1 /* 2 * util/netevent.c - event notification 3 * 4 * Copyright (c) 2007, NLnet Labs. All rights reserved. 5 * 6 * This software is open source. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * Redistributions of source code must retain the above copyright notice, 13 * this list of conditions and the following disclaimer. 14 * 15 * Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * Neither the name of the NLNET LABS nor the names of its contributors may 20 * be used to endorse or promote products derived from this software without 21 * specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36 /** 37 * \file 38 * 39 * This file contains event notification functions. 40 */ 41 #include "config.h" 42 #include "util/netevent.h" 43 #include "util/ub_event.h" 44 #include "util/log.h" 45 #include "util/net_help.h" 46 #include "util/tcp_conn_limit.h" 47 #include "util/fptr_wlist.h" 48 #include "util/proxy_protocol.h" 49 #include "util/timeval_func.h" 50 #include "sldns/pkthdr.h" 51 #include "sldns/sbuffer.h" 52 #include "sldns/str2wire.h" 53 #include "dnstap/dnstap.h" 54 #include "dnscrypt/dnscrypt.h" 55 #include "services/listen_dnsport.h" 56 #include "util/random.h" 57 #ifdef HAVE_SYS_TYPES_H 58 #include <sys/types.h> 59 #endif 60 #ifdef HAVE_SYS_SOCKET_H 61 #include <sys/socket.h> 62 #endif 63 #ifdef HAVE_NETDB_H 64 #include <netdb.h> 65 #endif 66 #ifdef HAVE_POLL_H 67 #include <poll.h> 68 #endif 69 70 #ifdef HAVE_OPENSSL_SSL_H 71 #include <openssl/ssl.h> 72 #endif 73 #ifdef HAVE_OPENSSL_ERR_H 74 #include <openssl/err.h> 75 #endif 76 77 #ifdef HAVE_NGTCP2 78 #include <ngtcp2/ngtcp2.h> 79 #include <ngtcp2/ngtcp2_crypto.h> 80 #endif 81 82 #ifdef HAVE_LINUX_NET_TSTAMP_H 83 #include <linux/net_tstamp.h> 84 #endif 85 86 /* -------- Start of local definitions -------- */ 87 /** if CMSG_ALIGN is not defined on this platform, a workaround */ 88 #ifndef CMSG_ALIGN 89 # ifdef __CMSG_ALIGN 90 # define CMSG_ALIGN(n) __CMSG_ALIGN(n) 91 # elif defined(CMSG_DATA_ALIGN) 92 # define CMSG_ALIGN _CMSG_DATA_ALIGN 93 # else 94 # define CMSG_ALIGN(len) (((len)+sizeof(long)-1) & ~(sizeof(long)-1)) 95 # endif 96 #endif 97 98 /** if CMSG_LEN is not defined on this platform, a workaround */ 99 #ifndef CMSG_LEN 100 # define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr))+(len)) 101 #endif 102 103 /** if CMSG_SPACE is not defined on this platform, a workaround */ 104 #ifndef CMSG_SPACE 105 # ifdef _CMSG_HDR_ALIGN 106 # define CMSG_SPACE(l) (CMSG_ALIGN(l)+_CMSG_HDR_ALIGN(sizeof(struct cmsghdr))) 107 # else 108 # define CMSG_SPACE(l) (CMSG_ALIGN(l)+CMSG_ALIGN(sizeof(struct cmsghdr))) 109 # endif 110 #endif 111 112 /** The TCP writing query timeout in milliseconds */ 113 #define TCP_QUERY_TIMEOUT 120000 114 /** The minimum actual TCP timeout to use, regardless of what we advertise, 115 * in msec */ 116 #define TCP_QUERY_TIMEOUT_MINIMUM 200 117 118 #ifndef NONBLOCKING_IS_BROKEN 119 /** number of UDP reads to perform per read indication from select */ 120 #define NUM_UDP_PER_SELECT 100 121 #else 122 #define NUM_UDP_PER_SELECT 1 123 #endif 124 125 /** timeout in millisec to wait for write to unblock, packets dropped after.*/ 126 #define SEND_BLOCKED_WAIT_TIMEOUT 200 127 /** max number of times to wait for write to unblock, packets dropped after.*/ 128 #define SEND_BLOCKED_MAX_RETRY 5 129 130 /** Let's make timestamping code cleaner and redefine SO_TIMESTAMP* */ 131 #ifndef SO_TIMESTAMP 132 #define SO_TIMESTAMP 29 133 #endif 134 #ifndef SO_TIMESTAMPNS 135 #define SO_TIMESTAMPNS 35 136 #endif 137 #ifndef SO_TIMESTAMPING 138 #define SO_TIMESTAMPING 37 139 #endif 140 /** 141 * The internal event structure for keeping ub_event info for the event. 142 * Possibly other structures (list, tree) this is part of. 143 */ 144 struct internal_event { 145 /** the comm base */ 146 struct comm_base* base; 147 /** ub_event event type */ 148 struct ub_event* ev; 149 }; 150 151 /** 152 * Internal base structure, so that every thread has its own events. 153 */ 154 struct internal_base { 155 /** ub_event event_base type. */ 156 struct ub_event_base* base; 157 /** seconds time pointer points here */ 158 time_t secs; 159 /** timeval with current time */ 160 struct timeval now; 161 /** the event used for slow_accept timeouts */ 162 struct ub_event* slow_accept; 163 /** true if slow_accept is enabled */ 164 int slow_accept_enabled; 165 /** last log time for slow logging of file descriptor errors */ 166 time_t last_slow_log; 167 /** last log time for slow logging of write wait failures */ 168 time_t last_writewait_log; 169 }; 170 171 /** 172 * Internal timer structure, to store timer event in. 173 */ 174 struct internal_timer { 175 /** the super struct from which derived */ 176 struct comm_timer super; 177 /** the comm base */ 178 struct comm_base* base; 179 /** ub_event event type */ 180 struct ub_event* ev; 181 /** is timer enabled */ 182 uint8_t enabled; 183 }; 184 185 /** 186 * Internal signal structure, to store signal event in. 187 */ 188 struct internal_signal { 189 /** ub_event event type */ 190 struct ub_event* ev; 191 /** next in signal list */ 192 struct internal_signal* next; 193 }; 194 195 /** create a tcp handler with a parent */ 196 static struct comm_point* comm_point_create_tcp_handler( 197 struct comm_base *base, struct comm_point* parent, size_t bufsize, 198 struct sldns_buffer* spoolbuf, comm_point_callback_type* callback, 199 void* callback_arg, struct unbound_socket* socket); 200 201 /* -------- End of local definitions -------- */ 202 203 struct comm_base* 204 comm_base_create(int sigs) 205 { 206 struct comm_base* b = (struct comm_base*)calloc(1, 207 sizeof(struct comm_base)); 208 const char *evnm="event", *evsys="", *evmethod=""; 209 210 if(!b) 211 return NULL; 212 b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base)); 213 if(!b->eb) { 214 free(b); 215 return NULL; 216 } 217 b->eb->base = ub_default_event_base(sigs, &b->eb->secs, &b->eb->now); 218 if(!b->eb->base) { 219 free(b->eb); 220 free(b); 221 return NULL; 222 } 223 ub_comm_base_now(b); 224 ub_get_event_sys(b->eb->base, &evnm, &evsys, &evmethod); 225 verbose(VERB_ALGO, "%s %s uses %s method.", evnm, evsys, evmethod); 226 return b; 227 } 228 229 struct comm_base* 230 comm_base_create_event(struct ub_event_base* base) 231 { 232 struct comm_base* b = (struct comm_base*)calloc(1, 233 sizeof(struct comm_base)); 234 if(!b) 235 return NULL; 236 b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base)); 237 if(!b->eb) { 238 free(b); 239 return NULL; 240 } 241 b->eb->base = base; 242 ub_comm_base_now(b); 243 return b; 244 } 245 246 void 247 comm_base_delete(struct comm_base* b) 248 { 249 if(!b) 250 return; 251 if(b->eb->slow_accept_enabled) { 252 if(ub_event_del(b->eb->slow_accept) != 0) { 253 log_err("could not event_del slow_accept"); 254 } 255 ub_event_free(b->eb->slow_accept); 256 } 257 ub_event_base_free(b->eb->base); 258 b->eb->base = NULL; 259 free(b->eb); 260 free(b); 261 } 262 263 void 264 comm_base_delete_no_base(struct comm_base* b) 265 { 266 if(!b) 267 return; 268 if(b->eb->slow_accept_enabled) { 269 if(ub_event_del(b->eb->slow_accept) != 0) { 270 log_err("could not event_del slow_accept"); 271 } 272 ub_event_free(b->eb->slow_accept); 273 } 274 b->eb->base = NULL; 275 free(b->eb); 276 free(b); 277 } 278 279 void 280 comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv) 281 { 282 *tt = &b->eb->secs; 283 *tv = &b->eb->now; 284 } 285 286 void 287 comm_base_dispatch(struct comm_base* b) 288 { 289 int retval; 290 retval = ub_event_base_dispatch(b->eb->base); 291 if(retval < 0) { 292 fatal_exit("event_dispatch returned error %d, " 293 "errno is %s", retval, strerror(errno)); 294 } 295 } 296 297 void comm_base_exit(struct comm_base* b) 298 { 299 if(ub_event_base_loopexit(b->eb->base) != 0) { 300 log_err("Could not loopexit"); 301 } 302 } 303 304 void comm_base_set_slow_accept_handlers(struct comm_base* b, 305 void (*stop_acc)(void*), void (*start_acc)(void*), void* arg) 306 { 307 b->stop_accept = stop_acc; 308 b->start_accept = start_acc; 309 b->cb_arg = arg; 310 } 311 312 struct ub_event_base* comm_base_internal(struct comm_base* b) 313 { 314 return b->eb->base; 315 } 316 317 struct ub_event* comm_point_internal(struct comm_point* c) 318 { 319 return c->ev->ev; 320 } 321 322 /** see if errno for udp has to be logged or not uses globals */ 323 static int 324 udp_send_errno_needs_log(struct sockaddr* addr, socklen_t addrlen) 325 { 326 /* do not log transient errors (unless high verbosity) */ 327 #if defined(ENETUNREACH) || defined(EHOSTDOWN) || defined(EHOSTUNREACH) || defined(ENETDOWN) 328 switch(errno) { 329 # ifdef ENETUNREACH 330 case ENETUNREACH: 331 # endif 332 # ifdef EHOSTDOWN 333 case EHOSTDOWN: 334 # endif 335 # ifdef EHOSTUNREACH 336 case EHOSTUNREACH: 337 # endif 338 # ifdef ENETDOWN 339 case ENETDOWN: 340 # endif 341 case EPERM: 342 case EACCES: 343 if(verbosity < VERB_ALGO) 344 return 0; 345 break; 346 default: 347 break; 348 } 349 #endif 350 /* permission denied is gotten for every send if the 351 * network is disconnected (on some OS), squelch it */ 352 if( ((errno == EPERM) 353 # ifdef EADDRNOTAVAIL 354 /* 'Cannot assign requested address' also when disconnected */ 355 || (errno == EADDRNOTAVAIL) 356 # endif 357 ) && verbosity < VERB_ALGO) 358 return 0; 359 # ifdef EADDRINUSE 360 /* If SO_REUSEADDR is set, we could try to connect to the same server 361 * from the same source port twice. */ 362 if(errno == EADDRINUSE && verbosity < VERB_DETAIL) 363 return 0; 364 # endif 365 /* squelch errors where people deploy AAAA ::ffff:bla for 366 * authority servers, which we try for intranets. */ 367 if(errno == EINVAL && addr_is_ip4mapped( 368 (struct sockaddr_storage*)addr, addrlen) && 369 verbosity < VERB_DETAIL) 370 return 0; 371 /* SO_BROADCAST sockopt can give access to 255.255.255.255, 372 * but a dns cache does not need it. */ 373 if(errno == EACCES && addr_is_broadcast( 374 (struct sockaddr_storage*)addr, addrlen) && 375 verbosity < VERB_DETAIL) 376 return 0; 377 # ifdef ENOTCONN 378 /* For 0.0.0.0, ::0 targets it can return that socket is not connected. 379 * This can be ignored, and the address skipped. It remains 380 * possible to send there for completeness in configuration. */ 381 if(errno == ENOTCONN && addr_is_any( 382 (struct sockaddr_storage*)addr, addrlen) && 383 verbosity < VERB_DETAIL) 384 return 0; 385 # endif 386 return 1; 387 } 388 389 int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen) 390 { 391 return udp_send_errno_needs_log(addr, addrlen); 392 } 393 394 /* send a UDP reply */ 395 int 396 comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet, 397 struct sockaddr* addr, socklen_t addrlen, int is_connected) 398 { 399 ssize_t sent; 400 log_assert(c->fd != -1); 401 #ifdef UNBOUND_DEBUG 402 if(sldns_buffer_remaining(packet) == 0) 403 log_err("error: send empty UDP packet"); 404 #endif 405 log_assert(addr && addrlen > 0); 406 if(!is_connected) { 407 sent = sendto(c->fd, (void*)sldns_buffer_begin(packet), 408 sldns_buffer_remaining(packet), 0, 409 addr, addrlen); 410 } else { 411 sent = send(c->fd, (void*)sldns_buffer_begin(packet), 412 sldns_buffer_remaining(packet), 0); 413 } 414 if(sent == -1) { 415 /* try again and block, waiting for IO to complete, 416 * we want to send the answer, and we will wait for 417 * the ethernet interface buffer to have space. */ 418 #ifndef USE_WINSOCK 419 if(errno == EAGAIN || errno == EINTR || 420 # ifdef EWOULDBLOCK 421 errno == EWOULDBLOCK || 422 # endif 423 errno == ENOBUFS) { 424 #else 425 if(WSAGetLastError() == WSAEINPROGRESS || 426 WSAGetLastError() == WSAEINTR || 427 WSAGetLastError() == WSAENOBUFS || 428 WSAGetLastError() == WSAEWOULDBLOCK) { 429 #endif 430 int retries = 0; 431 /* if we set the fd blocking, other threads suddenly 432 * have a blocking fd that they operate on */ 433 while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && ( 434 #ifndef USE_WINSOCK 435 errno == EAGAIN || errno == EINTR || 436 # ifdef EWOULDBLOCK 437 errno == EWOULDBLOCK || 438 # endif 439 errno == ENOBUFS 440 #else 441 WSAGetLastError() == WSAEINPROGRESS || 442 WSAGetLastError() == WSAEINTR || 443 WSAGetLastError() == WSAENOBUFS || 444 WSAGetLastError() == WSAEWOULDBLOCK 445 #endif 446 )) { 447 #if defined(HAVE_POLL) || defined(USE_WINSOCK) 448 int send_nobufs = ( 449 #ifndef USE_WINSOCK 450 errno == ENOBUFS 451 #else 452 WSAGetLastError() == WSAENOBUFS 453 #endif 454 ); 455 struct pollfd p; 456 int pret; 457 memset(&p, 0, sizeof(p)); 458 p.fd = c->fd; 459 p.events = POLLOUT 460 #ifndef USE_WINSOCK 461 | POLLERR | POLLHUP 462 #endif 463 ; 464 # ifndef USE_WINSOCK 465 pret = poll(&p, 1, SEND_BLOCKED_WAIT_TIMEOUT); 466 # else 467 pret = WSAPoll(&p, 1, 468 SEND_BLOCKED_WAIT_TIMEOUT); 469 # endif 470 if(pret == 0) { 471 /* timer expired */ 472 struct comm_base* b = c->ev->base; 473 if(b->eb->last_writewait_log+SLOW_LOG_TIME <= 474 b->eb->secs) { 475 b->eb->last_writewait_log = b->eb->secs; 476 verbose(VERB_OPS, "send udp blocked " 477 "for long, dropping packet."); 478 } 479 return 0; 480 } else if(pret < 0 && 481 #ifndef USE_WINSOCK 482 errno != EAGAIN && errno != EINTR && 483 # ifdef EWOULDBLOCK 484 errno != EWOULDBLOCK && 485 # endif 486 errno != ENOMEM && errno != ENOBUFS 487 #else 488 WSAGetLastError() != WSAEINPROGRESS && 489 WSAGetLastError() != WSAEINTR && 490 WSAGetLastError() != WSAENOBUFS && 491 WSAGetLastError() != WSAEWOULDBLOCK 492 #endif 493 ) { 494 log_err("poll udp out failed: %s", 495 sock_strerror(errno)); 496 return 0; 497 } else if((pret < 0 && 498 #ifndef USE_WINSOCK 499 ( errno == ENOBUFS /* Maybe some systems */ 500 || errno == ENOMEM /* Linux */ 501 || errno == EAGAIN) /* Macos, solaris, openbsd */ 502 #else 503 WSAGetLastError() == WSAENOBUFS 504 #endif 505 ) || (send_nobufs && retries > 0)) { 506 /* ENOBUFS/ENOMEM/EAGAIN, and poll 507 * returned without 508 * a timeout. Or the retried send call 509 * returned ENOBUFS/ENOMEM/EAGAIN. 510 * It is good to wait a bit for the 511 * error to clear. */ 512 /* The timeout is 20*(2^(retries+1)), 513 * it increases exponentially, starting 514 * at 40 msec. After 5 tries, 1240 msec 515 * have passed in total, when poll 516 * returned the error, and 1200 msec 517 * when send returned the errors. */ 518 #ifndef USE_WINSOCK 519 pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1)); 520 #else 521 Sleep((SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1)); 522 pret = 0; 523 #endif 524 if(pret < 0 525 #ifndef USE_WINSOCK 526 && errno != EAGAIN && errno != EINTR && 527 # ifdef EWOULDBLOCK 528 errno != EWOULDBLOCK && 529 # endif 530 errno != ENOMEM && errno != ENOBUFS 531 #else 532 /* Sleep does not error */ 533 #endif 534 ) { 535 log_err("poll udp out timer failed: %s", 536 sock_strerror(errno)); 537 } 538 } 539 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */ 540 retries++; 541 if (!is_connected) { 542 sent = sendto(c->fd, (void*)sldns_buffer_begin(packet), 543 sldns_buffer_remaining(packet), 0, 544 addr, addrlen); 545 } else { 546 sent = send(c->fd, (void*)sldns_buffer_begin(packet), 547 sldns_buffer_remaining(packet), 0); 548 } 549 } 550 } 551 } 552 if(sent == -1) { 553 if(!udp_send_errno_needs_log(addr, addrlen)) 554 return 0; 555 if (!is_connected) { 556 verbose(VERB_OPS, "sendto failed: %s", sock_strerror(errno)); 557 } else { 558 verbose(VERB_OPS, "send failed: %s", sock_strerror(errno)); 559 } 560 if(addr) 561 log_addr(VERB_OPS, "remote address is", 562 (struct sockaddr_storage*)addr, addrlen); 563 return 0; 564 } else if((size_t)sent != sldns_buffer_remaining(packet)) { 565 log_err("sent %d in place of %d bytes", 566 (int)sent, (int)sldns_buffer_remaining(packet)); 567 return 0; 568 } 569 return 1; 570 } 571 572 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && (defined(HAVE_RECVMSG) || defined(HAVE_SENDMSG)) 573 /** print debug ancillary info */ 574 static void p_ancil(const char* str, struct comm_reply* r) 575 { 576 if(r->srctype != 4 && r->srctype != 6) { 577 log_info("%s: unknown srctype %d", str, r->srctype); 578 return; 579 } 580 581 if(r->srctype == 6) { 582 #ifdef IPV6_PKTINFO 583 char buf[1024]; 584 if(inet_ntop(AF_INET6, &r->pktinfo.v6info.ipi6_addr, 585 buf, (socklen_t)sizeof(buf)) == 0) { 586 (void)strlcpy(buf, "(inet_ntop error)", sizeof(buf)); 587 } 588 buf[sizeof(buf)-1]=0; 589 log_info("%s: %s %d", str, buf, r->pktinfo.v6info.ipi6_ifindex); 590 #endif 591 } else if(r->srctype == 4) { 592 #ifdef IP_PKTINFO 593 char buf1[1024], buf2[1024]; 594 if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_addr, 595 buf1, (socklen_t)sizeof(buf1)) == 0) { 596 (void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1)); 597 } 598 buf1[sizeof(buf1)-1]=0; 599 #ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST 600 if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_spec_dst, 601 buf2, (socklen_t)sizeof(buf2)) == 0) { 602 (void)strlcpy(buf2, "(inet_ntop error)", sizeof(buf2)); 603 } 604 buf2[sizeof(buf2)-1]=0; 605 #else 606 buf2[0]=0; 607 #endif 608 log_info("%s: %d %s %s", str, r->pktinfo.v4info.ipi_ifindex, 609 buf1, buf2); 610 #elif defined(IP_RECVDSTADDR) 611 char buf1[1024]; 612 if(inet_ntop(AF_INET, &r->pktinfo.v4addr, 613 buf1, (socklen_t)sizeof(buf1)) == 0) { 614 (void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1)); 615 } 616 buf1[sizeof(buf1)-1]=0; 617 log_info("%s: %s", str, buf1); 618 #endif /* IP_PKTINFO or PI_RECVDSTDADDR */ 619 } 620 } 621 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG||HAVE_SENDMSG */ 622 623 /** send a UDP reply over specified interface*/ 624 static int 625 comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet, 626 struct sockaddr* addr, socklen_t addrlen, struct comm_reply* r) 627 { 628 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_SENDMSG) 629 ssize_t sent; 630 struct msghdr msg; 631 struct iovec iov[1]; 632 union { 633 struct cmsghdr hdr; 634 char buf[256]; 635 } control; 636 #ifndef S_SPLINT_S 637 struct cmsghdr *cmsg; 638 #endif /* S_SPLINT_S */ 639 640 log_assert(c->fd != -1); 641 #ifdef UNBOUND_DEBUG 642 if(sldns_buffer_remaining(packet) == 0) 643 log_err("error: send empty UDP packet"); 644 #endif 645 log_assert(addr && addrlen > 0); 646 647 msg.msg_name = addr; 648 msg.msg_namelen = addrlen; 649 iov[0].iov_base = sldns_buffer_begin(packet); 650 iov[0].iov_len = sldns_buffer_remaining(packet); 651 msg.msg_iov = iov; 652 msg.msg_iovlen = 1; 653 msg.msg_control = control.buf; 654 #ifndef S_SPLINT_S 655 msg.msg_controllen = sizeof(control.buf); 656 #endif /* S_SPLINT_S */ 657 msg.msg_flags = 0; 658 659 #ifndef S_SPLINT_S 660 cmsg = CMSG_FIRSTHDR(&msg); 661 if(r->srctype == 4) { 662 #ifdef IP_PKTINFO 663 void* cmsg_data; 664 msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo)); 665 log_assert(msg.msg_controllen <= sizeof(control.buf)); 666 cmsg->cmsg_level = IPPROTO_IP; 667 cmsg->cmsg_type = IP_PKTINFO; 668 memmove(CMSG_DATA(cmsg), &r->pktinfo.v4info, 669 sizeof(struct in_pktinfo)); 670 /* unset the ifindex to not bypass the routing tables */ 671 cmsg_data = CMSG_DATA(cmsg); 672 ((struct in_pktinfo *) cmsg_data)->ipi_ifindex = 0; 673 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); 674 /* zero the padding bytes inserted by the CMSG_LEN */ 675 if(sizeof(struct in_pktinfo) < cmsg->cmsg_len) 676 memset(((uint8_t*)(CMSG_DATA(cmsg))) + 677 sizeof(struct in_pktinfo), 0, cmsg->cmsg_len 678 - sizeof(struct in_pktinfo)); 679 #elif defined(IP_SENDSRCADDR) 680 msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr)); 681 log_assert(msg.msg_controllen <= sizeof(control.buf)); 682 cmsg->cmsg_level = IPPROTO_IP; 683 cmsg->cmsg_type = IP_SENDSRCADDR; 684 memmove(CMSG_DATA(cmsg), &r->pktinfo.v4addr, 685 sizeof(struct in_addr)); 686 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr)); 687 /* zero the padding bytes inserted by the CMSG_LEN */ 688 if(sizeof(struct in_addr) < cmsg->cmsg_len) 689 memset(((uint8_t*)(CMSG_DATA(cmsg))) + 690 sizeof(struct in_addr), 0, cmsg->cmsg_len 691 - sizeof(struct in_addr)); 692 #else 693 verbose(VERB_ALGO, "no IP_PKTINFO or IP_SENDSRCADDR"); 694 msg.msg_control = NULL; 695 #endif /* IP_PKTINFO or IP_SENDSRCADDR */ 696 } else if(r->srctype == 6) { 697 void* cmsg_data; 698 msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); 699 log_assert(msg.msg_controllen <= sizeof(control.buf)); 700 cmsg->cmsg_level = IPPROTO_IPV6; 701 cmsg->cmsg_type = IPV6_PKTINFO; 702 memmove(CMSG_DATA(cmsg), &r->pktinfo.v6info, 703 sizeof(struct in6_pktinfo)); 704 /* unset the ifindex to not bypass the routing tables */ 705 cmsg_data = CMSG_DATA(cmsg); 706 ((struct in6_pktinfo *) cmsg_data)->ipi6_ifindex = 0; 707 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); 708 /* zero the padding bytes inserted by the CMSG_LEN */ 709 if(sizeof(struct in6_pktinfo) < cmsg->cmsg_len) 710 memset(((uint8_t*)(CMSG_DATA(cmsg))) + 711 sizeof(struct in6_pktinfo), 0, cmsg->cmsg_len 712 - sizeof(struct in6_pktinfo)); 713 } else { 714 /* try to pass all 0 to use default route */ 715 msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); 716 log_assert(msg.msg_controllen <= sizeof(control.buf)); 717 cmsg->cmsg_level = IPPROTO_IPV6; 718 cmsg->cmsg_type = IPV6_PKTINFO; 719 memset(CMSG_DATA(cmsg), 0, sizeof(struct in6_pktinfo)); 720 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); 721 /* zero the padding bytes inserted by the CMSG_LEN */ 722 if(sizeof(struct in6_pktinfo) < cmsg->cmsg_len) 723 memset(((uint8_t*)(CMSG_DATA(cmsg))) + 724 sizeof(struct in6_pktinfo), 0, cmsg->cmsg_len 725 - sizeof(struct in6_pktinfo)); 726 } 727 #endif /* S_SPLINT_S */ 728 if(verbosity >= VERB_ALGO && r->srctype != 0) 729 p_ancil("send_udp over interface", r); 730 sent = sendmsg(c->fd, &msg, 0); 731 if(sent == -1) { 732 /* try again and block, waiting for IO to complete, 733 * we want to send the answer, and we will wait for 734 * the ethernet interface buffer to have space. */ 735 #ifndef USE_WINSOCK 736 if(errno == EAGAIN || errno == EINTR || 737 # ifdef EWOULDBLOCK 738 errno == EWOULDBLOCK || 739 # endif 740 errno == ENOBUFS) { 741 #else 742 if(WSAGetLastError() == WSAEINPROGRESS || 743 WSAGetLastError() == WSAEINTR || 744 WSAGetLastError() == WSAENOBUFS || 745 WSAGetLastError() == WSAEWOULDBLOCK) { 746 #endif 747 int retries = 0; 748 while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && ( 749 #ifndef USE_WINSOCK 750 errno == EAGAIN || errno == EINTR || 751 # ifdef EWOULDBLOCK 752 errno == EWOULDBLOCK || 753 # endif 754 errno == ENOBUFS 755 #else 756 WSAGetLastError() == WSAEINPROGRESS || 757 WSAGetLastError() == WSAEINTR || 758 WSAGetLastError() == WSAENOBUFS || 759 WSAGetLastError() == WSAEWOULDBLOCK 760 #endif 761 )) { 762 #if defined(HAVE_POLL) || defined(USE_WINSOCK) 763 int send_nobufs = ( 764 #ifndef USE_WINSOCK 765 errno == ENOBUFS 766 #else 767 WSAGetLastError() == WSAENOBUFS 768 #endif 769 ); 770 struct pollfd p; 771 int pret; 772 memset(&p, 0, sizeof(p)); 773 p.fd = c->fd; 774 p.events = POLLOUT 775 #ifndef USE_WINSOCK 776 | POLLERR | POLLHUP 777 #endif 778 ; 779 # ifndef USE_WINSOCK 780 pret = poll(&p, 1, SEND_BLOCKED_WAIT_TIMEOUT); 781 # else 782 pret = WSAPoll(&p, 1, 783 SEND_BLOCKED_WAIT_TIMEOUT); 784 # endif 785 if(pret == 0) { 786 /* timer expired */ 787 struct comm_base* b = c->ev->base; 788 if(b->eb->last_writewait_log+SLOW_LOG_TIME <= 789 b->eb->secs) { 790 b->eb->last_writewait_log = b->eb->secs; 791 verbose(VERB_OPS, "send udp blocked " 792 "for long, dropping packet."); 793 } 794 return 0; 795 } else if(pret < 0 && 796 #ifndef USE_WINSOCK 797 errno != EAGAIN && errno != EINTR && 798 # ifdef EWOULDBLOCK 799 errno != EWOULDBLOCK && 800 # endif 801 errno != ENOMEM && errno != ENOBUFS 802 #else 803 WSAGetLastError() != WSAEINPROGRESS && 804 WSAGetLastError() != WSAEINTR && 805 WSAGetLastError() != WSAENOBUFS && 806 WSAGetLastError() != WSAEWOULDBLOCK 807 #endif 808 ) { 809 log_err("poll udp out failed: %s", 810 sock_strerror(errno)); 811 return 0; 812 } else if((pret < 0 && 813 #ifndef USE_WINSOCK 814 ( errno == ENOBUFS /* Maybe some systems */ 815 || errno == ENOMEM /* Linux */ 816 || errno == EAGAIN) /* Macos, solaris, openbsd */ 817 #else 818 WSAGetLastError() == WSAENOBUFS 819 #endif 820 ) || (send_nobufs && retries > 0)) { 821 /* ENOBUFS/ENOMEM/EAGAIN, and poll 822 * returned without 823 * a timeout. Or the retried send call 824 * returned ENOBUFS/ENOMEM/EAGAIN. 825 * It is good to wait a bit for the 826 * error to clear. */ 827 /* The timeout is 20*(2^(retries+1)), 828 * it increases exponentially, starting 829 * at 40 msec. After 5 tries, 1240 msec 830 * have passed in total, when poll 831 * returned the error, and 1200 msec 832 * when send returned the errors. */ 833 #ifndef USE_WINSOCK 834 pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1)); 835 #else 836 Sleep((SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1)); 837 pret = 0; 838 #endif 839 if(pret < 0 840 #ifndef USE_WINSOCK 841 && errno != EAGAIN && errno != EINTR && 842 # ifdef EWOULDBLOCK 843 errno != EWOULDBLOCK && 844 # endif 845 errno != ENOMEM && errno != ENOBUFS 846 #else /* USE_WINSOCK */ 847 /* Sleep does not error */ 848 #endif 849 ) { 850 log_err("poll udp out timer failed: %s", 851 sock_strerror(errno)); 852 } 853 } 854 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */ 855 retries++; 856 sent = sendmsg(c->fd, &msg, 0); 857 } 858 } 859 } 860 if(sent == -1) { 861 if(!udp_send_errno_needs_log(addr, addrlen)) 862 return 0; 863 verbose(VERB_OPS, "sendmsg failed: %s", strerror(errno)); 864 log_addr(VERB_OPS, "remote address is", 865 (struct sockaddr_storage*)addr, addrlen); 866 #ifdef __NetBSD__ 867 /* netbsd 7 has IP_PKTINFO for recv but not send */ 868 if(errno == EINVAL && r->srctype == 4) 869 log_err("sendmsg: No support for sendmsg(IP_PKTINFO). " 870 "Please disable interface-automatic"); 871 #endif 872 return 0; 873 } else if((size_t)sent != sldns_buffer_remaining(packet)) { 874 log_err("sent %d in place of %d bytes", 875 (int)sent, (int)sldns_buffer_remaining(packet)); 876 return 0; 877 } 878 return 1; 879 #else 880 (void)c; 881 (void)packet; 882 (void)addr; 883 (void)addrlen; 884 (void)r; 885 log_err("sendmsg: IPV6_PKTINFO not supported"); 886 return 0; 887 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_SENDMSG */ 888 } 889 890 /** return true is UDP receive error needs to be logged */ 891 static int udp_recv_needs_log(int err) 892 { 893 switch(err) { 894 case EACCES: /* some hosts send ICMP 'Permission Denied' */ 895 #ifndef USE_WINSOCK 896 case ECONNREFUSED: 897 # ifdef ENETUNREACH 898 case ENETUNREACH: 899 # endif 900 # ifdef EHOSTDOWN 901 case EHOSTDOWN: 902 # endif 903 # ifdef EHOSTUNREACH 904 case EHOSTUNREACH: 905 # endif 906 # ifdef ENETDOWN 907 case ENETDOWN: 908 # endif 909 #else /* USE_WINSOCK */ 910 case WSAECONNREFUSED: 911 case WSAENETUNREACH: 912 case WSAEHOSTDOWN: 913 case WSAEHOSTUNREACH: 914 case WSAENETDOWN: 915 #endif 916 if(verbosity >= VERB_ALGO) 917 return 1; 918 return 0; 919 default: 920 break; 921 } 922 return 1; 923 } 924 925 /** Parses the PROXYv2 header from buf and updates the comm_reply struct. 926 * Returns 1 on success, 0 on failure. */ 927 static int consume_pp2_header(struct sldns_buffer* buf, struct comm_reply* rep, 928 int stream) { 929 size_t size; 930 struct pp2_header *header; 931 int err = pp2_read_header(sldns_buffer_begin(buf), 932 sldns_buffer_remaining(buf)); 933 if(err) return 0; 934 header = (struct pp2_header*)sldns_buffer_begin(buf); 935 size = PP2_HEADER_SIZE + ntohs(header->len); 936 if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) { 937 /* A connection from the proxy itself. 938 * No need to do anything with addresses. */ 939 goto done; 940 } 941 if(header->fam_prot == PP2_UNSPEC_UNSPEC) { 942 /* Unspecified family and protocol. This could be used for 943 * health checks by proxies. 944 * No need to do anything with addresses. */ 945 goto done; 946 } 947 /* Read the proxied address */ 948 switch(header->fam_prot) { 949 case PP2_INET_STREAM: 950 case PP2_INET_DGRAM: 951 { 952 struct sockaddr_in* addr = 953 (struct sockaddr_in*)&rep->client_addr; 954 addr->sin_family = AF_INET; 955 addr->sin_addr.s_addr = header->addr.addr4.src_addr; 956 addr->sin_port = header->addr.addr4.src_port; 957 rep->client_addrlen = (socklen_t)sizeof(struct sockaddr_in); 958 } 959 /* Ignore the destination address; it should be us. */ 960 break; 961 case PP2_INET6_STREAM: 962 case PP2_INET6_DGRAM: 963 { 964 struct sockaddr_in6* addr = 965 (struct sockaddr_in6*)&rep->client_addr; 966 memset(addr, 0, sizeof(*addr)); 967 addr->sin6_family = AF_INET6; 968 memcpy(&addr->sin6_addr, 969 header->addr.addr6.src_addr, 16); 970 addr->sin6_port = header->addr.addr6.src_port; 971 rep->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6); 972 } 973 /* Ignore the destination address; it should be us. */ 974 break; 975 default: 976 log_err("proxy_protocol: unsupported family and " 977 "protocol 0x%x", (int)header->fam_prot); 978 return 0; 979 } 980 rep->is_proxied = 1; 981 done: 982 if(!stream) { 983 /* We are reading a whole packet; 984 * Move the rest of the data to overwrite the PROXYv2 header */ 985 /* XXX can we do better to avoid memmove? */ 986 memmove(header, ((char*)header)+size, 987 sldns_buffer_limit(buf)-size); 988 sldns_buffer_set_limit(buf, sldns_buffer_limit(buf)-size); 989 } 990 return 1; 991 } 992 993 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG) 994 void 995 comm_point_udp_ancil_callback(int fd, short event, void* arg) 996 { 997 struct comm_reply rep; 998 struct msghdr msg; 999 struct iovec iov[1]; 1000 ssize_t rcv; 1001 union { 1002 struct cmsghdr hdr; 1003 char buf[256]; 1004 } ancil; 1005 int i; 1006 #ifndef S_SPLINT_S 1007 struct cmsghdr* cmsg; 1008 #endif /* S_SPLINT_S */ 1009 #ifdef HAVE_LINUX_NET_TSTAMP_H 1010 struct timespec *ts; 1011 #endif /* HAVE_LINUX_NET_TSTAMP_H */ 1012 1013 rep.c = (struct comm_point*)arg; 1014 log_assert(rep.c->type == comm_udp); 1015 1016 if(!(event&UB_EV_READ)) 1017 return; 1018 log_assert(rep.c && rep.c->buffer && rep.c->fd == fd); 1019 ub_comm_base_now(rep.c->ev->base); 1020 for(i=0; i<NUM_UDP_PER_SELECT; i++) { 1021 sldns_buffer_clear(rep.c->buffer); 1022 timeval_clear(&rep.c->recv_tv); 1023 rep.remote_addrlen = (socklen_t)sizeof(rep.remote_addr); 1024 log_assert(fd != -1); 1025 log_assert(sldns_buffer_remaining(rep.c->buffer) > 0); 1026 msg.msg_name = &rep.remote_addr; 1027 msg.msg_namelen = (socklen_t)sizeof(rep.remote_addr); 1028 iov[0].iov_base = sldns_buffer_begin(rep.c->buffer); 1029 iov[0].iov_len = sldns_buffer_remaining(rep.c->buffer); 1030 msg.msg_iov = iov; 1031 msg.msg_iovlen = 1; 1032 msg.msg_control = ancil.buf; 1033 #ifndef S_SPLINT_S 1034 msg.msg_controllen = sizeof(ancil.buf); 1035 #endif /* S_SPLINT_S */ 1036 msg.msg_flags = 0; 1037 rcv = recvmsg(fd, &msg, MSG_DONTWAIT); 1038 if(rcv == -1) { 1039 if(errno != EAGAIN && errno != EINTR 1040 && udp_recv_needs_log(errno)) { 1041 log_err("recvmsg failed: %s", strerror(errno)); 1042 } 1043 return; 1044 } 1045 rep.remote_addrlen = msg.msg_namelen; 1046 sldns_buffer_skip(rep.c->buffer, rcv); 1047 sldns_buffer_flip(rep.c->buffer); 1048 rep.srctype = 0; 1049 rep.is_proxied = 0; 1050 #ifndef S_SPLINT_S 1051 for(cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; 1052 cmsg = CMSG_NXTHDR(&msg, cmsg)) { 1053 if( cmsg->cmsg_level == IPPROTO_IPV6 && 1054 cmsg->cmsg_type == IPV6_PKTINFO) { 1055 rep.srctype = 6; 1056 memmove(&rep.pktinfo.v6info, CMSG_DATA(cmsg), 1057 sizeof(struct in6_pktinfo)); 1058 break; 1059 #ifdef IP_PKTINFO 1060 } else if( cmsg->cmsg_level == IPPROTO_IP && 1061 cmsg->cmsg_type == IP_PKTINFO) { 1062 rep.srctype = 4; 1063 memmove(&rep.pktinfo.v4info, CMSG_DATA(cmsg), 1064 sizeof(struct in_pktinfo)); 1065 break; 1066 #elif defined(IP_RECVDSTADDR) 1067 } else if( cmsg->cmsg_level == IPPROTO_IP && 1068 cmsg->cmsg_type == IP_RECVDSTADDR) { 1069 rep.srctype = 4; 1070 memmove(&rep.pktinfo.v4addr, CMSG_DATA(cmsg), 1071 sizeof(struct in_addr)); 1072 break; 1073 #endif /* IP_PKTINFO or IP_RECVDSTADDR */ 1074 #ifdef HAVE_LINUX_NET_TSTAMP_H 1075 } else if( cmsg->cmsg_level == SOL_SOCKET && 1076 cmsg->cmsg_type == SO_TIMESTAMPNS) { 1077 ts = (struct timespec *)CMSG_DATA(cmsg); 1078 TIMESPEC_TO_TIMEVAL(&rep.c->recv_tv, ts); 1079 } else if( cmsg->cmsg_level == SOL_SOCKET && 1080 cmsg->cmsg_type == SO_TIMESTAMPING) { 1081 ts = (struct timespec *)CMSG_DATA(cmsg); 1082 TIMESPEC_TO_TIMEVAL(&rep.c->recv_tv, ts); 1083 } else if( cmsg->cmsg_level == SOL_SOCKET && 1084 cmsg->cmsg_type == SO_TIMESTAMP) { 1085 memmove(&rep.c->recv_tv, CMSG_DATA(cmsg), sizeof(struct timeval)); 1086 #endif /* HAVE_LINUX_NET_TSTAMP_H */ 1087 } 1088 } 1089 1090 if(verbosity >= VERB_ALGO && rep.srctype != 0) 1091 p_ancil("receive_udp on interface", &rep); 1092 #endif /* S_SPLINT_S */ 1093 1094 if(rep.c->pp2_enabled && !consume_pp2_header(rep.c->buffer, 1095 &rep, 0)) { 1096 log_err("proxy_protocol: could not consume PROXYv2 header"); 1097 return; 1098 } 1099 if(!rep.is_proxied) { 1100 rep.client_addrlen = rep.remote_addrlen; 1101 memmove(&rep.client_addr, &rep.remote_addr, 1102 rep.remote_addrlen); 1103 } 1104 1105 fptr_ok(fptr_whitelist_comm_point(rep.c->callback)); 1106 if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) { 1107 /* send back immediate reply */ 1108 struct sldns_buffer *buffer; 1109 #ifdef USE_DNSCRYPT 1110 buffer = rep.c->dnscrypt_buffer; 1111 #else 1112 buffer = rep.c->buffer; 1113 #endif 1114 (void)comm_point_send_udp_msg_if(rep.c, buffer, 1115 (struct sockaddr*)&rep.remote_addr, 1116 rep.remote_addrlen, &rep); 1117 } 1118 if(!rep.c || rep.c->fd == -1) /* commpoint closed */ 1119 break; 1120 } 1121 } 1122 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG */ 1123 1124 void 1125 comm_point_udp_callback(int fd, short event, void* arg) 1126 { 1127 struct comm_reply rep; 1128 ssize_t rcv; 1129 int i; 1130 struct sldns_buffer *buffer; 1131 1132 rep.c = (struct comm_point*)arg; 1133 log_assert(rep.c->type == comm_udp); 1134 1135 if(!(event&UB_EV_READ)) 1136 return; 1137 log_assert(rep.c && rep.c->buffer && rep.c->fd == fd); 1138 ub_comm_base_now(rep.c->ev->base); 1139 for(i=0; i<NUM_UDP_PER_SELECT; i++) { 1140 sldns_buffer_clear(rep.c->buffer); 1141 rep.remote_addrlen = (socklen_t)sizeof(rep.remote_addr); 1142 log_assert(fd != -1); 1143 log_assert(sldns_buffer_remaining(rep.c->buffer) > 0); 1144 rcv = recvfrom(fd, (void*)sldns_buffer_begin(rep.c->buffer), 1145 sldns_buffer_remaining(rep.c->buffer), MSG_DONTWAIT, 1146 (struct sockaddr*)&rep.remote_addr, &rep.remote_addrlen); 1147 if(rcv == -1) { 1148 #ifndef USE_WINSOCK 1149 if(errno != EAGAIN && errno != EINTR 1150 && udp_recv_needs_log(errno)) 1151 log_err("recvfrom %d failed: %s", 1152 fd, strerror(errno)); 1153 #else 1154 if(WSAGetLastError() != WSAEINPROGRESS && 1155 WSAGetLastError() != WSAECONNRESET && 1156 WSAGetLastError()!= WSAEWOULDBLOCK && 1157 udp_recv_needs_log(WSAGetLastError())) 1158 log_err("recvfrom failed: %s", 1159 wsa_strerror(WSAGetLastError())); 1160 #endif 1161 return; 1162 } 1163 sldns_buffer_skip(rep.c->buffer, rcv); 1164 sldns_buffer_flip(rep.c->buffer); 1165 rep.srctype = 0; 1166 rep.is_proxied = 0; 1167 1168 if(rep.c->pp2_enabled && !consume_pp2_header(rep.c->buffer, 1169 &rep, 0)) { 1170 log_err("proxy_protocol: could not consume PROXYv2 header"); 1171 return; 1172 } 1173 if(!rep.is_proxied) { 1174 rep.client_addrlen = rep.remote_addrlen; 1175 memmove(&rep.client_addr, &rep.remote_addr, 1176 rep.remote_addrlen); 1177 } 1178 1179 fptr_ok(fptr_whitelist_comm_point(rep.c->callback)); 1180 if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) { 1181 /* send back immediate reply */ 1182 #ifdef USE_DNSCRYPT 1183 buffer = rep.c->dnscrypt_buffer; 1184 #else 1185 buffer = rep.c->buffer; 1186 #endif 1187 (void)comm_point_send_udp_msg(rep.c, buffer, 1188 (struct sockaddr*)&rep.remote_addr, 1189 rep.remote_addrlen, 0); 1190 } 1191 if(!rep.c || rep.c->fd != fd) /* commpoint closed to -1 or reused for 1192 another UDP port. Note rep.c cannot be reused with TCP fd. */ 1193 break; 1194 } 1195 } 1196 1197 #ifdef HAVE_NGTCP2 1198 void 1199 doq_pkt_addr_init(struct doq_pkt_addr* paddr) 1200 { 1201 paddr->addrlen = (socklen_t)sizeof(paddr->addr); 1202 paddr->localaddrlen = (socklen_t)sizeof(paddr->localaddr); 1203 paddr->ifindex = 0; 1204 } 1205 1206 /** set the ecn on the transmission */ 1207 static void 1208 doq_set_ecn(int fd, int family, uint32_t ecn) 1209 { 1210 unsigned int val = ecn; 1211 if(family == AF_INET6) { 1212 if(setsockopt(fd, IPPROTO_IPV6, IPV6_TCLASS, &val, 1213 (socklen_t)sizeof(val)) == -1) { 1214 log_err("setsockopt(.. IPV6_TCLASS ..): %s", 1215 strerror(errno)); 1216 } 1217 return; 1218 } 1219 if(setsockopt(fd, IPPROTO_IP, IP_TOS, &val, 1220 (socklen_t)sizeof(val)) == -1) { 1221 log_err("setsockopt(.. IP_TOS ..): %s", 1222 strerror(errno)); 1223 } 1224 } 1225 1226 /** set the local address in the control ancillary data */ 1227 static void 1228 doq_set_localaddr_cmsg(struct msghdr* msg, size_t control_size, 1229 struct doq_addr_storage* localaddr, socklen_t localaddrlen, 1230 int ifindex) 1231 { 1232 #ifndef S_SPLINT_S 1233 struct cmsghdr* cmsg; 1234 #endif /* S_SPLINT_S */ 1235 #ifndef S_SPLINT_S 1236 cmsg = CMSG_FIRSTHDR(msg); 1237 if(localaddr->sockaddr.in.sin_family == AF_INET) { 1238 #ifdef IP_PKTINFO 1239 struct sockaddr_in* sa = (struct sockaddr_in*)localaddr; 1240 struct in_pktinfo v4info; 1241 log_assert(localaddrlen >= sizeof(struct sockaddr_in)); 1242 msg->msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo)); 1243 memset(msg->msg_control, 0, msg->msg_controllen); 1244 log_assert(msg->msg_controllen <= control_size); 1245 cmsg->cmsg_level = IPPROTO_IP; 1246 cmsg->cmsg_type = IP_PKTINFO; 1247 memset(&v4info, 0, sizeof(v4info)); 1248 # ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST 1249 memmove(&v4info.ipi_spec_dst, &sa->sin_addr, 1250 sizeof(struct in_addr)); 1251 # else 1252 memmove(&v4info.ipi_addr, &sa->sin_addr, 1253 sizeof(struct in_addr)); 1254 # endif 1255 v4info.ipi_ifindex = ifindex; 1256 memmove(CMSG_DATA(cmsg), &v4info, sizeof(struct in_pktinfo)); 1257 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); 1258 #elif defined(IP_SENDSRCADDR) 1259 struct sockaddr_in* sa= (struct sockaddr_in*)localaddr; 1260 log_assert(localaddrlen >= sizeof(struct sockaddr_in)); 1261 msg->msg_controllen = CMSG_SPACE(sizeof(struct in_addr)); 1262 memset(msg->msg_control, 0, msg->msg_controllen); 1263 log_assert(msg->msg_controllen <= control_size); 1264 cmsg->cmsg_level = IPPROTO_IP; 1265 cmsg->cmsg_type = IP_SENDSRCADDR; 1266 memmove(CMSG_DATA(cmsg), &sa->sin_addr, 1267 sizeof(struct in_addr)); 1268 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr)); 1269 #endif 1270 } else { 1271 struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr; 1272 struct in6_pktinfo v6info; 1273 log_assert(localaddrlen >= sizeof(struct sockaddr_in6)); 1274 msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); 1275 memset(msg->msg_control, 0, msg->msg_controllen); 1276 log_assert(msg->msg_controllen <= control_size); 1277 cmsg->cmsg_level = IPPROTO_IPV6; 1278 cmsg->cmsg_type = IPV6_PKTINFO; 1279 memset(&v6info, 0, sizeof(v6info)); 1280 memmove(&v6info.ipi6_addr, &sa6->sin6_addr, 1281 sizeof(struct in6_addr)); 1282 v6info.ipi6_ifindex = ifindex; 1283 memmove(CMSG_DATA(cmsg), &v6info, sizeof(struct in6_pktinfo)); 1284 cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); 1285 } 1286 #endif /* S_SPLINT_S */ 1287 /* Ignore unused variables, if no assertions are compiled. */ 1288 (void)localaddrlen; 1289 (void)control_size; 1290 } 1291 1292 /** write address and port into strings */ 1293 static int 1294 doq_print_addr_port(struct doq_addr_storage* addr, socklen_t addrlen, 1295 char* host, size_t hostlen, char* port, size_t portlen) 1296 { 1297 if(addr->sockaddr.in.sin_family == AF_INET) { 1298 struct sockaddr_in* sa = (struct sockaddr_in*)addr; 1299 log_assert(addrlen >= sizeof(*sa)); 1300 if(inet_ntop(sa->sin_family, &sa->sin_addr, host, 1301 (socklen_t)hostlen) == 0) { 1302 log_hex("inet_ntop error: address", &sa->sin_addr, 1303 sizeof(sa->sin_addr)); 1304 return 0; 1305 } 1306 snprintf(port, portlen, "%u", (unsigned)ntohs(sa->sin_port)); 1307 } else if(addr->sockaddr.in.sin_family == AF_INET6) { 1308 struct sockaddr_in6* sa6 = (struct sockaddr_in6*)addr; 1309 log_assert(addrlen >= sizeof(*sa6)); 1310 if(inet_ntop(sa6->sin6_family, &sa6->sin6_addr, host, 1311 (socklen_t)hostlen) == 0) { 1312 log_hex("inet_ntop error: address", &sa6->sin6_addr, 1313 sizeof(sa6->sin6_addr)); 1314 return 0; 1315 } 1316 snprintf(port, portlen, "%u", (unsigned)ntohs(sa6->sin6_port)); 1317 } 1318 return 1; 1319 } 1320 1321 /** doq store the blocked packet when write has blocked */ 1322 static void 1323 doq_store_blocked_pkt(struct comm_point* c, struct doq_pkt_addr* paddr, 1324 uint32_t ecn) 1325 { 1326 if(c->doq_socket->have_blocked_pkt) 1327 return; /* should not happen that we write when there is 1328 already a blocked write, but if so, drop it. */ 1329 if(sldns_buffer_limit(c->doq_socket->pkt_buf) > 1330 sldns_buffer_capacity(c->doq_socket->blocked_pkt)) 1331 return; /* impossibly large, drop packet. impossible because 1332 pkt_buf and blocked_pkt are the same size. */ 1333 c->doq_socket->have_blocked_pkt = 1; 1334 c->doq_socket->blocked_pkt_pi.ecn = ecn; 1335 memcpy(c->doq_socket->blocked_paddr, paddr, 1336 sizeof(*c->doq_socket->blocked_paddr)); 1337 sldns_buffer_clear(c->doq_socket->blocked_pkt); 1338 sldns_buffer_write(c->doq_socket->blocked_pkt, 1339 sldns_buffer_begin(c->doq_socket->pkt_buf), 1340 sldns_buffer_limit(c->doq_socket->pkt_buf)); 1341 sldns_buffer_flip(c->doq_socket->blocked_pkt); 1342 } 1343 1344 void 1345 doq_send_pkt(struct comm_point* c, struct doq_pkt_addr* paddr, uint32_t ecn) 1346 { 1347 struct msghdr msg; 1348 struct iovec iov[1]; 1349 union { 1350 struct cmsghdr hdr; 1351 char buf[256]; 1352 } control; 1353 ssize_t ret; 1354 iov[0].iov_base = sldns_buffer_begin(c->doq_socket->pkt_buf); 1355 iov[0].iov_len = sldns_buffer_limit(c->doq_socket->pkt_buf); 1356 memset(&msg, 0, sizeof(msg)); 1357 msg.msg_name = (void*)&paddr->addr; 1358 msg.msg_namelen = paddr->addrlen; 1359 msg.msg_iov = iov; 1360 msg.msg_iovlen = 1; 1361 msg.msg_control = control.buf; 1362 #ifndef S_SPLINT_S 1363 msg.msg_controllen = sizeof(control.buf); 1364 #endif /* S_SPLINT_S */ 1365 msg.msg_flags = 0; 1366 1367 doq_set_localaddr_cmsg(&msg, sizeof(control.buf), &paddr->localaddr, 1368 paddr->localaddrlen, paddr->ifindex); 1369 doq_set_ecn(c->fd, paddr->addr.sockaddr.in.sin_family, ecn); 1370 1371 for(;;) { 1372 ret = sendmsg(c->fd, &msg, MSG_DONTWAIT); 1373 if(ret == -1 && errno == EINTR) 1374 continue; 1375 break; 1376 } 1377 if(ret == -1) { 1378 #ifndef USE_WINSOCK 1379 if(errno == EAGAIN || 1380 # ifdef EWOULDBLOCK 1381 errno == EWOULDBLOCK || 1382 # endif 1383 errno == ENOBUFS) 1384 #else 1385 if(WSAGetLastError() == WSAEINPROGRESS || 1386 WSAGetLastError() == WSAENOBUFS || 1387 WSAGetLastError() == WSAEWOULDBLOCK) 1388 #endif 1389 { 1390 /* udp send has blocked */ 1391 doq_store_blocked_pkt(c, paddr, ecn); 1392 return; 1393 } 1394 if(!udp_send_errno_needs_log((void*)&paddr->addr, 1395 paddr->addrlen)) 1396 return; 1397 if(verbosity >= VERB_OPS) { 1398 char host[256], port[32]; 1399 if(doq_print_addr_port(&paddr->addr, paddr->addrlen, 1400 host, sizeof(host), port, sizeof(port))) { 1401 verbose(VERB_OPS, "doq sendmsg to %s %s " 1402 "failed: %s", host, port, 1403 strerror(errno)); 1404 } else { 1405 verbose(VERB_OPS, "doq sendmsg failed: %s", 1406 strerror(errno)); 1407 } 1408 } 1409 return; 1410 } else if(ret != (ssize_t)sldns_buffer_limit(c->doq_socket->pkt_buf)) { 1411 char host[256], port[32]; 1412 if(doq_print_addr_port(&paddr->addr, paddr->addrlen, host, 1413 sizeof(host), port, sizeof(port))) { 1414 log_err("doq sendmsg to %s %s failed: " 1415 "sent %d in place of %d bytes", 1416 host, port, (int)ret, 1417 (int)sldns_buffer_limit(c->doq_socket->pkt_buf)); 1418 } else { 1419 log_err("doq sendmsg failed: " 1420 "sent %d in place of %d bytes", 1421 (int)ret, (int)sldns_buffer_limit(c->doq_socket->pkt_buf)); 1422 } 1423 return; 1424 } 1425 } 1426 1427 /** fetch port number */ 1428 static int 1429 doq_sockaddr_get_port(struct doq_addr_storage* addr) 1430 { 1431 if(addr->sockaddr.in.sin_family == AF_INET) { 1432 struct sockaddr_in* sa = (struct sockaddr_in*)addr; 1433 return ntohs(sa->sin_port); 1434 } else if(addr->sockaddr.in.sin_family == AF_INET6) { 1435 struct sockaddr_in6* sa6 = (struct sockaddr_in6*)addr; 1436 return ntohs(sa6->sin6_port); 1437 } 1438 return 0; 1439 } 1440 1441 /** get local address from ancillary data headers */ 1442 static int 1443 doq_get_localaddr_cmsg(struct comm_point* c, struct doq_pkt_addr* paddr, 1444 int* pkt_continue, struct msghdr* msg) 1445 { 1446 #ifndef S_SPLINT_S 1447 struct cmsghdr* cmsg; 1448 #endif /* S_SPLINT_S */ 1449 1450 memset(&paddr->localaddr, 0, sizeof(paddr->localaddr)); 1451 #ifndef S_SPLINT_S 1452 for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1453 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1454 if( cmsg->cmsg_level == IPPROTO_IPV6 && 1455 cmsg->cmsg_type == IPV6_PKTINFO) { 1456 struct in6_pktinfo* v6info = 1457 (struct in6_pktinfo*)CMSG_DATA(cmsg); 1458 struct sockaddr_in6* sa= (struct sockaddr_in6*) 1459 &paddr->localaddr; 1460 struct sockaddr_in6* rema = (struct sockaddr_in6*) 1461 &paddr->addr; 1462 if(rema->sin6_family != AF_INET6) { 1463 log_err("doq cmsg family mismatch cmsg is ip6"); 1464 *pkt_continue = 1; 1465 return 0; 1466 } 1467 sa->sin6_family = AF_INET6; 1468 sa->sin6_port = htons(doq_sockaddr_get_port( 1469 (void*)c->socket->addr)); 1470 paddr->ifindex = v6info->ipi6_ifindex; 1471 memmove(&sa->sin6_addr, &v6info->ipi6_addr, 1472 sizeof(struct in6_addr)); 1473 paddr->localaddrlen = sizeof(struct sockaddr_in6); 1474 break; 1475 #ifdef IP_PKTINFO 1476 } else if( cmsg->cmsg_level == IPPROTO_IP && 1477 cmsg->cmsg_type == IP_PKTINFO) { 1478 struct in_pktinfo* v4info = 1479 (struct in_pktinfo*)CMSG_DATA(cmsg); 1480 struct sockaddr_in* sa= (struct sockaddr_in*) 1481 &paddr->localaddr; 1482 struct sockaddr_in* rema = (struct sockaddr_in*) 1483 &paddr->addr; 1484 if(rema->sin_family != AF_INET) { 1485 log_err("doq cmsg family mismatch cmsg is ip4"); 1486 *pkt_continue = 1; 1487 return 0; 1488 } 1489 sa->sin_family = AF_INET; 1490 sa->sin_port = htons(doq_sockaddr_get_port( 1491 (void*)c->socket->addr)); 1492 paddr->ifindex = v4info->ipi_ifindex; 1493 memmove(&sa->sin_addr, &v4info->ipi_addr, 1494 sizeof(struct in_addr)); 1495 paddr->localaddrlen = sizeof(struct sockaddr_in); 1496 break; 1497 #elif defined(IP_RECVDSTADDR) 1498 } else if( cmsg->cmsg_level == IPPROTO_IP && 1499 cmsg->cmsg_type == IP_RECVDSTADDR) { 1500 struct sockaddr_in* sa= (struct sockaddr_in*) 1501 &paddr->localaddr; 1502 struct sockaddr_in* rema = (struct sockaddr_in*) 1503 &paddr->addr; 1504 if(rema->sin_family != AF_INET) { 1505 log_err("doq cmsg family mismatch cmsg is ip4"); 1506 *pkt_continue = 1; 1507 return 0; 1508 } 1509 sa->sin_family = AF_INET; 1510 sa->sin_port = htons(doq_sockaddr_get_port( 1511 (void*)c->socket->addr)); 1512 paddr->ifindex = 0; 1513 memmove(&sa.sin_addr, CMSG_DATA(cmsg), 1514 sizeof(struct in_addr)); 1515 paddr->localaddrlen = sizeof(struct sockaddr_in); 1516 break; 1517 #endif /* IP_PKTINFO or IP_RECVDSTADDR */ 1518 } 1519 } 1520 #endif /* S_SPLINT_S */ 1521 1522 return 1; 1523 } 1524 1525 /** get packet ecn information */ 1526 static uint32_t 1527 msghdr_get_ecn(struct msghdr* msg, int family) 1528 { 1529 #ifndef S_SPLINT_S 1530 struct cmsghdr* cmsg; 1531 if(family == AF_INET6) { 1532 for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1533 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1534 if(cmsg->cmsg_level == IPPROTO_IPV6 && 1535 cmsg->cmsg_type == IPV6_TCLASS && 1536 cmsg->cmsg_len != 0) { 1537 uint8_t* ecn = (uint8_t*)CMSG_DATA(cmsg); 1538 return *ecn; 1539 } 1540 } 1541 return 0; 1542 } 1543 for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1544 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1545 if(cmsg->cmsg_level == IPPROTO_IP && 1546 cmsg->cmsg_type == IP_TOS && 1547 cmsg->cmsg_len != 0) { 1548 uint8_t* ecn = (uint8_t*)CMSG_DATA(cmsg); 1549 return *ecn; 1550 } 1551 } 1552 #endif /* S_SPLINT_S */ 1553 return 0; 1554 } 1555 1556 /** receive packet for DoQ on UDP. get ancillary data for addresses, 1557 * return false if failed and the callback can stop receiving UDP packets 1558 * if pkt_continue is false. */ 1559 static int 1560 doq_recv(struct comm_point* c, struct doq_pkt_addr* paddr, int* pkt_continue, 1561 struct ngtcp2_pkt_info* pi) 1562 { 1563 struct msghdr msg; 1564 struct iovec iov[1]; 1565 ssize_t rcv; 1566 union { 1567 struct cmsghdr hdr; 1568 char buf[256]; 1569 } ancil; 1570 1571 msg.msg_name = &paddr->addr; 1572 msg.msg_namelen = (socklen_t)sizeof(paddr->addr); 1573 iov[0].iov_base = sldns_buffer_begin(c->doq_socket->pkt_buf); 1574 iov[0].iov_len = sldns_buffer_remaining(c->doq_socket->pkt_buf); 1575 msg.msg_iov = iov; 1576 msg.msg_iovlen = 1; 1577 msg.msg_control = ancil.buf; 1578 #ifndef S_SPLINT_S 1579 msg.msg_controllen = sizeof(ancil.buf); 1580 #endif /* S_SPLINT_S */ 1581 msg.msg_flags = 0; 1582 1583 rcv = recvmsg(c->fd, &msg, MSG_DONTWAIT); 1584 if(rcv == -1) { 1585 if(errno != EAGAIN && errno != EINTR 1586 && udp_recv_needs_log(errno)) { 1587 log_err("recvmsg failed for doq: %s", strerror(errno)); 1588 } 1589 *pkt_continue = 0; 1590 return 0; 1591 } 1592 1593 paddr->addrlen = msg.msg_namelen; 1594 sldns_buffer_skip(c->doq_socket->pkt_buf, rcv); 1595 sldns_buffer_flip(c->doq_socket->pkt_buf); 1596 if(!doq_get_localaddr_cmsg(c, paddr, pkt_continue, &msg)) 1597 return 0; 1598 pi->ecn = msghdr_get_ecn(&msg, paddr->addr.sockaddr.in.sin_family); 1599 return 1; 1600 } 1601 1602 /** send the version negotiation for doq. scid and dcid are flipped around 1603 * to send back to the client. */ 1604 static void 1605 doq_send_version_negotiation(struct comm_point* c, struct doq_pkt_addr* paddr, 1606 const uint8_t* dcid, size_t dcidlen, const uint8_t* scid, 1607 size_t scidlen) 1608 { 1609 uint32_t versions[2]; 1610 size_t versions_len = 0; 1611 ngtcp2_ssize ret; 1612 uint8_t unused_random; 1613 1614 /* fill the array with supported versions */ 1615 versions[0] = NGTCP2_PROTO_VER_V1; 1616 versions_len = 1; 1617 unused_random = ub_random_max(c->doq_socket->rnd, 256); 1618 sldns_buffer_clear(c->doq_socket->pkt_buf); 1619 ret = ngtcp2_pkt_write_version_negotiation( 1620 sldns_buffer_begin(c->doq_socket->pkt_buf), 1621 sldns_buffer_capacity(c->doq_socket->pkt_buf), unused_random, 1622 dcid, dcidlen, scid, scidlen, versions, versions_len); 1623 if(ret < 0) { 1624 log_err("ngtcp2_pkt_write_version_negotiation failed: %s", 1625 ngtcp2_strerror(ret)); 1626 return; 1627 } 1628 sldns_buffer_set_position(c->doq_socket->pkt_buf, ret); 1629 sldns_buffer_flip(c->doq_socket->pkt_buf); 1630 doq_send_pkt(c, paddr, 0); 1631 } 1632 1633 /** Find the doq_conn object by remote address and dcid */ 1634 static struct doq_conn* 1635 doq_conn_find(struct doq_table* table, struct doq_addr_storage* addr, 1636 socklen_t addrlen, struct doq_addr_storage* localaddr, 1637 socklen_t localaddrlen, int ifindex, const uint8_t* dcid, 1638 size_t dcidlen) 1639 { 1640 struct rbnode_type* node; 1641 struct doq_conn key; 1642 memset(&key.node, 0, sizeof(key.node)); 1643 key.node.key = &key; 1644 memmove(&key.key.paddr.addr, addr, addrlen); 1645 key.key.paddr.addrlen = addrlen; 1646 memmove(&key.key.paddr.localaddr, localaddr, localaddrlen); 1647 key.key.paddr.localaddrlen = localaddrlen; 1648 key.key.paddr.ifindex = ifindex; 1649 key.key.dcid = (void*)dcid; 1650 key.key.dcidlen = dcidlen; 1651 node = rbtree_search(table->conn_tree, &key); 1652 if(node) 1653 return (struct doq_conn*)node->key; 1654 return NULL; 1655 } 1656 1657 /** find the doq_con by the connection id */ 1658 static struct doq_conn* 1659 doq_conn_find_by_id(struct doq_table* table, const uint8_t* dcid, 1660 size_t dcidlen) 1661 { 1662 struct doq_conid* conid; 1663 lock_rw_rdlock(&table->conid_lock); 1664 conid = doq_conid_find(table, dcid, dcidlen); 1665 if(conid) { 1666 /* make a copy of the key */ 1667 struct doq_conn* conn; 1668 struct doq_conn_key key = conid->key; 1669 uint8_t cid[NGTCP2_MAX_CIDLEN]; 1670 log_assert(conid->key.dcidlen <= NGTCP2_MAX_CIDLEN); 1671 memcpy(cid, conid->key.dcid, conid->key.dcidlen); 1672 key.dcid = cid; 1673 lock_rw_unlock(&table->conid_lock); 1674 1675 /* now that the conid lock is released, look up the conn */ 1676 lock_rw_rdlock(&table->lock); 1677 conn = doq_conn_find(table, &key.paddr.addr, 1678 key.paddr.addrlen, &key.paddr.localaddr, 1679 key.paddr.localaddrlen, key.paddr.ifindex, key.dcid, 1680 key.dcidlen); 1681 if(!conn) { 1682 /* The connection got deleted between the conid lookup 1683 * and the connection lock grab, it no longer exists, 1684 * so return null. */ 1685 lock_rw_unlock(&table->lock); 1686 return NULL; 1687 } 1688 lock_basic_lock(&conn->lock); 1689 if(conn->is_deleted) { 1690 lock_rw_unlock(&table->lock); 1691 lock_basic_unlock(&conn->lock); 1692 return NULL; 1693 } 1694 lock_rw_unlock(&table->lock); 1695 return conn; 1696 } 1697 lock_rw_unlock(&table->conid_lock); 1698 return NULL; 1699 } 1700 1701 /** Find the doq_conn, by addr or by connection id */ 1702 static struct doq_conn* 1703 doq_conn_find_by_addr_or_cid(struct doq_table* table, 1704 struct doq_pkt_addr* paddr, const uint8_t* dcid, size_t dcidlen) 1705 { 1706 struct doq_conn* conn; 1707 lock_rw_rdlock(&table->lock); 1708 conn = doq_conn_find(table, &paddr->addr, paddr->addrlen, 1709 &paddr->localaddr, paddr->localaddrlen, paddr->ifindex, 1710 dcid, dcidlen); 1711 if(conn && conn->is_deleted) { 1712 conn = NULL; 1713 } 1714 if(conn) { 1715 lock_basic_lock(&conn->lock); 1716 lock_rw_unlock(&table->lock); 1717 verbose(VERB_ALGO, "doq: found connection by address, dcid"); 1718 } else { 1719 lock_rw_unlock(&table->lock); 1720 conn = doq_conn_find_by_id(table, dcid, dcidlen); 1721 if(conn) { 1722 verbose(VERB_ALGO, "doq: found connection by dcid"); 1723 } 1724 } 1725 return conn; 1726 } 1727 1728 /** decode doq packet header, false on handled or failure, true to continue 1729 * to process the packet */ 1730 static int 1731 doq_decode_pkt_header_negotiate(struct comm_point* c, 1732 struct doq_pkt_addr* paddr, struct doq_conn** conn) 1733 { 1734 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1735 struct ngtcp2_version_cid vc; 1736 #else 1737 uint32_t version; 1738 const uint8_t *dcid, *scid; 1739 size_t dcidlen, scidlen; 1740 #endif 1741 int rv; 1742 1743 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1744 rv = ngtcp2_pkt_decode_version_cid(&vc, 1745 sldns_buffer_begin(c->doq_socket->pkt_buf), 1746 sldns_buffer_limit(c->doq_socket->pkt_buf), 1747 c->doq_socket->sv_scidlen); 1748 #else 1749 rv = ngtcp2_pkt_decode_version_cid(&version, &dcid, &dcidlen, 1750 &scid, &scidlen, sldns_buffer_begin(c->doq_socket->pkt_buf), 1751 sldns_buffer_limit(c->doq_socket->pkt_buf), c->doq_socket->sv_scidlen); 1752 #endif 1753 if(rv != 0) { 1754 if(rv == NGTCP2_ERR_VERSION_NEGOTIATION) { 1755 /* send the version negotiation */ 1756 doq_send_version_negotiation(c, paddr, 1757 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1758 vc.scid, vc.scidlen, vc.dcid, vc.dcidlen 1759 #else 1760 scid, scidlen, dcid, dcidlen 1761 #endif 1762 ); 1763 return 0; 1764 } 1765 verbose(VERB_ALGO, "doq: could not decode version " 1766 "and CID from QUIC packet header: %s", 1767 ngtcp2_strerror(rv)); 1768 return 0; 1769 } 1770 1771 if(verbosity >= VERB_ALGO) { 1772 verbose(VERB_ALGO, "ngtcp2_pkt_decode_version_cid packet has " 1773 "QUIC protocol version %u", (unsigned) 1774 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1775 vc. 1776 #endif 1777 version 1778 ); 1779 log_hex("dcid", 1780 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1781 (void*)vc.dcid, vc.dcidlen 1782 #else 1783 (void*)dcid, dcidlen 1784 #endif 1785 ); 1786 log_hex("scid", 1787 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1788 (void*)vc.scid, vc.scidlen 1789 #else 1790 (void*)scid, scidlen 1791 #endif 1792 ); 1793 } 1794 *conn = doq_conn_find_by_addr_or_cid(c->doq_socket->table, paddr, 1795 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID 1796 vc.dcid, vc.dcidlen 1797 #else 1798 dcid, dcidlen 1799 #endif 1800 ); 1801 if(*conn) 1802 (*conn)->doq_socket = c->doq_socket; 1803 return 1; 1804 } 1805 1806 /** fill cid structure with random data */ 1807 static void doq_cid_randfill(struct ngtcp2_cid* cid, size_t datalen, 1808 struct ub_randstate* rnd) 1809 { 1810 uint8_t buf[32]; 1811 if(datalen > sizeof(buf)) 1812 datalen = sizeof(buf); 1813 doq_fill_rand(rnd, buf, datalen); 1814 ngtcp2_cid_init(cid, buf, datalen); 1815 } 1816 1817 /** send retry packet for doq connection. */ 1818 static void 1819 doq_send_retry(struct comm_point* c, struct doq_pkt_addr* paddr, 1820 struct ngtcp2_pkt_hd* hd) 1821 { 1822 char host[256], port[32]; 1823 struct ngtcp2_cid scid; 1824 uint8_t token[NGTCP2_CRYPTO_MAX_RETRY_TOKENLEN]; 1825 ngtcp2_tstamp ts; 1826 ngtcp2_ssize tokenlen, ret; 1827 1828 if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host, 1829 sizeof(host), port, sizeof(port))) { 1830 log_err("doq_send_retry failed"); 1831 return; 1832 } 1833 verbose(VERB_ALGO, "doq: sending retry packet to %s %s", host, port); 1834 1835 /* the server chosen source connection ID */ 1836 scid.datalen = c->doq_socket->sv_scidlen; 1837 doq_cid_randfill(&scid, scid.datalen, c->doq_socket->rnd); 1838 1839 ts = doq_get_timestamp_nanosec(); 1840 1841 tokenlen = ngtcp2_crypto_generate_retry_token(token, 1842 c->doq_socket->static_secret, c->doq_socket->static_secret_len, 1843 hd->version, (void*)&paddr->addr, paddr->addrlen, &scid, 1844 &hd->dcid, ts); 1845 if(tokenlen < 0) { 1846 log_err("ngtcp2_crypto_generate_retry_token failed: %s", 1847 ngtcp2_strerror(tokenlen)); 1848 return; 1849 } 1850 1851 sldns_buffer_clear(c->doq_socket->pkt_buf); 1852 ret = ngtcp2_crypto_write_retry(sldns_buffer_begin(c->doq_socket->pkt_buf), 1853 sldns_buffer_capacity(c->doq_socket->pkt_buf), hd->version, 1854 &hd->scid, &scid, &hd->dcid, token, tokenlen); 1855 if(ret < 0) { 1856 log_err("ngtcp2_crypto_write_retry failed: %s", 1857 ngtcp2_strerror(ret)); 1858 return; 1859 } 1860 sldns_buffer_set_position(c->doq_socket->pkt_buf, ret); 1861 sldns_buffer_flip(c->doq_socket->pkt_buf); 1862 doq_send_pkt(c, paddr, 0); 1863 } 1864 1865 /** doq send stateless connection close */ 1866 static void 1867 doq_send_stateless_connection_close(struct comm_point* c, 1868 struct doq_pkt_addr* paddr, struct ngtcp2_pkt_hd* hd, 1869 uint64_t error_code) 1870 { 1871 ngtcp2_ssize ret; 1872 sldns_buffer_clear(c->doq_socket->pkt_buf); 1873 ret = ngtcp2_crypto_write_connection_close( 1874 sldns_buffer_begin(c->doq_socket->pkt_buf), 1875 sldns_buffer_capacity(c->doq_socket->pkt_buf), hd->version, &hd->scid, 1876 &hd->dcid, error_code, NULL, 0); 1877 if(ret < 0) { 1878 log_err("ngtcp2_crypto_write_connection_close failed: %s", 1879 ngtcp2_strerror(ret)); 1880 return; 1881 } 1882 sldns_buffer_set_position(c->doq_socket->pkt_buf, ret); 1883 sldns_buffer_flip(c->doq_socket->pkt_buf); 1884 doq_send_pkt(c, paddr, 0); 1885 } 1886 1887 /** doq verify retry token, false on failure */ 1888 static int 1889 doq_verify_retry_token(struct comm_point* c, struct doq_pkt_addr* paddr, 1890 struct ngtcp2_cid* ocid, struct ngtcp2_pkt_hd* hd) 1891 { 1892 char host[256], port[32]; 1893 ngtcp2_tstamp ts; 1894 if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host, 1895 sizeof(host), port, sizeof(port))) { 1896 log_err("doq_verify_retry_token failed"); 1897 return 0; 1898 } 1899 ts = doq_get_timestamp_nanosec(); 1900 verbose(VERB_ALGO, "doq: verifying retry token from %s %s", host, 1901 port); 1902 if(ngtcp2_crypto_verify_retry_token(ocid, 1903 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 1904 hd->token, hd->tokenlen, 1905 #else 1906 hd->token.base, hd->token.len, 1907 #endif 1908 c->doq_socket->static_secret, 1909 c->doq_socket->static_secret_len, hd->version, 1910 (void*)&paddr->addr, paddr->addrlen, &hd->dcid, 1911 10*NGTCP2_SECONDS, ts) != 0) { 1912 verbose(VERB_ALGO, "doq: could not verify retry token " 1913 "from %s %s", host, port); 1914 return 0; 1915 } 1916 verbose(VERB_ALGO, "doq: verified retry token from %s %s", host, port); 1917 return 1; 1918 } 1919 1920 /** doq verify token, false on failure */ 1921 static int 1922 doq_verify_token(struct comm_point* c, struct doq_pkt_addr* paddr, 1923 struct ngtcp2_pkt_hd* hd) 1924 { 1925 char host[256], port[32]; 1926 ngtcp2_tstamp ts; 1927 if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host, 1928 sizeof(host), port, sizeof(port))) { 1929 log_err("doq_verify_token failed"); 1930 return 0; 1931 } 1932 ts = doq_get_timestamp_nanosec(); 1933 verbose(VERB_ALGO, "doq: verifying token from %s %s", host, port); 1934 if(ngtcp2_crypto_verify_regular_token( 1935 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 1936 hd->token, hd->tokenlen, 1937 #else 1938 hd->token.base, hd->token.len, 1939 #endif 1940 c->doq_socket->static_secret, c->doq_socket->static_secret_len, 1941 (void*)&paddr->addr, paddr->addrlen, 3600*NGTCP2_SECONDS, 1942 ts) != 0) { 1943 verbose(VERB_ALGO, "doq: could not verify token from %s %s", 1944 host, port); 1945 return 0; 1946 } 1947 verbose(VERB_ALGO, "doq: verified token from %s %s", host, port); 1948 return 1; 1949 } 1950 1951 /** delete and remove from the lookup tree the doq_conn connection */ 1952 static void 1953 doq_delete_connection(struct comm_point* c, struct doq_conn* conn) 1954 { 1955 struct doq_conn copy; 1956 uint8_t cid[NGTCP2_MAX_CIDLEN]; 1957 rbnode_type* node; 1958 if(!conn) 1959 return; 1960 /* Copy the key and set it deleted. */ 1961 conn->is_deleted = 1; 1962 doq_conn_write_disable(conn); 1963 copy.key = conn->key; 1964 log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN); 1965 memcpy(cid, conn->key.dcid, conn->key.dcidlen); 1966 copy.key.dcid = cid; 1967 copy.node.key = © 1968 lock_basic_unlock(&conn->lock); 1969 1970 /* Now get the table lock to delete it from the tree */ 1971 lock_rw_wrlock(&c->doq_socket->table->lock); 1972 node = rbtree_delete(c->doq_socket->table->conn_tree, copy.node.key); 1973 if(node) { 1974 conn = (struct doq_conn*)node->key; 1975 lock_basic_lock(&conn->lock); 1976 doq_conn_write_list_remove(c->doq_socket->table, conn); 1977 if(conn->timer.timer_in_list) { 1978 /* Remove timer from list first, because finding the 1979 * rbnode element of the setlist of same timeouts 1980 * needs tree lookup. Edit the tree structure after 1981 * that lookup. */ 1982 doq_timer_list_remove(c->doq_socket->table, 1983 &conn->timer); 1984 } 1985 if(conn->timer.timer_in_tree) 1986 doq_timer_tree_remove(c->doq_socket->table, 1987 &conn->timer); 1988 } 1989 lock_rw_unlock(&c->doq_socket->table->lock); 1990 if(node) { 1991 lock_basic_unlock(&conn->lock); 1992 doq_table_quic_size_subtract(c->doq_socket->table, 1993 sizeof(*conn)+conn->key.dcidlen); 1994 doq_conn_delete(conn, c->doq_socket->table); 1995 } 1996 } 1997 1998 /** create and setup a new doq connection, to a new destination, or with 1999 * a new dcid. It has a new set of streams. It is inserted in the lookup tree. 2000 * Returns NULL on failure. */ 2001 static struct doq_conn* 2002 doq_setup_new_conn(struct comm_point* c, struct doq_pkt_addr* paddr, 2003 struct ngtcp2_pkt_hd* hd, struct ngtcp2_cid* ocid) 2004 { 2005 struct doq_conn* conn; 2006 if(!doq_table_quic_size_available(c->doq_socket->table, 2007 c->doq_socket->cfg, sizeof(*conn)+hd->dcid.datalen 2008 + sizeof(struct doq_stream) 2009 + 100 /* estimated input query */ 2010 + 1200 /* estimated output query */)) { 2011 verbose(VERB_ALGO, "doq: no mem available for new connection"); 2012 doq_send_stateless_connection_close(c, paddr, hd, 2013 NGTCP2_CONNECTION_REFUSED); 2014 return NULL; 2015 } 2016 conn = doq_conn_create(c, paddr, hd->dcid.data, hd->dcid.datalen, 2017 hd->version); 2018 if(!conn) { 2019 log_err("doq: could not allocate doq_conn"); 2020 return NULL; 2021 } 2022 lock_rw_wrlock(&c->doq_socket->table->lock); 2023 lock_basic_lock(&conn->lock); 2024 if(!rbtree_insert(c->doq_socket->table->conn_tree, &conn->node)) { 2025 lock_rw_unlock(&c->doq_socket->table->lock); 2026 log_err("doq: duplicate connection"); 2027 /* conn has no entry in writelist, and no timer yet. */ 2028 lock_basic_unlock(&conn->lock); 2029 doq_conn_delete(conn, c->doq_socket->table); 2030 return NULL; 2031 } 2032 lock_rw_unlock(&c->doq_socket->table->lock); 2033 doq_table_quic_size_add(c->doq_socket->table, 2034 sizeof(*conn)+conn->key.dcidlen); 2035 verbose(VERB_ALGO, "doq: created new connection"); 2036 2037 /* the scid and dcid switch meaning from the accepted client 2038 * connection to the server connection. The 'source' and 'destination' 2039 * meaning is reversed. */ 2040 if(!doq_conn_setup(conn, hd->scid.data, hd->scid.datalen, 2041 (ocid?ocid->data:NULL), (ocid?ocid->datalen:0), 2042 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 2043 hd->token, hd->tokenlen 2044 #else 2045 hd->token.base, hd->token.len 2046 #endif 2047 )) { 2048 log_err("doq: could not set up connection"); 2049 doq_delete_connection(c, conn); 2050 return NULL; 2051 } 2052 return conn; 2053 } 2054 2055 /** perform doq address validation */ 2056 static int 2057 doq_address_validation(struct comm_point* c, struct doq_pkt_addr* paddr, 2058 struct ngtcp2_pkt_hd* hd, struct ngtcp2_cid* ocid, 2059 struct ngtcp2_cid** pocid) 2060 { 2061 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 2062 const uint8_t* token = hd->token; 2063 size_t tokenlen = hd->tokenlen; 2064 #else 2065 const uint8_t* token = hd->token.base; 2066 size_t tokenlen = hd->token.len; 2067 #endif 2068 verbose(VERB_ALGO, "doq stateless address validation"); 2069 2070 if(tokenlen == 0 || token == NULL) { 2071 doq_send_retry(c, paddr, hd); 2072 return 0; 2073 } 2074 if(token[0] != NGTCP2_CRYPTO_TOKEN_MAGIC_RETRY && 2075 hd->dcid.datalen < NGTCP2_MIN_INITIAL_DCIDLEN) { 2076 doq_send_stateless_connection_close(c, paddr, hd, 2077 NGTCP2_INVALID_TOKEN); 2078 return 0; 2079 } 2080 if(token[0] == NGTCP2_CRYPTO_TOKEN_MAGIC_RETRY) { 2081 if(!doq_verify_retry_token(c, paddr, ocid, hd)) { 2082 doq_send_stateless_connection_close(c, paddr, hd, 2083 NGTCP2_INVALID_TOKEN); 2084 return 0; 2085 } 2086 *pocid = ocid; 2087 } else if(token[0] == NGTCP2_CRYPTO_TOKEN_MAGIC_REGULAR) { 2088 if(!doq_verify_token(c, paddr, hd)) { 2089 doq_send_retry(c, paddr, hd); 2090 return 0; 2091 } 2092 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 2093 hd->token = NULL; 2094 hd->tokenlen = 0; 2095 #else 2096 hd->token.base = NULL; 2097 hd->token.len = 0; 2098 #endif 2099 } else { 2100 verbose(VERB_ALGO, "doq address validation: unrecognised " 2101 "token in hd.token.base with magic byte 0x%2.2x", 2102 (int)token[0]); 2103 if(c->doq_socket->validate_addr) { 2104 doq_send_retry(c, paddr, hd); 2105 return 0; 2106 } 2107 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 2108 hd->token = NULL; 2109 hd->tokenlen = 0; 2110 #else 2111 hd->token.base = NULL; 2112 hd->token.len = 0; 2113 #endif 2114 } 2115 return 1; 2116 } 2117 2118 /** the doq accept, returns false if no further processing of content */ 2119 static int 2120 doq_accept(struct comm_point* c, struct doq_pkt_addr* paddr, 2121 struct doq_conn** conn, struct ngtcp2_pkt_info* pi) 2122 { 2123 int rv; 2124 struct ngtcp2_pkt_hd hd; 2125 struct ngtcp2_cid ocid, *pocid=NULL; 2126 int err_retry; 2127 memset(&hd, 0, sizeof(hd)); 2128 rv = ngtcp2_accept(&hd, sldns_buffer_begin(c->doq_socket->pkt_buf), 2129 sldns_buffer_limit(c->doq_socket->pkt_buf)); 2130 if(rv != 0) { 2131 if(rv == NGTCP2_ERR_RETRY) { 2132 doq_send_retry(c, paddr, &hd); 2133 return 0; 2134 } 2135 log_err("doq: initial packet failed, ngtcp2_accept failed: %s", 2136 ngtcp2_strerror(rv)); 2137 return 0; 2138 } 2139 if(c->doq_socket->validate_addr || 2140 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN 2141 hd.tokenlen 2142 #else 2143 hd.token.len 2144 #endif 2145 ) { 2146 if(!doq_address_validation(c, paddr, &hd, &ocid, &pocid)) 2147 return 0; 2148 } 2149 *conn = doq_setup_new_conn(c, paddr, &hd, pocid); 2150 if(!*conn) 2151 return 0; 2152 (*conn)->doq_socket = c->doq_socket; 2153 if(!doq_conn_recv(c, paddr, *conn, pi, &err_retry, NULL)) { 2154 if(err_retry) 2155 doq_send_retry(c, paddr, &hd); 2156 doq_delete_connection(c, *conn); 2157 *conn = NULL; 2158 return 0; 2159 } 2160 return 1; 2161 } 2162 2163 /** doq pickup a timer to wait for for the worker. If any timer exists. */ 2164 static void 2165 doq_pickup_timer(struct comm_point* c) 2166 { 2167 struct doq_timer* t; 2168 struct timeval tv; 2169 int have_time = 0; 2170 memset(&tv, 0, sizeof(tv)); 2171 2172 lock_rw_wrlock(&c->doq_socket->table->lock); 2173 RBTREE_FOR(t, struct doq_timer*, c->doq_socket->table->timer_tree) { 2174 if(t->worker_doq_socket == NULL || 2175 t->worker_doq_socket == c->doq_socket) { 2176 /* pick up this element */ 2177 t->worker_doq_socket = c->doq_socket; 2178 have_time = 1; 2179 memcpy(&tv, &t->time, sizeof(tv)); 2180 break; 2181 } 2182 } 2183 lock_rw_unlock(&c->doq_socket->table->lock); 2184 2185 if(have_time) { 2186 struct timeval rel; 2187 timeval_subtract(&rel, &tv, c->doq_socket->now_tv); 2188 comm_timer_set(c->doq_socket->timer, &rel); 2189 memcpy(&c->doq_socket->marked_time, &tv, 2190 sizeof(c->doq_socket->marked_time)); 2191 verbose(VERB_ALGO, "doq pickup timer at %d.%6.6d in %d.%6.6d", 2192 (int)tv.tv_sec, (int)tv.tv_usec, (int)rel.tv_sec, 2193 (int)rel.tv_usec); 2194 } else { 2195 if(comm_timer_is_set(c->doq_socket->timer)) 2196 comm_timer_disable(c->doq_socket->timer); 2197 memset(&c->doq_socket->marked_time, 0, 2198 sizeof(c->doq_socket->marked_time)); 2199 verbose(VERB_ALGO, "doq timer disabled"); 2200 } 2201 } 2202 2203 /** doq done with connection, release locks and setup timer and write */ 2204 static void 2205 doq_done_setup_timer_and_write(struct comm_point* c, struct doq_conn* conn) 2206 { 2207 struct doq_conn copy; 2208 uint8_t cid[NGTCP2_MAX_CIDLEN]; 2209 rbnode_type* node; 2210 struct timeval new_tv; 2211 int write_change = 0, timer_change = 0; 2212 2213 /* No longer in callbacks, so the pointer to doq_socket is back 2214 * to NULL. */ 2215 conn->doq_socket = NULL; 2216 2217 if(doq_conn_check_timer(conn, &new_tv)) 2218 timer_change = 1; 2219 if( (conn->write_interest && !conn->on_write_list) || 2220 (!conn->write_interest && conn->on_write_list)) 2221 write_change = 1; 2222 2223 if(!timer_change && !write_change) { 2224 /* Nothing to do. */ 2225 lock_basic_unlock(&conn->lock); 2226 return; 2227 } 2228 2229 /* The table lock is needed to change the write list and timer tree. 2230 * So the connection lock is release and then the connection is 2231 * looked up again. */ 2232 copy.key = conn->key; 2233 log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN); 2234 memcpy(cid, conn->key.dcid, conn->key.dcidlen); 2235 copy.key.dcid = cid; 2236 copy.node.key = © 2237 lock_basic_unlock(&conn->lock); 2238 2239 lock_rw_wrlock(&c->doq_socket->table->lock); 2240 node = rbtree_search(c->doq_socket->table->conn_tree, copy.node.key); 2241 if(!node) { 2242 lock_rw_unlock(&c->doq_socket->table->lock); 2243 /* Must have been deleted in the mean time. */ 2244 return; 2245 } 2246 conn = (struct doq_conn*)node->key; 2247 lock_basic_lock(&conn->lock); 2248 if(conn->is_deleted) { 2249 /* It is deleted now. */ 2250 lock_rw_unlock(&c->doq_socket->table->lock); 2251 lock_basic_unlock(&conn->lock); 2252 return; 2253 } 2254 2255 if(write_change) { 2256 /* Edit the write lists, we are holding the table.lock and can 2257 * edit the list first,last and also prev,next and on_list 2258 * elements in the doq_conn structures. */ 2259 doq_conn_set_write_list(c->doq_socket->table, conn); 2260 } 2261 if(timer_change) { 2262 doq_timer_set(c->doq_socket->table, &conn->timer, 2263 c->doq_socket, &new_tv); 2264 } 2265 lock_rw_unlock(&c->doq_socket->table->lock); 2266 lock_basic_unlock(&conn->lock); 2267 } 2268 2269 /** doq done with connection callbacks, release locks and setup write */ 2270 static void 2271 doq_done_with_conn_cb(struct comm_point* c, struct doq_conn* conn) 2272 { 2273 struct doq_conn copy; 2274 uint8_t cid[NGTCP2_MAX_CIDLEN]; 2275 rbnode_type* node; 2276 2277 /* no longer in callbacks, so the pointer to doq_socket is back 2278 * to NULL. */ 2279 conn->doq_socket = NULL; 2280 2281 if( (conn->write_interest && conn->on_write_list) || 2282 (!conn->write_interest && !conn->on_write_list)) { 2283 /* The connection already has the required write list 2284 * status. */ 2285 lock_basic_unlock(&conn->lock); 2286 return; 2287 } 2288 2289 /* To edit the write list of connections we have to hold the table 2290 * lock, so we release the connection and then look it up again. */ 2291 copy.key = conn->key; 2292 log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN); 2293 memcpy(cid, conn->key.dcid, conn->key.dcidlen); 2294 copy.key.dcid = cid; 2295 copy.node.key = © 2296 lock_basic_unlock(&conn->lock); 2297 2298 lock_rw_wrlock(&c->doq_socket->table->lock); 2299 node = rbtree_search(c->doq_socket->table->conn_tree, copy.node.key); 2300 if(!node) { 2301 lock_rw_unlock(&c->doq_socket->table->lock); 2302 /* must have been deleted in the mean time */ 2303 return; 2304 } 2305 conn = (struct doq_conn*)node->key; 2306 lock_basic_lock(&conn->lock); 2307 if(conn->is_deleted) { 2308 /* it is deleted now. */ 2309 lock_rw_unlock(&c->doq_socket->table->lock); 2310 lock_basic_unlock(&conn->lock); 2311 return; 2312 } 2313 2314 /* edit the write lists, we are holding the table.lock and can 2315 * edit the list first,last and also prev,next and on_list elements 2316 * in the doq_conn structures. */ 2317 doq_conn_set_write_list(c->doq_socket->table, conn); 2318 lock_rw_unlock(&c->doq_socket->table->lock); 2319 lock_basic_unlock(&conn->lock); 2320 } 2321 2322 /** doq count the length of the write list */ 2323 static size_t 2324 doq_write_list_length(struct comm_point* c) 2325 { 2326 size_t count = 0; 2327 struct doq_conn* conn; 2328 lock_rw_rdlock(&c->doq_socket->table->lock); 2329 conn = c->doq_socket->table->write_list_first; 2330 while(conn) { 2331 count++; 2332 conn = conn->write_next; 2333 } 2334 lock_rw_unlock(&c->doq_socket->table->lock); 2335 return count; 2336 } 2337 2338 /** doq pop the first element from the write list to have write events */ 2339 static struct doq_conn* 2340 doq_pop_write_conn(struct comm_point* c) 2341 { 2342 struct doq_conn* conn; 2343 lock_rw_wrlock(&c->doq_socket->table->lock); 2344 conn = doq_table_pop_first(c->doq_socket->table); 2345 while(conn && conn->is_deleted) { 2346 lock_basic_unlock(&conn->lock); 2347 conn = doq_table_pop_first(c->doq_socket->table); 2348 } 2349 lock_rw_unlock(&c->doq_socket->table->lock); 2350 if(conn) 2351 conn->doq_socket = c->doq_socket; 2352 return conn; 2353 } 2354 2355 /** doq the connection is done with write callbacks, release it. */ 2356 static void 2357 doq_done_with_write_cb(struct comm_point* c, struct doq_conn* conn, 2358 int delete_it) 2359 { 2360 if(delete_it) { 2361 doq_delete_connection(c, conn); 2362 return; 2363 } 2364 doq_done_setup_timer_and_write(c, conn); 2365 } 2366 2367 /** see if the doq socket wants to write packets */ 2368 static int 2369 doq_socket_want_write(struct comm_point* c) 2370 { 2371 int want_write = 0; 2372 if(c->doq_socket->have_blocked_pkt) 2373 return 1; 2374 lock_rw_rdlock(&c->doq_socket->table->lock); 2375 if(c->doq_socket->table->write_list_first) 2376 want_write = 1; 2377 lock_rw_unlock(&c->doq_socket->table->lock); 2378 return want_write; 2379 } 2380 2381 /** enable write event for the doq server socket fd */ 2382 static void 2383 doq_socket_write_enable(struct comm_point* c) 2384 { 2385 verbose(VERB_ALGO, "doq socket want write"); 2386 if(c->doq_socket->event_has_write) 2387 return; 2388 comm_point_listen_for_rw(c, 1, 1); 2389 c->doq_socket->event_has_write = 1; 2390 } 2391 2392 /** disable write event for the doq server socket fd */ 2393 static void 2394 doq_socket_write_disable(struct comm_point* c) 2395 { 2396 verbose(VERB_ALGO, "doq socket want no write"); 2397 if(!c->doq_socket->event_has_write) 2398 return; 2399 comm_point_listen_for_rw(c, 1, 0); 2400 c->doq_socket->event_has_write = 0; 2401 } 2402 2403 /** write blocked packet, if possible. returns false if failed, again. */ 2404 static int 2405 doq_write_blocked_pkt(struct comm_point* c) 2406 { 2407 struct doq_pkt_addr paddr; 2408 if(!c->doq_socket->have_blocked_pkt) 2409 return 1; 2410 c->doq_socket->have_blocked_pkt = 0; 2411 if(sldns_buffer_limit(c->doq_socket->blocked_pkt) > 2412 sldns_buffer_remaining(c->doq_socket->pkt_buf)) 2413 return 1; /* impossibly large, drop it. 2414 impossible since pkt_buf is same size as blocked_pkt buf. */ 2415 sldns_buffer_clear(c->doq_socket->pkt_buf); 2416 sldns_buffer_write(c->doq_socket->pkt_buf, 2417 sldns_buffer_begin(c->doq_socket->blocked_pkt), 2418 sldns_buffer_limit(c->doq_socket->blocked_pkt)); 2419 sldns_buffer_flip(c->doq_socket->pkt_buf); 2420 memcpy(&paddr, c->doq_socket->blocked_paddr, sizeof(paddr)); 2421 doq_send_pkt(c, &paddr, c->doq_socket->blocked_pkt_pi.ecn); 2422 if(c->doq_socket->have_blocked_pkt) 2423 return 0; 2424 return 1; 2425 } 2426 2427 /** doq find a timer that timeouted and return the conn, locked. */ 2428 static struct doq_conn* 2429 doq_timer_timeout_conn(struct doq_server_socket* doq_socket) 2430 { 2431 struct doq_conn* conn = NULL; 2432 struct rbnode_type* node; 2433 lock_rw_wrlock(&doq_socket->table->lock); 2434 node = rbtree_first(doq_socket->table->timer_tree); 2435 if(node && node != RBTREE_NULL) { 2436 struct doq_timer* t = (struct doq_timer*)node; 2437 conn = t->conn; 2438 2439 /* If now < timer then no further timeouts in tree. */ 2440 if(timeval_smaller(doq_socket->now_tv, &t->time)) { 2441 lock_rw_unlock(&doq_socket->table->lock); 2442 return NULL; 2443 } 2444 2445 lock_basic_lock(&conn->lock); 2446 conn->doq_socket = doq_socket; 2447 2448 /* Now that the timer is fired, remove it. */ 2449 doq_timer_unset(doq_socket->table, t); 2450 lock_rw_unlock(&doq_socket->table->lock); 2451 return conn; 2452 } 2453 lock_rw_unlock(&doq_socket->table->lock); 2454 return NULL; 2455 } 2456 2457 /** doq timer erase the marker that said which timer the worker uses. */ 2458 static void 2459 doq_timer_erase_marker(struct doq_server_socket* doq_socket) 2460 { 2461 struct doq_timer* t; 2462 lock_rw_wrlock(&doq_socket->table->lock); 2463 t = doq_timer_find_time(doq_socket->table, &doq_socket->marked_time); 2464 if(t && t->worker_doq_socket == doq_socket) 2465 t->worker_doq_socket = NULL; 2466 lock_rw_unlock(&doq_socket->table->lock); 2467 memset(&doq_socket->marked_time, 0, sizeof(doq_socket->marked_time)); 2468 } 2469 2470 void 2471 doq_timer_cb(void* arg) 2472 { 2473 struct doq_server_socket* doq_socket = (struct doq_server_socket*)arg; 2474 struct doq_conn* conn; 2475 verbose(VERB_ALGO, "doq timer callback"); 2476 2477 doq_timer_erase_marker(doq_socket); 2478 2479 while((conn = doq_timer_timeout_conn(doq_socket)) != NULL) { 2480 if(conn->is_deleted || 2481 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD 2482 ngtcp2_conn_in_closing_period(conn->conn) || 2483 #else 2484 ngtcp2_conn_is_in_closing_period(conn->conn) || 2485 #endif 2486 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD 2487 ngtcp2_conn_in_draining_period(conn->conn) 2488 #else 2489 ngtcp2_conn_is_in_draining_period(conn->conn) 2490 #endif 2491 ) { 2492 if(verbosity >= VERB_ALGO) { 2493 char remotestr[256]; 2494 addr_to_str((void*)&conn->key.paddr.addr, 2495 conn->key.paddr.addrlen, remotestr, 2496 sizeof(remotestr)); 2497 verbose(VERB_ALGO, "doq conn %s is deleted " 2498 "after timeout", remotestr); 2499 } 2500 doq_delete_connection(doq_socket->cp, conn); 2501 continue; 2502 } 2503 if(!doq_conn_handle_timeout(conn)) 2504 doq_delete_connection(doq_socket->cp, conn); 2505 else doq_done_setup_timer_and_write(doq_socket->cp, conn); 2506 } 2507 2508 if(doq_socket_want_write(doq_socket->cp)) 2509 doq_socket_write_enable(doq_socket->cp); 2510 else doq_socket_write_disable(doq_socket->cp); 2511 doq_pickup_timer(doq_socket->cp); 2512 } 2513 2514 void 2515 comm_point_doq_callback(int fd, short event, void* arg) 2516 { 2517 struct comm_point* c; 2518 struct doq_pkt_addr paddr; 2519 int i, pkt_continue, err_drop; 2520 struct doq_conn* conn; 2521 struct ngtcp2_pkt_info pi; 2522 size_t count, num_len; 2523 2524 c = (struct comm_point*)arg; 2525 log_assert(c->type == comm_doq); 2526 2527 log_assert(c && c->doq_socket->pkt_buf && c->fd == fd); 2528 ub_comm_base_now(c->ev->base); 2529 2530 /* see if there is a blocked packet, and send that if possible. 2531 * do not attempt to read yet, even if possible, that would just 2532 * push more answers in reply to those read packets onto the list 2533 * of written replies. First attempt to clear the write content out. 2534 * That keeps the memory usage from bloating up. */ 2535 if(c->doq_socket->have_blocked_pkt) { 2536 if(!doq_write_blocked_pkt(c)) { 2537 /* this write has also blocked, attempt to write 2538 * later. Make sure the event listens to write 2539 * events. */ 2540 if(!c->doq_socket->event_has_write) 2541 doq_socket_write_enable(c); 2542 doq_pickup_timer(c); 2543 return; 2544 } 2545 } 2546 2547 /* see if there is write interest */ 2548 count = 0; 2549 num_len = doq_write_list_length(c); 2550 while((conn = doq_pop_write_conn(c)) != NULL) { 2551 if(conn->is_deleted || 2552 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD 2553 ngtcp2_conn_in_closing_period(conn->conn) || 2554 #else 2555 ngtcp2_conn_is_in_closing_period(conn->conn) || 2556 #endif 2557 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD 2558 ngtcp2_conn_in_draining_period(conn->conn) 2559 #else 2560 ngtcp2_conn_is_in_draining_period(conn->conn) 2561 #endif 2562 ) { 2563 conn->doq_socket = NULL; 2564 lock_basic_unlock(&conn->lock); 2565 if(c->doq_socket->have_blocked_pkt) { 2566 if(!c->doq_socket->event_has_write) 2567 doq_socket_write_enable(c); 2568 doq_pickup_timer(c); 2569 return; 2570 } 2571 if(++count > num_len*2) 2572 break; 2573 continue; 2574 } 2575 if(verbosity >= VERB_ALGO) { 2576 char remotestr[256]; 2577 addr_to_str((void*)&conn->key.paddr.addr, 2578 conn->key.paddr.addrlen, remotestr, 2579 sizeof(remotestr)); 2580 verbose(VERB_ALGO, "doq write connection %s %d", 2581 remotestr, doq_sockaddr_get_port( 2582 &conn->key.paddr.addr)); 2583 } 2584 if(doq_conn_write_streams(c, conn, &err_drop)) 2585 err_drop = 0; 2586 doq_done_with_write_cb(c, conn, err_drop); 2587 if(c->doq_socket->have_blocked_pkt) { 2588 if(!c->doq_socket->event_has_write) 2589 doq_socket_write_enable(c); 2590 doq_pickup_timer(c); 2591 return; 2592 } 2593 /* Stop overly long write lists that are created 2594 * while we are processing. Do those next time there 2595 * is a write callback. Stops long loops, and keeps 2596 * fair for other events. */ 2597 if(++count > num_len*2) 2598 break; 2599 } 2600 2601 /* check for data to read */ 2602 if((event&UB_EV_READ)!=0) 2603 for(i=0; i<NUM_UDP_PER_SELECT; i++) { 2604 /* there may be a blocked write packet and if so, stop 2605 * reading because the reply cannot get written. The 2606 * blocked packet could be written during the conn_recv 2607 * handling of replies, or for a connection close. */ 2608 if(c->doq_socket->have_blocked_pkt) { 2609 if(!c->doq_socket->event_has_write) 2610 doq_socket_write_enable(c); 2611 doq_pickup_timer(c); 2612 return; 2613 } 2614 sldns_buffer_clear(c->doq_socket->pkt_buf); 2615 doq_pkt_addr_init(&paddr); 2616 log_assert(fd != -1); 2617 log_assert(sldns_buffer_remaining(c->doq_socket->pkt_buf) > 0); 2618 if(!doq_recv(c, &paddr, &pkt_continue, &pi)) { 2619 if(pkt_continue) 2620 continue; 2621 break; 2622 } 2623 2624 /* handle incoming packet from remote addr to localaddr */ 2625 if(verbosity >= VERB_ALGO) { 2626 char remotestr[256], localstr[256]; 2627 addr_to_str((void*)&paddr.addr, paddr.addrlen, 2628 remotestr, sizeof(remotestr)); 2629 addr_to_str((void*)&paddr.localaddr, 2630 paddr.localaddrlen, localstr, 2631 sizeof(localstr)); 2632 log_info("incoming doq packet from %s port %d on " 2633 "%s port %d ifindex %d", 2634 remotestr, doq_sockaddr_get_port(&paddr.addr), 2635 localstr, 2636 doq_sockaddr_get_port(&paddr.localaddr), 2637 paddr.ifindex); 2638 log_info("doq_recv length %d ecn 0x%x", 2639 (int)sldns_buffer_limit(c->doq_socket->pkt_buf), 2640 (int)pi.ecn); 2641 } 2642 2643 if(sldns_buffer_limit(c->doq_socket->pkt_buf) == 0) 2644 continue; 2645 2646 conn = NULL; 2647 if(!doq_decode_pkt_header_negotiate(c, &paddr, &conn)) 2648 continue; 2649 if(!conn) { 2650 if(!doq_accept(c, &paddr, &conn, &pi)) 2651 continue; 2652 if(!doq_conn_write_streams(c, conn, NULL)) { 2653 doq_delete_connection(c, conn); 2654 continue; 2655 } 2656 doq_done_setup_timer_and_write(c, conn); 2657 continue; 2658 } 2659 if( 2660 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD 2661 ngtcp2_conn_in_closing_period(conn->conn) 2662 #else 2663 ngtcp2_conn_is_in_closing_period(conn->conn) 2664 #endif 2665 ) { 2666 if(!doq_conn_send_close(c, conn)) { 2667 doq_delete_connection(c, conn); 2668 } else { 2669 doq_done_setup_timer_and_write(c, conn); 2670 } 2671 continue; 2672 } 2673 if( 2674 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD 2675 ngtcp2_conn_in_draining_period(conn->conn) 2676 #else 2677 ngtcp2_conn_is_in_draining_period(conn->conn) 2678 #endif 2679 ) { 2680 doq_done_setup_timer_and_write(c, conn); 2681 continue; 2682 } 2683 if(!doq_conn_recv(c, &paddr, conn, &pi, NULL, &err_drop)) { 2684 /* The receive failed, and if it also failed to send 2685 * a close, drop the connection. That means it is not 2686 * in the closing period. */ 2687 if(err_drop) { 2688 doq_delete_connection(c, conn); 2689 } else { 2690 doq_done_setup_timer_and_write(c, conn); 2691 } 2692 continue; 2693 } 2694 if(!doq_conn_write_streams(c, conn, &err_drop)) { 2695 if(err_drop) { 2696 doq_delete_connection(c, conn); 2697 } else { 2698 doq_done_setup_timer_and_write(c, conn); 2699 } 2700 continue; 2701 } 2702 doq_done_setup_timer_and_write(c, conn); 2703 } 2704 2705 /* see if we want to have more write events */ 2706 verbose(VERB_ALGO, "doq check write enable"); 2707 if(doq_socket_want_write(c)) 2708 doq_socket_write_enable(c); 2709 else doq_socket_write_disable(c); 2710 doq_pickup_timer(c); 2711 } 2712 2713 /** create new doq server socket structure */ 2714 static struct doq_server_socket* 2715 doq_server_socket_create(struct doq_table* table, struct ub_randstate* rnd, 2716 const void* quic_sslctx, struct comm_point* c, struct comm_base* base, 2717 struct config_file* cfg) 2718 { 2719 size_t doq_buffer_size = 4096; /* bytes buffer size, for one packet. */ 2720 struct doq_server_socket* doq_socket; 2721 doq_socket = calloc(1, sizeof(*doq_socket)); 2722 if(!doq_socket) { 2723 return NULL; 2724 } 2725 doq_socket->table = table; 2726 doq_socket->rnd = rnd; 2727 doq_socket->validate_addr = 1; 2728 /* the doq_socket has its own copy of the static secret, as 2729 * well as other config values, so that they do not need table.lock */ 2730 doq_socket->static_secret_len = table->static_secret_len; 2731 doq_socket->static_secret = memdup(table->static_secret, 2732 table->static_secret_len); 2733 if(!doq_socket->static_secret) { 2734 free(doq_socket); 2735 return NULL; 2736 } 2737 doq_socket->ctx = (SSL_CTX*)quic_sslctx; 2738 doq_socket->idle_timeout = table->idle_timeout; 2739 doq_socket->sv_scidlen = table->sv_scidlen; 2740 doq_socket->cp = c; 2741 doq_socket->pkt_buf = sldns_buffer_new(doq_buffer_size); 2742 if(!doq_socket->pkt_buf) { 2743 free(doq_socket->static_secret); 2744 free(doq_socket); 2745 return NULL; 2746 } 2747 doq_socket->blocked_pkt = sldns_buffer_new( 2748 sldns_buffer_capacity(doq_socket->pkt_buf)); 2749 if(!doq_socket->pkt_buf) { 2750 free(doq_socket->static_secret); 2751 sldns_buffer_free(doq_socket->pkt_buf); 2752 free(doq_socket); 2753 return NULL; 2754 } 2755 doq_socket->blocked_paddr = calloc(1, 2756 sizeof(*doq_socket->blocked_paddr)); 2757 if(!doq_socket->blocked_paddr) { 2758 free(doq_socket->static_secret); 2759 sldns_buffer_free(doq_socket->pkt_buf); 2760 sldns_buffer_free(doq_socket->blocked_pkt); 2761 free(doq_socket); 2762 return NULL; 2763 } 2764 doq_socket->timer = comm_timer_create(base, doq_timer_cb, doq_socket); 2765 if(!doq_socket->timer) { 2766 free(doq_socket->static_secret); 2767 sldns_buffer_free(doq_socket->pkt_buf); 2768 sldns_buffer_free(doq_socket->blocked_pkt); 2769 free(doq_socket->blocked_paddr); 2770 free(doq_socket); 2771 return NULL; 2772 } 2773 memset(&doq_socket->marked_time, 0, sizeof(doq_socket->marked_time)); 2774 comm_base_timept(base, &doq_socket->now_tt, &doq_socket->now_tv); 2775 doq_socket->cfg = cfg; 2776 return doq_socket; 2777 } 2778 2779 /** delete doq server socket structure */ 2780 static void 2781 doq_server_socket_delete(struct doq_server_socket* doq_socket) 2782 { 2783 if(!doq_socket) 2784 return; 2785 free(doq_socket->static_secret); 2786 #ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT 2787 free(doq_socket->quic_method); 2788 #endif 2789 sldns_buffer_free(doq_socket->pkt_buf); 2790 sldns_buffer_free(doq_socket->blocked_pkt); 2791 free(doq_socket->blocked_paddr); 2792 comm_timer_delete(doq_socket->timer); 2793 free(doq_socket); 2794 } 2795 2796 /** find repinfo in the doq table */ 2797 static struct doq_conn* 2798 doq_lookup_repinfo(struct doq_table* table, struct comm_reply* repinfo) 2799 { 2800 struct doq_conn* conn; 2801 struct doq_conn_key key; 2802 doq_conn_key_from_repinfo(&key, repinfo); 2803 lock_rw_rdlock(&table->lock); 2804 conn = doq_conn_find(table, &key.paddr.addr, 2805 key.paddr.addrlen, &key.paddr.localaddr, 2806 key.paddr.localaddrlen, key.paddr.ifindex, key.dcid, 2807 key.dcidlen); 2808 if(conn) { 2809 lock_basic_lock(&conn->lock); 2810 lock_rw_unlock(&table->lock); 2811 return conn; 2812 } 2813 lock_rw_unlock(&table->lock); 2814 return NULL; 2815 } 2816 2817 /** doq find connection and stream. From inside callbacks from worker. */ 2818 static int 2819 doq_lookup_conn_stream(struct comm_reply* repinfo, struct comm_point* c, 2820 struct doq_conn** conn, struct doq_stream** stream) 2821 { 2822 log_assert(c->doq_socket); 2823 if(c->doq_socket->current_conn) { 2824 *conn = c->doq_socket->current_conn; 2825 } else { 2826 *conn = doq_lookup_repinfo(c->doq_socket->table, repinfo); 2827 if((*conn) && (*conn)->is_deleted) { 2828 lock_basic_unlock(&(*conn)->lock); 2829 *conn = NULL; 2830 } 2831 if(*conn) { 2832 (*conn)->doq_socket = c->doq_socket; 2833 } 2834 } 2835 if(!*conn) { 2836 *stream = NULL; 2837 return 0; 2838 } 2839 *stream = doq_stream_find(*conn, repinfo->doq_streamid); 2840 if(!*stream) { 2841 if(!c->doq_socket->current_conn) { 2842 /* Not inside callbacks, we have our own lock on conn. 2843 * Release it. */ 2844 lock_basic_unlock(&(*conn)->lock); 2845 } 2846 return 0; 2847 } 2848 if((*stream)->is_closed) { 2849 /* stream is closed, ignore reply or drop */ 2850 if(!c->doq_socket->current_conn) { 2851 /* Not inside callbacks, we have our own lock on conn. 2852 * Release it. */ 2853 lock_basic_unlock(&(*conn)->lock); 2854 } 2855 return 0; 2856 } 2857 return 1; 2858 } 2859 2860 /** doq send a reply from a comm reply */ 2861 static void 2862 doq_socket_send_reply(struct comm_reply* repinfo) 2863 { 2864 struct doq_conn* conn; 2865 struct doq_stream* stream; 2866 log_assert(repinfo->c->type == comm_doq); 2867 if(!doq_lookup_conn_stream(repinfo, repinfo->c, &conn, &stream)) { 2868 verbose(VERB_ALGO, "doq: send_reply but %s is gone", 2869 (conn?"stream":"connection")); 2870 /* No stream, it may have been closed. */ 2871 /* Drop the reply, it cannot be sent. */ 2872 return; 2873 } 2874 if(!doq_stream_send_reply(conn, stream, repinfo->c->buffer)) 2875 doq_stream_close(conn, stream, 1); 2876 if(!repinfo->c->doq_socket->current_conn) { 2877 /* Not inside callbacks, we have our own lock on conn. 2878 * Release it. */ 2879 doq_done_with_conn_cb(repinfo->c, conn); 2880 /* since we sent a reply, or closed it, the assumption is 2881 * that there is something to write, so enable write event. 2882 * It waits until the write event happens to write the 2883 * streams with answers, this allows some answers to be 2884 * answered before the event loop reaches the doq fd, in 2885 * repinfo->c->fd, and that collates answers. That would 2886 * not happen if we write doq packets right now. */ 2887 doq_socket_write_enable(repinfo->c); 2888 } 2889 } 2890 2891 /** doq drop a reply from a comm reply */ 2892 static void 2893 doq_socket_drop_reply(struct comm_reply* repinfo) 2894 { 2895 struct doq_conn* conn; 2896 struct doq_stream* stream; 2897 log_assert(repinfo->c->type == comm_doq); 2898 if(!doq_lookup_conn_stream(repinfo, repinfo->c, &conn, &stream)) { 2899 verbose(VERB_ALGO, "doq: drop_reply but %s is gone", 2900 (conn?"stream":"connection")); 2901 /* The connection or stream is already gone. */ 2902 return; 2903 } 2904 doq_stream_close(conn, stream, 1); 2905 if(!repinfo->c->doq_socket->current_conn) { 2906 /* Not inside callbacks, we have our own lock on conn. 2907 * Release it. */ 2908 doq_done_with_conn_cb(repinfo->c, conn); 2909 doq_socket_write_enable(repinfo->c); 2910 } 2911 } 2912 #endif /* HAVE_NGTCP2 */ 2913 2914 int adjusted_tcp_timeout(struct comm_point* c) 2915 { 2916 if(c->tcp_timeout_msec < TCP_QUERY_TIMEOUT_MINIMUM) 2917 return TCP_QUERY_TIMEOUT_MINIMUM; 2918 return c->tcp_timeout_msec; 2919 } 2920 2921 /** Use a new tcp handler for new query fd, set to read query */ 2922 static void 2923 setup_tcp_handler(struct comm_point* c, int fd, int cur, int max) 2924 { 2925 int handler_usage; 2926 log_assert(c->type == comm_tcp || c->type == comm_http); 2927 log_assert(c->fd == -1); 2928 sldns_buffer_clear(c->buffer); 2929 #ifdef USE_DNSCRYPT 2930 if (c->dnscrypt) 2931 sldns_buffer_clear(c->dnscrypt_buffer); 2932 #endif 2933 c->tcp_is_reading = 1; 2934 c->tcp_byte_count = 0; 2935 c->tcp_keepalive = 0; 2936 /* if more than half the tcp handlers are in use, use a shorter 2937 * timeout for this TCP connection, we need to make space for 2938 * other connections to be able to get attention */ 2939 /* If > 50% TCP handler structures in use, set timeout to 1/100th 2940 * configured value. 2941 * If > 65%TCP handler structures in use, set to 1/500th configured 2942 * value. 2943 * If > 80% TCP handler structures in use, set to 0. 2944 * 2945 * If the timeout to use falls below 200 milliseconds, an actual 2946 * timeout of 200ms is used. 2947 */ 2948 handler_usage = (cur * 100) / max; 2949 if(handler_usage > 50 && handler_usage <= 65) 2950 c->tcp_timeout_msec /= 100; 2951 else if (handler_usage > 65 && handler_usage <= 80) 2952 c->tcp_timeout_msec /= 500; 2953 else if (handler_usage > 80) 2954 c->tcp_timeout_msec = 0; 2955 comm_point_start_listening(c, fd, adjusted_tcp_timeout(c)); 2956 } 2957 2958 void comm_base_handle_slow_accept(int ATTR_UNUSED(fd), 2959 short ATTR_UNUSED(event), void* arg) 2960 { 2961 struct comm_base* b = (struct comm_base*)arg; 2962 /* timeout for the slow accept, re-enable accepts again */ 2963 if(b->start_accept) { 2964 verbose(VERB_ALGO, "wait is over, slow accept disabled"); 2965 fptr_ok(fptr_whitelist_start_accept(b->start_accept)); 2966 (*b->start_accept)(b->cb_arg); 2967 b->eb->slow_accept_enabled = 0; 2968 } 2969 } 2970 2971 int comm_point_perform_accept(struct comm_point* c, 2972 struct sockaddr_storage* addr, socklen_t* addrlen) 2973 { 2974 int new_fd; 2975 *addrlen = (socklen_t)sizeof(*addr); 2976 #ifndef HAVE_ACCEPT4 2977 new_fd = accept(c->fd, (struct sockaddr*)addr, addrlen); 2978 #else 2979 /* SOCK_NONBLOCK saves extra calls to fcntl for the same result */ 2980 new_fd = accept4(c->fd, (struct sockaddr*)addr, addrlen, SOCK_NONBLOCK); 2981 #endif 2982 if(new_fd == -1) { 2983 #ifndef USE_WINSOCK 2984 /* EINTR is signal interrupt. others are closed connection. */ 2985 if( errno == EINTR || errno == EAGAIN 2986 #ifdef EWOULDBLOCK 2987 || errno == EWOULDBLOCK 2988 #endif 2989 #ifdef ECONNABORTED 2990 || errno == ECONNABORTED 2991 #endif 2992 #ifdef EPROTO 2993 || errno == EPROTO 2994 #endif /* EPROTO */ 2995 ) 2996 return -1; 2997 #if defined(ENFILE) && defined(EMFILE) 2998 if(errno == ENFILE || errno == EMFILE) { 2999 /* out of file descriptors, likely outside of our 3000 * control. stop accept() calls for some time */ 3001 if(c->ev->base->stop_accept) { 3002 struct comm_base* b = c->ev->base; 3003 struct timeval tv; 3004 verbose(VERB_ALGO, "out of file descriptors: " 3005 "slow accept"); 3006 ub_comm_base_now(b); 3007 if(b->eb->last_slow_log+SLOW_LOG_TIME <= 3008 b->eb->secs) { 3009 b->eb->last_slow_log = b->eb->secs; 3010 verbose(VERB_OPS, "accept failed, " 3011 "slow down accept for %d " 3012 "msec: %s", 3013 NETEVENT_SLOW_ACCEPT_TIME, 3014 sock_strerror(errno)); 3015 } 3016 b->eb->slow_accept_enabled = 1; 3017 fptr_ok(fptr_whitelist_stop_accept( 3018 b->stop_accept)); 3019 (*b->stop_accept)(b->cb_arg); 3020 /* set timeout, no mallocs */ 3021 tv.tv_sec = NETEVENT_SLOW_ACCEPT_TIME/1000; 3022 tv.tv_usec = (NETEVENT_SLOW_ACCEPT_TIME%1000)*1000; 3023 b->eb->slow_accept = ub_event_new(b->eb->base, 3024 -1, UB_EV_TIMEOUT, 3025 comm_base_handle_slow_accept, b); 3026 if(b->eb->slow_accept == NULL) { 3027 /* we do not want to log here, because 3028 * that would spam the logfiles. 3029 * error: "event_base_set failed." */ 3030 } 3031 else if(ub_event_add(b->eb->slow_accept, &tv) 3032 != 0) { 3033 /* we do not want to log here, 3034 * error: "event_add failed." */ 3035 } 3036 } else { 3037 log_err("accept, with no slow down, " 3038 "failed: %s", sock_strerror(errno)); 3039 } 3040 return -1; 3041 } 3042 #endif 3043 #else /* USE_WINSOCK */ 3044 if(WSAGetLastError() == WSAEINPROGRESS || 3045 WSAGetLastError() == WSAECONNRESET) 3046 return -1; 3047 if(WSAGetLastError() == WSAEWOULDBLOCK) { 3048 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 3049 return -1; 3050 } 3051 #endif 3052 log_err_addr("accept failed", sock_strerror(errno), addr, 3053 *addrlen); 3054 return -1; 3055 } 3056 if(c->tcp_conn_limit && c->type == comm_tcp_accept) { 3057 c->tcl_addr = tcl_addr_lookup(c->tcp_conn_limit, addr, *addrlen); 3058 if(!tcl_new_connection(c->tcl_addr)) { 3059 if(verbosity >= 3) 3060 log_err_addr("accept rejected", 3061 "connection limit exceeded", addr, *addrlen); 3062 sock_close(new_fd); 3063 return -1; 3064 } 3065 } 3066 #ifndef HAVE_ACCEPT4 3067 fd_set_nonblock(new_fd); 3068 #endif 3069 return new_fd; 3070 } 3071 3072 #ifdef USE_WINSOCK 3073 static long win_bio_cb(BIO *b, int oper, const char* ATTR_UNUSED(argp), 3074 #ifdef HAVE_BIO_SET_CALLBACK_EX 3075 size_t ATTR_UNUSED(len), 3076 #endif 3077 int ATTR_UNUSED(argi), long argl, 3078 #ifndef HAVE_BIO_SET_CALLBACK_EX 3079 long retvalue 3080 #else 3081 int retvalue, size_t* ATTR_UNUSED(processed) 3082 #endif 3083 ) 3084 { 3085 int wsa_err = WSAGetLastError(); /* store errcode before it is gone */ 3086 verbose(VERB_ALGO, "bio_cb %d, %s %s %s", oper, 3087 (oper&BIO_CB_RETURN)?"return":"before", 3088 (oper&BIO_CB_READ)?"read":((oper&BIO_CB_WRITE)?"write":"other"), 3089 wsa_err==WSAEWOULDBLOCK?"wsawb":""); 3090 /* on windows, check if previous operation caused EWOULDBLOCK */ 3091 if( (oper == (BIO_CB_READ|BIO_CB_RETURN) && argl == 0) || 3092 (oper == (BIO_CB_GETS|BIO_CB_RETURN) && argl == 0)) { 3093 if(wsa_err == WSAEWOULDBLOCK) 3094 ub_winsock_tcp_wouldblock((struct ub_event*) 3095 BIO_get_callback_arg(b), UB_EV_READ); 3096 } 3097 if( (oper == (BIO_CB_WRITE|BIO_CB_RETURN) && argl == 0) || 3098 (oper == (BIO_CB_PUTS|BIO_CB_RETURN) && argl == 0)) { 3099 if(wsa_err == WSAEWOULDBLOCK) 3100 ub_winsock_tcp_wouldblock((struct ub_event*) 3101 BIO_get_callback_arg(b), UB_EV_WRITE); 3102 } 3103 /* return original return value */ 3104 return retvalue; 3105 } 3106 3107 /** set win bio callbacks for nonblocking operations */ 3108 void 3109 comm_point_tcp_win_bio_cb(struct comm_point* c, void* thessl) 3110 { 3111 SSL* ssl = (SSL*)thessl; 3112 /* set them both just in case, but usually they are the same BIO */ 3113 #ifdef HAVE_BIO_SET_CALLBACK_EX 3114 BIO_set_callback_ex(SSL_get_rbio(ssl), &win_bio_cb); 3115 #else 3116 BIO_set_callback(SSL_get_rbio(ssl), &win_bio_cb); 3117 #endif 3118 BIO_set_callback_arg(SSL_get_rbio(ssl), (char*)c->ev->ev); 3119 #ifdef HAVE_BIO_SET_CALLBACK_EX 3120 BIO_set_callback_ex(SSL_get_wbio(ssl), &win_bio_cb); 3121 #else 3122 BIO_set_callback(SSL_get_wbio(ssl), &win_bio_cb); 3123 #endif 3124 BIO_set_callback_arg(SSL_get_wbio(ssl), (char*)c->ev->ev); 3125 } 3126 #endif 3127 3128 #ifdef HAVE_NGHTTP2 3129 /** Create http2 session server. Per connection, after TCP accepted.*/ 3130 static int http2_session_server_create(struct http2_session* h2_session) 3131 { 3132 log_assert(h2_session->callbacks); 3133 h2_session->is_drop = 0; 3134 if(nghttp2_session_server_new(&h2_session->session, 3135 h2_session->callbacks, 3136 h2_session) == NGHTTP2_ERR_NOMEM) { 3137 log_err("failed to create nghttp2 session server"); 3138 return 0; 3139 } 3140 3141 return 1; 3142 } 3143 3144 /** Submit http2 setting to session. Once per session. */ 3145 static int http2_submit_settings(struct http2_session* h2_session) 3146 { 3147 int ret; 3148 nghttp2_settings_entry settings[1] = { 3149 {NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS, 3150 h2_session->c->http2_max_streams}}; 3151 3152 ret = nghttp2_submit_settings(h2_session->session, NGHTTP2_FLAG_NONE, 3153 settings, 1); 3154 if(ret) { 3155 verbose(VERB_QUERY, "http2: submit_settings failed, " 3156 "error: %s", nghttp2_strerror(ret)); 3157 return 0; 3158 } 3159 return 1; 3160 } 3161 #endif /* HAVE_NGHTTP2 */ 3162 3163 #ifdef HAVE_NGHTTP2 3164 /** Delete http2 stream. After session delete or stream close callback */ 3165 static void http2_stream_delete(struct http2_session* h2_session, 3166 struct http2_stream* h2_stream) 3167 { 3168 if(h2_stream->mesh_state) { 3169 mesh_state_remove_reply(h2_stream->mesh, h2_stream->mesh_state, 3170 h2_session->c); 3171 h2_stream->mesh_state = NULL; 3172 } 3173 http2_req_stream_clear(h2_stream); 3174 free(h2_stream); 3175 } 3176 #endif /* HAVE_NGHTTP2 */ 3177 3178 /** delete http2 session server. After closing connection. */ 3179 static void http2_session_server_delete(struct http2_session* h2_session) 3180 { 3181 #ifdef HAVE_NGHTTP2 3182 struct http2_stream* h2_stream, *next; 3183 nghttp2_session_del(h2_session->session); /* NULL input is fine */ 3184 h2_session->session = NULL; 3185 for(h2_stream = h2_session->first_stream; h2_stream;) { 3186 next = h2_stream->next; 3187 http2_stream_delete(h2_session, h2_stream); 3188 h2_stream = next; 3189 } 3190 h2_session->first_stream = NULL; 3191 h2_session->is_drop = 0; 3192 h2_session->postpone_drop = 0; 3193 h2_session->c->h2_stream = NULL; 3194 #endif 3195 (void)h2_session; 3196 } 3197 3198 void 3199 comm_point_tcp_accept_callback(int fd, short event, void* arg) 3200 { 3201 struct comm_point* c = (struct comm_point*)arg, *c_hdl; 3202 int new_fd; 3203 log_assert(c->type == comm_tcp_accept); 3204 if(!(event & UB_EV_READ)) { 3205 log_info("ignoring tcp accept event %d", (int)event); 3206 return; 3207 } 3208 ub_comm_base_now(c->ev->base); 3209 /* find free tcp handler. */ 3210 if(!c->tcp_free) { 3211 log_warn("accepted too many tcp, connections full"); 3212 return; 3213 } 3214 /* accept incoming connection. */ 3215 c_hdl = c->tcp_free; 3216 /* clear leftover flags from previous use, and then set the 3217 * correct event base for the event structure for libevent */ 3218 ub_event_free(c_hdl->ev->ev); 3219 c_hdl->ev->ev = NULL; 3220 if((c_hdl->type == comm_tcp && c_hdl->tcp_req_info) || 3221 c_hdl->type == comm_local || c_hdl->type == comm_raw) 3222 c_hdl->tcp_do_toggle_rw = 0; 3223 else c_hdl->tcp_do_toggle_rw = 1; 3224 3225 if(c_hdl->type == comm_http) { 3226 #ifdef HAVE_NGHTTP2 3227 if(!c_hdl->h2_session || 3228 !http2_session_server_create(c_hdl->h2_session)) { 3229 log_warn("failed to create nghttp2"); 3230 return; 3231 } 3232 if(!c_hdl->h2_session || 3233 !http2_submit_settings(c_hdl->h2_session)) { 3234 log_warn("failed to submit http2 settings"); 3235 if(c_hdl->h2_session) 3236 http2_session_server_delete(c_hdl->h2_session); 3237 return; 3238 } 3239 if(!c->ssl) { 3240 c_hdl->tcp_do_toggle_rw = 0; 3241 c_hdl->use_h2 = 1; 3242 } 3243 #endif 3244 c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1, 3245 UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT, 3246 comm_point_http_handle_callback, c_hdl); 3247 } else { 3248 c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1, 3249 UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT, 3250 comm_point_tcp_handle_callback, c_hdl); 3251 } 3252 if(!c_hdl->ev->ev) { 3253 log_warn("could not ub_event_new, dropped tcp"); 3254 #ifdef HAVE_NGHTTP2 3255 if(c_hdl->type == comm_http && c_hdl->h2_session) 3256 http2_session_server_delete(c_hdl->h2_session); 3257 #endif 3258 return; 3259 } 3260 log_assert(fd != -1); 3261 (void)fd; 3262 new_fd = comm_point_perform_accept(c, &c_hdl->repinfo.remote_addr, 3263 &c_hdl->repinfo.remote_addrlen); 3264 if(new_fd == -1) { 3265 #ifdef HAVE_NGHTTP2 3266 if(c_hdl->type == comm_http && c_hdl->h2_session) 3267 http2_session_server_delete(c_hdl->h2_session); 3268 #endif 3269 return; 3270 } 3271 /* Copy remote_address to client_address. 3272 * Simplest way/time for streams to do that. */ 3273 c_hdl->repinfo.client_addrlen = c_hdl->repinfo.remote_addrlen; 3274 memmove(&c_hdl->repinfo.client_addr, 3275 &c_hdl->repinfo.remote_addr, 3276 c_hdl->repinfo.remote_addrlen); 3277 if(c->ssl) { 3278 c_hdl->ssl = incoming_ssl_fd(c->ssl, new_fd); 3279 if(!c_hdl->ssl) { 3280 c_hdl->fd = new_fd; 3281 comm_point_close(c_hdl); 3282 return; 3283 } 3284 c_hdl->ssl_shake_state = comm_ssl_shake_read; 3285 #ifdef USE_WINSOCK 3286 comm_point_tcp_win_bio_cb(c_hdl, c_hdl->ssl); 3287 #endif 3288 } 3289 3290 /* grab the tcp handler buffers */ 3291 c->cur_tcp_count++; 3292 c->tcp_free = c_hdl->tcp_free; 3293 c_hdl->tcp_free = NULL; 3294 if(!c->tcp_free) { 3295 /* stop accepting incoming queries for now. */ 3296 comm_point_stop_listening(c); 3297 } 3298 setup_tcp_handler(c_hdl, new_fd, c->cur_tcp_count, c->max_tcp_count); 3299 } 3300 3301 /** Make tcp handler free for next assignment */ 3302 static void 3303 reclaim_tcp_handler(struct comm_point* c) 3304 { 3305 log_assert(c->type == comm_tcp); 3306 if(c->ssl) { 3307 #ifdef HAVE_SSL 3308 SSL_shutdown(c->ssl); 3309 SSL_free(c->ssl); 3310 c->ssl = NULL; 3311 #endif 3312 } 3313 comm_point_close(c); 3314 if(c->tcp_parent) { 3315 if(c != c->tcp_parent->tcp_free) { 3316 c->tcp_parent->cur_tcp_count--; 3317 c->tcp_free = c->tcp_parent->tcp_free; 3318 c->tcp_parent->tcp_free = c; 3319 } 3320 if(!c->tcp_free) { 3321 /* re-enable listening on accept socket */ 3322 comm_point_start_listening(c->tcp_parent, -1, -1); 3323 } 3324 } 3325 c->tcp_more_read_again = NULL; 3326 c->tcp_more_write_again = NULL; 3327 c->tcp_byte_count = 0; 3328 c->pp2_header_state = pp2_header_none; 3329 sldns_buffer_clear(c->buffer); 3330 } 3331 3332 /** do the callback when writing is done */ 3333 static void 3334 tcp_callback_writer(struct comm_point* c) 3335 { 3336 log_assert(c->type == comm_tcp); 3337 if(!c->tcp_write_and_read) { 3338 sldns_buffer_clear(c->buffer); 3339 c->tcp_byte_count = 0; 3340 } 3341 if(c->tcp_do_toggle_rw) 3342 c->tcp_is_reading = 1; 3343 /* switch from listening(write) to listening(read) */ 3344 if(c->tcp_req_info) { 3345 tcp_req_info_handle_writedone(c->tcp_req_info); 3346 } else { 3347 comm_point_stop_listening(c); 3348 if(c->tcp_write_and_read) { 3349 fptr_ok(fptr_whitelist_comm_point(c->callback)); 3350 if( (*c->callback)(c, c->cb_arg, NETEVENT_PKT_WRITTEN, 3351 &c->repinfo) ) { 3352 comm_point_start_listening(c, -1, 3353 adjusted_tcp_timeout(c)); 3354 } 3355 } else { 3356 comm_point_start_listening(c, -1, 3357 adjusted_tcp_timeout(c)); 3358 } 3359 } 3360 } 3361 3362 /** do the callback when reading is done */ 3363 static void 3364 tcp_callback_reader(struct comm_point* c) 3365 { 3366 log_assert(c->type == comm_tcp || c->type == comm_local); 3367 sldns_buffer_flip(c->buffer); 3368 if(c->tcp_do_toggle_rw) 3369 c->tcp_is_reading = 0; 3370 c->tcp_byte_count = 0; 3371 if(c->tcp_req_info) { 3372 tcp_req_info_handle_readdone(c->tcp_req_info); 3373 } else { 3374 if(c->type == comm_tcp) 3375 comm_point_stop_listening(c); 3376 fptr_ok(fptr_whitelist_comm_point(c->callback)); 3377 if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) { 3378 comm_point_start_listening(c, -1, 3379 adjusted_tcp_timeout(c)); 3380 } 3381 } 3382 } 3383 3384 #ifdef HAVE_SSL 3385 /** true if the ssl handshake error has to be squelched from the logs */ 3386 int 3387 squelch_err_ssl_handshake(unsigned long err) 3388 { 3389 if(verbosity >= VERB_QUERY) 3390 return 0; /* only squelch on low verbosity */ 3391 if(ERR_GET_LIB(err) == ERR_LIB_SSL && 3392 (ERR_GET_REASON(err) == SSL_R_HTTPS_PROXY_REQUEST || 3393 ERR_GET_REASON(err) == SSL_R_HTTP_REQUEST || 3394 ERR_GET_REASON(err) == SSL_R_WRONG_VERSION_NUMBER || 3395 ERR_GET_REASON(err) == SSL_R_SSLV3_ALERT_BAD_CERTIFICATE 3396 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 3397 || ERR_GET_REASON(err) == SSL_R_NO_SHARED_CIPHER 3398 #endif 3399 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 3400 || ERR_GET_REASON(err) == SSL_R_UNKNOWN_PROTOCOL 3401 || ERR_GET_REASON(err) == SSL_R_UNSUPPORTED_PROTOCOL 3402 # ifdef SSL_R_VERSION_TOO_LOW 3403 || ERR_GET_REASON(err) == SSL_R_VERSION_TOO_LOW 3404 # endif 3405 #endif 3406 )) 3407 return 1; 3408 return 0; 3409 } 3410 #endif /* HAVE_SSL */ 3411 3412 /** continue ssl handshake */ 3413 #ifdef HAVE_SSL 3414 static int 3415 ssl_handshake(struct comm_point* c) 3416 { 3417 int r; 3418 if(c->ssl_shake_state == comm_ssl_shake_hs_read) { 3419 /* read condition satisfied back to writing */ 3420 comm_point_listen_for_rw(c, 0, 1); 3421 c->ssl_shake_state = comm_ssl_shake_none; 3422 return 1; 3423 } 3424 if(c->ssl_shake_state == comm_ssl_shake_hs_write) { 3425 /* write condition satisfied, back to reading */ 3426 comm_point_listen_for_rw(c, 1, 0); 3427 c->ssl_shake_state = comm_ssl_shake_none; 3428 return 1; 3429 } 3430 3431 ERR_clear_error(); 3432 r = SSL_do_handshake(c->ssl); 3433 if(r != 1) { 3434 int want = SSL_get_error(c->ssl, r); 3435 if(want == SSL_ERROR_WANT_READ) { 3436 if(c->ssl_shake_state == comm_ssl_shake_read) 3437 return 1; 3438 c->ssl_shake_state = comm_ssl_shake_read; 3439 comm_point_listen_for_rw(c, 1, 0); 3440 return 1; 3441 } else if(want == SSL_ERROR_WANT_WRITE) { 3442 if(c->ssl_shake_state == comm_ssl_shake_write) 3443 return 1; 3444 c->ssl_shake_state = comm_ssl_shake_write; 3445 comm_point_listen_for_rw(c, 0, 1); 3446 return 1; 3447 } else if(r == 0) { 3448 return 0; /* closed */ 3449 } else if(want == SSL_ERROR_SYSCALL) { 3450 /* SYSCALL and errno==0 means closed uncleanly */ 3451 #ifdef EPIPE 3452 if(errno == EPIPE && verbosity < 2) 3453 return 0; /* silence 'broken pipe' */ 3454 #endif 3455 #ifdef ECONNRESET 3456 if(errno == ECONNRESET && verbosity < 2) 3457 return 0; /* silence reset by peer */ 3458 #endif 3459 if(!tcp_connect_errno_needs_log( 3460 (struct sockaddr*)&c->repinfo.remote_addr, 3461 c->repinfo.remote_addrlen)) 3462 return 0; /* silence connect failures that 3463 show up because after connect this is the 3464 first system call that accesses the socket */ 3465 if(errno != 0) 3466 log_err("SSL_handshake syscall: %s", 3467 strerror(errno)); 3468 return 0; 3469 } else { 3470 unsigned long err = ERR_get_error(); 3471 if(!squelch_err_ssl_handshake(err)) { 3472 long vr; 3473 log_crypto_err_io_code("ssl handshake failed", 3474 want, err); 3475 if((vr=SSL_get_verify_result(c->ssl)) != 0) 3476 log_err("ssl handshake cert error: %s", 3477 X509_verify_cert_error_string( 3478 vr)); 3479 log_addr(VERB_OPS, "ssl handshake failed", 3480 &c->repinfo.remote_addr, 3481 c->repinfo.remote_addrlen); 3482 } 3483 return 0; 3484 } 3485 } 3486 /* this is where peer verification could take place */ 3487 if((SSL_get_verify_mode(c->ssl)&SSL_VERIFY_PEER)) { 3488 /* verification */ 3489 if(SSL_get_verify_result(c->ssl) == X509_V_OK) { 3490 #ifdef HAVE_SSL_GET1_PEER_CERTIFICATE 3491 X509* x = SSL_get1_peer_certificate(c->ssl); 3492 #else 3493 X509* x = SSL_get_peer_certificate(c->ssl); 3494 #endif 3495 if(!x) { 3496 log_addr(VERB_ALGO, "SSL connection failed: " 3497 "no certificate", 3498 &c->repinfo.remote_addr, 3499 c->repinfo.remote_addrlen); 3500 return 0; 3501 } 3502 log_cert(VERB_ALGO, "peer certificate", x); 3503 #ifdef HAVE_SSL_GET0_PEERNAME 3504 if(SSL_get0_peername(c->ssl)) { 3505 char buf[255]; 3506 snprintf(buf, sizeof(buf), "SSL connection " 3507 "to %s authenticated", 3508 SSL_get0_peername(c->ssl)); 3509 log_addr(VERB_ALGO, buf, &c->repinfo.remote_addr, 3510 c->repinfo.remote_addrlen); 3511 } else { 3512 #endif 3513 log_addr(VERB_ALGO, "SSL connection " 3514 "authenticated", &c->repinfo.remote_addr, 3515 c->repinfo.remote_addrlen); 3516 #ifdef HAVE_SSL_GET0_PEERNAME 3517 } 3518 #endif 3519 X509_free(x); 3520 } else { 3521 #ifdef HAVE_SSL_GET1_PEER_CERTIFICATE 3522 X509* x = SSL_get1_peer_certificate(c->ssl); 3523 #else 3524 X509* x = SSL_get_peer_certificate(c->ssl); 3525 #endif 3526 if(x) { 3527 log_cert(VERB_ALGO, "peer certificate", x); 3528 X509_free(x); 3529 } 3530 log_addr(VERB_ALGO, "SSL connection failed: " 3531 "failed to authenticate", 3532 &c->repinfo.remote_addr, 3533 c->repinfo.remote_addrlen); 3534 return 0; 3535 } 3536 } else { 3537 /* unauthenticated, the verify peer flag was not set 3538 * in c->ssl when the ssl object was created from ssl_ctx */ 3539 log_addr(VERB_ALGO, "SSL connection", &c->repinfo.remote_addr, 3540 c->repinfo.remote_addrlen); 3541 } 3542 3543 #ifdef HAVE_SSL_GET0_ALPN_SELECTED 3544 /* check if http2 use is negotiated */ 3545 if(c->type == comm_http && c->h2_session) { 3546 const unsigned char *alpn; 3547 unsigned int alpnlen = 0; 3548 SSL_get0_alpn_selected(c->ssl, &alpn, &alpnlen); 3549 if(alpnlen == 2 && memcmp("h2", alpn, 2) == 0) { 3550 /* connection upgraded to HTTP2 */ 3551 c->tcp_do_toggle_rw = 0; 3552 c->use_h2 = 1; 3553 } else { 3554 verbose(VERB_ALGO, "client doesn't support HTTP/2"); 3555 return 0; 3556 } 3557 } 3558 #endif 3559 3560 /* setup listen rw correctly */ 3561 if(c->tcp_is_reading) { 3562 if(c->ssl_shake_state != comm_ssl_shake_read) 3563 comm_point_listen_for_rw(c, 1, 0); 3564 } else { 3565 comm_point_listen_for_rw(c, 0, 1); 3566 } 3567 c->ssl_shake_state = comm_ssl_shake_none; 3568 return 1; 3569 } 3570 #endif /* HAVE_SSL */ 3571 3572 /** ssl read callback on TCP */ 3573 static int 3574 ssl_handle_read(struct comm_point* c) 3575 { 3576 #ifdef HAVE_SSL 3577 int r; 3578 if(c->ssl_shake_state != comm_ssl_shake_none) { 3579 if(!ssl_handshake(c)) 3580 return 0; 3581 if(c->ssl_shake_state != comm_ssl_shake_none) 3582 return 1; 3583 } 3584 if(c->pp2_enabled && c->pp2_header_state != pp2_header_done) { 3585 struct pp2_header* header = NULL; 3586 size_t want_read_size = 0; 3587 size_t current_read_size = 0; 3588 if(c->pp2_header_state == pp2_header_none) { 3589 want_read_size = PP2_HEADER_SIZE; 3590 if(sldns_buffer_remaining(c->buffer)<want_read_size) { 3591 log_err_addr("proxy_protocol: not enough " 3592 "buffer size to read PROXYv2 header", "", 3593 &c->repinfo.remote_addr, 3594 c->repinfo.remote_addrlen); 3595 return 0; 3596 } 3597 verbose(VERB_ALGO, "proxy_protocol: reading fixed " 3598 "part of PROXYv2 header (len %lu)", 3599 (unsigned long)want_read_size); 3600 current_read_size = want_read_size; 3601 if(c->tcp_byte_count < current_read_size) { 3602 ERR_clear_error(); 3603 if((r=SSL_read(c->ssl, (void*)sldns_buffer_at( 3604 c->buffer, c->tcp_byte_count), 3605 current_read_size - 3606 c->tcp_byte_count)) <= 0) { 3607 int want = SSL_get_error(c->ssl, r); 3608 if(want == SSL_ERROR_ZERO_RETURN) { 3609 if(c->tcp_req_info) 3610 return tcp_req_info_handle_read_close(c->tcp_req_info); 3611 return 0; /* shutdown, closed */ 3612 } else if(want == SSL_ERROR_WANT_READ) { 3613 #ifdef USE_WINSOCK 3614 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 3615 #endif 3616 return 1; /* read more later */ 3617 } else if(want == SSL_ERROR_WANT_WRITE) { 3618 c->ssl_shake_state = comm_ssl_shake_hs_write; 3619 comm_point_listen_for_rw(c, 0, 1); 3620 return 1; 3621 } else if(want == SSL_ERROR_SYSCALL) { 3622 #ifdef ECONNRESET 3623 if(errno == ECONNRESET && verbosity < 2) 3624 return 0; /* silence reset by peer */ 3625 #endif 3626 if(errno != 0) 3627 log_err("SSL_read syscall: %s", 3628 strerror(errno)); 3629 return 0; 3630 } 3631 log_crypto_err_io("could not SSL_read", 3632 want); 3633 return 0; 3634 } 3635 c->tcp_byte_count += r; 3636 sldns_buffer_skip(c->buffer, r); 3637 if(c->tcp_byte_count != current_read_size) return 1; 3638 c->pp2_header_state = pp2_header_init; 3639 } 3640 } 3641 if(c->pp2_header_state == pp2_header_init) { 3642 int err; 3643 err = pp2_read_header( 3644 sldns_buffer_begin(c->buffer), 3645 sldns_buffer_limit(c->buffer)); 3646 if(err) { 3647 log_err("proxy_protocol: could not parse " 3648 "PROXYv2 header (%s)", 3649 pp_lookup_error(err)); 3650 return 0; 3651 } 3652 header = (struct pp2_header*)sldns_buffer_begin(c->buffer); 3653 want_read_size = ntohs(header->len); 3654 if(sldns_buffer_limit(c->buffer) < 3655 PP2_HEADER_SIZE + want_read_size) { 3656 log_err_addr("proxy_protocol: not enough " 3657 "buffer size to read PROXYv2 header", "", 3658 &c->repinfo.remote_addr, 3659 c->repinfo.remote_addrlen); 3660 return 0; 3661 } 3662 verbose(VERB_ALGO, "proxy_protocol: reading variable " 3663 "part of PROXYv2 header (len %lu)", 3664 (unsigned long)want_read_size); 3665 current_read_size = PP2_HEADER_SIZE + want_read_size; 3666 if(want_read_size == 0) { 3667 /* nothing more to read; header is complete */ 3668 c->pp2_header_state = pp2_header_done; 3669 } else if(c->tcp_byte_count < current_read_size) { 3670 ERR_clear_error(); 3671 if((r=SSL_read(c->ssl, (void*)sldns_buffer_at( 3672 c->buffer, c->tcp_byte_count), 3673 current_read_size - 3674 c->tcp_byte_count)) <= 0) { 3675 int want = SSL_get_error(c->ssl, r); 3676 if(want == SSL_ERROR_ZERO_RETURN) { 3677 if(c->tcp_req_info) 3678 return tcp_req_info_handle_read_close(c->tcp_req_info); 3679 return 0; /* shutdown, closed */ 3680 } else if(want == SSL_ERROR_WANT_READ) { 3681 #ifdef USE_WINSOCK 3682 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 3683 #endif 3684 return 1; /* read more later */ 3685 } else if(want == SSL_ERROR_WANT_WRITE) { 3686 c->ssl_shake_state = comm_ssl_shake_hs_write; 3687 comm_point_listen_for_rw(c, 0, 1); 3688 return 1; 3689 } else if(want == SSL_ERROR_SYSCALL) { 3690 #ifdef ECONNRESET 3691 if(errno == ECONNRESET && verbosity < 2) 3692 return 0; /* silence reset by peer */ 3693 #endif 3694 if(errno != 0) 3695 log_err("SSL_read syscall: %s", 3696 strerror(errno)); 3697 return 0; 3698 } 3699 log_crypto_err_io("could not SSL_read", 3700 want); 3701 return 0; 3702 } 3703 c->tcp_byte_count += r; 3704 sldns_buffer_skip(c->buffer, r); 3705 if(c->tcp_byte_count != current_read_size) return 1; 3706 c->pp2_header_state = pp2_header_done; 3707 } 3708 } 3709 if(c->pp2_header_state != pp2_header_done || !header) { 3710 log_err_addr("proxy_protocol: wrong state for the " 3711 "PROXYv2 header", "", &c->repinfo.remote_addr, 3712 c->repinfo.remote_addrlen); 3713 return 0; 3714 } 3715 sldns_buffer_flip(c->buffer); 3716 if(!consume_pp2_header(c->buffer, &c->repinfo, 1)) { 3717 log_err_addr("proxy_protocol: could not consume " 3718 "PROXYv2 header", "", &c->repinfo.remote_addr, 3719 c->repinfo.remote_addrlen); 3720 return 0; 3721 } 3722 verbose(VERB_ALGO, "proxy_protocol: successful read of " 3723 "PROXYv2 header"); 3724 /* Clear and reset the buffer to read the following 3725 * DNS packet(s). */ 3726 sldns_buffer_clear(c->buffer); 3727 c->tcp_byte_count = 0; 3728 return 1; 3729 } 3730 if(c->tcp_byte_count < sizeof(uint16_t)) { 3731 /* read length bytes */ 3732 ERR_clear_error(); 3733 if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(c->buffer, 3734 c->tcp_byte_count), (int)(sizeof(uint16_t) - 3735 c->tcp_byte_count))) <= 0) { 3736 int want = SSL_get_error(c->ssl, r); 3737 if(want == SSL_ERROR_ZERO_RETURN) { 3738 if(c->tcp_req_info) 3739 return tcp_req_info_handle_read_close(c->tcp_req_info); 3740 return 0; /* shutdown, closed */ 3741 } else if(want == SSL_ERROR_WANT_READ) { 3742 #ifdef USE_WINSOCK 3743 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 3744 #endif 3745 return 1; /* read more later */ 3746 } else if(want == SSL_ERROR_WANT_WRITE) { 3747 c->ssl_shake_state = comm_ssl_shake_hs_write; 3748 comm_point_listen_for_rw(c, 0, 1); 3749 return 1; 3750 } else if(want == SSL_ERROR_SYSCALL) { 3751 #ifdef ECONNRESET 3752 if(errno == ECONNRESET && verbosity < 2) 3753 return 0; /* silence reset by peer */ 3754 #endif 3755 if(errno != 0) 3756 log_err("SSL_read syscall: %s", 3757 strerror(errno)); 3758 return 0; 3759 } 3760 log_crypto_err_io("could not SSL_read", want); 3761 return 0; 3762 } 3763 c->tcp_byte_count += r; 3764 if(c->tcp_byte_count < sizeof(uint16_t)) 3765 return 1; 3766 if(sldns_buffer_read_u16_at(c->buffer, 0) > 3767 sldns_buffer_capacity(c->buffer)) { 3768 verbose(VERB_QUERY, "ssl: dropped larger than buffer"); 3769 return 0; 3770 } 3771 sldns_buffer_set_limit(c->buffer, 3772 sldns_buffer_read_u16_at(c->buffer, 0)); 3773 if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) { 3774 verbose(VERB_QUERY, "ssl: dropped bogus too short."); 3775 return 0; 3776 } 3777 sldns_buffer_skip(c->buffer, (ssize_t)(c->tcp_byte_count-sizeof(uint16_t))); 3778 verbose(VERB_ALGO, "Reading ssl tcp query of length %d", 3779 (int)sldns_buffer_limit(c->buffer)); 3780 } 3781 if(sldns_buffer_remaining(c->buffer) > 0) { 3782 ERR_clear_error(); 3783 r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer), 3784 (int)sldns_buffer_remaining(c->buffer)); 3785 if(r <= 0) { 3786 int want = SSL_get_error(c->ssl, r); 3787 if(want == SSL_ERROR_ZERO_RETURN) { 3788 if(c->tcp_req_info) 3789 return tcp_req_info_handle_read_close(c->tcp_req_info); 3790 return 0; /* shutdown, closed */ 3791 } else if(want == SSL_ERROR_WANT_READ) { 3792 #ifdef USE_WINSOCK 3793 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 3794 #endif 3795 return 1; /* read more later */ 3796 } else if(want == SSL_ERROR_WANT_WRITE) { 3797 c->ssl_shake_state = comm_ssl_shake_hs_write; 3798 comm_point_listen_for_rw(c, 0, 1); 3799 return 1; 3800 } else if(want == SSL_ERROR_SYSCALL) { 3801 #ifdef ECONNRESET 3802 if(errno == ECONNRESET && verbosity < 2) 3803 return 0; /* silence reset by peer */ 3804 #endif 3805 if(errno != 0) 3806 log_err("SSL_read syscall: %s", 3807 strerror(errno)); 3808 return 0; 3809 } 3810 log_crypto_err_io("could not SSL_read", want); 3811 return 0; 3812 } 3813 sldns_buffer_skip(c->buffer, (ssize_t)r); 3814 } 3815 if(sldns_buffer_remaining(c->buffer) <= 0) { 3816 tcp_callback_reader(c); 3817 } 3818 return 1; 3819 #else 3820 (void)c; 3821 return 0; 3822 #endif /* HAVE_SSL */ 3823 } 3824 3825 /** ssl write callback on TCP */ 3826 static int 3827 ssl_handle_write(struct comm_point* c) 3828 { 3829 #ifdef HAVE_SSL 3830 int r; 3831 if(c->ssl_shake_state != comm_ssl_shake_none) { 3832 if(!ssl_handshake(c)) 3833 return 0; 3834 if(c->ssl_shake_state != comm_ssl_shake_none) 3835 return 1; 3836 } 3837 /* ignore return, if fails we may simply block */ 3838 (void)SSL_set_mode(c->ssl, (long)SSL_MODE_ENABLE_PARTIAL_WRITE); 3839 if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) { 3840 uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(c->buffer)); 3841 ERR_clear_error(); 3842 if(c->tcp_write_and_read) { 3843 if(c->tcp_write_pkt_len + 2 < LDNS_RR_BUF_SIZE) { 3844 /* combine the tcp length and the query for 3845 * write, this emulates writev */ 3846 uint8_t buf[LDNS_RR_BUF_SIZE]; 3847 memmove(buf, &len, sizeof(uint16_t)); 3848 memmove(buf+sizeof(uint16_t), 3849 c->tcp_write_pkt, 3850 c->tcp_write_pkt_len); 3851 r = SSL_write(c->ssl, 3852 (void*)(buf+c->tcp_write_byte_count), 3853 c->tcp_write_pkt_len + 2 - 3854 c->tcp_write_byte_count); 3855 } else { 3856 r = SSL_write(c->ssl, 3857 (void*)(((uint8_t*)&len)+c->tcp_write_byte_count), 3858 (int)(sizeof(uint16_t)-c->tcp_write_byte_count)); 3859 } 3860 } else if(sizeof(uint16_t)+sldns_buffer_remaining(c->buffer) < 3861 LDNS_RR_BUF_SIZE) { 3862 /* combine the tcp length and the query for write, 3863 * this emulates writev */ 3864 uint8_t buf[LDNS_RR_BUF_SIZE]; 3865 memmove(buf, &len, sizeof(uint16_t)); 3866 memmove(buf+sizeof(uint16_t), 3867 sldns_buffer_current(c->buffer), 3868 sldns_buffer_remaining(c->buffer)); 3869 r = SSL_write(c->ssl, (void*)(buf+c->tcp_byte_count), 3870 (int)(sizeof(uint16_t)+ 3871 sldns_buffer_remaining(c->buffer) 3872 - c->tcp_byte_count)); 3873 } else { 3874 r = SSL_write(c->ssl, 3875 (void*)(((uint8_t*)&len)+c->tcp_byte_count), 3876 (int)(sizeof(uint16_t)-c->tcp_byte_count)); 3877 } 3878 if(r <= 0) { 3879 int want = SSL_get_error(c->ssl, r); 3880 if(want == SSL_ERROR_ZERO_RETURN) { 3881 return 0; /* closed */ 3882 } else if(want == SSL_ERROR_WANT_READ) { 3883 c->ssl_shake_state = comm_ssl_shake_hs_read; 3884 comm_point_listen_for_rw(c, 1, 0); 3885 return 1; /* wait for read condition */ 3886 } else if(want == SSL_ERROR_WANT_WRITE) { 3887 #ifdef USE_WINSOCK 3888 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 3889 #endif 3890 return 1; /* write more later */ 3891 } else if(want == SSL_ERROR_SYSCALL) { 3892 #ifdef EPIPE 3893 if(errno == EPIPE && verbosity < 2) 3894 return 0; /* silence 'broken pipe' */ 3895 #endif 3896 if(errno != 0) 3897 log_err("SSL_write syscall: %s", 3898 strerror(errno)); 3899 return 0; 3900 } 3901 log_crypto_err_io("could not SSL_write", want); 3902 return 0; 3903 } 3904 if(c->tcp_write_and_read) { 3905 c->tcp_write_byte_count += r; 3906 if(c->tcp_write_byte_count < sizeof(uint16_t)) 3907 return 1; 3908 } else { 3909 c->tcp_byte_count += r; 3910 if(c->tcp_byte_count < sizeof(uint16_t)) 3911 return 1; 3912 sldns_buffer_set_position(c->buffer, c->tcp_byte_count - 3913 sizeof(uint16_t)); 3914 } 3915 if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { 3916 tcp_callback_writer(c); 3917 return 1; 3918 } 3919 } 3920 log_assert(c->tcp_write_and_read || sldns_buffer_remaining(c->buffer) > 0); 3921 log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2); 3922 ERR_clear_error(); 3923 if(c->tcp_write_and_read) { 3924 r = SSL_write(c->ssl, (void*)(c->tcp_write_pkt + c->tcp_write_byte_count - 2), 3925 (int)(c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count)); 3926 } else { 3927 r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer), 3928 (int)sldns_buffer_remaining(c->buffer)); 3929 } 3930 if(r <= 0) { 3931 int want = SSL_get_error(c->ssl, r); 3932 if(want == SSL_ERROR_ZERO_RETURN) { 3933 return 0; /* closed */ 3934 } else if(want == SSL_ERROR_WANT_READ) { 3935 c->ssl_shake_state = comm_ssl_shake_hs_read; 3936 comm_point_listen_for_rw(c, 1, 0); 3937 return 1; /* wait for read condition */ 3938 } else if(want == SSL_ERROR_WANT_WRITE) { 3939 #ifdef USE_WINSOCK 3940 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 3941 #endif 3942 return 1; /* write more later */ 3943 } else if(want == SSL_ERROR_SYSCALL) { 3944 #ifdef EPIPE 3945 if(errno == EPIPE && verbosity < 2) 3946 return 0; /* silence 'broken pipe' */ 3947 #endif 3948 if(errno != 0) 3949 log_err("SSL_write syscall: %s", 3950 strerror(errno)); 3951 return 0; 3952 } 3953 log_crypto_err_io("could not SSL_write", want); 3954 return 0; 3955 } 3956 if(c->tcp_write_and_read) { 3957 c->tcp_write_byte_count += r; 3958 } else { 3959 sldns_buffer_skip(c->buffer, (ssize_t)r); 3960 } 3961 3962 if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { 3963 tcp_callback_writer(c); 3964 } 3965 return 1; 3966 #else 3967 (void)c; 3968 return 0; 3969 #endif /* HAVE_SSL */ 3970 } 3971 3972 /** handle ssl tcp connection with dns contents */ 3973 static int 3974 ssl_handle_it(struct comm_point* c, int is_write) 3975 { 3976 /* handle case where renegotiation wants read during write call 3977 * or write during read calls */ 3978 if(is_write && c->ssl_shake_state == comm_ssl_shake_hs_write) 3979 return ssl_handle_read(c); 3980 else if(!is_write && c->ssl_shake_state == comm_ssl_shake_hs_read) 3981 return ssl_handle_write(c); 3982 /* handle read events for read operation and write events for a 3983 * write operation */ 3984 else if(!is_write) 3985 return ssl_handle_read(c); 3986 return ssl_handle_write(c); 3987 } 3988 3989 /** 3990 * Handle tcp reading callback. 3991 * @param fd: file descriptor of socket. 3992 * @param c: comm point to read from into buffer. 3993 * @param short_ok: if true, very short packets are OK (for comm_local). 3994 * @return: 0 on error 3995 */ 3996 static int 3997 comm_point_tcp_handle_read(int fd, struct comm_point* c, int short_ok) 3998 { 3999 ssize_t r; 4000 int recv_initial = 0; 4001 log_assert(c->type == comm_tcp || c->type == comm_local); 4002 if(c->ssl) 4003 return ssl_handle_it(c, 0); 4004 if(!c->tcp_is_reading && !c->tcp_write_and_read) 4005 return 0; 4006 4007 log_assert(fd != -1); 4008 if(c->pp2_enabled && c->pp2_header_state != pp2_header_done) { 4009 struct pp2_header* header = NULL; 4010 size_t want_read_size = 0; 4011 size_t current_read_size = 0; 4012 if(c->pp2_header_state == pp2_header_none) { 4013 want_read_size = PP2_HEADER_SIZE; 4014 if(sldns_buffer_remaining(c->buffer)<want_read_size) { 4015 log_err_addr("proxy_protocol: not enough " 4016 "buffer size to read PROXYv2 header", "", 4017 &c->repinfo.remote_addr, 4018 c->repinfo.remote_addrlen); 4019 return 0; 4020 } 4021 verbose(VERB_ALGO, "proxy_protocol: reading fixed " 4022 "part of PROXYv2 header (len %lu)", 4023 (unsigned long)want_read_size); 4024 current_read_size = want_read_size; 4025 if(c->tcp_byte_count < current_read_size) { 4026 r = recv(fd, (void*)sldns_buffer_at(c->buffer, 4027 c->tcp_byte_count), 4028 current_read_size-c->tcp_byte_count, MSG_DONTWAIT); 4029 if(r == 0) { 4030 if(c->tcp_req_info) 4031 return tcp_req_info_handle_read_close(c->tcp_req_info); 4032 return 0; 4033 } else if(r == -1) { 4034 goto recv_error_initial; 4035 } 4036 c->tcp_byte_count += r; 4037 sldns_buffer_skip(c->buffer, r); 4038 if(c->tcp_byte_count != current_read_size) return 1; 4039 c->pp2_header_state = pp2_header_init; 4040 } 4041 } 4042 if(c->pp2_header_state == pp2_header_init) { 4043 int err; 4044 err = pp2_read_header( 4045 sldns_buffer_begin(c->buffer), 4046 sldns_buffer_limit(c->buffer)); 4047 if(err) { 4048 log_err("proxy_protocol: could not parse " 4049 "PROXYv2 header (%s)", 4050 pp_lookup_error(err)); 4051 return 0; 4052 } 4053 header = (struct pp2_header*)sldns_buffer_begin(c->buffer); 4054 want_read_size = ntohs(header->len); 4055 if(sldns_buffer_limit(c->buffer) < 4056 PP2_HEADER_SIZE + want_read_size) { 4057 log_err_addr("proxy_protocol: not enough " 4058 "buffer size to read PROXYv2 header", "", 4059 &c->repinfo.remote_addr, 4060 c->repinfo.remote_addrlen); 4061 return 0; 4062 } 4063 verbose(VERB_ALGO, "proxy_protocol: reading variable " 4064 "part of PROXYv2 header (len %lu)", 4065 (unsigned long)want_read_size); 4066 current_read_size = PP2_HEADER_SIZE + want_read_size; 4067 if(want_read_size == 0) { 4068 /* nothing more to read; header is complete */ 4069 c->pp2_header_state = pp2_header_done; 4070 } else if(c->tcp_byte_count < current_read_size) { 4071 r = recv(fd, (void*)sldns_buffer_at(c->buffer, 4072 c->tcp_byte_count), 4073 current_read_size-c->tcp_byte_count, MSG_DONTWAIT); 4074 if(r == 0) { 4075 if(c->tcp_req_info) 4076 return tcp_req_info_handle_read_close(c->tcp_req_info); 4077 return 0; 4078 } else if(r == -1) { 4079 goto recv_error; 4080 } 4081 c->tcp_byte_count += r; 4082 sldns_buffer_skip(c->buffer, r); 4083 if(c->tcp_byte_count != current_read_size) return 1; 4084 c->pp2_header_state = pp2_header_done; 4085 } 4086 } 4087 if(c->pp2_header_state != pp2_header_done || !header) { 4088 log_err_addr("proxy_protocol: wrong state for the " 4089 "PROXYv2 header", "", &c->repinfo.remote_addr, 4090 c->repinfo.remote_addrlen); 4091 return 0; 4092 } 4093 sldns_buffer_flip(c->buffer); 4094 if(!consume_pp2_header(c->buffer, &c->repinfo, 1)) { 4095 log_err_addr("proxy_protocol: could not consume " 4096 "PROXYv2 header", "", &c->repinfo.remote_addr, 4097 c->repinfo.remote_addrlen); 4098 return 0; 4099 } 4100 verbose(VERB_ALGO, "proxy_protocol: successful read of " 4101 "PROXYv2 header"); 4102 /* Clear and reset the buffer to read the following 4103 * DNS packet(s). */ 4104 sldns_buffer_clear(c->buffer); 4105 c->tcp_byte_count = 0; 4106 return 1; 4107 } 4108 4109 if(c->tcp_byte_count < sizeof(uint16_t)) { 4110 /* read length bytes */ 4111 r = recv(fd,(void*)sldns_buffer_at(c->buffer,c->tcp_byte_count), 4112 sizeof(uint16_t)-c->tcp_byte_count, MSG_DONTWAIT); 4113 if(r == 0) { 4114 if(c->tcp_req_info) 4115 return tcp_req_info_handle_read_close(c->tcp_req_info); 4116 return 0; 4117 } else if(r == -1) { 4118 if(c->pp2_enabled) goto recv_error; 4119 goto recv_error_initial; 4120 } 4121 c->tcp_byte_count += r; 4122 if(c->tcp_byte_count != sizeof(uint16_t)) 4123 return 1; 4124 if(sldns_buffer_read_u16_at(c->buffer, 0) > 4125 sldns_buffer_capacity(c->buffer)) { 4126 verbose(VERB_QUERY, "tcp: dropped larger than buffer"); 4127 return 0; 4128 } 4129 sldns_buffer_set_limit(c->buffer, 4130 sldns_buffer_read_u16_at(c->buffer, 0)); 4131 if(!short_ok && 4132 sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) { 4133 verbose(VERB_QUERY, "tcp: dropped bogus too short."); 4134 return 0; 4135 } 4136 verbose(VERB_ALGO, "Reading tcp query of length %d", 4137 (int)sldns_buffer_limit(c->buffer)); 4138 } 4139 4140 if(sldns_buffer_remaining(c->buffer) == 0) 4141 log_err("in comm_point_tcp_handle_read buffer_remaining is " 4142 "not > 0 as expected, continuing with (harmless) 0 " 4143 "length recv"); 4144 r = recv(fd, (void*)sldns_buffer_current(c->buffer), 4145 sldns_buffer_remaining(c->buffer), MSG_DONTWAIT); 4146 if(r == 0) { 4147 if(c->tcp_req_info) 4148 return tcp_req_info_handle_read_close(c->tcp_req_info); 4149 return 0; 4150 } else if(r == -1) { 4151 goto recv_error; 4152 } 4153 sldns_buffer_skip(c->buffer, r); 4154 if(sldns_buffer_remaining(c->buffer) <= 0) { 4155 tcp_callback_reader(c); 4156 } 4157 return 1; 4158 4159 recv_error_initial: 4160 recv_initial = 1; 4161 recv_error: 4162 #ifndef USE_WINSOCK 4163 if(errno == EINTR || errno == EAGAIN) 4164 return 1; 4165 #ifdef ECONNRESET 4166 if(errno == ECONNRESET && verbosity < 2) 4167 return 0; /* silence reset by peer */ 4168 #endif 4169 if(recv_initial) { 4170 #ifdef ECONNREFUSED 4171 if(errno == ECONNREFUSED && verbosity < 2) 4172 return 0; /* silence reset by peer */ 4173 #endif 4174 #ifdef ENETUNREACH 4175 if(errno == ENETUNREACH && verbosity < 2) 4176 return 0; /* silence it */ 4177 #endif 4178 #ifdef EHOSTDOWN 4179 if(errno == EHOSTDOWN && verbosity < 2) 4180 return 0; /* silence it */ 4181 #endif 4182 #ifdef EHOSTUNREACH 4183 if(errno == EHOSTUNREACH && verbosity < 2) 4184 return 0; /* silence it */ 4185 #endif 4186 #ifdef ENETDOWN 4187 if(errno == ENETDOWN && verbosity < 2) 4188 return 0; /* silence it */ 4189 #endif 4190 #ifdef EACCES 4191 if(errno == EACCES && verbosity < 2) 4192 return 0; /* silence it */ 4193 #endif 4194 #ifdef ENOTCONN 4195 if(errno == ENOTCONN) { 4196 log_err_addr("read (in tcp initial) failed and this " 4197 "could be because TCP Fast Open is " 4198 "enabled [--disable-tfo-client " 4199 "--disable-tfo-server] but does not " 4200 "work", sock_strerror(errno), 4201 &c->repinfo.remote_addr, 4202 c->repinfo.remote_addrlen); 4203 return 0; 4204 } 4205 #endif 4206 } 4207 #else /* USE_WINSOCK */ 4208 if(recv_initial) { 4209 if(WSAGetLastError() == WSAECONNREFUSED && verbosity < 2) 4210 return 0; 4211 if(WSAGetLastError() == WSAEHOSTDOWN && verbosity < 2) 4212 return 0; 4213 if(WSAGetLastError() == WSAEHOSTUNREACH && verbosity < 2) 4214 return 0; 4215 if(WSAGetLastError() == WSAENETDOWN && verbosity < 2) 4216 return 0; 4217 if(WSAGetLastError() == WSAENETUNREACH && verbosity < 2) 4218 return 0; 4219 } 4220 if(WSAGetLastError() == WSAECONNRESET) 4221 return 0; 4222 if(WSAGetLastError() == WSAEINPROGRESS) 4223 return 1; 4224 if(WSAGetLastError() == WSAEWOULDBLOCK) { 4225 ub_winsock_tcp_wouldblock(c->ev->ev, 4226 UB_EV_READ); 4227 return 1; 4228 } 4229 #endif 4230 log_err_addr((recv_initial?"read (in tcp initial)":"read (in tcp)"), 4231 sock_strerror(errno), &c->repinfo.remote_addr, 4232 c->repinfo.remote_addrlen); 4233 return 0; 4234 } 4235 4236 /** 4237 * Handle tcp writing callback. 4238 * @param fd: file descriptor of socket. 4239 * @param c: comm point to write buffer out of. 4240 * @return: 0 on error 4241 */ 4242 static int 4243 comm_point_tcp_handle_write(int fd, struct comm_point* c) 4244 { 4245 ssize_t r; 4246 struct sldns_buffer *buffer; 4247 log_assert(c->type == comm_tcp); 4248 #ifdef USE_DNSCRYPT 4249 buffer = c->dnscrypt_buffer; 4250 #else 4251 buffer = c->buffer; 4252 #endif 4253 if(c->tcp_is_reading && !c->ssl && !c->tcp_write_and_read) 4254 return 0; 4255 log_assert(fd != -1); 4256 if(((!c->tcp_write_and_read && c->tcp_byte_count == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == 0)) && c->tcp_check_nb_connect) { 4257 /* check for pending error from nonblocking connect */ 4258 /* from Stevens, unix network programming, vol1, 3rd ed, p450*/ 4259 int error = 0; 4260 socklen_t len = (socklen_t)sizeof(error); 4261 if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error, 4262 &len) < 0){ 4263 #ifndef USE_WINSOCK 4264 error = errno; /* on solaris errno is error */ 4265 #else /* USE_WINSOCK */ 4266 error = WSAGetLastError(); 4267 #endif 4268 } 4269 #ifndef USE_WINSOCK 4270 #if defined(EINPROGRESS) && defined(EWOULDBLOCK) 4271 if(error == EINPROGRESS || error == EWOULDBLOCK) 4272 return 1; /* try again later */ 4273 else 4274 #endif 4275 if(error != 0 && verbosity < 2) 4276 return 0; /* silence lots of chatter in the logs */ 4277 else if(error != 0) { 4278 log_err_addr("tcp connect", strerror(error), 4279 &c->repinfo.remote_addr, 4280 c->repinfo.remote_addrlen); 4281 #else /* USE_WINSOCK */ 4282 /* examine error */ 4283 if(error == WSAEINPROGRESS) 4284 return 1; 4285 else if(error == WSAEWOULDBLOCK) { 4286 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 4287 return 1; 4288 } else if(error != 0 && verbosity < 2) 4289 return 0; 4290 else if(error != 0) { 4291 log_err_addr("tcp connect", wsa_strerror(error), 4292 &c->repinfo.remote_addr, 4293 c->repinfo.remote_addrlen); 4294 #endif /* USE_WINSOCK */ 4295 return 0; 4296 } 4297 } 4298 if(c->ssl) 4299 return ssl_handle_it(c, 1); 4300 4301 #ifdef USE_MSG_FASTOPEN 4302 /* Only try this on first use of a connection that uses tfo, 4303 otherwise fall through to normal write */ 4304 /* Also, TFO support on WINDOWS not implemented at the moment */ 4305 if(c->tcp_do_fastopen == 1) { 4306 /* this form of sendmsg() does both a connect() and send() so need to 4307 look for various flavours of error*/ 4308 uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer)); 4309 struct msghdr msg; 4310 struct iovec iov[2]; 4311 c->tcp_do_fastopen = 0; 4312 memset(&msg, 0, sizeof(msg)); 4313 if(c->tcp_write_and_read) { 4314 iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count; 4315 iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count; 4316 iov[1].iov_base = c->tcp_write_pkt; 4317 iov[1].iov_len = c->tcp_write_pkt_len; 4318 } else { 4319 iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count; 4320 iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count; 4321 iov[1].iov_base = sldns_buffer_begin(buffer); 4322 iov[1].iov_len = sldns_buffer_limit(buffer); 4323 } 4324 log_assert(iov[0].iov_len > 0); 4325 msg.msg_name = &c->repinfo.remote_addr; 4326 msg.msg_namelen = c->repinfo.remote_addrlen; 4327 msg.msg_iov = iov; 4328 msg.msg_iovlen = 2; 4329 r = sendmsg(fd, &msg, MSG_FASTOPEN); 4330 if (r == -1) { 4331 #if defined(EINPROGRESS) && defined(EWOULDBLOCK) 4332 /* Handshake is underway, maybe because no TFO cookie available. 4333 Come back to write the message*/ 4334 if(errno == EINPROGRESS || errno == EWOULDBLOCK) 4335 return 1; 4336 #endif 4337 if(errno == EINTR || errno == EAGAIN) 4338 return 1; 4339 /* Not handling EISCONN here as shouldn't ever hit that case.*/ 4340 if(errno != EPIPE 4341 #ifdef EOPNOTSUPP 4342 /* if /proc/sys/net/ipv4/tcp_fastopen is 4343 * disabled on Linux, sendmsg may return 4344 * 'Operation not supported', if so 4345 * fallthrough to ordinary connect. */ 4346 && errno != EOPNOTSUPP 4347 #endif 4348 && errno != 0) { 4349 if(verbosity < 2) 4350 return 0; /* silence lots of chatter in the logs */ 4351 log_err_addr("tcp sendmsg", strerror(errno), 4352 &c->repinfo.remote_addr, 4353 c->repinfo.remote_addrlen); 4354 return 0; 4355 } 4356 verbose(VERB_ALGO, "tcp sendmsg for fastopen failed (with %s), try normal connect", strerror(errno)); 4357 /* fallthrough to nonFASTOPEN 4358 * (MSG_FASTOPEN on Linux 3 produces EPIPE) 4359 * we need to perform connect() */ 4360 if(connect(fd, (struct sockaddr *)&c->repinfo.remote_addr, 4361 c->repinfo.remote_addrlen) == -1) { 4362 #ifdef EINPROGRESS 4363 if(errno == EINPROGRESS) 4364 return 1; /* wait until connect done*/ 4365 #endif 4366 #ifdef USE_WINSOCK 4367 if(WSAGetLastError() == WSAEINPROGRESS || 4368 WSAGetLastError() == WSAEWOULDBLOCK) 4369 return 1; /* wait until connect done*/ 4370 #endif 4371 if(tcp_connect_errno_needs_log( 4372 (struct sockaddr *)&c->repinfo.remote_addr, 4373 c->repinfo.remote_addrlen)) { 4374 log_err_addr("outgoing tcp: connect after EPIPE for fastopen", 4375 strerror(errno), 4376 &c->repinfo.remote_addr, 4377 c->repinfo.remote_addrlen); 4378 } 4379 return 0; 4380 } 4381 4382 } else { 4383 if(c->tcp_write_and_read) { 4384 c->tcp_write_byte_count += r; 4385 if(c->tcp_write_byte_count < sizeof(uint16_t)) 4386 return 1; 4387 } else { 4388 c->tcp_byte_count += r; 4389 if(c->tcp_byte_count < sizeof(uint16_t)) 4390 return 1; 4391 sldns_buffer_set_position(buffer, c->tcp_byte_count - 4392 sizeof(uint16_t)); 4393 } 4394 if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { 4395 tcp_callback_writer(c); 4396 return 1; 4397 } 4398 } 4399 } 4400 #endif /* USE_MSG_FASTOPEN */ 4401 4402 if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) { 4403 uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer)); 4404 #ifdef HAVE_WRITEV 4405 struct iovec iov[2]; 4406 if(c->tcp_write_and_read) { 4407 iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count; 4408 iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count; 4409 iov[1].iov_base = c->tcp_write_pkt; 4410 iov[1].iov_len = c->tcp_write_pkt_len; 4411 } else { 4412 iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count; 4413 iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count; 4414 iov[1].iov_base = sldns_buffer_begin(buffer); 4415 iov[1].iov_len = sldns_buffer_limit(buffer); 4416 } 4417 log_assert(iov[0].iov_len > 0); 4418 r = writev(fd, iov, 2); 4419 #else /* HAVE_WRITEV */ 4420 if(c->tcp_write_and_read) { 4421 r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_write_byte_count), 4422 sizeof(uint16_t)-c->tcp_write_byte_count, 0); 4423 } else { 4424 r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_byte_count), 4425 sizeof(uint16_t)-c->tcp_byte_count, 0); 4426 } 4427 #endif /* HAVE_WRITEV */ 4428 if(r == -1) { 4429 #ifndef USE_WINSOCK 4430 # ifdef EPIPE 4431 if(errno == EPIPE && verbosity < 2) 4432 return 0; /* silence 'broken pipe' */ 4433 #endif 4434 if(errno == EINTR || errno == EAGAIN) 4435 return 1; 4436 #ifdef ECONNRESET 4437 if(errno == ECONNRESET && verbosity < 2) 4438 return 0; /* silence reset by peer */ 4439 #endif 4440 # ifdef HAVE_WRITEV 4441 log_err_addr("tcp writev", strerror(errno), 4442 &c->repinfo.remote_addr, 4443 c->repinfo.remote_addrlen); 4444 # else /* HAVE_WRITEV */ 4445 log_err_addr("tcp send s", strerror(errno), 4446 &c->repinfo.remote_addr, 4447 c->repinfo.remote_addrlen); 4448 # endif /* HAVE_WRITEV */ 4449 #else 4450 if(WSAGetLastError() == WSAENOTCONN) 4451 return 1; 4452 if(WSAGetLastError() == WSAEINPROGRESS) 4453 return 1; 4454 if(WSAGetLastError() == WSAEWOULDBLOCK) { 4455 ub_winsock_tcp_wouldblock(c->ev->ev, 4456 UB_EV_WRITE); 4457 return 1; 4458 } 4459 if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) 4460 return 0; /* silence reset by peer */ 4461 log_err_addr("tcp send s", 4462 wsa_strerror(WSAGetLastError()), 4463 &c->repinfo.remote_addr, 4464 c->repinfo.remote_addrlen); 4465 #endif 4466 return 0; 4467 } 4468 if(c->tcp_write_and_read) { 4469 c->tcp_write_byte_count += r; 4470 if(c->tcp_write_byte_count < sizeof(uint16_t)) 4471 return 1; 4472 } else { 4473 c->tcp_byte_count += r; 4474 if(c->tcp_byte_count < sizeof(uint16_t)) 4475 return 1; 4476 sldns_buffer_set_position(buffer, c->tcp_byte_count - 4477 sizeof(uint16_t)); 4478 } 4479 if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { 4480 tcp_callback_writer(c); 4481 return 1; 4482 } 4483 } 4484 log_assert(c->tcp_write_and_read || sldns_buffer_remaining(buffer) > 0); 4485 log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2); 4486 if(c->tcp_write_and_read) { 4487 r = send(fd, (void*)(c->tcp_write_pkt + c->tcp_write_byte_count - 2), 4488 c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count, 0); 4489 } else { 4490 r = send(fd, (void*)sldns_buffer_current(buffer), 4491 sldns_buffer_remaining(buffer), 0); 4492 } 4493 if(r == -1) { 4494 #ifndef USE_WINSOCK 4495 if(errno == EINTR || errno == EAGAIN) 4496 return 1; 4497 #ifdef ECONNRESET 4498 if(errno == ECONNRESET && verbosity < 2) 4499 return 0; /* silence reset by peer */ 4500 #endif 4501 #else 4502 if(WSAGetLastError() == WSAEINPROGRESS) 4503 return 1; 4504 if(WSAGetLastError() == WSAEWOULDBLOCK) { 4505 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 4506 return 1; 4507 } 4508 if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) 4509 return 0; /* silence reset by peer */ 4510 #endif 4511 log_err_addr("tcp send r", sock_strerror(errno), 4512 &c->repinfo.remote_addr, 4513 c->repinfo.remote_addrlen); 4514 return 0; 4515 } 4516 if(c->tcp_write_and_read) { 4517 c->tcp_write_byte_count += r; 4518 } else { 4519 sldns_buffer_skip(buffer, r); 4520 } 4521 4522 if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { 4523 tcp_callback_writer(c); 4524 } 4525 4526 return 1; 4527 } 4528 4529 /** read again to drain buffers when there could be more to read, returns 0 4530 * on failure which means the comm point is closed. */ 4531 static int 4532 tcp_req_info_read_again(int fd, struct comm_point* c) 4533 { 4534 while(c->tcp_req_info->read_again) { 4535 int r; 4536 c->tcp_req_info->read_again = 0; 4537 if(c->tcp_is_reading) 4538 r = comm_point_tcp_handle_read(fd, c, 0); 4539 else r = comm_point_tcp_handle_write(fd, c); 4540 if(!r) { 4541 reclaim_tcp_handler(c); 4542 if(!c->tcp_do_close) { 4543 fptr_ok(fptr_whitelist_comm_point( 4544 c->callback)); 4545 (void)(*c->callback)(c, c->cb_arg, 4546 NETEVENT_CLOSED, NULL); 4547 } 4548 return 0; 4549 } 4550 } 4551 return 1; 4552 } 4553 4554 /** read again to drain buffers when there could be more to read */ 4555 static void 4556 tcp_more_read_again(int fd, struct comm_point* c) 4557 { 4558 /* if the packet is done, but another one could be waiting on 4559 * the connection, the callback signals this, and we try again */ 4560 /* this continues until the read routines get EAGAIN or so, 4561 * and thus does not call the callback, and the bool is 0 */ 4562 int* moreread = c->tcp_more_read_again; 4563 while(moreread && *moreread) { 4564 *moreread = 0; 4565 if(!comm_point_tcp_handle_read(fd, c, 0)) { 4566 reclaim_tcp_handler(c); 4567 if(!c->tcp_do_close) { 4568 fptr_ok(fptr_whitelist_comm_point( 4569 c->callback)); 4570 (void)(*c->callback)(c, c->cb_arg, 4571 NETEVENT_CLOSED, NULL); 4572 } 4573 return; 4574 } 4575 } 4576 } 4577 4578 /** write again to fill up when there could be more to write */ 4579 static void 4580 tcp_more_write_again(int fd, struct comm_point* c) 4581 { 4582 /* if the packet is done, but another is waiting to be written, 4583 * the callback signals it and we try again. */ 4584 /* this continues until the write routines get EAGAIN or so, 4585 * and thus does not call the callback, and the bool is 0 */ 4586 int* morewrite = c->tcp_more_write_again; 4587 while(morewrite && *morewrite) { 4588 *morewrite = 0; 4589 if(!comm_point_tcp_handle_write(fd, c)) { 4590 reclaim_tcp_handler(c); 4591 if(!c->tcp_do_close) { 4592 fptr_ok(fptr_whitelist_comm_point( 4593 c->callback)); 4594 (void)(*c->callback)(c, c->cb_arg, 4595 NETEVENT_CLOSED, NULL); 4596 } 4597 return; 4598 } 4599 } 4600 } 4601 4602 void 4603 comm_point_tcp_handle_callback(int fd, short event, void* arg) 4604 { 4605 struct comm_point* c = (struct comm_point*)arg; 4606 log_assert(c->type == comm_tcp); 4607 ub_comm_base_now(c->ev->base); 4608 4609 if(c->fd == -1 || c->fd != fd) 4610 return; /* duplicate event, but commpoint closed. */ 4611 4612 #ifdef USE_DNSCRYPT 4613 /* Initialize if this is a dnscrypt socket */ 4614 if(c->tcp_parent) { 4615 c->dnscrypt = c->tcp_parent->dnscrypt; 4616 } 4617 if(c->dnscrypt && c->dnscrypt_buffer == c->buffer) { 4618 c->dnscrypt_buffer = sldns_buffer_new(sldns_buffer_capacity(c->buffer)); 4619 if(!c->dnscrypt_buffer) { 4620 log_err("Could not allocate dnscrypt buffer"); 4621 reclaim_tcp_handler(c); 4622 if(!c->tcp_do_close) { 4623 fptr_ok(fptr_whitelist_comm_point( 4624 c->callback)); 4625 (void)(*c->callback)(c, c->cb_arg, 4626 NETEVENT_CLOSED, NULL); 4627 } 4628 return; 4629 } 4630 } 4631 #endif 4632 4633 if(event&UB_EV_TIMEOUT) { 4634 verbose(VERB_QUERY, "tcp took too long, dropped"); 4635 reclaim_tcp_handler(c); 4636 if(!c->tcp_do_close) { 4637 fptr_ok(fptr_whitelist_comm_point(c->callback)); 4638 (void)(*c->callback)(c, c->cb_arg, 4639 NETEVENT_TIMEOUT, NULL); 4640 } 4641 return; 4642 } 4643 if(event&UB_EV_READ 4644 #ifdef USE_MSG_FASTOPEN 4645 && !(c->tcp_do_fastopen && (event&UB_EV_WRITE)) 4646 #endif 4647 ) { 4648 int has_tcpq = (c->tcp_req_info != NULL); 4649 int* moreread = c->tcp_more_read_again; 4650 if(!comm_point_tcp_handle_read(fd, c, 0)) { 4651 reclaim_tcp_handler(c); 4652 if(!c->tcp_do_close) { 4653 fptr_ok(fptr_whitelist_comm_point( 4654 c->callback)); 4655 (void)(*c->callback)(c, c->cb_arg, 4656 NETEVENT_CLOSED, NULL); 4657 } 4658 return; 4659 } 4660 if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) { 4661 if(!tcp_req_info_read_again(fd, c)) 4662 return; 4663 } 4664 if(moreread && *moreread) 4665 tcp_more_read_again(fd, c); 4666 return; 4667 } 4668 if(event&UB_EV_WRITE) { 4669 int has_tcpq = (c->tcp_req_info != NULL); 4670 int* morewrite = c->tcp_more_write_again; 4671 if(!comm_point_tcp_handle_write(fd, c)) { 4672 reclaim_tcp_handler(c); 4673 if(!c->tcp_do_close) { 4674 fptr_ok(fptr_whitelist_comm_point( 4675 c->callback)); 4676 (void)(*c->callback)(c, c->cb_arg, 4677 NETEVENT_CLOSED, NULL); 4678 } 4679 return; 4680 } 4681 if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) { 4682 if(!tcp_req_info_read_again(fd, c)) 4683 return; 4684 } 4685 if(morewrite && *morewrite) 4686 tcp_more_write_again(fd, c); 4687 return; 4688 } 4689 log_err("Ignored event %d for tcphdl.", event); 4690 } 4691 4692 /** Make http handler free for next assignment */ 4693 static void 4694 reclaim_http_handler(struct comm_point* c) 4695 { 4696 log_assert(c->type == comm_http); 4697 if(c->ssl) { 4698 #ifdef HAVE_SSL 4699 SSL_shutdown(c->ssl); 4700 SSL_free(c->ssl); 4701 c->ssl = NULL; 4702 #endif 4703 } 4704 comm_point_close(c); 4705 if(c->tcp_parent) { 4706 if(c != c->tcp_parent->tcp_free) { 4707 c->tcp_parent->cur_tcp_count--; 4708 c->tcp_free = c->tcp_parent->tcp_free; 4709 c->tcp_parent->tcp_free = c; 4710 } 4711 if(!c->tcp_free) { 4712 /* re-enable listening on accept socket */ 4713 comm_point_start_listening(c->tcp_parent, -1, -1); 4714 } 4715 } 4716 } 4717 4718 /** read more data for http (with ssl) */ 4719 static int 4720 ssl_http_read_more(struct comm_point* c) 4721 { 4722 #ifdef HAVE_SSL 4723 int r; 4724 log_assert(sldns_buffer_remaining(c->buffer) > 0); 4725 ERR_clear_error(); 4726 r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer), 4727 (int)sldns_buffer_remaining(c->buffer)); 4728 if(r <= 0) { 4729 int want = SSL_get_error(c->ssl, r); 4730 if(want == SSL_ERROR_ZERO_RETURN) { 4731 return 0; /* shutdown, closed */ 4732 } else if(want == SSL_ERROR_WANT_READ) { 4733 return 1; /* read more later */ 4734 } else if(want == SSL_ERROR_WANT_WRITE) { 4735 c->ssl_shake_state = comm_ssl_shake_hs_write; 4736 comm_point_listen_for_rw(c, 0, 1); 4737 return 1; 4738 } else if(want == SSL_ERROR_SYSCALL) { 4739 #ifdef ECONNRESET 4740 if(errno == ECONNRESET && verbosity < 2) 4741 return 0; /* silence reset by peer */ 4742 #endif 4743 if(errno != 0) 4744 log_err("SSL_read syscall: %s", 4745 strerror(errno)); 4746 return 0; 4747 } 4748 log_crypto_err_io("could not SSL_read", want); 4749 return 0; 4750 } 4751 verbose(VERB_ALGO, "ssl http read more skip to %d + %d", 4752 (int)sldns_buffer_position(c->buffer), (int)r); 4753 sldns_buffer_skip(c->buffer, (ssize_t)r); 4754 return 1; 4755 #else 4756 (void)c; 4757 return 0; 4758 #endif /* HAVE_SSL */ 4759 } 4760 4761 /** read more data for http */ 4762 static int 4763 http_read_more(int fd, struct comm_point* c) 4764 { 4765 ssize_t r; 4766 log_assert(sldns_buffer_remaining(c->buffer) > 0); 4767 r = recv(fd, (void*)sldns_buffer_current(c->buffer), 4768 sldns_buffer_remaining(c->buffer), MSG_DONTWAIT); 4769 if(r == 0) { 4770 return 0; 4771 } else if(r == -1) { 4772 #ifndef USE_WINSOCK 4773 if(errno == EINTR || errno == EAGAIN) 4774 return 1; 4775 #else /* USE_WINSOCK */ 4776 if(WSAGetLastError() == WSAECONNRESET) 4777 return 0; 4778 if(WSAGetLastError() == WSAEINPROGRESS) 4779 return 1; 4780 if(WSAGetLastError() == WSAEWOULDBLOCK) { 4781 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 4782 return 1; 4783 } 4784 #endif 4785 log_err_addr("read (in http r)", sock_strerror(errno), 4786 &c->repinfo.remote_addr, c->repinfo.remote_addrlen); 4787 return 0; 4788 } 4789 verbose(VERB_ALGO, "http read more skip to %d + %d", 4790 (int)sldns_buffer_position(c->buffer), (int)r); 4791 sldns_buffer_skip(c->buffer, r); 4792 return 1; 4793 } 4794 4795 /** return true if http header has been read (one line complete) */ 4796 static int 4797 http_header_done(sldns_buffer* buf) 4798 { 4799 size_t i; 4800 for(i=sldns_buffer_position(buf); i<sldns_buffer_limit(buf); i++) { 4801 /* there was a \r before the \n, but we ignore that */ 4802 if((char)sldns_buffer_read_u8_at(buf, i) == '\n') 4803 return 1; 4804 } 4805 return 0; 4806 } 4807 4808 /** return character string into buffer for header line, moves buffer 4809 * past that line and puts zero terminator into linefeed-newline */ 4810 static char* 4811 http_header_line(sldns_buffer* buf) 4812 { 4813 char* result = (char*)sldns_buffer_current(buf); 4814 size_t i; 4815 for(i=sldns_buffer_position(buf); i<sldns_buffer_limit(buf); i++) { 4816 /* terminate the string on the \r */ 4817 if((char)sldns_buffer_read_u8_at(buf, i) == '\r') 4818 sldns_buffer_write_u8_at(buf, i, 0); 4819 /* terminate on the \n and skip past the it and done */ 4820 if((char)sldns_buffer_read_u8_at(buf, i) == '\n') { 4821 sldns_buffer_write_u8_at(buf, i, 0); 4822 sldns_buffer_set_position(buf, i+1); 4823 return result; 4824 } 4825 } 4826 return NULL; 4827 } 4828 4829 /** move unread buffer to start and clear rest for putting the rest into it */ 4830 static void 4831 http_moveover_buffer(sldns_buffer* buf) 4832 { 4833 size_t pos = sldns_buffer_position(buf); 4834 size_t len = sldns_buffer_remaining(buf); 4835 sldns_buffer_clear(buf); 4836 memmove(sldns_buffer_begin(buf), sldns_buffer_at(buf, pos), len); 4837 sldns_buffer_set_position(buf, len); 4838 } 4839 4840 /** a http header is complete, process it */ 4841 static int 4842 http_process_initial_header(struct comm_point* c) 4843 { 4844 char* line = http_header_line(c->buffer); 4845 if(!line) return 1; 4846 verbose(VERB_ALGO, "http header: %s", line); 4847 if(strncasecmp(line, "HTTP/1.1 ", 9) == 0) { 4848 /* check returncode */ 4849 if(line[9] != '2') { 4850 verbose(VERB_ALGO, "http bad status %s", line+9); 4851 return 0; 4852 } 4853 } else if(strncasecmp(line, "Content-Length: ", 16) == 0) { 4854 if(!c->http_is_chunked) 4855 c->tcp_byte_count = (size_t)atoi(line+16); 4856 } else if(strncasecmp(line, "Transfer-Encoding: chunked", 19+7) == 0) { 4857 c->tcp_byte_count = 0; 4858 c->http_is_chunked = 1; 4859 } else if(line[0] == 0) { 4860 /* end of initial headers */ 4861 c->http_in_headers = 0; 4862 if(c->http_is_chunked) 4863 c->http_in_chunk_headers = 1; 4864 /* remove header text from front of buffer 4865 * the buffer is going to be used to return the data segment 4866 * itself and we don't want the header to get returned 4867 * prepended with it */ 4868 http_moveover_buffer(c->buffer); 4869 sldns_buffer_flip(c->buffer); 4870 return 1; 4871 } 4872 /* ignore other headers */ 4873 return 1; 4874 } 4875 4876 /** a chunk header is complete, process it, return 0=fail, 1=continue next 4877 * header line, 2=done with chunked transfer*/ 4878 static int 4879 http_process_chunk_header(struct comm_point* c) 4880 { 4881 char* line = http_header_line(c->buffer); 4882 if(!line) return 1; 4883 if(c->http_in_chunk_headers == 3) { 4884 verbose(VERB_ALGO, "http chunk trailer: %s", line); 4885 /* are we done ? */ 4886 if(line[0] == 0 && c->tcp_byte_count == 0) { 4887 /* callback of http reader when NETEVENT_DONE, 4888 * end of data, with no data in buffer */ 4889 sldns_buffer_set_position(c->buffer, 0); 4890 sldns_buffer_set_limit(c->buffer, 0); 4891 fptr_ok(fptr_whitelist_comm_point(c->callback)); 4892 (void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL); 4893 /* return that we are done */ 4894 return 2; 4895 } 4896 if(line[0] == 0) { 4897 /* continue with header of the next chunk */ 4898 c->http_in_chunk_headers = 1; 4899 /* remove header text from front of buffer */ 4900 http_moveover_buffer(c->buffer); 4901 sldns_buffer_flip(c->buffer); 4902 return 1; 4903 } 4904 /* ignore further trail headers */ 4905 return 1; 4906 } 4907 verbose(VERB_ALGO, "http chunk header: %s", line); 4908 if(c->http_in_chunk_headers == 1) { 4909 /* read chunked start line */ 4910 char* end = NULL; 4911 c->tcp_byte_count = (size_t)strtol(line, &end, 16); 4912 if(end == line) 4913 return 0; 4914 c->http_in_chunk_headers = 0; 4915 /* remove header text from front of buffer */ 4916 http_moveover_buffer(c->buffer); 4917 sldns_buffer_flip(c->buffer); 4918 if(c->tcp_byte_count == 0) { 4919 /* done with chunks, process chunk_trailer lines */ 4920 c->http_in_chunk_headers = 3; 4921 } 4922 return 1; 4923 } 4924 /* ignore other headers */ 4925 return 1; 4926 } 4927 4928 /** handle nonchunked data segment, 0=fail, 1=wait */ 4929 static int 4930 http_nonchunk_segment(struct comm_point* c) 4931 { 4932 /* c->buffer at position..limit has new data we read in. 4933 * the buffer itself is full of nonchunked data. 4934 * we are looking to read tcp_byte_count more data 4935 * and then the transfer is done. */ 4936 size_t remainbufferlen; 4937 size_t got_now = sldns_buffer_limit(c->buffer); 4938 if(c->tcp_byte_count <= got_now) { 4939 /* done, this is the last data fragment */ 4940 c->http_stored = 0; 4941 sldns_buffer_set_position(c->buffer, 0); 4942 fptr_ok(fptr_whitelist_comm_point(c->callback)); 4943 (void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL); 4944 return 1; 4945 } 4946 /* if we have the buffer space, 4947 * read more data collected into the buffer */ 4948 remainbufferlen = sldns_buffer_capacity(c->buffer) - 4949 sldns_buffer_limit(c->buffer); 4950 if(remainbufferlen+got_now >= c->tcp_byte_count || 4951 remainbufferlen >= (size_t)(c->ssl?16384:2048)) { 4952 size_t total = sldns_buffer_limit(c->buffer); 4953 sldns_buffer_clear(c->buffer); 4954 sldns_buffer_set_position(c->buffer, total); 4955 c->http_stored = total; 4956 /* return and wait to read more */ 4957 return 1; 4958 } 4959 /* call callback with this data amount, then 4960 * wait for more */ 4961 c->tcp_byte_count -= got_now; 4962 c->http_stored = 0; 4963 sldns_buffer_set_position(c->buffer, 0); 4964 fptr_ok(fptr_whitelist_comm_point(c->callback)); 4965 (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL); 4966 /* c->callback has to buffer_clear(c->buffer). */ 4967 /* return and wait to read more */ 4968 return 1; 4969 } 4970 4971 /** handle chunked data segment, return 0=fail, 1=wait, 2=process more */ 4972 static int 4973 http_chunked_segment(struct comm_point* c) 4974 { 4975 /* the c->buffer has from position..limit new data we read. */ 4976 /* the current chunk has length tcp_byte_count. 4977 * once we read that read more chunk headers. 4978 */ 4979 size_t remainbufferlen; 4980 size_t got_now = sldns_buffer_limit(c->buffer) - c->http_stored; 4981 verbose(VERB_ALGO, "http_chunked_segment: got now %d, tcpbytcount %d, http_stored %d, buffer pos %d, buffer limit %d", (int)got_now, (int)c->tcp_byte_count, (int)c->http_stored, (int)sldns_buffer_position(c->buffer), (int)sldns_buffer_limit(c->buffer)); 4982 if(c->tcp_byte_count <= got_now) { 4983 /* the chunk has completed (with perhaps some extra data 4984 * from next chunk header and next chunk) */ 4985 /* save too much info into temp buffer */ 4986 size_t fraglen; 4987 struct comm_reply repinfo; 4988 c->http_stored = 0; 4989 sldns_buffer_skip(c->buffer, (ssize_t)c->tcp_byte_count); 4990 sldns_buffer_clear(c->http_temp); 4991 sldns_buffer_write(c->http_temp, 4992 sldns_buffer_current(c->buffer), 4993 sldns_buffer_remaining(c->buffer)); 4994 sldns_buffer_flip(c->http_temp); 4995 4996 /* callback with this fragment */ 4997 fraglen = sldns_buffer_position(c->buffer); 4998 sldns_buffer_set_position(c->buffer, 0); 4999 sldns_buffer_set_limit(c->buffer, fraglen); 5000 repinfo = c->repinfo; 5001 fptr_ok(fptr_whitelist_comm_point(c->callback)); 5002 (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &repinfo); 5003 /* c->callback has to buffer_clear(). */ 5004 5005 /* is commpoint deleted? */ 5006 if(!repinfo.c) { 5007 return 1; 5008 } 5009 /* copy waiting info */ 5010 sldns_buffer_clear(c->buffer); 5011 sldns_buffer_write(c->buffer, 5012 sldns_buffer_begin(c->http_temp), 5013 sldns_buffer_remaining(c->http_temp)); 5014 sldns_buffer_flip(c->buffer); 5015 /* process end of chunk trailer header lines, until 5016 * an empty line */ 5017 c->http_in_chunk_headers = 3; 5018 /* process more data in buffer (if any) */ 5019 return 2; 5020 } 5021 c->tcp_byte_count -= got_now; 5022 5023 /* if we have the buffer space, 5024 * read more data collected into the buffer */ 5025 remainbufferlen = sldns_buffer_capacity(c->buffer) - 5026 sldns_buffer_limit(c->buffer); 5027 if(remainbufferlen >= c->tcp_byte_count || 5028 remainbufferlen >= 2048) { 5029 size_t total = sldns_buffer_limit(c->buffer); 5030 sldns_buffer_clear(c->buffer); 5031 sldns_buffer_set_position(c->buffer, total); 5032 c->http_stored = total; 5033 /* return and wait to read more */ 5034 return 1; 5035 } 5036 5037 /* callback of http reader for a new part of the data */ 5038 c->http_stored = 0; 5039 sldns_buffer_set_position(c->buffer, 0); 5040 fptr_ok(fptr_whitelist_comm_point(c->callback)); 5041 (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL); 5042 /* c->callback has to buffer_clear(c->buffer). */ 5043 /* return and wait to read more */ 5044 return 1; 5045 } 5046 5047 #ifdef HAVE_NGHTTP2 5048 /** Create new http2 session. Called when creating handling comm point. */ 5049 static struct http2_session* http2_session_create(struct comm_point* c) 5050 { 5051 struct http2_session* session = calloc(1, sizeof(*session)); 5052 if(!session) { 5053 log_err("malloc failure while creating http2 session"); 5054 return NULL; 5055 } 5056 session->c = c; 5057 5058 return session; 5059 } 5060 #endif 5061 5062 /** Delete http2 session. After closing connection or on error */ 5063 static void http2_session_delete(struct http2_session* h2_session) 5064 { 5065 #ifdef HAVE_NGHTTP2 5066 if(h2_session->callbacks) 5067 nghttp2_session_callbacks_del(h2_session->callbacks); 5068 free(h2_session); 5069 #else 5070 (void)h2_session; 5071 #endif 5072 } 5073 5074 #ifdef HAVE_NGHTTP2 5075 struct http2_stream* http2_stream_create(int32_t stream_id) 5076 { 5077 struct http2_stream* h2_stream = calloc(1, sizeof(*h2_stream)); 5078 if(!h2_stream) { 5079 log_err("malloc failure while creating http2 stream"); 5080 return NULL; 5081 } 5082 h2_stream->stream_id = stream_id; 5083 return h2_stream; 5084 } 5085 #endif 5086 5087 void http2_stream_add_meshstate(struct http2_stream* h2_stream, 5088 struct mesh_area* mesh, struct mesh_state* m) 5089 { 5090 h2_stream->mesh = mesh; 5091 h2_stream->mesh_state = m; 5092 } 5093 5094 void http2_stream_remove_mesh_state(struct http2_stream* h2_stream) 5095 { 5096 if(!h2_stream) 5097 return; 5098 h2_stream->mesh_state = NULL; 5099 } 5100 5101 #ifdef HAVE_NGHTTP2 5102 void http2_session_add_stream(struct http2_session* h2_session, 5103 struct http2_stream* h2_stream) 5104 { 5105 if(h2_session->first_stream) 5106 h2_session->first_stream->prev = h2_stream; 5107 h2_stream->next = h2_session->first_stream; 5108 h2_session->first_stream = h2_stream; 5109 } 5110 5111 /** remove stream from session linked list. After stream close callback or 5112 * closing connection */ 5113 static void http2_session_remove_stream(struct http2_session* h2_session, 5114 struct http2_stream* h2_stream) 5115 { 5116 if(h2_stream->prev) 5117 h2_stream->prev->next = h2_stream->next; 5118 else 5119 h2_session->first_stream = h2_stream->next; 5120 if(h2_stream->next) 5121 h2_stream->next->prev = h2_stream->prev; 5122 5123 } 5124 5125 int http2_stream_close_cb(nghttp2_session* ATTR_UNUSED(session), 5126 int32_t stream_id, uint32_t ATTR_UNUSED(error_code), void* cb_arg) 5127 { 5128 struct http2_stream* h2_stream; 5129 struct http2_session* h2_session = (struct http2_session*)cb_arg; 5130 if(!(h2_stream = nghttp2_session_get_stream_user_data( 5131 h2_session->session, stream_id))) { 5132 return 0; 5133 } 5134 http2_session_remove_stream(h2_session, h2_stream); 5135 http2_stream_delete(h2_session, h2_stream); 5136 return 0; 5137 } 5138 5139 ssize_t http2_recv_cb(nghttp2_session* ATTR_UNUSED(session), uint8_t* buf, 5140 size_t len, int ATTR_UNUSED(flags), void* cb_arg) 5141 { 5142 struct http2_session* h2_session = (struct http2_session*)cb_arg; 5143 ssize_t ret; 5144 5145 log_assert(h2_session->c->type == comm_http); 5146 log_assert(h2_session->c->h2_session); 5147 5148 #ifdef HAVE_SSL 5149 if(h2_session->c->ssl) { 5150 int r; 5151 ERR_clear_error(); 5152 r = SSL_read(h2_session->c->ssl, buf, len); 5153 if(r <= 0) { 5154 int want = SSL_get_error(h2_session->c->ssl, r); 5155 if(want == SSL_ERROR_ZERO_RETURN) { 5156 return NGHTTP2_ERR_EOF; 5157 } else if(want == SSL_ERROR_WANT_READ) { 5158 return NGHTTP2_ERR_WOULDBLOCK; 5159 } else if(want == SSL_ERROR_WANT_WRITE) { 5160 h2_session->c->ssl_shake_state = comm_ssl_shake_hs_write; 5161 comm_point_listen_for_rw(h2_session->c, 0, 1); 5162 return NGHTTP2_ERR_WOULDBLOCK; 5163 } else if(want == SSL_ERROR_SYSCALL) { 5164 #ifdef ECONNRESET 5165 if(errno == ECONNRESET && verbosity < 2) 5166 return NGHTTP2_ERR_CALLBACK_FAILURE; 5167 #endif 5168 if(errno != 0) 5169 log_err("SSL_read syscall: %s", 5170 strerror(errno)); 5171 return NGHTTP2_ERR_CALLBACK_FAILURE; 5172 } 5173 log_crypto_err_io("could not SSL_read", want); 5174 return NGHTTP2_ERR_CALLBACK_FAILURE; 5175 } 5176 return r; 5177 } 5178 #endif /* HAVE_SSL */ 5179 5180 ret = recv(h2_session->c->fd, buf, len, MSG_DONTWAIT); 5181 if(ret == 0) { 5182 return NGHTTP2_ERR_EOF; 5183 } else if(ret < 0) { 5184 #ifndef USE_WINSOCK 5185 if(errno == EINTR || errno == EAGAIN) 5186 return NGHTTP2_ERR_WOULDBLOCK; 5187 #ifdef ECONNRESET 5188 if(errno == ECONNRESET && verbosity < 2) 5189 return NGHTTP2_ERR_CALLBACK_FAILURE; 5190 #endif 5191 log_err_addr("could not http2 recv: %s", strerror(errno), 5192 &h2_session->c->repinfo.remote_addr, 5193 h2_session->c->repinfo.remote_addrlen); 5194 #else /* USE_WINSOCK */ 5195 if(WSAGetLastError() == WSAECONNRESET) 5196 return NGHTTP2_ERR_CALLBACK_FAILURE; 5197 if(WSAGetLastError() == WSAEINPROGRESS) 5198 return NGHTTP2_ERR_WOULDBLOCK; 5199 if(WSAGetLastError() == WSAEWOULDBLOCK) { 5200 ub_winsock_tcp_wouldblock(h2_session->c->ev->ev, 5201 UB_EV_READ); 5202 return NGHTTP2_ERR_WOULDBLOCK; 5203 } 5204 log_err_addr("could not http2 recv: %s", 5205 wsa_strerror(WSAGetLastError()), 5206 &h2_session->c->repinfo.remote_addr, 5207 h2_session->c->repinfo.remote_addrlen); 5208 #endif 5209 return NGHTTP2_ERR_CALLBACK_FAILURE; 5210 } 5211 return ret; 5212 } 5213 #endif /* HAVE_NGHTTP2 */ 5214 5215 /** Handle http2 read */ 5216 static int 5217 comm_point_http2_handle_read(int ATTR_UNUSED(fd), struct comm_point* c) 5218 { 5219 #ifdef HAVE_NGHTTP2 5220 int ret; 5221 log_assert(c->h2_session); 5222 5223 /* reading until recv cb returns NGHTTP2_ERR_WOULDBLOCK */ 5224 ret = nghttp2_session_recv(c->h2_session->session); 5225 if(ret) { 5226 if(ret != NGHTTP2_ERR_EOF && 5227 ret != NGHTTP2_ERR_CALLBACK_FAILURE) { 5228 char a[256]; 5229 addr_to_str(&c->repinfo.remote_addr, 5230 c->repinfo.remote_addrlen, a, sizeof(a)); 5231 verbose(VERB_QUERY, "http2: session_recv from %s failed, " 5232 "error: %s", a, nghttp2_strerror(ret)); 5233 } 5234 return 0; 5235 } 5236 if(nghttp2_session_want_write(c->h2_session->session)) { 5237 c->tcp_is_reading = 0; 5238 comm_point_stop_listening(c); 5239 comm_point_start_listening(c, -1, adjusted_tcp_timeout(c)); 5240 } else if(!nghttp2_session_want_read(c->h2_session->session)) 5241 return 0; /* connection can be closed */ 5242 return 1; 5243 #else 5244 (void)c; 5245 return 0; 5246 #endif 5247 } 5248 5249 /** 5250 * Handle http reading callback. 5251 * @param fd: file descriptor of socket. 5252 * @param c: comm point to read from into buffer. 5253 * @return: 0 on error 5254 */ 5255 static int 5256 comm_point_http_handle_read(int fd, struct comm_point* c) 5257 { 5258 log_assert(c->type == comm_http); 5259 log_assert(fd != -1); 5260 5261 /* if we are in ssl handshake, handle SSL handshake */ 5262 #ifdef HAVE_SSL 5263 if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) { 5264 if(!ssl_handshake(c)) 5265 return 0; 5266 if(c->ssl_shake_state != comm_ssl_shake_none) 5267 return 1; 5268 } 5269 #endif /* HAVE_SSL */ 5270 5271 if(!c->tcp_is_reading) 5272 return 1; 5273 5274 if(c->use_h2) { 5275 return comm_point_http2_handle_read(fd, c); 5276 } 5277 5278 /* http version is <= http/1.1 */ 5279 5280 if(c->http_min_version >= http_version_2) { 5281 /* HTTP/2 failed, not allowed to use lower version. */ 5282 return 0; 5283 } 5284 5285 /* read more data */ 5286 if(c->ssl) { 5287 if(!ssl_http_read_more(c)) 5288 return 0; 5289 } else { 5290 if(!http_read_more(fd, c)) 5291 return 0; 5292 } 5293 5294 if(c->http_stored >= sldns_buffer_position(c->buffer)) { 5295 /* read did not work but we wanted more data, there is 5296 * no bytes to process now. */ 5297 return 1; 5298 } 5299 sldns_buffer_flip(c->buffer); 5300 /* if we are partway in a segment of data, position us at the point 5301 * where we left off previously */ 5302 if(c->http_stored < sldns_buffer_limit(c->buffer)) 5303 sldns_buffer_set_position(c->buffer, c->http_stored); 5304 else sldns_buffer_set_position(c->buffer, sldns_buffer_limit(c->buffer)); 5305 5306 while(sldns_buffer_remaining(c->buffer) > 0) { 5307 /* Handle HTTP/1.x data */ 5308 /* if we are reading headers, read more headers */ 5309 if(c->http_in_headers || c->http_in_chunk_headers) { 5310 /* if header is done, process the header */ 5311 if(!http_header_done(c->buffer)) { 5312 /* copy remaining data to front of buffer 5313 * and set rest for writing into it */ 5314 http_moveover_buffer(c->buffer); 5315 /* return and wait to read more */ 5316 return 1; 5317 } 5318 if(!c->http_in_chunk_headers) { 5319 /* process initial headers */ 5320 if(!http_process_initial_header(c)) 5321 return 0; 5322 } else { 5323 /* process chunk headers */ 5324 int r = http_process_chunk_header(c); 5325 if(r == 0) return 0; 5326 if(r == 2) return 1; /* done */ 5327 /* r == 1, continue */ 5328 } 5329 /* see if we have more to process */ 5330 continue; 5331 } 5332 5333 if(!c->http_is_chunked) { 5334 /* if we are reading nonchunks, process that*/ 5335 return http_nonchunk_segment(c); 5336 } else { 5337 /* if we are reading chunks, read the chunk */ 5338 int r = http_chunked_segment(c); 5339 if(r == 0) return 0; 5340 if(r == 1) return 1; 5341 continue; 5342 } 5343 } 5344 /* broke out of the loop; could not process header instead need 5345 * to read more */ 5346 /* moveover any remaining data and read more data */ 5347 http_moveover_buffer(c->buffer); 5348 /* return and wait to read more */ 5349 return 1; 5350 } 5351 5352 /** check pending connect for http */ 5353 static int 5354 http_check_connect(int fd, struct comm_point* c) 5355 { 5356 /* check for pending error from nonblocking connect */ 5357 /* from Stevens, unix network programming, vol1, 3rd ed, p450*/ 5358 int error = 0; 5359 socklen_t len = (socklen_t)sizeof(error); 5360 if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error, 5361 &len) < 0){ 5362 #ifndef USE_WINSOCK 5363 error = errno; /* on solaris errno is error */ 5364 #else /* USE_WINSOCK */ 5365 error = WSAGetLastError(); 5366 #endif 5367 } 5368 #ifndef USE_WINSOCK 5369 #if defined(EINPROGRESS) && defined(EWOULDBLOCK) 5370 if(error == EINPROGRESS || error == EWOULDBLOCK) 5371 return 1; /* try again later */ 5372 else 5373 #endif 5374 if(error != 0 && verbosity < 2) 5375 return 0; /* silence lots of chatter in the logs */ 5376 else if(error != 0) { 5377 log_err_addr("http connect", strerror(error), 5378 &c->repinfo.remote_addr, c->repinfo.remote_addrlen); 5379 #else /* USE_WINSOCK */ 5380 /* examine error */ 5381 if(error == WSAEINPROGRESS) 5382 return 1; 5383 else if(error == WSAEWOULDBLOCK) { 5384 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 5385 return 1; 5386 } else if(error != 0 && verbosity < 2) 5387 return 0; 5388 else if(error != 0) { 5389 log_err_addr("http connect", wsa_strerror(error), 5390 &c->repinfo.remote_addr, c->repinfo.remote_addrlen); 5391 #endif /* USE_WINSOCK */ 5392 return 0; 5393 } 5394 /* keep on processing this socket */ 5395 return 2; 5396 } 5397 5398 /** write more data for http (with ssl) */ 5399 static int 5400 ssl_http_write_more(struct comm_point* c) 5401 { 5402 #ifdef HAVE_SSL 5403 int r; 5404 log_assert(sldns_buffer_remaining(c->buffer) > 0); 5405 ERR_clear_error(); 5406 r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer), 5407 (int)sldns_buffer_remaining(c->buffer)); 5408 if(r <= 0) { 5409 int want = SSL_get_error(c->ssl, r); 5410 if(want == SSL_ERROR_ZERO_RETURN) { 5411 return 0; /* closed */ 5412 } else if(want == SSL_ERROR_WANT_READ) { 5413 c->ssl_shake_state = comm_ssl_shake_hs_read; 5414 comm_point_listen_for_rw(c, 1, 0); 5415 return 1; /* wait for read condition */ 5416 } else if(want == SSL_ERROR_WANT_WRITE) { 5417 return 1; /* write more later */ 5418 } else if(want == SSL_ERROR_SYSCALL) { 5419 #ifdef EPIPE 5420 if(errno == EPIPE && verbosity < 2) 5421 return 0; /* silence 'broken pipe' */ 5422 #endif 5423 if(errno != 0) 5424 log_err("SSL_write syscall: %s", 5425 strerror(errno)); 5426 return 0; 5427 } 5428 log_crypto_err_io("could not SSL_write", want); 5429 return 0; 5430 } 5431 sldns_buffer_skip(c->buffer, (ssize_t)r); 5432 return 1; 5433 #else 5434 (void)c; 5435 return 0; 5436 #endif /* HAVE_SSL */ 5437 } 5438 5439 /** write more data for http */ 5440 static int 5441 http_write_more(int fd, struct comm_point* c) 5442 { 5443 ssize_t r; 5444 log_assert(sldns_buffer_remaining(c->buffer) > 0); 5445 r = send(fd, (void*)sldns_buffer_current(c->buffer), 5446 sldns_buffer_remaining(c->buffer), 0); 5447 if(r == -1) { 5448 #ifndef USE_WINSOCK 5449 if(errno == EINTR || errno == EAGAIN) 5450 return 1; 5451 #else 5452 if(WSAGetLastError() == WSAEINPROGRESS) 5453 return 1; 5454 if(WSAGetLastError() == WSAEWOULDBLOCK) { 5455 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 5456 return 1; 5457 } 5458 #endif 5459 log_err_addr("http send r", sock_strerror(errno), 5460 &c->repinfo.remote_addr, c->repinfo.remote_addrlen); 5461 return 0; 5462 } 5463 sldns_buffer_skip(c->buffer, r); 5464 return 1; 5465 } 5466 5467 #ifdef HAVE_NGHTTP2 5468 ssize_t http2_send_cb(nghttp2_session* ATTR_UNUSED(session), const uint8_t* buf, 5469 size_t len, int ATTR_UNUSED(flags), void* cb_arg) 5470 { 5471 ssize_t ret; 5472 struct http2_session* h2_session = (struct http2_session*)cb_arg; 5473 log_assert(h2_session->c->type == comm_http); 5474 log_assert(h2_session->c->h2_session); 5475 5476 #ifdef HAVE_SSL 5477 if(h2_session->c->ssl) { 5478 int r; 5479 ERR_clear_error(); 5480 r = SSL_write(h2_session->c->ssl, buf, len); 5481 if(r <= 0) { 5482 int want = SSL_get_error(h2_session->c->ssl, r); 5483 if(want == SSL_ERROR_ZERO_RETURN) { 5484 return NGHTTP2_ERR_CALLBACK_FAILURE; 5485 } else if(want == SSL_ERROR_WANT_READ) { 5486 h2_session->c->ssl_shake_state = comm_ssl_shake_hs_read; 5487 comm_point_listen_for_rw(h2_session->c, 1, 0); 5488 return NGHTTP2_ERR_WOULDBLOCK; 5489 } else if(want == SSL_ERROR_WANT_WRITE) { 5490 return NGHTTP2_ERR_WOULDBLOCK; 5491 } else if(want == SSL_ERROR_SYSCALL) { 5492 #ifdef EPIPE 5493 if(errno == EPIPE && verbosity < 2) 5494 return NGHTTP2_ERR_CALLBACK_FAILURE; 5495 #endif 5496 if(errno != 0) 5497 log_err("SSL_write syscall: %s", 5498 strerror(errno)); 5499 return NGHTTP2_ERR_CALLBACK_FAILURE; 5500 } 5501 log_crypto_err_io("could not SSL_write", want); 5502 return NGHTTP2_ERR_CALLBACK_FAILURE; 5503 } 5504 return r; 5505 } 5506 #endif /* HAVE_SSL */ 5507 5508 ret = send(h2_session->c->fd, buf, len, 0); 5509 if(ret == 0) { 5510 return NGHTTP2_ERR_CALLBACK_FAILURE; 5511 } else if(ret < 0) { 5512 #ifndef USE_WINSOCK 5513 if(errno == EINTR || errno == EAGAIN) 5514 return NGHTTP2_ERR_WOULDBLOCK; 5515 #ifdef EPIPE 5516 if(errno == EPIPE && verbosity < 2) 5517 return NGHTTP2_ERR_CALLBACK_FAILURE; 5518 #endif 5519 #ifdef ECONNRESET 5520 if(errno == ECONNRESET && verbosity < 2) 5521 return NGHTTP2_ERR_CALLBACK_FAILURE; 5522 #endif 5523 log_err_addr("could not http2 write: %s", strerror(errno), 5524 &h2_session->c->repinfo.remote_addr, 5525 h2_session->c->repinfo.remote_addrlen); 5526 #else /* USE_WINSOCK */ 5527 if(WSAGetLastError() == WSAENOTCONN) 5528 return NGHTTP2_ERR_WOULDBLOCK; 5529 if(WSAGetLastError() == WSAEINPROGRESS) 5530 return NGHTTP2_ERR_WOULDBLOCK; 5531 if(WSAGetLastError() == WSAEWOULDBLOCK) { 5532 ub_winsock_tcp_wouldblock(h2_session->c->ev->ev, 5533 UB_EV_WRITE); 5534 return NGHTTP2_ERR_WOULDBLOCK; 5535 } 5536 if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) 5537 return NGHTTP2_ERR_CALLBACK_FAILURE; 5538 log_err_addr("could not http2 write: %s", 5539 wsa_strerror(WSAGetLastError()), 5540 &h2_session->c->repinfo.remote_addr, 5541 h2_session->c->repinfo.remote_addrlen); 5542 #endif 5543 return NGHTTP2_ERR_CALLBACK_FAILURE; 5544 } 5545 return ret; 5546 } 5547 #endif /* HAVE_NGHTTP2 */ 5548 5549 /** Handle http2 writing */ 5550 static int 5551 comm_point_http2_handle_write(int ATTR_UNUSED(fd), struct comm_point* c) 5552 { 5553 #ifdef HAVE_NGHTTP2 5554 int ret; 5555 log_assert(c->h2_session); 5556 5557 ret = nghttp2_session_send(c->h2_session->session); 5558 if(ret) { 5559 verbose(VERB_QUERY, "http2: session_send failed, " 5560 "error: %s", nghttp2_strerror(ret)); 5561 return 0; 5562 } 5563 5564 if(nghttp2_session_want_read(c->h2_session->session)) { 5565 c->tcp_is_reading = 1; 5566 comm_point_stop_listening(c); 5567 comm_point_start_listening(c, -1, adjusted_tcp_timeout(c)); 5568 } else if(!nghttp2_session_want_write(c->h2_session->session)) 5569 return 0; /* connection can be closed */ 5570 return 1; 5571 #else 5572 (void)c; 5573 return 0; 5574 #endif 5575 } 5576 5577 /** 5578 * Handle http writing callback. 5579 * @param fd: file descriptor of socket. 5580 * @param c: comm point to write buffer out of. 5581 * @return: 0 on error 5582 */ 5583 static int 5584 comm_point_http_handle_write(int fd, struct comm_point* c) 5585 { 5586 log_assert(c->type == comm_http); 5587 log_assert(fd != -1); 5588 5589 /* check pending connect errors, if that fails, we wait for more, 5590 * or we can continue to write contents */ 5591 if(c->tcp_check_nb_connect) { 5592 int r = http_check_connect(fd, c); 5593 if(r == 0) return 0; 5594 if(r == 1) return 1; 5595 c->tcp_check_nb_connect = 0; 5596 } 5597 /* if we are in ssl handshake, handle SSL handshake */ 5598 #ifdef HAVE_SSL 5599 if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) { 5600 if(!ssl_handshake(c)) 5601 return 0; 5602 if(c->ssl_shake_state != comm_ssl_shake_none) 5603 return 1; 5604 } 5605 #endif /* HAVE_SSL */ 5606 if(c->tcp_is_reading) 5607 return 1; 5608 5609 if(c->use_h2) { 5610 return comm_point_http2_handle_write(fd, c); 5611 } 5612 5613 /* http version is <= http/1.1 */ 5614 5615 if(c->http_min_version >= http_version_2) { 5616 /* HTTP/2 failed, not allowed to use lower version. */ 5617 return 0; 5618 } 5619 5620 /* if we are writing, write more */ 5621 if(c->ssl) { 5622 if(!ssl_http_write_more(c)) 5623 return 0; 5624 } else { 5625 if(!http_write_more(fd, c)) 5626 return 0; 5627 } 5628 5629 /* we write a single buffer contents, that can contain 5630 * the http request, and then flip to read the results */ 5631 /* see if write is done */ 5632 if(sldns_buffer_remaining(c->buffer) == 0) { 5633 sldns_buffer_clear(c->buffer); 5634 if(c->tcp_do_toggle_rw) 5635 c->tcp_is_reading = 1; 5636 c->tcp_byte_count = 0; 5637 /* switch from listening(write) to listening(read) */ 5638 comm_point_stop_listening(c); 5639 comm_point_start_listening(c, -1, -1); 5640 } 5641 return 1; 5642 } 5643 5644 void 5645 comm_point_http_handle_callback(int fd, short event, void* arg) 5646 { 5647 struct comm_point* c = (struct comm_point*)arg; 5648 log_assert(c->type == comm_http); 5649 ub_comm_base_now(c->ev->base); 5650 5651 if(event&UB_EV_TIMEOUT) { 5652 verbose(VERB_QUERY, "http took too long, dropped"); 5653 reclaim_http_handler(c); 5654 if(!c->tcp_do_close) { 5655 fptr_ok(fptr_whitelist_comm_point(c->callback)); 5656 (void)(*c->callback)(c, c->cb_arg, 5657 NETEVENT_TIMEOUT, NULL); 5658 } 5659 return; 5660 } 5661 if(event&UB_EV_READ) { 5662 if(!comm_point_http_handle_read(fd, c)) { 5663 reclaim_http_handler(c); 5664 if(!c->tcp_do_close) { 5665 fptr_ok(fptr_whitelist_comm_point( 5666 c->callback)); 5667 (void)(*c->callback)(c, c->cb_arg, 5668 NETEVENT_CLOSED, NULL); 5669 } 5670 } 5671 return; 5672 } 5673 if(event&UB_EV_WRITE) { 5674 if(!comm_point_http_handle_write(fd, c)) { 5675 reclaim_http_handler(c); 5676 if(!c->tcp_do_close) { 5677 fptr_ok(fptr_whitelist_comm_point( 5678 c->callback)); 5679 (void)(*c->callback)(c, c->cb_arg, 5680 NETEVENT_CLOSED, NULL); 5681 } 5682 } 5683 return; 5684 } 5685 log_err("Ignored event %d for httphdl.", event); 5686 } 5687 5688 void comm_point_local_handle_callback(int fd, short event, void* arg) 5689 { 5690 struct comm_point* c = (struct comm_point*)arg; 5691 log_assert(c->type == comm_local); 5692 ub_comm_base_now(c->ev->base); 5693 5694 if(event&UB_EV_READ) { 5695 if(!comm_point_tcp_handle_read(fd, c, 1)) { 5696 fptr_ok(fptr_whitelist_comm_point(c->callback)); 5697 (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, 5698 NULL); 5699 } 5700 return; 5701 } 5702 log_err("Ignored event %d for localhdl.", event); 5703 } 5704 5705 void comm_point_raw_handle_callback(int ATTR_UNUSED(fd), 5706 short event, void* arg) 5707 { 5708 struct comm_point* c = (struct comm_point*)arg; 5709 int err = NETEVENT_NOERROR; 5710 log_assert(c->type == comm_raw); 5711 ub_comm_base_now(c->ev->base); 5712 5713 if(event&UB_EV_TIMEOUT) 5714 err = NETEVENT_TIMEOUT; 5715 fptr_ok(fptr_whitelist_comm_point_raw(c->callback)); 5716 (void)(*c->callback)(c, c->cb_arg, err, NULL); 5717 } 5718 5719 struct comm_point* 5720 comm_point_create_udp(struct comm_base *base, int fd, sldns_buffer* buffer, 5721 int pp2_enabled, comm_point_callback_type* callback, 5722 void* callback_arg, struct unbound_socket* socket) 5723 { 5724 struct comm_point* c = (struct comm_point*)calloc(1, 5725 sizeof(struct comm_point)); 5726 short evbits; 5727 if(!c) 5728 return NULL; 5729 c->ev = (struct internal_event*)calloc(1, 5730 sizeof(struct internal_event)); 5731 if(!c->ev) { 5732 free(c); 5733 return NULL; 5734 } 5735 c->ev->base = base; 5736 c->fd = fd; 5737 c->buffer = buffer; 5738 c->timeout = NULL; 5739 c->tcp_is_reading = 0; 5740 c->tcp_byte_count = 0; 5741 c->tcp_parent = NULL; 5742 c->max_tcp_count = 0; 5743 c->cur_tcp_count = 0; 5744 c->tcp_handlers = NULL; 5745 c->tcp_free = NULL; 5746 c->type = comm_udp; 5747 c->tcp_do_close = 0; 5748 c->do_not_close = 0; 5749 c->tcp_do_toggle_rw = 0; 5750 c->tcp_check_nb_connect = 0; 5751 #ifdef USE_MSG_FASTOPEN 5752 c->tcp_do_fastopen = 0; 5753 #endif 5754 #ifdef USE_DNSCRYPT 5755 c->dnscrypt = 0; 5756 c->dnscrypt_buffer = buffer; 5757 #endif 5758 c->inuse = 0; 5759 c->callback = callback; 5760 c->cb_arg = callback_arg; 5761 c->socket = socket; 5762 c->pp2_enabled = pp2_enabled; 5763 c->pp2_header_state = pp2_header_none; 5764 evbits = UB_EV_READ | UB_EV_PERSIST; 5765 /* ub_event stuff */ 5766 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 5767 comm_point_udp_callback, c); 5768 if(c->ev->ev == NULL) { 5769 log_err("could not baseset udp event"); 5770 comm_point_delete(c); 5771 return NULL; 5772 } 5773 if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) { 5774 log_err("could not add udp event"); 5775 comm_point_delete(c); 5776 return NULL; 5777 } 5778 c->event_added = 1; 5779 return c; 5780 } 5781 5782 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG) 5783 struct comm_point* 5784 comm_point_create_udp_ancil(struct comm_base *base, int fd, 5785 sldns_buffer* buffer, int pp2_enabled, 5786 comm_point_callback_type* callback, void* callback_arg, struct unbound_socket* socket) 5787 { 5788 struct comm_point* c = (struct comm_point*)calloc(1, 5789 sizeof(struct comm_point)); 5790 short evbits; 5791 if(!c) 5792 return NULL; 5793 c->ev = (struct internal_event*)calloc(1, 5794 sizeof(struct internal_event)); 5795 if(!c->ev) { 5796 free(c); 5797 return NULL; 5798 } 5799 c->ev->base = base; 5800 c->fd = fd; 5801 c->buffer = buffer; 5802 c->timeout = NULL; 5803 c->tcp_is_reading = 0; 5804 c->tcp_byte_count = 0; 5805 c->tcp_parent = NULL; 5806 c->max_tcp_count = 0; 5807 c->cur_tcp_count = 0; 5808 c->tcp_handlers = NULL; 5809 c->tcp_free = NULL; 5810 c->type = comm_udp; 5811 c->tcp_do_close = 0; 5812 c->do_not_close = 0; 5813 #ifdef USE_DNSCRYPT 5814 c->dnscrypt = 0; 5815 c->dnscrypt_buffer = buffer; 5816 #endif 5817 c->inuse = 0; 5818 c->tcp_do_toggle_rw = 0; 5819 c->tcp_check_nb_connect = 0; 5820 #ifdef USE_MSG_FASTOPEN 5821 c->tcp_do_fastopen = 0; 5822 #endif 5823 c->callback = callback; 5824 c->cb_arg = callback_arg; 5825 c->socket = socket; 5826 c->pp2_enabled = pp2_enabled; 5827 c->pp2_header_state = pp2_header_none; 5828 evbits = UB_EV_READ | UB_EV_PERSIST; 5829 /* ub_event stuff */ 5830 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 5831 comm_point_udp_ancil_callback, c); 5832 if(c->ev->ev == NULL) { 5833 log_err("could not baseset udp event"); 5834 comm_point_delete(c); 5835 return NULL; 5836 } 5837 if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) { 5838 log_err("could not add udp event"); 5839 comm_point_delete(c); 5840 return NULL; 5841 } 5842 c->event_added = 1; 5843 return c; 5844 } 5845 #endif 5846 5847 struct comm_point* 5848 comm_point_create_doq(struct comm_base *base, int fd, sldns_buffer* buffer, 5849 comm_point_callback_type* callback, void* callback_arg, 5850 struct unbound_socket* socket, struct doq_table* table, 5851 struct ub_randstate* rnd, const void* quic_sslctx, 5852 struct config_file* cfg) 5853 { 5854 #ifdef HAVE_NGTCP2 5855 struct comm_point* c = (struct comm_point*)calloc(1, 5856 sizeof(struct comm_point)); 5857 short evbits; 5858 if(!c) 5859 return NULL; 5860 c->ev = (struct internal_event*)calloc(1, 5861 sizeof(struct internal_event)); 5862 if(!c->ev) { 5863 free(c); 5864 return NULL; 5865 } 5866 c->ev->base = base; 5867 c->fd = fd; 5868 c->buffer = buffer; 5869 c->timeout = NULL; 5870 c->tcp_is_reading = 0; 5871 c->tcp_byte_count = 0; 5872 c->tcp_parent = NULL; 5873 c->max_tcp_count = 0; 5874 c->cur_tcp_count = 0; 5875 c->tcp_handlers = NULL; 5876 c->tcp_free = NULL; 5877 c->type = comm_doq; 5878 c->tcp_do_close = 0; 5879 c->do_not_close = 0; 5880 c->tcp_do_toggle_rw = 0; 5881 c->tcp_check_nb_connect = 0; 5882 #ifdef USE_MSG_FASTOPEN 5883 c->tcp_do_fastopen = 0; 5884 #endif 5885 #ifdef USE_DNSCRYPT 5886 c->dnscrypt = 0; 5887 c->dnscrypt_buffer = NULL; 5888 #endif 5889 c->doq_socket = doq_server_socket_create(table, rnd, quic_sslctx, c, 5890 base, cfg); 5891 if(!c->doq_socket) { 5892 log_err("could not create doq comm_point"); 5893 comm_point_delete(c); 5894 return NULL; 5895 } 5896 c->inuse = 0; 5897 c->callback = callback; 5898 c->cb_arg = callback_arg; 5899 c->socket = socket; 5900 c->pp2_enabled = 0; 5901 c->pp2_header_state = pp2_header_none; 5902 evbits = UB_EV_READ | UB_EV_PERSIST; 5903 /* ub_event stuff */ 5904 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 5905 comm_point_doq_callback, c); 5906 if(c->ev->ev == NULL) { 5907 log_err("could not baseset udp event"); 5908 comm_point_delete(c); 5909 return NULL; 5910 } 5911 if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) { 5912 log_err("could not add udp event"); 5913 comm_point_delete(c); 5914 return NULL; 5915 } 5916 c->event_added = 1; 5917 return c; 5918 #else 5919 /* no libngtcp2, so no QUIC support */ 5920 (void)base; 5921 (void)buffer; 5922 (void)callback; 5923 (void)callback_arg; 5924 (void)socket; 5925 (void)rnd; 5926 (void)table; 5927 (void)quic_sslctx; 5928 (void)cfg; 5929 sock_close(fd); 5930 return NULL; 5931 #endif /* HAVE_NGTCP2 */ 5932 } 5933 5934 static struct comm_point* 5935 comm_point_create_tcp_handler(struct comm_base *base, 5936 struct comm_point* parent, size_t bufsize, 5937 struct sldns_buffer* spoolbuf, comm_point_callback_type* callback, 5938 void* callback_arg, struct unbound_socket* socket) 5939 { 5940 struct comm_point* c = (struct comm_point*)calloc(1, 5941 sizeof(struct comm_point)); 5942 short evbits; 5943 if(!c) 5944 return NULL; 5945 c->ev = (struct internal_event*)calloc(1, 5946 sizeof(struct internal_event)); 5947 if(!c->ev) { 5948 free(c); 5949 return NULL; 5950 } 5951 c->ev->base = base; 5952 c->fd = -1; 5953 c->buffer = sldns_buffer_new(bufsize); 5954 if(!c->buffer) { 5955 free(c->ev); 5956 free(c); 5957 return NULL; 5958 } 5959 c->timeout = (struct timeval*)malloc(sizeof(struct timeval)); 5960 if(!c->timeout) { 5961 sldns_buffer_free(c->buffer); 5962 free(c->ev); 5963 free(c); 5964 return NULL; 5965 } 5966 c->tcp_is_reading = 0; 5967 c->tcp_byte_count = 0; 5968 c->tcp_parent = parent; 5969 c->tcp_timeout_msec = parent->tcp_timeout_msec; 5970 c->tcp_conn_limit = parent->tcp_conn_limit; 5971 c->tcl_addr = NULL; 5972 c->tcp_keepalive = 0; 5973 c->max_tcp_count = 0; 5974 c->cur_tcp_count = 0; 5975 c->tcp_handlers = NULL; 5976 c->tcp_free = NULL; 5977 c->type = comm_tcp; 5978 c->tcp_do_close = 0; 5979 c->do_not_close = 0; 5980 c->tcp_do_toggle_rw = 1; 5981 c->tcp_check_nb_connect = 0; 5982 #ifdef USE_MSG_FASTOPEN 5983 c->tcp_do_fastopen = 0; 5984 #endif 5985 #ifdef USE_DNSCRYPT 5986 c->dnscrypt = 0; 5987 /* We don't know just yet if this is a dnscrypt channel. Allocation 5988 * will be done when handling the callback. */ 5989 c->dnscrypt_buffer = c->buffer; 5990 #endif 5991 c->repinfo.c = c; 5992 c->callback = callback; 5993 c->cb_arg = callback_arg; 5994 c->socket = socket; 5995 c->pp2_enabled = parent->pp2_enabled; 5996 c->pp2_header_state = pp2_header_none; 5997 if(spoolbuf) { 5998 c->tcp_req_info = tcp_req_info_create(spoolbuf); 5999 if(!c->tcp_req_info) { 6000 log_err("could not create tcp commpoint"); 6001 sldns_buffer_free(c->buffer); 6002 free(c->timeout); 6003 free(c->ev); 6004 free(c); 6005 return NULL; 6006 } 6007 c->tcp_req_info->cp = c; 6008 c->tcp_do_close = 1; 6009 c->tcp_do_toggle_rw = 0; 6010 } 6011 /* add to parent free list */ 6012 c->tcp_free = parent->tcp_free; 6013 parent->tcp_free = c; 6014 /* ub_event stuff */ 6015 evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT; 6016 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6017 comm_point_tcp_handle_callback, c); 6018 if(c->ev->ev == NULL) 6019 { 6020 log_err("could not basetset tcphdl event"); 6021 parent->tcp_free = c->tcp_free; 6022 tcp_req_info_delete(c->tcp_req_info); 6023 sldns_buffer_free(c->buffer); 6024 free(c->timeout); 6025 free(c->ev); 6026 free(c); 6027 return NULL; 6028 } 6029 return c; 6030 } 6031 6032 static struct comm_point* 6033 comm_point_create_http_handler(struct comm_base *base, 6034 struct comm_point* parent, size_t bufsize, int harden_large_queries, 6035 uint32_t http_max_streams, char* http_endpoint, 6036 comm_point_callback_type* callback, void* callback_arg, 6037 struct unbound_socket* socket) 6038 { 6039 struct comm_point* c = (struct comm_point*)calloc(1, 6040 sizeof(struct comm_point)); 6041 short evbits; 6042 if(!c) 6043 return NULL; 6044 c->ev = (struct internal_event*)calloc(1, 6045 sizeof(struct internal_event)); 6046 if(!c->ev) { 6047 free(c); 6048 return NULL; 6049 } 6050 c->ev->base = base; 6051 c->fd = -1; 6052 c->buffer = sldns_buffer_new(bufsize); 6053 if(!c->buffer) { 6054 free(c->ev); 6055 free(c); 6056 return NULL; 6057 } 6058 c->timeout = (struct timeval*)malloc(sizeof(struct timeval)); 6059 if(!c->timeout) { 6060 sldns_buffer_free(c->buffer); 6061 free(c->ev); 6062 free(c); 6063 return NULL; 6064 } 6065 c->tcp_is_reading = 0; 6066 c->tcp_byte_count = 0; 6067 c->tcp_parent = parent; 6068 c->tcp_timeout_msec = parent->tcp_timeout_msec; 6069 c->tcp_conn_limit = parent->tcp_conn_limit; 6070 c->tcl_addr = NULL; 6071 c->tcp_keepalive = 0; 6072 c->max_tcp_count = 0; 6073 c->cur_tcp_count = 0; 6074 c->tcp_handlers = NULL; 6075 c->tcp_free = NULL; 6076 c->type = comm_http; 6077 c->tcp_do_close = 1; 6078 c->do_not_close = 0; 6079 c->tcp_do_toggle_rw = 1; /* will be set to 0 after http2 upgrade */ 6080 c->tcp_check_nb_connect = 0; 6081 #ifdef USE_MSG_FASTOPEN 6082 c->tcp_do_fastopen = 0; 6083 #endif 6084 #ifdef USE_DNSCRYPT 6085 c->dnscrypt = 0; 6086 c->dnscrypt_buffer = NULL; 6087 #endif 6088 c->repinfo.c = c; 6089 c->callback = callback; 6090 c->cb_arg = callback_arg; 6091 c->socket = socket; 6092 c->pp2_enabled = 0; 6093 c->pp2_header_state = pp2_header_none; 6094 6095 c->http_min_version = http_version_2; 6096 c->http2_stream_max_qbuffer_size = bufsize; 6097 if(harden_large_queries && bufsize > 512) 6098 c->http2_stream_max_qbuffer_size = 512; 6099 c->http2_max_streams = http_max_streams; 6100 if(!(c->http_endpoint = strdup(http_endpoint))) { 6101 log_err("could not strdup http_endpoint"); 6102 sldns_buffer_free(c->buffer); 6103 free(c->timeout); 6104 free(c->ev); 6105 free(c); 6106 return NULL; 6107 } 6108 c->use_h2 = 0; 6109 #ifdef HAVE_NGHTTP2 6110 if(!(c->h2_session = http2_session_create(c))) { 6111 log_err("could not create http2 session"); 6112 free(c->http_endpoint); 6113 sldns_buffer_free(c->buffer); 6114 free(c->timeout); 6115 free(c->ev); 6116 free(c); 6117 return NULL; 6118 } 6119 if(!(c->h2_session->callbacks = http2_req_callbacks_create())) { 6120 log_err("could not create http2 callbacks"); 6121 http2_session_delete(c->h2_session); 6122 free(c->http_endpoint); 6123 sldns_buffer_free(c->buffer); 6124 free(c->timeout); 6125 free(c->ev); 6126 free(c); 6127 return NULL; 6128 } 6129 #endif 6130 6131 /* add to parent free list */ 6132 c->tcp_free = parent->tcp_free; 6133 parent->tcp_free = c; 6134 /* ub_event stuff */ 6135 evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT; 6136 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6137 comm_point_http_handle_callback, c); 6138 if(c->ev->ev == NULL) 6139 { 6140 log_err("could not set http handler event"); 6141 parent->tcp_free = c->tcp_free; 6142 http2_session_delete(c->h2_session); 6143 sldns_buffer_free(c->buffer); 6144 free(c->timeout); 6145 free(c->ev); 6146 free(c); 6147 return NULL; 6148 } 6149 return c; 6150 } 6151 6152 struct comm_point* 6153 comm_point_create_tcp(struct comm_base *base, int fd, int num, 6154 int idle_timeout, int harden_large_queries, 6155 uint32_t http_max_streams, char* http_endpoint, 6156 struct tcl_list* tcp_conn_limit, size_t bufsize, 6157 struct sldns_buffer* spoolbuf, enum listen_type port_type, 6158 int pp2_enabled, comm_point_callback_type* callback, 6159 void* callback_arg, struct unbound_socket* socket) 6160 { 6161 struct comm_point* c = (struct comm_point*)calloc(1, 6162 sizeof(struct comm_point)); 6163 short evbits; 6164 int i; 6165 /* first allocate the TCP accept listener */ 6166 if(!c) 6167 return NULL; 6168 c->ev = (struct internal_event*)calloc(1, 6169 sizeof(struct internal_event)); 6170 if(!c->ev) { 6171 free(c); 6172 return NULL; 6173 } 6174 c->ev->base = base; 6175 c->fd = fd; 6176 c->buffer = NULL; 6177 c->timeout = NULL; 6178 c->tcp_is_reading = 0; 6179 c->tcp_byte_count = 0; 6180 c->tcp_timeout_msec = idle_timeout; 6181 c->tcp_conn_limit = tcp_conn_limit; 6182 c->tcl_addr = NULL; 6183 c->tcp_keepalive = 0; 6184 c->tcp_parent = NULL; 6185 c->max_tcp_count = num; 6186 c->cur_tcp_count = 0; 6187 c->tcp_handlers = (struct comm_point**)calloc((size_t)num, 6188 sizeof(struct comm_point*)); 6189 if(!c->tcp_handlers) { 6190 free(c->ev); 6191 free(c); 6192 return NULL; 6193 } 6194 c->tcp_free = NULL; 6195 c->type = comm_tcp_accept; 6196 c->tcp_do_close = 0; 6197 c->do_not_close = 0; 6198 c->tcp_do_toggle_rw = 0; 6199 c->tcp_check_nb_connect = 0; 6200 #ifdef USE_MSG_FASTOPEN 6201 c->tcp_do_fastopen = 0; 6202 #endif 6203 #ifdef USE_DNSCRYPT 6204 c->dnscrypt = 0; 6205 c->dnscrypt_buffer = NULL; 6206 #endif 6207 c->callback = NULL; 6208 c->cb_arg = NULL; 6209 c->socket = socket; 6210 c->pp2_enabled = (port_type==listen_type_http?0:pp2_enabled); 6211 c->pp2_header_state = pp2_header_none; 6212 evbits = UB_EV_READ | UB_EV_PERSIST; 6213 /* ub_event stuff */ 6214 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6215 comm_point_tcp_accept_callback, c); 6216 if(c->ev->ev == NULL) { 6217 log_err("could not baseset tcpacc event"); 6218 comm_point_delete(c); 6219 return NULL; 6220 } 6221 if (ub_event_add(c->ev->ev, c->timeout) != 0) { 6222 log_err("could not add tcpacc event"); 6223 comm_point_delete(c); 6224 return NULL; 6225 } 6226 c->event_added = 1; 6227 /* now prealloc the handlers */ 6228 for(i=0; i<num; i++) { 6229 if(port_type == listen_type_tcp || 6230 port_type == listen_type_ssl || 6231 port_type == listen_type_tcp_dnscrypt) { 6232 c->tcp_handlers[i] = comm_point_create_tcp_handler(base, 6233 c, bufsize, spoolbuf, callback, callback_arg, socket); 6234 } else if(port_type == listen_type_http) { 6235 c->tcp_handlers[i] = comm_point_create_http_handler( 6236 base, c, bufsize, harden_large_queries, 6237 http_max_streams, http_endpoint, 6238 callback, callback_arg, socket); 6239 } 6240 else { 6241 log_err("could not create tcp handler, unknown listen " 6242 "type"); 6243 return NULL; 6244 } 6245 if(!c->tcp_handlers[i]) { 6246 comm_point_delete(c); 6247 return NULL; 6248 } 6249 } 6250 6251 return c; 6252 } 6253 6254 struct comm_point* 6255 comm_point_create_tcp_out(struct comm_base *base, size_t bufsize, 6256 comm_point_callback_type* callback, void* callback_arg) 6257 { 6258 struct comm_point* c = (struct comm_point*)calloc(1, 6259 sizeof(struct comm_point)); 6260 short evbits; 6261 if(!c) 6262 return NULL; 6263 c->ev = (struct internal_event*)calloc(1, 6264 sizeof(struct internal_event)); 6265 if(!c->ev) { 6266 free(c); 6267 return NULL; 6268 } 6269 c->ev->base = base; 6270 c->fd = -1; 6271 c->buffer = sldns_buffer_new(bufsize); 6272 if(!c->buffer) { 6273 free(c->ev); 6274 free(c); 6275 return NULL; 6276 } 6277 c->timeout = NULL; 6278 c->tcp_is_reading = 0; 6279 c->tcp_byte_count = 0; 6280 c->tcp_timeout_msec = TCP_QUERY_TIMEOUT; 6281 c->tcp_conn_limit = NULL; 6282 c->tcl_addr = NULL; 6283 c->tcp_keepalive = 0; 6284 c->tcp_parent = NULL; 6285 c->max_tcp_count = 0; 6286 c->cur_tcp_count = 0; 6287 c->tcp_handlers = NULL; 6288 c->tcp_free = NULL; 6289 c->type = comm_tcp; 6290 c->tcp_do_close = 0; 6291 c->do_not_close = 0; 6292 c->tcp_do_toggle_rw = 1; 6293 c->tcp_check_nb_connect = 1; 6294 #ifdef USE_MSG_FASTOPEN 6295 c->tcp_do_fastopen = 1; 6296 #endif 6297 #ifdef USE_DNSCRYPT 6298 c->dnscrypt = 0; 6299 c->dnscrypt_buffer = c->buffer; 6300 #endif 6301 c->repinfo.c = c; 6302 c->callback = callback; 6303 c->cb_arg = callback_arg; 6304 c->pp2_enabled = 0; 6305 c->pp2_header_state = pp2_header_none; 6306 evbits = UB_EV_PERSIST | UB_EV_WRITE; 6307 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6308 comm_point_tcp_handle_callback, c); 6309 if(c->ev->ev == NULL) 6310 { 6311 log_err("could not baseset tcpout event"); 6312 sldns_buffer_free(c->buffer); 6313 free(c->ev); 6314 free(c); 6315 return NULL; 6316 } 6317 6318 return c; 6319 } 6320 6321 struct comm_point* 6322 comm_point_create_http_out(struct comm_base *base, size_t bufsize, 6323 comm_point_callback_type* callback, void* callback_arg, 6324 sldns_buffer* temp) 6325 { 6326 struct comm_point* c = (struct comm_point*)calloc(1, 6327 sizeof(struct comm_point)); 6328 short evbits; 6329 if(!c) 6330 return NULL; 6331 c->ev = (struct internal_event*)calloc(1, 6332 sizeof(struct internal_event)); 6333 if(!c->ev) { 6334 free(c); 6335 return NULL; 6336 } 6337 c->ev->base = base; 6338 c->fd = -1; 6339 c->buffer = sldns_buffer_new(bufsize); 6340 if(!c->buffer) { 6341 free(c->ev); 6342 free(c); 6343 return NULL; 6344 } 6345 c->timeout = NULL; 6346 c->tcp_is_reading = 0; 6347 c->tcp_byte_count = 0; 6348 c->tcp_parent = NULL; 6349 c->max_tcp_count = 0; 6350 c->cur_tcp_count = 0; 6351 c->tcp_handlers = NULL; 6352 c->tcp_free = NULL; 6353 c->type = comm_http; 6354 c->tcp_do_close = 0; 6355 c->do_not_close = 0; 6356 c->tcp_do_toggle_rw = 1; 6357 c->tcp_check_nb_connect = 1; 6358 c->http_in_headers = 1; 6359 c->http_in_chunk_headers = 0; 6360 c->http_is_chunked = 0; 6361 c->http_temp = temp; 6362 #ifdef USE_MSG_FASTOPEN 6363 c->tcp_do_fastopen = 1; 6364 #endif 6365 #ifdef USE_DNSCRYPT 6366 c->dnscrypt = 0; 6367 c->dnscrypt_buffer = c->buffer; 6368 #endif 6369 c->repinfo.c = c; 6370 c->callback = callback; 6371 c->cb_arg = callback_arg; 6372 c->pp2_enabled = 0; 6373 c->pp2_header_state = pp2_header_none; 6374 evbits = UB_EV_PERSIST | UB_EV_WRITE; 6375 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6376 comm_point_http_handle_callback, c); 6377 if(c->ev->ev == NULL) 6378 { 6379 log_err("could not baseset tcpout event"); 6380 #ifdef HAVE_SSL 6381 SSL_free(c->ssl); 6382 #endif 6383 sldns_buffer_free(c->buffer); 6384 free(c->ev); 6385 free(c); 6386 return NULL; 6387 } 6388 6389 return c; 6390 } 6391 6392 struct comm_point* 6393 comm_point_create_local(struct comm_base *base, int fd, size_t bufsize, 6394 comm_point_callback_type* callback, void* callback_arg) 6395 { 6396 struct comm_point* c = (struct comm_point*)calloc(1, 6397 sizeof(struct comm_point)); 6398 short evbits; 6399 if(!c) 6400 return NULL; 6401 c->ev = (struct internal_event*)calloc(1, 6402 sizeof(struct internal_event)); 6403 if(!c->ev) { 6404 free(c); 6405 return NULL; 6406 } 6407 c->ev->base = base; 6408 c->fd = fd; 6409 c->buffer = sldns_buffer_new(bufsize); 6410 if(!c->buffer) { 6411 free(c->ev); 6412 free(c); 6413 return NULL; 6414 } 6415 c->timeout = NULL; 6416 c->tcp_is_reading = 1; 6417 c->tcp_byte_count = 0; 6418 c->tcp_parent = NULL; 6419 c->max_tcp_count = 0; 6420 c->cur_tcp_count = 0; 6421 c->tcp_handlers = NULL; 6422 c->tcp_free = NULL; 6423 c->type = comm_local; 6424 c->tcp_do_close = 0; 6425 c->do_not_close = 1; 6426 c->tcp_do_toggle_rw = 0; 6427 c->tcp_check_nb_connect = 0; 6428 #ifdef USE_MSG_FASTOPEN 6429 c->tcp_do_fastopen = 0; 6430 #endif 6431 #ifdef USE_DNSCRYPT 6432 c->dnscrypt = 0; 6433 c->dnscrypt_buffer = c->buffer; 6434 #endif 6435 c->callback = callback; 6436 c->cb_arg = callback_arg; 6437 c->pp2_enabled = 0; 6438 c->pp2_header_state = pp2_header_none; 6439 /* ub_event stuff */ 6440 evbits = UB_EV_PERSIST | UB_EV_READ; 6441 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6442 comm_point_local_handle_callback, c); 6443 if(c->ev->ev == NULL) { 6444 log_err("could not baseset localhdl event"); 6445 free(c->ev); 6446 free(c); 6447 return NULL; 6448 } 6449 if (ub_event_add(c->ev->ev, c->timeout) != 0) { 6450 log_err("could not add localhdl event"); 6451 ub_event_free(c->ev->ev); 6452 free(c->ev); 6453 free(c); 6454 return NULL; 6455 } 6456 c->event_added = 1; 6457 return c; 6458 } 6459 6460 struct comm_point* 6461 comm_point_create_raw(struct comm_base* base, int fd, int writing, 6462 comm_point_callback_type* callback, void* callback_arg) 6463 { 6464 struct comm_point* c = (struct comm_point*)calloc(1, 6465 sizeof(struct comm_point)); 6466 short evbits; 6467 if(!c) 6468 return NULL; 6469 c->ev = (struct internal_event*)calloc(1, 6470 sizeof(struct internal_event)); 6471 if(!c->ev) { 6472 free(c); 6473 return NULL; 6474 } 6475 c->ev->base = base; 6476 c->fd = fd; 6477 c->buffer = NULL; 6478 c->timeout = NULL; 6479 c->tcp_is_reading = 0; 6480 c->tcp_byte_count = 0; 6481 c->tcp_parent = NULL; 6482 c->max_tcp_count = 0; 6483 c->cur_tcp_count = 0; 6484 c->tcp_handlers = NULL; 6485 c->tcp_free = NULL; 6486 c->type = comm_raw; 6487 c->tcp_do_close = 0; 6488 c->do_not_close = 1; 6489 c->tcp_do_toggle_rw = 0; 6490 c->tcp_check_nb_connect = 0; 6491 #ifdef USE_MSG_FASTOPEN 6492 c->tcp_do_fastopen = 0; 6493 #endif 6494 #ifdef USE_DNSCRYPT 6495 c->dnscrypt = 0; 6496 c->dnscrypt_buffer = c->buffer; 6497 #endif 6498 c->callback = callback; 6499 c->cb_arg = callback_arg; 6500 c->pp2_enabled = 0; 6501 c->pp2_header_state = pp2_header_none; 6502 /* ub_event stuff */ 6503 if(writing) 6504 evbits = UB_EV_PERSIST | UB_EV_WRITE; 6505 else evbits = UB_EV_PERSIST | UB_EV_READ; 6506 c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, 6507 comm_point_raw_handle_callback, c); 6508 if(c->ev->ev == NULL) { 6509 log_err("could not baseset rawhdl event"); 6510 free(c->ev); 6511 free(c); 6512 return NULL; 6513 } 6514 if (ub_event_add(c->ev->ev, c->timeout) != 0) { 6515 log_err("could not add rawhdl event"); 6516 ub_event_free(c->ev->ev); 6517 free(c->ev); 6518 free(c); 6519 return NULL; 6520 } 6521 c->event_added = 1; 6522 return c; 6523 } 6524 6525 void 6526 comm_point_close(struct comm_point* c) 6527 { 6528 if(!c) 6529 return; 6530 if(c->fd != -1) { 6531 verbose(5, "comm_point_close of %d: event_del", c->fd); 6532 if(c->event_added) { 6533 if(ub_event_del(c->ev->ev) != 0) { 6534 log_err("could not event_del on close"); 6535 } 6536 c->event_added = 0; 6537 } 6538 } 6539 tcl_close_connection(c->tcl_addr); 6540 if(c->tcp_req_info) 6541 tcp_req_info_clear(c->tcp_req_info); 6542 if(c->h2_session) 6543 http2_session_server_delete(c->h2_session); 6544 /* stop the comm point from reading or writing after it is closed. */ 6545 if(c->tcp_more_read_again && *c->tcp_more_read_again) 6546 *c->tcp_more_read_again = 0; 6547 if(c->tcp_more_write_again && *c->tcp_more_write_again) 6548 *c->tcp_more_write_again = 0; 6549 6550 /* close fd after removing from event lists, or epoll.. is messed up */ 6551 if(c->fd != -1 && !c->do_not_close) { 6552 #ifdef USE_WINSOCK 6553 if(c->type == comm_tcp || c->type == comm_http) { 6554 /* delete sticky events for the fd, it gets closed */ 6555 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); 6556 ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); 6557 } 6558 #endif 6559 verbose(VERB_ALGO, "close fd %d", c->fd); 6560 sock_close(c->fd); 6561 } 6562 c->fd = -1; 6563 } 6564 6565 void 6566 comm_point_delete(struct comm_point* c) 6567 { 6568 if(!c) 6569 return; 6570 if((c->type == comm_tcp || c->type == comm_http) && c->ssl) { 6571 #ifdef HAVE_SSL 6572 SSL_shutdown(c->ssl); 6573 SSL_free(c->ssl); 6574 #endif 6575 } 6576 if(c->type == comm_http && c->http_endpoint) { 6577 free(c->http_endpoint); 6578 c->http_endpoint = NULL; 6579 } 6580 comm_point_close(c); 6581 if(c->tcp_handlers) { 6582 int i; 6583 for(i=0; i<c->max_tcp_count; i++) 6584 comm_point_delete(c->tcp_handlers[i]); 6585 free(c->tcp_handlers); 6586 } 6587 free(c->timeout); 6588 if(c->type == comm_tcp || c->type == comm_local || c->type == comm_http) { 6589 sldns_buffer_free(c->buffer); 6590 #ifdef USE_DNSCRYPT 6591 if(c->dnscrypt && c->dnscrypt_buffer != c->buffer) { 6592 sldns_buffer_free(c->dnscrypt_buffer); 6593 } 6594 #endif 6595 if(c->tcp_req_info) { 6596 tcp_req_info_delete(c->tcp_req_info); 6597 } 6598 if(c->h2_session) { 6599 http2_session_delete(c->h2_session); 6600 } 6601 } 6602 #ifdef HAVE_NGTCP2 6603 if(c->doq_socket) 6604 doq_server_socket_delete(c->doq_socket); 6605 #endif 6606 ub_event_free(c->ev->ev); 6607 free(c->ev); 6608 free(c); 6609 } 6610 6611 #ifdef USE_DNSTAP 6612 static void 6613 send_reply_dnstap(struct dt_env* dtenv, 6614 struct sockaddr* addr, socklen_t addrlen, 6615 struct sockaddr_storage* client_addr, socklen_t client_addrlen, 6616 enum comm_point_type type, void* ssl, sldns_buffer* buffer) 6617 { 6618 log_addr(VERB_ALGO, "from local addr", (void*)addr, addrlen); 6619 log_addr(VERB_ALGO, "response to client", client_addr, client_addrlen); 6620 dt_msg_send_client_response(dtenv, client_addr, 6621 (struct sockaddr_storage*)addr, type, ssl, buffer); 6622 } 6623 #endif 6624 6625 void 6626 comm_point_send_reply(struct comm_reply *repinfo) 6627 { 6628 struct sldns_buffer* buffer; 6629 log_assert(repinfo && repinfo->c); 6630 #ifdef USE_DNSCRYPT 6631 buffer = repinfo->c->dnscrypt_buffer; 6632 if(!dnsc_handle_uncurved_request(repinfo)) { 6633 return; 6634 } 6635 #else 6636 buffer = repinfo->c->buffer; 6637 #endif 6638 if(repinfo->c->type == comm_udp) { 6639 if(repinfo->srctype) 6640 comm_point_send_udp_msg_if(repinfo->c, buffer, 6641 (struct sockaddr*)&repinfo->remote_addr, 6642 repinfo->remote_addrlen, repinfo); 6643 else 6644 comm_point_send_udp_msg(repinfo->c, buffer, 6645 (struct sockaddr*)&repinfo->remote_addr, 6646 repinfo->remote_addrlen, 0); 6647 #ifdef USE_DNSTAP 6648 /* 6649 * sending src (client)/dst (local service) addresses over 6650 * DNSTAP from udp callback 6651 */ 6652 if(repinfo->c->dtenv != NULL && repinfo->c->dtenv->log_client_response_messages) { 6653 send_reply_dnstap(repinfo->c->dtenv, 6654 repinfo->c->socket->addr, 6655 repinfo->c->socket->addrlen, 6656 &repinfo->client_addr, repinfo->client_addrlen, 6657 repinfo->c->type, repinfo->c->ssl, 6658 repinfo->c->buffer); 6659 } 6660 #endif 6661 } else { 6662 #ifdef USE_DNSTAP 6663 struct dt_env* dtenv = 6664 #ifdef HAVE_NGTCP2 6665 repinfo->c->doq_socket 6666 ?repinfo->c->dtenv: 6667 #endif 6668 repinfo->c->tcp_parent->dtenv; 6669 struct sldns_buffer* dtbuffer = repinfo->c->tcp_req_info 6670 ?repinfo->c->tcp_req_info->spool_buffer 6671 :repinfo->c->buffer; 6672 #ifdef USE_DNSCRYPT 6673 if(repinfo->c->dnscrypt && repinfo->is_dnscrypted) 6674 dtbuffer = repinfo->c->buffer; 6675 #endif 6676 /* 6677 * sending src (client)/dst (local service) addresses over 6678 * DNSTAP from other callbacks 6679 */ 6680 if(dtenv != NULL && dtenv->log_client_response_messages) { 6681 send_reply_dnstap(dtenv, 6682 repinfo->c->socket->addr, 6683 repinfo->c->socket->addrlen, 6684 &repinfo->client_addr, repinfo->client_addrlen, 6685 repinfo->c->type, repinfo->c->ssl, 6686 dtbuffer); 6687 } 6688 #endif 6689 if(repinfo->c->tcp_req_info) { 6690 tcp_req_info_send_reply(repinfo->c->tcp_req_info); 6691 } else if(repinfo->c->use_h2) { 6692 if(!http2_submit_dns_response(repinfo->c->h2_session)) { 6693 comm_point_drop_reply(repinfo); 6694 return; 6695 } 6696 repinfo->c->h2_stream = NULL; 6697 repinfo->c->tcp_is_reading = 0; 6698 comm_point_stop_listening(repinfo->c); 6699 comm_point_start_listening(repinfo->c, -1, 6700 adjusted_tcp_timeout(repinfo->c)); 6701 return; 6702 #ifdef HAVE_NGTCP2 6703 } else if(repinfo->c->doq_socket) { 6704 doq_socket_send_reply(repinfo); 6705 #endif 6706 } else { 6707 comm_point_start_listening(repinfo->c, -1, 6708 adjusted_tcp_timeout(repinfo->c)); 6709 } 6710 } 6711 } 6712 6713 void 6714 comm_point_drop_reply(struct comm_reply* repinfo) 6715 { 6716 if(!repinfo) 6717 return; 6718 log_assert(repinfo->c); 6719 log_assert(repinfo->c->type != comm_tcp_accept); 6720 if(repinfo->c->type == comm_udp) 6721 return; 6722 if(repinfo->c->tcp_req_info) 6723 repinfo->c->tcp_req_info->is_drop = 1; 6724 if(repinfo->c->type == comm_http) { 6725 if(repinfo->c->h2_session) { 6726 repinfo->c->h2_session->is_drop = 1; 6727 if(!repinfo->c->h2_session->postpone_drop) 6728 reclaim_http_handler(repinfo->c); 6729 return; 6730 } 6731 reclaim_http_handler(repinfo->c); 6732 return; 6733 #ifdef HAVE_NGTCP2 6734 } else if(repinfo->c->doq_socket) { 6735 doq_socket_drop_reply(repinfo); 6736 return; 6737 #endif 6738 } 6739 reclaim_tcp_handler(repinfo->c); 6740 } 6741 6742 void 6743 comm_point_stop_listening(struct comm_point* c) 6744 { 6745 verbose(VERB_ALGO, "comm point stop listening %d", c->fd); 6746 if(c->event_added) { 6747 if(ub_event_del(c->ev->ev) != 0) { 6748 log_err("event_del error to stoplisten"); 6749 } 6750 c->event_added = 0; 6751 } 6752 } 6753 6754 void 6755 comm_point_start_listening(struct comm_point* c, int newfd, int msec) 6756 { 6757 verbose(VERB_ALGO, "comm point start listening %d (%d msec)", 6758 c->fd==-1?newfd:c->fd, msec); 6759 if(c->type == comm_tcp_accept && !c->tcp_free) { 6760 /* no use to start listening no free slots. */ 6761 return; 6762 } 6763 if(c->event_added) { 6764 if(ub_event_del(c->ev->ev) != 0) { 6765 log_err("event_del error to startlisten"); 6766 } 6767 c->event_added = 0; 6768 } 6769 if(msec != -1 && msec != 0) { 6770 if(!c->timeout) { 6771 c->timeout = (struct timeval*)malloc(sizeof( 6772 struct timeval)); 6773 if(!c->timeout) { 6774 log_err("cpsl: malloc failed. No net read."); 6775 return; 6776 } 6777 } 6778 ub_event_add_bits(c->ev->ev, UB_EV_TIMEOUT); 6779 #ifndef S_SPLINT_S /* splint fails on struct timeval. */ 6780 c->timeout->tv_sec = msec/1000; 6781 c->timeout->tv_usec = (msec%1000)*1000; 6782 #endif /* S_SPLINT_S */ 6783 } else { 6784 if(msec == 0 || !c->timeout) { 6785 ub_event_del_bits(c->ev->ev, UB_EV_TIMEOUT); 6786 } 6787 } 6788 if(c->type == comm_tcp || c->type == comm_http) { 6789 ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); 6790 if(c->tcp_write_and_read) { 6791 verbose(5, "startlistening %d mode rw", (newfd==-1?c->fd:newfd)); 6792 ub_event_add_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); 6793 } else if(c->tcp_is_reading) { 6794 verbose(5, "startlistening %d mode r", (newfd==-1?c->fd:newfd)); 6795 ub_event_add_bits(c->ev->ev, UB_EV_READ); 6796 } else { 6797 verbose(5, "startlistening %d mode w", (newfd==-1?c->fd:newfd)); 6798 ub_event_add_bits(c->ev->ev, UB_EV_WRITE); 6799 } 6800 } 6801 if(newfd != -1) { 6802 if(c->fd != -1 && c->fd != newfd) { 6803 verbose(5, "cpsl close of fd %d for %d", c->fd, newfd); 6804 sock_close(c->fd); 6805 } 6806 c->fd = newfd; 6807 ub_event_set_fd(c->ev->ev, c->fd); 6808 } 6809 if(ub_event_add(c->ev->ev, msec==0?NULL:c->timeout) != 0) { 6810 log_err("event_add failed. in cpsl."); 6811 return; 6812 } 6813 c->event_added = 1; 6814 } 6815 6816 void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr) 6817 { 6818 verbose(VERB_ALGO, "comm point listen_for_rw %d %d", c->fd, wr); 6819 if(c->event_added) { 6820 if(ub_event_del(c->ev->ev) != 0) { 6821 log_err("event_del error to cplf"); 6822 } 6823 c->event_added = 0; 6824 } 6825 if(!c->timeout) { 6826 ub_event_del_bits(c->ev->ev, UB_EV_TIMEOUT); 6827 } 6828 ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); 6829 if(rd) ub_event_add_bits(c->ev->ev, UB_EV_READ); 6830 if(wr) ub_event_add_bits(c->ev->ev, UB_EV_WRITE); 6831 if(ub_event_add(c->ev->ev, c->timeout) != 0) { 6832 log_err("event_add failed. in cplf."); 6833 return; 6834 } 6835 c->event_added = 1; 6836 } 6837 6838 size_t comm_point_get_mem(struct comm_point* c) 6839 { 6840 size_t s; 6841 if(!c) 6842 return 0; 6843 s = sizeof(*c) + sizeof(*c->ev); 6844 if(c->timeout) 6845 s += sizeof(*c->timeout); 6846 if(c->type == comm_tcp || c->type == comm_local) { 6847 s += sizeof(*c->buffer) + sldns_buffer_capacity(c->buffer); 6848 #ifdef USE_DNSCRYPT 6849 s += sizeof(*c->dnscrypt_buffer); 6850 if(c->buffer != c->dnscrypt_buffer) { 6851 s += sldns_buffer_capacity(c->dnscrypt_buffer); 6852 } 6853 #endif 6854 } 6855 if(c->type == comm_tcp_accept) { 6856 int i; 6857 for(i=0; i<c->max_tcp_count; i++) 6858 s += comm_point_get_mem(c->tcp_handlers[i]); 6859 } 6860 return s; 6861 } 6862 6863 struct comm_timer* 6864 comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg) 6865 { 6866 struct internal_timer *tm = (struct internal_timer*)calloc(1, 6867 sizeof(struct internal_timer)); 6868 if(!tm) { 6869 log_err("malloc failed"); 6870 return NULL; 6871 } 6872 tm->super.ev_timer = tm; 6873 tm->base = base; 6874 tm->super.callback = cb; 6875 tm->super.cb_arg = cb_arg; 6876 tm->ev = ub_event_new(base->eb->base, -1, UB_EV_TIMEOUT, 6877 comm_timer_callback, &tm->super); 6878 if(tm->ev == NULL) { 6879 log_err("timer_create: event_base_set failed."); 6880 free(tm); 6881 return NULL; 6882 } 6883 return &tm->super; 6884 } 6885 6886 void 6887 comm_timer_disable(struct comm_timer* timer) 6888 { 6889 if(!timer) 6890 return; 6891 ub_timer_del(timer->ev_timer->ev); 6892 timer->ev_timer->enabled = 0; 6893 } 6894 6895 void 6896 comm_timer_set(struct comm_timer* timer, struct timeval* tv) 6897 { 6898 log_assert(tv); 6899 if(timer->ev_timer->enabled) 6900 comm_timer_disable(timer); 6901 if(ub_timer_add(timer->ev_timer->ev, timer->ev_timer->base->eb->base, 6902 comm_timer_callback, timer, tv) != 0) 6903 log_err("comm_timer_set: evtimer_add failed."); 6904 timer->ev_timer->enabled = 1; 6905 } 6906 6907 void 6908 comm_timer_delete(struct comm_timer* timer) 6909 { 6910 if(!timer) 6911 return; 6912 comm_timer_disable(timer); 6913 /* Free the sub struct timer->ev_timer derived from the super struct timer. 6914 * i.e. assert(timer == timer->ev_timer) 6915 */ 6916 ub_event_free(timer->ev_timer->ev); 6917 free(timer->ev_timer); 6918 } 6919 6920 void 6921 comm_timer_callback(int ATTR_UNUSED(fd), short event, void* arg) 6922 { 6923 struct comm_timer* tm = (struct comm_timer*)arg; 6924 if(!(event&UB_EV_TIMEOUT)) 6925 return; 6926 ub_comm_base_now(tm->ev_timer->base); 6927 tm->ev_timer->enabled = 0; 6928 fptr_ok(fptr_whitelist_comm_timer(tm->callback)); 6929 (*tm->callback)(tm->cb_arg); 6930 } 6931 6932 int 6933 comm_timer_is_set(struct comm_timer* timer) 6934 { 6935 return (int)timer->ev_timer->enabled; 6936 } 6937 6938 size_t 6939 comm_timer_get_mem(struct comm_timer* timer) 6940 { 6941 if(!timer) return 0; 6942 return sizeof(struct internal_timer); 6943 } 6944 6945 struct comm_signal* 6946 comm_signal_create(struct comm_base* base, 6947 void (*callback)(int, void*), void* cb_arg) 6948 { 6949 struct comm_signal* com = (struct comm_signal*)malloc( 6950 sizeof(struct comm_signal)); 6951 if(!com) { 6952 log_err("malloc failed"); 6953 return NULL; 6954 } 6955 com->base = base; 6956 com->callback = callback; 6957 com->cb_arg = cb_arg; 6958 com->ev_signal = NULL; 6959 return com; 6960 } 6961 6962 void 6963 comm_signal_callback(int sig, short event, void* arg) 6964 { 6965 struct comm_signal* comsig = (struct comm_signal*)arg; 6966 if(!(event & UB_EV_SIGNAL)) 6967 return; 6968 ub_comm_base_now(comsig->base); 6969 fptr_ok(fptr_whitelist_comm_signal(comsig->callback)); 6970 (*comsig->callback)(sig, comsig->cb_arg); 6971 } 6972 6973 int 6974 comm_signal_bind(struct comm_signal* comsig, int sig) 6975 { 6976 struct internal_signal* entry = (struct internal_signal*)calloc(1, 6977 sizeof(struct internal_signal)); 6978 if(!entry) { 6979 log_err("malloc failed"); 6980 return 0; 6981 } 6982 log_assert(comsig); 6983 /* add signal event */ 6984 entry->ev = ub_signal_new(comsig->base->eb->base, sig, 6985 comm_signal_callback, comsig); 6986 if(entry->ev == NULL) { 6987 log_err("Could not create signal event"); 6988 free(entry); 6989 return 0; 6990 } 6991 if(ub_signal_add(entry->ev, NULL) != 0) { 6992 log_err("Could not add signal handler"); 6993 ub_event_free(entry->ev); 6994 free(entry); 6995 return 0; 6996 } 6997 /* link into list */ 6998 entry->next = comsig->ev_signal; 6999 comsig->ev_signal = entry; 7000 return 1; 7001 } 7002 7003 void 7004 comm_signal_delete(struct comm_signal* comsig) 7005 { 7006 struct internal_signal* p, *np; 7007 if(!comsig) 7008 return; 7009 p=comsig->ev_signal; 7010 while(p) { 7011 np = p->next; 7012 ub_signal_del(p->ev); 7013 ub_event_free(p->ev); 7014 free(p); 7015 p = np; 7016 } 7017 free(comsig); 7018 } 7019