xref: /freebsd/contrib/unbound/services/listen_dnsport.c (revision b2efd602aea8b3cbc3fb215b9611946d04fceb10)
1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #include <limits.h>
47 #ifdef USE_TCP_FASTOPEN
48 #include <netinet/tcp.h>
49 #endif
50 #include <ctype.h>
51 #include "services/listen_dnsport.h"
52 #include "services/outside_network.h"
53 #include "util/netevent.h"
54 #include "util/log.h"
55 #include "util/config_file.h"
56 #include "util/net_help.h"
57 #include "sldns/sbuffer.h"
58 #include "sldns/parseutil.h"
59 #include "sldns/wire2str.h"
60 #include "services/mesh.h"
61 #include "util/fptr_wlist.h"
62 #include "util/locks.h"
63 #include "util/timeval_func.h"
64 
65 #ifdef HAVE_NETDB_H
66 #include <netdb.h>
67 #endif
68 #include <fcntl.h>
69 
70 #ifdef HAVE_SYS_UN_H
71 #include <sys/un.h>
72 #endif
73 
74 #ifdef HAVE_SYSTEMD
75 #include <systemd/sd-daemon.h>
76 #endif
77 
78 #ifdef HAVE_IFADDRS_H
79 #include <ifaddrs.h>
80 #endif
81 #ifdef HAVE_NET_IF_H
82 #include <net/if.h>
83 #endif
84 
85 #ifdef HAVE_TIME_H
86 #include <time.h>
87 #endif
88 #include <sys/time.h>
89 
90 #ifdef HAVE_NGTCP2
91 #include <ngtcp2/ngtcp2.h>
92 #include <ngtcp2/ngtcp2_crypto.h>
93 #ifdef HAVE_NGTCP2_NGTCP2_CRYPTO_OSSL_H
94 #include <ngtcp2/ngtcp2_crypto_ossl.h>
95 #elif defined(HAVE_NGTCP2_NGTCP2_CRYPTO_QUICTLS_H)
96 #include <ngtcp2/ngtcp2_crypto_quictls.h>
97 #elif defined(HAVE_NGTCP2_NGTCP2_CRYPTO_OPENSSL_H)
98 #include <ngtcp2/ngtcp2_crypto_openssl.h>
99 #define MAKE_QUIC_METHOD 1
100 #endif
101 #endif
102 
103 #ifdef HAVE_OPENSSL_SSL_H
104 #include <openssl/ssl.h>
105 #endif
106 
107 #ifdef HAVE_LINUX_NET_TSTAMP_H
108 #include <linux/net_tstamp.h>
109 #endif
110 
111 /** number of queued TCP connections for listen() */
112 #define TCP_BACKLOG 256
113 
114 #ifndef THREADS_DISABLED
115 /** lock on the counter of stream buffer memory */
116 static lock_basic_type stream_wait_count_lock;
117 /** lock on the counter of HTTP2 query buffer memory */
118 static lock_basic_type http2_query_buffer_count_lock;
119 /** lock on the counter of HTTP2 response buffer memory */
120 static lock_basic_type http2_response_buffer_count_lock;
121 #endif
122 /** size (in bytes) of stream wait buffers */
123 static size_t stream_wait_count = 0;
124 /** is the lock initialised for stream wait buffers */
125 static int stream_wait_lock_inited = 0;
126 /** size (in bytes) of HTTP2 query buffers */
127 static size_t http2_query_buffer_count = 0;
128 /** is the lock initialised for HTTP2 query buffers */
129 static int http2_query_buffer_lock_inited = 0;
130 /** size (in bytes) of HTTP2 response buffers */
131 static size_t http2_response_buffer_count = 0;
132 /** is the lock initialised for HTTP2 response buffers */
133 static int http2_response_buffer_lock_inited = 0;
134 
135 /**
136  * Debug print of the getaddrinfo returned address.
137  * @param addr: the address returned.
138  * @param additional: additional text that describes the type of socket,
139  * 	or NULL for no text.
140  */
141 static void
verbose_print_addr(struct addrinfo * addr,const char * additional)142 verbose_print_addr(struct addrinfo *addr, const char* additional)
143 {
144 	if(verbosity >= VERB_ALGO) {
145 		char buf[100];
146 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
147 #ifdef INET6
148 		if(addr->ai_family == AF_INET6)
149 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
150 				sin6_addr;
151 #endif /* INET6 */
152 		if(inet_ntop(addr->ai_family, sinaddr, buf,
153 			(socklen_t)sizeof(buf)) == 0) {
154 			(void)strlcpy(buf, "(null)", sizeof(buf));
155 		}
156 		buf[sizeof(buf)-1] = 0;
157 		verbose(VERB_ALGO, "creating %s%s socket %s %d%s%s",
158 			addr->ai_socktype==SOCK_DGRAM?"udp":
159 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
160 			addr->ai_family==AF_INET?"4":
161 			addr->ai_family==AF_INET6?"6":
162 			"_otherfam", buf,
163 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port),
164 			(additional?" ":""), (additional?additional:""));
165 	}
166 }
167 
168 void
verbose_print_unbound_socket(struct unbound_socket * ub_sock)169 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
170 {
171 	if(verbosity >= VERB_ALGO) {
172 		char buf[256];
173 		log_info("listing of unbound_socket structure:");
174 		addr_to_str((void*)ub_sock->addr, ub_sock->addrlen, buf,
175 			sizeof(buf));
176 		log_info("%s s is: %d, fam is: %s, acl: %s", buf, ub_sock->s,
177 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
178 			ub_sock->acl?"yes":"no");
179 	}
180 }
181 
182 #ifdef HAVE_SYSTEMD
183 static int
systemd_get_activated(int family,int socktype,int listen,struct sockaddr * addr,socklen_t addrlen,const char * path)184 systemd_get_activated(int family, int socktype, int listen,
185 		      struct sockaddr *addr, socklen_t addrlen,
186 		      const char *path)
187 {
188 	int i = 0;
189 	int r = 0;
190 	int s = -1;
191 	const char* listen_pid, *listen_fds;
192 
193 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
194 
195 	if((r = sd_booted()) < 1) {
196 		if(r == 0)
197 			log_warn("systemd is not running");
198 		else
199 			log_err("systemd sd_booted(): %s", strerror(-r));
200 		return -1;
201 	}
202 
203 	listen_pid = getenv("LISTEN_PID");
204 	listen_fds = getenv("LISTEN_FDS");
205 
206 	if (!listen_pid) {
207 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
208 		return -1;
209 	}
210 
211 	if (!listen_fds) {
212 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
213 		return -1;
214 	}
215 
216 	if((r = sd_listen_fds(0)) < 1) {
217 		if(r == 0)
218 			log_warn("systemd: did not return socket, check unit configuration");
219 		else
220 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
221 		return -1;
222 	}
223 
224 	for(i = 0; i < r; i++) {
225 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
226 			s = SD_LISTEN_FDS_START + i;
227 			break;
228 		}
229 	}
230 	if (s == -1) {
231 		if (addr)
232 			log_err_addr("systemd sd_listen_fds()",
233 				     "no such socket",
234 				     (struct sockaddr_storage *)addr, addrlen);
235 		else
236 			log_err("systemd sd_listen_fds(): %s", path);
237 	}
238 	return s;
239 }
240 #endif
241 
242 int
create_udp_sock(int family,int socktype,struct sockaddr * addr,socklen_t addrlen,int v6only,int * inuse,int * noproto,int rcv,int snd,int listen,int * reuseport,int transparent,int freebind,int use_systemd,int dscp)243 create_udp_sock(int family, int socktype, struct sockaddr* addr,
244         socklen_t addrlen, int v6only, int* inuse, int* noproto,
245 	int rcv, int snd, int listen, int* reuseport, int transparent,
246 	int freebind, int use_systemd, int dscp)
247 {
248 	int s;
249 	char* err;
250 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
251 	int on=1;
252 #endif
253 #ifdef IPV6_MTU
254 	int mtu = IPV6_MIN_MTU;
255 #endif
256 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
257 	(void)rcv;
258 #endif
259 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
260 	(void)snd;
261 #endif
262 #ifndef IPV6_V6ONLY
263 	(void)v6only;
264 #endif
265 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
266 	(void)transparent;
267 #endif
268 #if !defined(IP_FREEBIND)
269 	(void)freebind;
270 #endif
271 #ifdef HAVE_SYSTEMD
272 	int got_fd_from_systemd = 0;
273 
274 	if (!use_systemd
275 	    || (use_systemd
276 		&& (s = systemd_get_activated(family, socktype, -1, addr,
277 					      addrlen, NULL)) == -1)) {
278 #else
279 	(void)use_systemd;
280 #endif
281 	if((s = socket(family, socktype, 0)) == -1) {
282 		*inuse = 0;
283 #ifndef USE_WINSOCK
284 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
285 			*noproto = 1;
286 			return -1;
287 		}
288 #else
289 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
290 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
291 			*noproto = 1;
292 			return -1;
293 		}
294 #endif
295 		log_err("can't create socket: %s", sock_strerror(errno));
296 		*noproto = 0;
297 		return -1;
298 	}
299 #ifdef HAVE_SYSTEMD
300 	} else {
301 		got_fd_from_systemd = 1;
302 	}
303 #endif
304 	if(listen) {
305 #ifdef SO_REUSEADDR
306 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
307 			(socklen_t)sizeof(on)) < 0) {
308 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
309 				sock_strerror(errno));
310 #ifndef USE_WINSOCK
311 			if(errno != ENOSYS) {
312 				close(s);
313 				*noproto = 0;
314 				*inuse = 0;
315 				return -1;
316 			}
317 #else
318 			closesocket(s);
319 			*noproto = 0;
320 			*inuse = 0;
321 			return -1;
322 #endif
323 		}
324 #endif /* SO_REUSEADDR */
325 #ifdef SO_REUSEPORT
326 #  ifdef SO_REUSEPORT_LB
327 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
328 		 * like SO_REUSEPORT on Linux.  This is what the users want
329 		 * with the config option in unbound.conf; if we actually
330 		 * need local address and port reuse they'll also need to
331 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
332 		 */
333 		if (reuseport && *reuseport &&
334 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
335 			(socklen_t)sizeof(on)) < 0) {
336 #ifdef ENOPROTOOPT
337 			if(errno != ENOPROTOOPT || verbosity >= 3)
338 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
339 					strerror(errno));
340 #endif
341 			/* this option is not essential, we can continue */
342 			*reuseport = 0;
343 		}
344 #  else /* no SO_REUSEPORT_LB */
345 
346 		/* try to set SO_REUSEPORT so that incoming
347 		 * queries are distributed evenly among the receiving threads.
348 		 * Each thread must have its own socket bound to the same port,
349 		 * with SO_REUSEPORT set on each socket.
350 		 */
351 		if (reuseport && *reuseport &&
352 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
353 			(socklen_t)sizeof(on)) < 0) {
354 #ifdef ENOPROTOOPT
355 			if(errno != ENOPROTOOPT || verbosity >= 3)
356 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
357 					strerror(errno));
358 #endif
359 			/* this option is not essential, we can continue */
360 			*reuseport = 0;
361 		}
362 #  endif /* SO_REUSEPORT_LB */
363 #else
364 		(void)reuseport;
365 #endif /* defined(SO_REUSEPORT) */
366 #ifdef IP_TRANSPARENT
367 		if (transparent &&
368 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
369 		    (socklen_t)sizeof(on)) < 0) {
370 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
371 			strerror(errno));
372 		}
373 #elif defined(IP_BINDANY)
374 		if (transparent &&
375 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
376 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
377 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
378 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
379 			(family==AF_INET6?"V6":""), strerror(errno));
380 		}
381 #elif defined(SO_BINDANY)
382 		if (transparent &&
383 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
384 		    (socklen_t)sizeof(on)) < 0) {
385 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
386 			strerror(errno));
387 		}
388 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
389 	}
390 #ifdef IP_FREEBIND
391 	if(freebind &&
392 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
393 	    (socklen_t)sizeof(on)) < 0) {
394 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
395 		strerror(errno));
396 	}
397 #endif /* IP_FREEBIND */
398 	if(rcv) {
399 #ifdef SO_RCVBUF
400 		int got;
401 		socklen_t slen = (socklen_t)sizeof(got);
402 #  ifdef SO_RCVBUFFORCE
403 		/* Linux specific: try to use root permission to override
404 		 * system limits on rcvbuf. The limit is stored in
405 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
406 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
407 			(socklen_t)sizeof(rcv)) < 0) {
408 			if(errno != EPERM) {
409 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
410 					"...) failed: %s", sock_strerror(errno));
411 				sock_close(s);
412 				*noproto = 0;
413 				*inuse = 0;
414 				return -1;
415 			}
416 #  endif /* SO_RCVBUFFORCE */
417 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
418 				(socklen_t)sizeof(rcv)) < 0) {
419 				log_err("setsockopt(..., SO_RCVBUF, "
420 					"...) failed: %s", sock_strerror(errno));
421 				sock_close(s);
422 				*noproto = 0;
423 				*inuse = 0;
424 				return -1;
425 			}
426 			/* check if we got the right thing or if system
427 			 * reduced to some system max.  Warn if so */
428 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
429 				&slen) >= 0 && got < rcv/2) {
430 				log_warn("so-rcvbuf %u was not granted. "
431 					"Got %u. To fix: start with "
432 					"root permissions(linux) or sysctl "
433 					"bigger net.core.rmem_max(linux) or "
434 					"kern.ipc.maxsockbuf(bsd) values.",
435 					(unsigned)rcv, (unsigned)got);
436 			}
437 #  ifdef SO_RCVBUFFORCE
438 		}
439 #  endif
440 #endif /* SO_RCVBUF */
441 	}
442 	/* first do RCVBUF as the receive buffer is more important */
443 	if(snd) {
444 #ifdef SO_SNDBUF
445 		int got;
446 		socklen_t slen = (socklen_t)sizeof(got);
447 #  ifdef SO_SNDBUFFORCE
448 		/* Linux specific: try to use root permission to override
449 		 * system limits on sndbuf. The limit is stored in
450 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
451 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
452 			(socklen_t)sizeof(snd)) < 0) {
453 			if(errno != EPERM && errno != ENOBUFS) {
454 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
455 					"...) failed: %s", sock_strerror(errno));
456 				sock_close(s);
457 				*noproto = 0;
458 				*inuse = 0;
459 				return -1;
460 			}
461 			if(errno != EPERM) {
462 				verbose(VERB_ALGO, "setsockopt(..., SO_SNDBUFFORCE, "
463 					"...) was not granted: %s", sock_strerror(errno));
464 			}
465 #  endif /* SO_SNDBUFFORCE */
466 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
467 				(socklen_t)sizeof(snd)) < 0) {
468 				if(errno != ENOSYS && errno != ENOBUFS) {
469 					log_err("setsockopt(..., SO_SNDBUF, "
470 						"...) failed: %s", sock_strerror(errno));
471 					sock_close(s);
472 					*noproto = 0;
473 					*inuse = 0;
474 					return -1;
475 				}
476 				log_warn("setsockopt(..., SO_SNDBUF, "
477 					"...) was not granted: %s", sock_strerror(errno));
478 			}
479 			/* check if we got the right thing or if system
480 			 * reduced to some system max.  Warn if so */
481 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
482 				&slen) >= 0 && got < snd/2) {
483 				log_warn("so-sndbuf %u was not granted. "
484 					"Got %u. To fix: start with "
485 					"root permissions(linux) or sysctl "
486 					"bigger net.core.wmem_max(linux) or "
487 					"kern.ipc.maxsockbuf(bsd) values. or "
488 					"set so-sndbuf: 0 (use system value).",
489 					(unsigned)snd, (unsigned)got);
490 			}
491 #  ifdef SO_SNDBUFFORCE
492 		}
493 #  endif
494 #endif /* SO_SNDBUF */
495 	}
496 	err = set_ip_dscp(s, family, dscp);
497 	if(err != NULL)
498 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
499 	if(family == AF_INET6) {
500 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
501 		int omit6_set = 0;
502 		int action;
503 # endif
504 # if defined(IPV6_V6ONLY)
505 		if(v6only
506 #   ifdef HAVE_SYSTEMD
507 			/* Systemd wants to control if the socket is v6 only
508 			 * or both, with BindIPv6Only=default, ipv6-only or
509 			 * both in systemd.socket, so it is not set here. */
510 			&& !got_fd_from_systemd
511 #   endif
512 			) {
513 			int val=(v6only==2)?0:1;
514 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
515 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
516 				log_err("setsockopt(..., IPV6_V6ONLY"
517 					", ...) failed: %s", sock_strerror(errno));
518 				sock_close(s);
519 				*noproto = 0;
520 				*inuse = 0;
521 				return -1;
522 			}
523 		}
524 # endif
525 # if defined(IPV6_USE_MIN_MTU)
526 		/*
527 		 * There is no fragmentation of IPv6 datagrams
528 		 * during forwarding in the network. Therefore
529 		 * we do not send UDP datagrams larger than
530 		 * the minimum IPv6 MTU of 1280 octets. The
531 		 * EDNS0 message length can be larger if the
532 		 * network stack supports IPV6_USE_MIN_MTU.
533 		 */
534 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
535 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
536 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
537 				"...) failed: %s", sock_strerror(errno));
538 			sock_close(s);
539 			*noproto = 0;
540 			*inuse = 0;
541 			return -1;
542 		}
543 # elif defined(IPV6_MTU)
544 #   ifndef USE_WINSOCK
545 		/*
546 		 * On Linux, to send no larger than 1280, the PMTUD is
547 		 * disabled by default for datagrams anyway, so we set
548 		 * the MTU to use.
549 		 */
550 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
551 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
552 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
553 				sock_strerror(errno));
554 			sock_close(s);
555 			*noproto = 0;
556 			*inuse = 0;
557 			return -1;
558 		}
559 #   elif defined(IPV6_USER_MTU)
560 		/* As later versions of the mingw crosscompiler define
561 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
562 		 * instead which is writable; IPV6_MTU is readonly there. */
563 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
564 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
565 			if (WSAGetLastError() != WSAENOPROTOOPT) {
566 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
567 					wsa_strerror(WSAGetLastError()));
568 				sock_close(s);
569 				*noproto = 0;
570 				*inuse = 0;
571 				return -1;
572 			}
573 		}
574 #   endif /* USE_WINSOCK */
575 # endif /* IPv6 MTU */
576 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
577 #  if defined(IP_PMTUDISC_OMIT)
578 		action = IP_PMTUDISC_OMIT;
579 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
580 			&action, (socklen_t)sizeof(action)) < 0) {
581 
582 			if (errno != EINVAL) {
583 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
584 					strerror(errno));
585 				sock_close(s);
586 				*noproto = 0;
587 				*inuse = 0;
588 				return -1;
589 			}
590 		}
591 		else
592 		{
593 		    omit6_set = 1;
594 		}
595 #  endif
596 		if (omit6_set == 0) {
597 			action = IP_PMTUDISC_DONT;
598 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
599 				&action, (socklen_t)sizeof(action)) < 0) {
600 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
601 					strerror(errno));
602 				sock_close(s);
603 				*noproto = 0;
604 				*inuse = 0;
605 				return -1;
606 			}
607 		}
608 # endif /* IPV6_MTU_DISCOVER */
609 	} else if(family == AF_INET) {
610 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
611 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
612  * PMTU information is not accepted, but fragmentation is allowed
613  * if and only if the packet size exceeds the outgoing interface MTU
614  * (and also uses the interface mtu to determine the size of the packets).
615  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
616  * FreeBSD already has same semantics without setting the option. */
617 		int omit_set = 0;
618 		int action;
619 #   if defined(IP_PMTUDISC_OMIT)
620 		action = IP_PMTUDISC_OMIT;
621 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
622 			&action, (socklen_t)sizeof(action)) < 0) {
623 
624 			if (errno != EINVAL) {
625 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
626 					strerror(errno));
627 				sock_close(s);
628 				*noproto = 0;
629 				*inuse = 0;
630 				return -1;
631 			}
632 		}
633 		else
634 		{
635 		    omit_set = 1;
636 		}
637 #   endif
638 		if (omit_set == 0) {
639    			action = IP_PMTUDISC_DONT;
640 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
641 				&action, (socklen_t)sizeof(action)) < 0) {
642 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
643 					strerror(errno));
644 				sock_close(s);
645 				*noproto = 0;
646 				*inuse = 0;
647 				return -1;
648 			}
649 		}
650 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
651 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
652 		 * but does not work on that version, so we exclude it */
653 		/* a nonzero value disables fragmentation, according to
654 		 * docs.oracle.com for ip(4). */
655 		int off = 1;
656 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
657 			&off, (socklen_t)sizeof(off)) < 0) {
658 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
659 				strerror(errno));
660 			sock_close(s);
661 			*noproto = 0;
662 			*inuse = 0;
663 			return -1;
664 		}
665 #  endif /* IPv4 MTU */
666 	}
667 	if(
668 #ifdef HAVE_SYSTEMD
669 		!got_fd_from_systemd &&
670 #endif
671 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
672 		*noproto = 0;
673 		*inuse = 0;
674 #ifndef USE_WINSOCK
675 #ifdef EADDRINUSE
676 		*inuse = (errno == EADDRINUSE);
677 		/* detect freebsd jail with no ipv6 permission */
678 		if(family==AF_INET6 && errno==EINVAL)
679 			*noproto = 1;
680 		else if(errno != EADDRINUSE &&
681 			!(errno == EACCES && verbosity < 4 && !listen)
682 #ifdef EADDRNOTAVAIL
683 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
684 #endif
685 			) {
686 			log_err_addr("can't bind socket", strerror(errno),
687 				(struct sockaddr_storage*)addr, addrlen);
688 		}
689 #endif /* EADDRINUSE */
690 #else /* USE_WINSOCK */
691 		if(WSAGetLastError() != WSAEADDRINUSE &&
692 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
693 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
694 			log_err_addr("can't bind socket",
695 				wsa_strerror(WSAGetLastError()),
696 				(struct sockaddr_storage*)addr, addrlen);
697 		}
698 #endif /* USE_WINSOCK */
699 		sock_close(s);
700 		return -1;
701 	}
702 	if(!fd_set_nonblock(s)) {
703 		*noproto = 0;
704 		*inuse = 0;
705 		sock_close(s);
706 		return -1;
707 	}
708 	return s;
709 }
710 
711 int
create_tcp_accept_sock(struct addrinfo * addr,int v6only,int * noproto,int * reuseport,int transparent,int mss,int nodelay,int freebind,int use_systemd,int dscp,const char * additional)712 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
713 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
714 	int use_systemd, int dscp, const char* additional)
715 {
716 	int s = -1;
717 	char* err;
718 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT)		\
719 	|| defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT)	\
720 	|| defined(IP_BINDANY) || defined(IP_FREEBIND)		\
721 	|| defined(SO_BINDANY) || defined(TCP_NODELAY)
722 	int on = 1;
723 #endif
724 #ifdef HAVE_SYSTEMD
725 	int got_fd_from_systemd = 0;
726 #endif
727 #ifdef USE_TCP_FASTOPEN
728 	int qlen;
729 #endif
730 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
731 	(void)transparent;
732 #endif
733 #if !defined(IP_FREEBIND)
734 	(void)freebind;
735 #endif
736 	verbose_print_addr(addr, additional);
737 	*noproto = 0;
738 #ifdef HAVE_SYSTEMD
739 	if (!use_systemd ||
740 	    (use_systemd
741 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
742 					   addr->ai_addr, addr->ai_addrlen,
743 					   NULL)) == -1)) {
744 #else
745 	(void)use_systemd;
746 #endif
747 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
748 #ifndef USE_WINSOCK
749 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
750 			*noproto = 1;
751 			return -1;
752 		}
753 #else
754 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
755 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
756 			*noproto = 1;
757 			return -1;
758 		}
759 #endif
760 		log_err("can't create socket: %s", sock_strerror(errno));
761 		return -1;
762 	}
763 	if(nodelay) {
764 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
765 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
766 			(socklen_t)sizeof(on)) < 0) {
767 			#ifndef USE_WINSOCK
768 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
769 				strerror(errno));
770 			#else
771 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
772 				wsa_strerror(WSAGetLastError()));
773 			#endif
774 		}
775 #else
776 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
777 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
778 	}
779 	if (mss > 0) {
780 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
781 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
782 			(socklen_t)sizeof(mss)) < 0) {
783 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
784 				sock_strerror(errno));
785 		} else {
786 			verbose(VERB_ALGO,
787 				" tcp socket mss set to %d", mss);
788 		}
789 #else
790 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
791 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
792 	}
793 #ifdef HAVE_SYSTEMD
794 	} else {
795 		got_fd_from_systemd = 1;
796     }
797 #endif
798 #ifdef SO_REUSEADDR
799 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
800 		(socklen_t)sizeof(on)) < 0) {
801 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
802 			sock_strerror(errno));
803 		sock_close(s);
804 		return -1;
805 	}
806 #endif /* SO_REUSEADDR */
807 #ifdef IP_FREEBIND
808 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
809 	    (socklen_t)sizeof(on)) < 0) {
810 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
811 		strerror(errno));
812 	}
813 #endif /* IP_FREEBIND */
814 #ifdef SO_REUSEPORT
815 	/* try to set SO_REUSEPORT so that incoming
816 	 * connections are distributed evenly among the receiving threads.
817 	 * Each thread must have its own socket bound to the same port,
818 	 * with SO_REUSEPORT set on each socket.
819 	 */
820 	if (reuseport && *reuseport &&
821 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
822 		(socklen_t)sizeof(on)) < 0) {
823 #ifdef ENOPROTOOPT
824 		if(errno != ENOPROTOOPT || verbosity >= 3)
825 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
826 				strerror(errno));
827 #endif
828 		/* this option is not essential, we can continue */
829 		*reuseport = 0;
830 	}
831 #else
832 	(void)reuseport;
833 #endif /* defined(SO_REUSEPORT) */
834 #if defined(IPV6_V6ONLY)
835 	if(addr->ai_family == AF_INET6 && v6only
836 #  ifdef HAVE_SYSTEMD
837 		/* Systemd wants to control if the socket is v6 only
838 		 * or both, with BindIPv6Only=default, ipv6-only or
839 		 * both in systemd.socket, so it is not set here. */
840 		&& !got_fd_from_systemd
841 #  endif
842 		) {
843 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
844 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
845 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
846 				sock_strerror(errno));
847 			sock_close(s);
848 			return -1;
849 		}
850 	}
851 #else
852 	(void)v6only;
853 #endif /* IPV6_V6ONLY */
854 #ifdef IP_TRANSPARENT
855 	if (transparent &&
856 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
857 	    (socklen_t)sizeof(on)) < 0) {
858 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
859 			strerror(errno));
860 	}
861 #elif defined(IP_BINDANY)
862 	if (transparent &&
863 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
864 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
865 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
866 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
867 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
868 	}
869 #elif defined(SO_BINDANY)
870 	if (transparent &&
871 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
872 	    sizeof(on)) < 0) {
873 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
874 		strerror(errno));
875 	}
876 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
877 	err = set_ip_dscp(s, addr->ai_family, dscp);
878 	if(err != NULL)
879 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
880 	if(
881 #ifdef HAVE_SYSTEMD
882 		!got_fd_from_systemd &&
883 #endif
884         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
885 #ifndef USE_WINSOCK
886 		/* detect freebsd jail with no ipv6 permission */
887 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
888 			*noproto = 1;
889 		else {
890 			log_err_addr("can't bind socket", strerror(errno),
891 				(struct sockaddr_storage*)addr->ai_addr,
892 				addr->ai_addrlen);
893 		}
894 #else
895 		log_err_addr("can't bind socket",
896 			wsa_strerror(WSAGetLastError()),
897 			(struct sockaddr_storage*)addr->ai_addr,
898 			addr->ai_addrlen);
899 #endif
900 		sock_close(s);
901 		return -1;
902 	}
903 	if(!fd_set_nonblock(s)) {
904 		sock_close(s);
905 		return -1;
906 	}
907 	if(listen(s, TCP_BACKLOG) == -1) {
908 		log_err("can't listen: %s", sock_strerror(errno));
909 		sock_close(s);
910 		return -1;
911 	}
912 #ifdef USE_TCP_FASTOPEN
913 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
914 	   against IP spoofing attacks as suggested in RFC7413 */
915 #ifdef __APPLE__
916 	/* OS X implementation only supports qlen of 1 via this call. Actual
917 	   value is configured by the net.inet.tcp.fastopen_backlog kernel param. */
918 	qlen = 1;
919 #else
920 	/* 5 is recommended on linux */
921 	qlen = 5;
922 #endif
923 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
924 		  sizeof(qlen))) == -1 ) {
925 #ifdef ENOPROTOOPT
926 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
927 		   disabled, except when verbosity enabled for debugging */
928 		if(errno != ENOPROTOOPT || verbosity >= 3) {
929 #endif
930 		  if(errno == EPERM) {
931 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
932 		  } else {
933 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
934 		  }
935 #ifdef ENOPROTOOPT
936 		}
937 #endif
938 	}
939 #endif
940 	return s;
941 }
942 
943 char*
set_ip_dscp(int socket,int addrfamily,int dscp)944 set_ip_dscp(int socket, int addrfamily, int dscp)
945 {
946 	int ds;
947 
948 	if(dscp == 0)
949 		return NULL;
950 	ds = dscp << 2;
951 	switch(addrfamily) {
952 	case AF_INET6:
953 	#ifdef IPV6_TCLASS
954 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
955 			sizeof(ds)) < 0)
956 			return sock_strerror(errno);
957 		break;
958 	#else
959 		return "IPV6_TCLASS not defined on this system";
960 	#endif
961 	default:
962 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
963 			return sock_strerror(errno);
964 		break;
965 	}
966 	return NULL;
967 }
968 
969 int
create_local_accept_sock(const char * path,int * noproto,int use_systemd)970 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
971 {
972 #ifdef HAVE_SYSTEMD
973 	int ret;
974 
975 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
976 		return ret;
977 	else {
978 #endif
979 #ifdef HAVE_SYS_UN_H
980 	int s;
981 	struct sockaddr_un usock;
982 #ifndef HAVE_SYSTEMD
983 	(void)use_systemd;
984 #endif
985 
986 	verbose(VERB_ALGO, "creating unix socket %s", path);
987 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
988 	/* this member exists on BSDs, not Linux */
989 	usock.sun_len = (unsigned)sizeof(usock);
990 #endif
991 	usock.sun_family = AF_LOCAL;
992 	/* length is 92-108, 104 on FreeBSD */
993 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
994 
995 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
996 		log_err("Cannot create local socket %s (%s)",
997 			path, strerror(errno));
998 		return -1;
999 	}
1000 
1001 	if (unlink(path) && errno != ENOENT) {
1002 		/* The socket already exists and cannot be removed */
1003 		log_err("Cannot remove old local socket %s (%s)",
1004 			path, strerror(errno));
1005 		goto err;
1006 	}
1007 
1008 	if (bind(s, (struct sockaddr *)&usock,
1009 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
1010 		log_err("Cannot bind local socket %s (%s)",
1011 			path, strerror(errno));
1012 		goto err;
1013 	}
1014 
1015 	if (!fd_set_nonblock(s)) {
1016 		log_err("Cannot set non-blocking mode");
1017 		goto err;
1018 	}
1019 
1020 	if (listen(s, TCP_BACKLOG) == -1) {
1021 		log_err("can't listen: %s", strerror(errno));
1022 		goto err;
1023 	}
1024 
1025 	(void)noproto; /*unused*/
1026 	return s;
1027 
1028 err:
1029 	sock_close(s);
1030 	return -1;
1031 
1032 #ifdef HAVE_SYSTEMD
1033 	}
1034 #endif
1035 #else
1036 	(void)use_systemd;
1037 	(void)path;
1038 	log_err("Local sockets are not supported");
1039 	*noproto = 1;
1040 	return -1;
1041 #endif
1042 }
1043 
1044 
1045 /**
1046  * Create socket from getaddrinfo results
1047  */
1048 static int
make_sock(int stype,const char * ifname,int port,struct addrinfo * hints,int v6only,int * noip6,size_t rcv,size_t snd,int * reuseport,int transparent,int tcp_mss,int nodelay,int freebind,int use_systemd,int dscp,struct unbound_socket * ub_sock,const char * additional)1049 make_sock(int stype, const char* ifname, int port,
1050 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1051 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1052 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
1053 	const char* additional)
1054 {
1055 	struct addrinfo *res = NULL;
1056 	int r, s, inuse, noproto;
1057 	char portbuf[32];
1058 	snprintf(portbuf, sizeof(portbuf), "%d", port);
1059 	hints->ai_socktype = stype;
1060 	*noip6 = 0;
1061 	if((r=getaddrinfo(ifname, portbuf, hints, &res)) != 0 || !res) {
1062 #ifdef USE_WINSOCK
1063 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1064 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1065 			return -1;
1066 		}
1067 #endif
1068 		log_err("node %s:%s getaddrinfo: %s %s",
1069 			ifname?ifname:"default", portbuf, gai_strerror(r),
1070 #ifdef EAI_SYSTEM
1071 			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1072 #else
1073 			""
1074 #endif
1075 		);
1076 		return -1;
1077 	}
1078 	if(stype == SOCK_DGRAM) {
1079 		verbose_print_addr(res, additional);
1080 		s = create_udp_sock(res->ai_family, res->ai_socktype,
1081 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1082 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1083 			reuseport, transparent, freebind, use_systemd, dscp);
1084 		if(s == -1 && inuse) {
1085 			log_err("bind: address already in use");
1086 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1087 			*noip6 = 1;
1088 		}
1089 	} else	{
1090 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1091 			transparent, tcp_mss, nodelay, freebind, use_systemd,
1092 			dscp, additional);
1093 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1094 			*noip6 = 1;
1095 		}
1096 	}
1097 
1098 	if(!res->ai_addr) {
1099 		log_err("getaddrinfo returned no address");
1100 		freeaddrinfo(res);
1101 		sock_close(s);
1102 		return -1;
1103 	}
1104 	ub_sock->addr = memdup(res->ai_addr, res->ai_addrlen);
1105 	ub_sock->addrlen = res->ai_addrlen;
1106 	if(!ub_sock->addr) {
1107 		log_err("out of memory: allocate listening address");
1108 		freeaddrinfo(res);
1109 		sock_close(s);
1110 		return -1;
1111 	}
1112 	freeaddrinfo(res);
1113 
1114 	ub_sock->s = s;
1115 	ub_sock->fam = hints->ai_family;
1116 	ub_sock->acl = NULL;
1117 
1118 	return s;
1119 }
1120 
1121 /** make socket and first see if ifname contains port override info */
1122 static int
make_sock_port(int stype,const char * ifname,int port,struct addrinfo * hints,int v6only,int * noip6,size_t rcv,size_t snd,int * reuseport,int transparent,int tcp_mss,int nodelay,int freebind,int use_systemd,int dscp,struct unbound_socket * ub_sock,const char * additional)1123 make_sock_port(int stype, const char* ifname, int port,
1124 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1125 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1126 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
1127 	const char* additional)
1128 {
1129 	char* s = strchr(ifname, '@');
1130 	if(s) {
1131 		/* override port with ifspec@port */
1132 		int port;
1133 		char newif[128];
1134 		if((size_t)(s-ifname) >= sizeof(newif)) {
1135 			log_err("ifname too long: %s", ifname);
1136 			*noip6 = 0;
1137 			return -1;
1138 		}
1139 		port = atoi(s+1);
1140 		if(port < 0 || 0 == port || port > 65535) {
1141 			log_err("invalid portnumber in interface: %s", ifname);
1142 			*noip6 = 0;
1143 			return -1;
1144 		}
1145 		(void)strlcpy(newif, ifname, sizeof(newif));
1146 		newif[s-ifname] = 0;
1147 		return make_sock(stype, newif, port, hints, v6only, noip6, rcv,
1148 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1149 			use_systemd, dscp, ub_sock, additional);
1150 	}
1151 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1152 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1153 		dscp, ub_sock, additional);
1154 }
1155 
1156 /**
1157  * Add port to open ports list.
1158  * @param list: list head. changed.
1159  * @param s: fd.
1160  * @param ftype: if fd is UDP.
1161  * @param pp2_enabled: if PROXYv2 is enabled for this port.
1162  * @param ub_sock: socket with address.
1163  * @return false on failure. list in unchanged then.
1164  */
1165 static int
port_insert(struct listen_port ** list,int s,enum listen_type ftype,int pp2_enabled,struct unbound_socket * ub_sock)1166 port_insert(struct listen_port** list, int s, enum listen_type ftype,
1167 	int pp2_enabled, struct unbound_socket* ub_sock)
1168 {
1169 	struct listen_port* item = (struct listen_port*)malloc(
1170 		sizeof(struct listen_port));
1171 	if(!item)
1172 		return 0;
1173 	item->next = *list;
1174 	item->fd = s;
1175 	item->ftype = ftype;
1176 	item->pp2_enabled = pp2_enabled;
1177 	item->socket = ub_sock;
1178 	*list = item;
1179 	return 1;
1180 }
1181 
1182 /** set fd to receive software timestamps */
1183 static int
set_recvtimestamp(int s)1184 set_recvtimestamp(int s)
1185 {
1186 #ifdef HAVE_LINUX_NET_TSTAMP_H
1187 	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1188 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1189 		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1190 			strerror(errno));
1191 		return 0;
1192 	}
1193 	return 1;
1194 #elif defined(SO_TIMESTAMP) && defined(SCM_TIMESTAMP)
1195 	int on = 1;
1196 	/* FreeBSD and also Linux. */
1197 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMP, (void*)&on, (socklen_t)sizeof(on)) < 0) {
1198 		log_err("setsockopt(..., SO_TIMESTAMP, ...) failed: %s",
1199 			strerror(errno));
1200 		return 0;
1201 	}
1202 	return 1;
1203 #else
1204 	log_err("packets timestamping is not supported on this platform");
1205 	(void)s;
1206 	return 0;
1207 #endif
1208 }
1209 
1210 /** set fd to receive source address packet info */
1211 static int
set_recvpktinfo(int s,int family)1212 set_recvpktinfo(int s, int family)
1213 {
1214 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1215 	int on = 1;
1216 #else
1217 	(void)s;
1218 #endif
1219 	if(family == AF_INET6) {
1220 #           ifdef IPV6_RECVPKTINFO
1221 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1222 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1223 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1224 				strerror(errno));
1225 			return 0;
1226 		}
1227 #           elif defined(IPV6_PKTINFO)
1228 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1229 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1230 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1231 				strerror(errno));
1232 			return 0;
1233 		}
1234 #           else
1235 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1236 			"disable interface-automatic or do-ip6 in config");
1237 		return 0;
1238 #           endif /* defined IPV6_RECVPKTINFO */
1239 
1240 	} else if(family == AF_INET) {
1241 #           ifdef IP_PKTINFO
1242 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1243 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1244 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1245 				strerror(errno));
1246 			return 0;
1247 		}
1248 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1249 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1250 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1251 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1252 				strerror(errno));
1253 			return 0;
1254 		}
1255 #           else
1256 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1257 			"interface-automatic or do-ip4 in config");
1258 		return 0;
1259 #           endif /* IP_PKTINFO */
1260 
1261 	}
1262 	return 1;
1263 }
1264 
1265 /**
1266  * Helper for ports_open. Creates one interface (or NULL for default).
1267  * @param ifname: The interface ip address.
1268  * @param do_auto: use automatic interface detection.
1269  * 	If enabled, then ifname must be the wildcard name.
1270  * @param do_udp: if udp should be used.
1271  * @param do_tcp: if tcp should be used.
1272  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1273  * @param port: Port number to use.
1274  * @param list: list of open ports, appended to, changed to point to list head.
1275  * @param rcv: receive buffer size for UDP
1276  * @param snd: send buffer size for UDP
1277  * @param ssl_port: ssl service port number
1278  * @param tls_additional_port: list of additional ssl service port numbers.
1279  * @param https_port: DoH service port number
1280  * @param proxy_protocol_port: list of PROXYv2 port numbers.
1281  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1282  * 	set to false on exit if reuseport failed due to no kernel support.
1283  * @param transparent: set IP_TRANSPARENT socket option.
1284  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1285  * @param freebind: set IP_FREEBIND socket option.
1286  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1287  * @param use_systemd: if true, fetch sockets from systemd.
1288  * @param dnscrypt_port: dnscrypt service port number
1289  * @param dscp: DSCP to use.
1290  * @param quic_port: dns over quic port number.
1291  * @param http_notls_downstream: if no tls is used for https downstream.
1292  * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1293  * 	wait to discard if UDP packets have waited for long in the socket
1294  * 	buffer.
1295  * @return: returns false on error.
1296  */
1297 static int
ports_create_if(const char * ifname,int do_auto,int do_udp,int do_tcp,struct addrinfo * hints,int port,struct listen_port ** list,size_t rcv,size_t snd,int ssl_port,struct config_strlist * tls_additional_port,int https_port,struct config_strlist * proxy_protocol_port,int * reuseport,int transparent,int tcp_mss,int freebind,int http2_nodelay,int use_systemd,int dnscrypt_port,int dscp,int quic_port,int http_notls_downstream,int sock_queue_timeout)1298 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1299 	struct addrinfo *hints, int port, struct listen_port** list,
1300 	size_t rcv, size_t snd, int ssl_port,
1301 	struct config_strlist* tls_additional_port, int https_port,
1302 	struct config_strlist* proxy_protocol_port,
1303 	int* reuseport, int transparent, int tcp_mss, int freebind,
1304 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1305 	int quic_port, int http_notls_downstream, int sock_queue_timeout)
1306 {
1307 	int s, noip6=0;
1308 	int is_ssl = if_is_ssl(ifname, port, ssl_port, tls_additional_port);
1309 	int is_https = if_is_https(ifname, port, https_port);
1310 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1311 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1312 	int is_doq = if_is_quic(ifname, port, quic_port);
1313 	/* Always set TCP_NODELAY on TLS connection as it speeds up the TLS
1314 	 * handshake. DoH had already such option so we respect it.
1315 	 * Otherwise the server waits before sending more handshake data for
1316 	 * the client ACK (Nagle's algorithm), which is delayed because the
1317 	 * client waits for more data before ACKing (delayed ACK). */
1318 	int nodelay = is_https?http2_nodelay:is_ssl;
1319 	struct unbound_socket* ub_sock;
1320 	const char* add = NULL;
1321 
1322 	if(!do_udp && !do_tcp)
1323 		return 0;
1324 
1325 	if(is_pp2) {
1326 		if(is_dnscrypt) {
1327 			fatal_exit("PROXYv2 and DNSCrypt combination not "
1328 				"supported!");
1329 		} else if(is_https) {
1330 			fatal_exit("PROXYv2 and DoH combination not "
1331 				"supported!");
1332 		} else if(is_doq) {
1333 			fatal_exit("PROXYv2 and DoQ combination not "
1334 				"supported!");
1335 		}
1336 	}
1337 
1338 	/* Check if both UDP and TCP ports should be open.
1339 	 * In the case of encrypted channels, probably an unencrypted channel
1340 	 * at the same port is not desired. */
1341 	if((is_ssl || is_https) && !is_doq) do_udp = do_auto = 0;
1342 	if((is_doq) && !(is_https || is_ssl)) do_tcp = 0;
1343 
1344 	if(do_auto) {
1345 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1346 		if(!ub_sock)
1347 			return 0;
1348 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1349 			&noip6, rcv, snd, reuseport, transparent,
1350 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
1351 			(is_dnscrypt?"udpancil_dnscrypt":"udpancil"))) == -1) {
1352 			free(ub_sock->addr);
1353 			free(ub_sock);
1354 			if(noip6) {
1355 				log_warn("IPv6 protocol not available");
1356 				return 1;
1357 			}
1358 			return 0;
1359 		}
1360 		/* getting source addr packet info is highly non-portable */
1361 		if(!set_recvpktinfo(s, hints->ai_family)) {
1362 			sock_close(s);
1363 			free(ub_sock->addr);
1364 			free(ub_sock);
1365 			return 0;
1366 		}
1367 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1368 			log_warn("socket timestamping is not available");
1369 		}
1370 		if(!port_insert(list, s, is_dnscrypt
1371 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1372 			is_pp2, ub_sock)) {
1373 			sock_close(s);
1374 			free(ub_sock->addr);
1375 			free(ub_sock);
1376 			return 0;
1377 		}
1378 	} else if(do_udp) {
1379 		enum listen_type udp_port_type;
1380 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1381 		if(!ub_sock)
1382 			return 0;
1383 		if(is_dnscrypt) {
1384 			udp_port_type = listen_type_udp_dnscrypt;
1385 			add = "dnscrypt";
1386 		} else if(is_doq) {
1387 			udp_port_type = listen_type_doq;
1388 			add = "doq";
1389 			if(if_listens_on(ifname, port, 53, NULL)) {
1390 				log_err("DNS over QUIC is strictly not "
1391 					"allowed on port 53 as per RFC 9250. "
1392 					"Port 53 is for DNS datagrams. Error "
1393 					"for interface '%s'.", ifname);
1394 				free(ub_sock->addr);
1395 				free(ub_sock);
1396 				return 0;
1397 			}
1398 		} else {
1399 			udp_port_type = listen_type_udp;
1400 			add = NULL;
1401 		}
1402 		/* regular udp socket */
1403 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1404 			&noip6, rcv, snd, reuseport, transparent,
1405 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
1406 			add)) == -1) {
1407 			free(ub_sock->addr);
1408 			free(ub_sock);
1409 			if(noip6) {
1410 				log_warn("IPv6 protocol not available");
1411 				return 1;
1412 			}
1413 			return 0;
1414 		}
1415 		if(udp_port_type == listen_type_doq) {
1416 			if(!set_recvpktinfo(s, hints->ai_family)) {
1417 				sock_close(s);
1418 				free(ub_sock->addr);
1419 				free(ub_sock);
1420 				return 0;
1421 			}
1422 		}
1423 		if(udp_port_type == listen_type_udp && sock_queue_timeout)
1424 			udp_port_type = listen_type_udpancil;
1425 		if (sock_queue_timeout) {
1426 			if(!set_recvtimestamp(s)) {
1427 				log_warn("socket timestamping is not available");
1428 			} else {
1429 				if(udp_port_type == listen_type_udp)
1430 					udp_port_type = listen_type_udpancil;
1431 			}
1432 		}
1433 		if(!port_insert(list, s, udp_port_type, is_pp2, ub_sock)) {
1434 			sock_close(s);
1435 			free(ub_sock->addr);
1436 			free(ub_sock);
1437 			return 0;
1438 		}
1439 	}
1440 	if(do_tcp) {
1441 		enum listen_type port_type;
1442 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1443 		if(!ub_sock)
1444 			return 0;
1445 		if(is_ssl) {
1446 			port_type = listen_type_ssl;
1447 			add = "tls";
1448 		} else if(is_https) {
1449 			port_type = listen_type_http;
1450 			add = "https";
1451 			if(http_notls_downstream)
1452 				add = "http";
1453 		} else if(is_dnscrypt) {
1454 			port_type = listen_type_tcp_dnscrypt;
1455 			add = "dnscrypt";
1456 		} else {
1457 			port_type = listen_type_tcp;
1458 			add = NULL;
1459 		}
1460 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1461 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1462 			freebind, use_systemd, dscp, ub_sock, add)) == -1) {
1463 			free(ub_sock->addr);
1464 			free(ub_sock);
1465 			if(noip6) {
1466 				/*log_warn("IPv6 protocol not available");*/
1467 				return 1;
1468 			}
1469 			return 0;
1470 		}
1471 		if(is_ssl)
1472 			verbose(VERB_ALGO, "setup TCP for SSL service");
1473 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1474 			sock_close(s);
1475 			free(ub_sock->addr);
1476 			free(ub_sock);
1477 			return 0;
1478 		}
1479 	}
1480 	return 1;
1481 }
1482 
1483 /**
1484  * Add items to commpoint list in front.
1485  * @param c: commpoint to add.
1486  * @param front: listen struct.
1487  * @return: false on failure.
1488  */
1489 static int
listen_cp_insert(struct comm_point * c,struct listen_dnsport * front)1490 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1491 {
1492 	struct listen_list* item = (struct listen_list*)malloc(
1493 		sizeof(struct listen_list));
1494 	if(!item)
1495 		return 0;
1496 	item->com = c;
1497 	item->next = front->cps;
1498 	front->cps = item;
1499 	return 1;
1500 }
1501 
listen_setup_locks(void)1502 void listen_setup_locks(void)
1503 {
1504 	if(!stream_wait_lock_inited) {
1505 		lock_basic_init(&stream_wait_count_lock);
1506 		stream_wait_lock_inited = 1;
1507 	}
1508 	if(!http2_query_buffer_lock_inited) {
1509 		lock_basic_init(&http2_query_buffer_count_lock);
1510 		http2_query_buffer_lock_inited = 1;
1511 	}
1512 	if(!http2_response_buffer_lock_inited) {
1513 		lock_basic_init(&http2_response_buffer_count_lock);
1514 		http2_response_buffer_lock_inited = 1;
1515 	}
1516 }
1517 
listen_desetup_locks(void)1518 void listen_desetup_locks(void)
1519 {
1520 	if(stream_wait_lock_inited) {
1521 		stream_wait_lock_inited = 0;
1522 		lock_basic_destroy(&stream_wait_count_lock);
1523 	}
1524 	if(http2_query_buffer_lock_inited) {
1525 		http2_query_buffer_lock_inited = 0;
1526 		lock_basic_destroy(&http2_query_buffer_count_lock);
1527 	}
1528 	if(http2_response_buffer_lock_inited) {
1529 		http2_response_buffer_lock_inited = 0;
1530 		lock_basic_destroy(&http2_response_buffer_count_lock);
1531 	}
1532 }
1533 
1534 struct listen_dnsport*
listen_create(struct comm_base * base,struct listen_port * ports,size_t bufsize,int tcp_accept_count,int tcp_idle_timeout,int harden_large_queries,uint32_t http_max_streams,char * http_endpoint,int http_notls,struct tcl_list * tcp_conn_limit,void * dot_sslctx,void * doh_sslctx,void * quic_sslctx,struct dt_env * dtenv,struct doq_table * doq_table,struct ub_randstate * rnd,struct config_file * cfg,comm_point_callback_type * cb,void * cb_arg)1535 listen_create(struct comm_base* base, struct listen_port* ports,
1536 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1537 	int harden_large_queries, uint32_t http_max_streams,
1538 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1539 	void* dot_sslctx, void* doh_sslctx, void* quic_sslctx,
1540 	struct dt_env* dtenv,
1541 	struct doq_table* doq_table,
1542 	struct ub_randstate* rnd,struct config_file* cfg,
1543 	comm_point_callback_type* cb, void *cb_arg)
1544 {
1545 	struct listen_dnsport* front = (struct listen_dnsport*)
1546 		malloc(sizeof(struct listen_dnsport));
1547 	if(!front)
1548 		return NULL;
1549 	front->cps = NULL;
1550 	front->udp_buff = sldns_buffer_new(bufsize);
1551 #ifdef USE_DNSCRYPT
1552 	front->dnscrypt_udp_buff = NULL;
1553 #endif
1554 	if(!front->udp_buff) {
1555 		free(front);
1556 		return NULL;
1557 	}
1558 
1559 	/* create comm points as needed */
1560 	while(ports) {
1561 		struct comm_point* cp = NULL;
1562 		if(ports->ftype == listen_type_udp ||
1563 		   ports->ftype == listen_type_udp_dnscrypt) {
1564 			cp = comm_point_create_udp(base, ports->fd,
1565 				front->udp_buff, ports->pp2_enabled, cb,
1566 				cb_arg, ports->socket);
1567 		} else if(ports->ftype == listen_type_doq) {
1568 #ifndef HAVE_NGTCP2
1569 			log_warn("Unbound is not compiled with "
1570 				"ngtcp2. This is required to use DNS "
1571 				"over QUIC.");
1572 #endif
1573 			cp = comm_point_create_doq(base, ports->fd,
1574 				front->udp_buff, cb, cb_arg, ports->socket,
1575 				doq_table, rnd, quic_sslctx, cfg);
1576 		} else if(ports->ftype == listen_type_tcp ||
1577 				ports->ftype == listen_type_tcp_dnscrypt) {
1578 			cp = comm_point_create_tcp(base, ports->fd,
1579 				tcp_accept_count, tcp_idle_timeout,
1580 				harden_large_queries, 0, NULL,
1581 				tcp_conn_limit, bufsize, front->udp_buff,
1582 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1583 				ports->socket);
1584 		} else if(ports->ftype == listen_type_ssl ||
1585 			ports->ftype == listen_type_http) {
1586 			cp = comm_point_create_tcp(base, ports->fd,
1587 				tcp_accept_count, tcp_idle_timeout,
1588 				harden_large_queries,
1589 				http_max_streams, http_endpoint,
1590 				tcp_conn_limit, bufsize, front->udp_buff,
1591 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1592 				ports->socket);
1593 			if(ports->ftype == listen_type_http) {
1594 				if(!doh_sslctx && !http_notls) {
1595 					log_warn("HTTPS port configured, but "
1596 						"no TLS tls-service-key or "
1597 						"tls-service-pem set");
1598 				}
1599 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1600 				if(!http_notls) {
1601 					log_warn("Unbound is not compiled "
1602 						"with an OpenSSL version "
1603 						"supporting ALPN "
1604 						"(OpenSSL >= 1.0.2). This "
1605 						"is required to use "
1606 						"DNS-over-HTTPS");
1607 				}
1608 #endif
1609 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1610 				log_warn("Unbound is not compiled with "
1611 					"nghttp2. This is required to use "
1612 					"DNS-over-HTTPS.");
1613 #endif
1614 			}
1615 		} else if(ports->ftype == listen_type_udpancil ||
1616 				  ports->ftype == listen_type_udpancil_dnscrypt) {
1617 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
1618 			cp = comm_point_create_udp_ancil(base, ports->fd,
1619 				front->udp_buff, ports->pp2_enabled, cb,
1620 				cb_arg, ports->socket);
1621 #else
1622 			log_warn("This system does not support UDP ancillary data.");
1623 #endif
1624 		}
1625 		if(!cp) {
1626 			log_err("can't create commpoint");
1627 			listen_delete(front);
1628 			return NULL;
1629 		}
1630 		if((http_notls && ports->ftype == listen_type_http) ||
1631 			(ports->ftype == listen_type_tcp) ||
1632 			(ports->ftype == listen_type_udp) ||
1633 			(ports->ftype == listen_type_udpancil) ||
1634 			(ports->ftype == listen_type_tcp_dnscrypt) ||
1635 			(ports->ftype == listen_type_udp_dnscrypt) ||
1636 			(ports->ftype == listen_type_udpancil_dnscrypt)) {
1637 			cp->ssl = NULL;
1638 		} else if(ports->ftype == listen_type_doq) {
1639 			cp->ssl = quic_sslctx;
1640 		} else if(ports->ftype == listen_type_http) {
1641 			cp->ssl = doh_sslctx;
1642 		} else {
1643 			cp->ssl = dot_sslctx;
1644 		}
1645 		cp->dtenv = dtenv;
1646 		cp->do_not_close = 1;
1647 #ifdef USE_DNSCRYPT
1648 		if (ports->ftype == listen_type_udp_dnscrypt ||
1649 			ports->ftype == listen_type_tcp_dnscrypt ||
1650 			ports->ftype == listen_type_udpancil_dnscrypt) {
1651 			cp->dnscrypt = 1;
1652 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1653 			if(!cp->dnscrypt_buffer) {
1654 				log_err("can't alloc dnscrypt_buffer");
1655 				comm_point_delete(cp);
1656 				listen_delete(front);
1657 				return NULL;
1658 			}
1659 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1660 		}
1661 #endif
1662 		if(!listen_cp_insert(cp, front)) {
1663 			log_err("malloc failed");
1664 			comm_point_delete(cp);
1665 			listen_delete(front);
1666 			return NULL;
1667 		}
1668 		ports = ports->next;
1669 	}
1670 	if(!front->cps) {
1671 		log_err("Could not open sockets to accept queries.");
1672 		listen_delete(front);
1673 		return NULL;
1674 	}
1675 
1676 	return front;
1677 }
1678 
1679 void
listen_list_delete(struct listen_list * list)1680 listen_list_delete(struct listen_list* list)
1681 {
1682 	struct listen_list *p = list, *pn;
1683 	while(p) {
1684 		pn = p->next;
1685 		comm_point_delete(p->com);
1686 		free(p);
1687 		p = pn;
1688 	}
1689 }
1690 
1691 void
listen_delete(struct listen_dnsport * front)1692 listen_delete(struct listen_dnsport* front)
1693 {
1694 	if(!front)
1695 		return;
1696 	listen_list_delete(front->cps);
1697 #ifdef USE_DNSCRYPT
1698 	if(front->dnscrypt_udp_buff &&
1699 		front->udp_buff != front->dnscrypt_udp_buff) {
1700 		sldns_buffer_free(front->dnscrypt_udp_buff);
1701 	}
1702 #endif
1703 	sldns_buffer_free(front->udp_buff);
1704 	free(front);
1705 }
1706 
1707 #ifdef HAVE_GETIFADDRS
1708 static int
resolve_ifa_name(struct ifaddrs * ifas,const char * search_ifa,char *** ip_addresses,int * ip_addresses_size)1709 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1710 {
1711 	struct ifaddrs *ifa;
1712 	void *tmpbuf;
1713 	int last_ip_addresses_size = *ip_addresses_size;
1714 
1715 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1716 		sa_family_t family;
1717 		const char* atsign;
1718 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1719 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1720 #else
1721 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1722 #endif
1723 
1724 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1725 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1726 			   || strncmp(ifa->ifa_name, search_ifa,
1727 			   atsign-search_ifa) != 0)
1728 				continue;
1729 		} else {
1730 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1731 				continue;
1732 			atsign = "";
1733 		}
1734 
1735 		if(ifa->ifa_addr == NULL)
1736 			continue;
1737 
1738 		family = ifa->ifa_addr->sa_family;
1739 		if(family == AF_INET) {
1740 			char a4[INET_ADDRSTRLEN + 1];
1741 			struct sockaddr_in *in4 = (struct sockaddr_in *)
1742 				ifa->ifa_addr;
1743 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1744 				log_err("inet_ntop failed");
1745 				return 0;
1746 			}
1747 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1748 				a4, atsign);
1749 		}
1750 #ifdef INET6
1751 		else if(family == AF_INET6) {
1752 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1753 				ifa->ifa_addr;
1754 			char a6[INET6_ADDRSTRLEN + 1];
1755 			char if_index_name[IF_NAMESIZE + 1];
1756 			if_index_name[0] = 0;
1757 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1758 				log_err("inet_ntop failed");
1759 				return 0;
1760 			}
1761 			(void)if_indextoname(in6->sin6_scope_id,
1762 				(char *)if_index_name);
1763 			if (strlen(if_index_name) != 0) {
1764 				snprintf(addr_buf, sizeof(addr_buf),
1765 					"%s%%%s%s", a6, if_index_name, atsign);
1766 			} else {
1767 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1768 					a6, atsign);
1769 			}
1770 		}
1771 #endif
1772 		else {
1773 			continue;
1774 		}
1775 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1776 
1777 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1778 		if(!tmpbuf) {
1779 			log_err("realloc failed: out of memory");
1780 			return 0;
1781 		} else {
1782 			*ip_addresses = tmpbuf;
1783 		}
1784 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1785 		if(!(*ip_addresses)[*ip_addresses_size]) {
1786 			log_err("strdup failed: out of memory");
1787 			return 0;
1788 		}
1789 		(*ip_addresses_size)++;
1790 	}
1791 
1792 	if (*ip_addresses_size == last_ip_addresses_size) {
1793 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1794 		if(!tmpbuf) {
1795 			log_err("realloc failed: out of memory");
1796 			return 0;
1797 		} else {
1798 			*ip_addresses = tmpbuf;
1799 		}
1800 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1801 		if(!(*ip_addresses)[*ip_addresses_size]) {
1802 			log_err("strdup failed: out of memory");
1803 			return 0;
1804 		}
1805 		(*ip_addresses_size)++;
1806 	}
1807 	return 1;
1808 }
1809 #endif /* HAVE_GETIFADDRS */
1810 
resolve_interface_names(char ** ifs,int num_ifs,struct config_strlist * list,char *** resif,int * num_resif)1811 int resolve_interface_names(char** ifs, int num_ifs,
1812 	struct config_strlist* list, char*** resif, int* num_resif)
1813 {
1814 #ifdef HAVE_GETIFADDRS
1815 	struct ifaddrs *addrs = NULL;
1816 	if(num_ifs == 0 && list == NULL) {
1817 		*resif = NULL;
1818 		*num_resif = 0;
1819 		return 1;
1820 	}
1821 	if(getifaddrs(&addrs) == -1) {
1822 		log_err("failed to list interfaces: getifaddrs: %s",
1823 			strerror(errno));
1824 		freeifaddrs(addrs);
1825 		return 0;
1826 	}
1827 	if(ifs) {
1828 		int i;
1829 		for(i=0; i<num_ifs; i++) {
1830 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1831 				freeifaddrs(addrs);
1832 				config_del_strarray(*resif, *num_resif);
1833 				*resif = NULL;
1834 				*num_resif = 0;
1835 				return 0;
1836 			}
1837 		}
1838 	}
1839 	if(list) {
1840 		struct config_strlist* p;
1841 		for(p = list; p; p = p->next) {
1842 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1843 				freeifaddrs(addrs);
1844 				config_del_strarray(*resif, *num_resif);
1845 				*resif = NULL;
1846 				*num_resif = 0;
1847 				return 0;
1848 			}
1849 }
1850 	}
1851 	freeifaddrs(addrs);
1852 	return 1;
1853 #else
1854 	struct config_strlist* p;
1855 	if(num_ifs == 0 && list == NULL) {
1856 		*resif = NULL;
1857 		*num_resif = 0;
1858 		return 1;
1859 	}
1860 	*num_resif = num_ifs;
1861 	for(p = list; p; p = p->next) {
1862 		(*num_resif)++;
1863 	}
1864 	*resif = calloc(*num_resif, sizeof(**resif));
1865 	if(!*resif) {
1866 		log_err("out of memory");
1867 		return 0;
1868 	}
1869 	if(ifs) {
1870 		int i;
1871 		for(i=0; i<num_ifs; i++) {
1872 			(*resif)[i] = strdup(ifs[i]);
1873 			if(!((*resif)[i])) {
1874 				log_err("out of memory");
1875 				config_del_strarray(*resif, *num_resif);
1876 				*resif = NULL;
1877 				*num_resif = 0;
1878 				return 0;
1879 			}
1880 		}
1881 	}
1882 	if(list) {
1883 		int idx = num_ifs;
1884 		for(p = list; p; p = p->next) {
1885 			(*resif)[idx] = strdup(p->str);
1886 			if(!((*resif)[idx])) {
1887 				log_err("out of memory");
1888 				config_del_strarray(*resif, *num_resif);
1889 				*resif = NULL;
1890 				*num_resif = 0;
1891 				return 0;
1892 			}
1893 			idx++;
1894 		}
1895 	}
1896 	return 1;
1897 #endif /* HAVE_GETIFADDRS */
1898 }
1899 
1900 struct listen_port*
listening_ports_open(struct config_file * cfg,char ** ifs,int num_ifs,int * reuseport)1901 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1902 	int* reuseport)
1903 {
1904 	struct listen_port* list = NULL;
1905 	struct addrinfo hints;
1906 	int i, do_ip4, do_ip6;
1907 	int do_tcp, do_auto;
1908 	do_ip4 = cfg->do_ip4;
1909 	do_ip6 = cfg->do_ip6;
1910 	do_tcp = cfg->do_tcp;
1911 	do_auto = cfg->if_automatic && cfg->do_udp;
1912 	if(cfg->incoming_num_tcp == 0)
1913 		do_tcp = 0;
1914 
1915 	/* getaddrinfo */
1916 	memset(&hints, 0, sizeof(hints));
1917 	hints.ai_flags = AI_PASSIVE;
1918 	/* no name lookups on our listening ports */
1919 	if(num_ifs > 0)
1920 		hints.ai_flags |= AI_NUMERICHOST;
1921 	hints.ai_family = AF_UNSPEC;
1922 #ifndef INET6
1923 	do_ip6 = 0;
1924 #endif
1925 	if(!do_ip4 && !do_ip6) {
1926 		return NULL;
1927 	}
1928 	/* create ip4 and ip6 ports so that return addresses are nice. */
1929 	if(do_auto || num_ifs == 0) {
1930 		if(do_auto && cfg->if_automatic_ports &&
1931 			cfg->if_automatic_ports[0]!=0) {
1932 			char* now = cfg->if_automatic_ports;
1933 			while(now && *now) {
1934 				char* after;
1935 				int extraport;
1936 				while(isspace((unsigned char)*now))
1937 					now++;
1938 				if(!*now)
1939 					break;
1940 				after = now;
1941 				extraport = (int)strtol(now, &after, 10);
1942 				if(extraport < 0 || extraport > 65535) {
1943 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1944 					listening_ports_free(list);
1945 					return NULL;
1946 				}
1947 				if(extraport == 0 && now == after) {
1948 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1949 					listening_ports_free(list);
1950 					return NULL;
1951 				}
1952 				now = after;
1953 				if(do_ip6) {
1954 					hints.ai_family = AF_INET6;
1955 					if(!ports_create_if("::0",
1956 						do_auto, cfg->do_udp, do_tcp,
1957 						&hints, extraport, &list,
1958 						cfg->so_rcvbuf, cfg->so_sndbuf,
1959 						cfg->ssl_port, cfg->tls_additional_port,
1960 						cfg->https_port,
1961 						cfg->proxy_protocol_port,
1962 						reuseport, cfg->ip_transparent,
1963 						cfg->tcp_mss, cfg->ip_freebind,
1964 						cfg->http_nodelay, cfg->use_systemd,
1965 						cfg->dnscrypt_port, cfg->ip_dscp,
1966 						cfg->quic_port, cfg->http_notls_downstream,
1967 						cfg->sock_queue_timeout)) {
1968 						listening_ports_free(list);
1969 						return NULL;
1970 					}
1971 				}
1972 				if(do_ip4) {
1973 					hints.ai_family = AF_INET;
1974 					if(!ports_create_if("0.0.0.0",
1975 						do_auto, cfg->do_udp, do_tcp,
1976 						&hints, extraport, &list,
1977 						cfg->so_rcvbuf, cfg->so_sndbuf,
1978 						cfg->ssl_port, cfg->tls_additional_port,
1979 						cfg->https_port,
1980 						cfg->proxy_protocol_port,
1981 						reuseport, cfg->ip_transparent,
1982 						cfg->tcp_mss, cfg->ip_freebind,
1983 						cfg->http_nodelay, cfg->use_systemd,
1984 						cfg->dnscrypt_port, cfg->ip_dscp,
1985 						cfg->quic_port, cfg->http_notls_downstream,
1986 						cfg->sock_queue_timeout)) {
1987 						listening_ports_free(list);
1988 						return NULL;
1989 					}
1990 				}
1991 			}
1992 			return list;
1993 		}
1994 		if(do_ip6) {
1995 			hints.ai_family = AF_INET6;
1996 			if(!ports_create_if(do_auto?"::0":"::1",
1997 				do_auto, cfg->do_udp, do_tcp,
1998 				&hints, cfg->port, &list,
1999 				cfg->so_rcvbuf, cfg->so_sndbuf,
2000 				cfg->ssl_port, cfg->tls_additional_port,
2001 				cfg->https_port, cfg->proxy_protocol_port,
2002 				reuseport, cfg->ip_transparent,
2003 				cfg->tcp_mss, cfg->ip_freebind,
2004 				cfg->http_nodelay, cfg->use_systemd,
2005 				cfg->dnscrypt_port, cfg->ip_dscp,
2006 				cfg->quic_port, cfg->http_notls_downstream,
2007 				cfg->sock_queue_timeout)) {
2008 				listening_ports_free(list);
2009 				return NULL;
2010 			}
2011 		}
2012 		if(do_ip4) {
2013 			hints.ai_family = AF_INET;
2014 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
2015 				do_auto, cfg->do_udp, do_tcp,
2016 				&hints, cfg->port, &list,
2017 				cfg->so_rcvbuf, cfg->so_sndbuf,
2018 				cfg->ssl_port, cfg->tls_additional_port,
2019 				cfg->https_port, cfg->proxy_protocol_port,
2020 				reuseport, cfg->ip_transparent,
2021 				cfg->tcp_mss, cfg->ip_freebind,
2022 				cfg->http_nodelay, cfg->use_systemd,
2023 				cfg->dnscrypt_port, cfg->ip_dscp,
2024 				cfg->quic_port, cfg->http_notls_downstream,
2025 				cfg->sock_queue_timeout)) {
2026 				listening_ports_free(list);
2027 				return NULL;
2028 			}
2029 		}
2030 	} else for(i = 0; i<num_ifs; i++) {
2031 		if(str_is_ip6(ifs[i])) {
2032 			if(!do_ip6)
2033 				continue;
2034 			hints.ai_family = AF_INET6;
2035 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
2036 				do_tcp, &hints, cfg->port, &list,
2037 				cfg->so_rcvbuf, cfg->so_sndbuf,
2038 				cfg->ssl_port, cfg->tls_additional_port,
2039 				cfg->https_port, cfg->proxy_protocol_port,
2040 				reuseport, cfg->ip_transparent,
2041 				cfg->tcp_mss, cfg->ip_freebind,
2042 				cfg->http_nodelay, cfg->use_systemd,
2043 				cfg->dnscrypt_port, cfg->ip_dscp,
2044 				cfg->quic_port, cfg->http_notls_downstream,
2045 				cfg->sock_queue_timeout)) {
2046 				listening_ports_free(list);
2047 				return NULL;
2048 			}
2049 		} else {
2050 			if(!do_ip4)
2051 				continue;
2052 			hints.ai_family = AF_INET;
2053 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
2054 				do_tcp, &hints, cfg->port, &list,
2055 				cfg->so_rcvbuf, cfg->so_sndbuf,
2056 				cfg->ssl_port, cfg->tls_additional_port,
2057 				cfg->https_port, cfg->proxy_protocol_port,
2058 				reuseport, cfg->ip_transparent,
2059 				cfg->tcp_mss, cfg->ip_freebind,
2060 				cfg->http_nodelay, cfg->use_systemd,
2061 				cfg->dnscrypt_port, cfg->ip_dscp,
2062 				cfg->quic_port, cfg->http_notls_downstream,
2063 				cfg->sock_queue_timeout)) {
2064 				listening_ports_free(list);
2065 				return NULL;
2066 			}
2067 		}
2068 	}
2069 
2070 	return list;
2071 }
2072 
listening_ports_free(struct listen_port * list)2073 void listening_ports_free(struct listen_port* list)
2074 {
2075 	struct listen_port* nx;
2076 	while(list) {
2077 		nx = list->next;
2078 		if(list->fd != -1) {
2079 			sock_close(list->fd);
2080 		}
2081 		/* rc_ports don't have ub_socket */
2082 		if(list->socket) {
2083 			free(list->socket->addr);
2084 			free(list->socket);
2085 		}
2086 		free(list);
2087 		list = nx;
2088 	}
2089 }
2090 
listen_get_mem(struct listen_dnsport * listen)2091 size_t listen_get_mem(struct listen_dnsport* listen)
2092 {
2093 	struct listen_list* p;
2094 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
2095 		sizeof(*listen->udp_buff) +
2096 		sldns_buffer_capacity(listen->udp_buff);
2097 #ifdef USE_DNSCRYPT
2098 	s += sizeof(*listen->dnscrypt_udp_buff);
2099 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
2100 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
2101 	}
2102 #endif
2103 	for(p = listen->cps; p; p = p->next) {
2104 		s += sizeof(*p);
2105 		s += comm_point_get_mem(p->com);
2106 	}
2107 	return s;
2108 }
2109 
listen_stop_accept(struct listen_dnsport * listen)2110 void listen_stop_accept(struct listen_dnsport* listen)
2111 {
2112 	/* do not stop the ones that have no tcp_free list
2113 	 * (they have already stopped listening) */
2114 	struct listen_list* p;
2115 	for(p=listen->cps; p; p=p->next) {
2116 		if(p->com->type == comm_tcp_accept &&
2117 			p->com->tcp_free != NULL) {
2118 			comm_point_stop_listening(p->com);
2119 		}
2120 	}
2121 }
2122 
listen_start_accept(struct listen_dnsport * listen)2123 void listen_start_accept(struct listen_dnsport* listen)
2124 {
2125 	/* do not start the ones that have no tcp_free list, it is no
2126 	 * use to listen to them because they have no free tcp handlers */
2127 	struct listen_list* p;
2128 	for(p=listen->cps; p; p=p->next) {
2129 		if(p->com->type == comm_tcp_accept &&
2130 			p->com->tcp_free != NULL) {
2131 			comm_point_start_listening(p->com, -1, -1);
2132 		}
2133 	}
2134 }
2135 
2136 struct tcp_req_info*
tcp_req_info_create(struct sldns_buffer * spoolbuf)2137 tcp_req_info_create(struct sldns_buffer* spoolbuf)
2138 {
2139 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2140 	if(!req) {
2141 		log_err("malloc failure for new stream outoforder processing structure");
2142 		return NULL;
2143 	}
2144 	memset(req, 0, sizeof(*req));
2145 	req->spool_buffer = spoolbuf;
2146 	return req;
2147 }
2148 
2149 void
tcp_req_info_delete(struct tcp_req_info * req)2150 tcp_req_info_delete(struct tcp_req_info* req)
2151 {
2152 	if(!req) return;
2153 	tcp_req_info_clear(req);
2154 	/* cp is pointer back to commpoint that owns this struct and
2155 	 * called delete on us */
2156 	/* spool_buffer is shared udp buffer, not deleted here */
2157 	free(req);
2158 }
2159 
tcp_req_info_clear(struct tcp_req_info * req)2160 void tcp_req_info_clear(struct tcp_req_info* req)
2161 {
2162 	struct tcp_req_open_item* open, *nopen;
2163 	struct tcp_req_done_item* item, *nitem;
2164 	if(!req) return;
2165 
2166 	/* free outstanding request mesh reply entries */
2167 	open = req->open_req_list;
2168 	while(open) {
2169 		nopen = open->next;
2170 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2171 		free(open);
2172 		open = nopen;
2173 	}
2174 	req->open_req_list = NULL;
2175 	req->num_open_req = 0;
2176 
2177 	/* free pending writable result packets */
2178 	item = req->done_req_list;
2179 	while(item) {
2180 		nitem = item->next;
2181 		lock_basic_lock(&stream_wait_count_lock);
2182 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2183 			+item->len);
2184 		lock_basic_unlock(&stream_wait_count_lock);
2185 		free(item->buf);
2186 		free(item);
2187 		item = nitem;
2188 	}
2189 	req->done_req_list = NULL;
2190 	req->num_done_req = 0;
2191 	req->read_is_closed = 0;
2192 }
2193 
2194 void
tcp_req_info_remove_mesh_state(struct tcp_req_info * req,struct mesh_state * m)2195 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2196 {
2197 	struct tcp_req_open_item* open, *prev = NULL;
2198 	if(!req || !m) return;
2199 	open = req->open_req_list;
2200 	while(open) {
2201 		if(open->mesh_state == m) {
2202 			struct tcp_req_open_item* next;
2203 			if(prev) prev->next = open->next;
2204 			else req->open_req_list = open->next;
2205 			/* caller has to manage the mesh state reply entry */
2206 			next = open->next;
2207 			free(open);
2208 			req->num_open_req --;
2209 
2210 			/* prev = prev; */
2211 			open = next;
2212 			continue;
2213 		}
2214 		prev = open;
2215 		open = open->next;
2216 	}
2217 }
2218 
2219 /** setup listening for read or write */
2220 static void
tcp_req_info_setup_listen(struct tcp_req_info * req)2221 tcp_req_info_setup_listen(struct tcp_req_info* req)
2222 {
2223 	int wr = 0;
2224 	int rd = 0;
2225 
2226 	if(req->cp->tcp_byte_count != 0) {
2227 		/* cannot change, halfway through */
2228 		return;
2229 	}
2230 
2231 	if(!req->cp->tcp_is_reading)
2232 		wr = 1;
2233 	if(!req->read_is_closed)
2234 		rd = 1;
2235 
2236 	if(wr) {
2237 		req->cp->tcp_is_reading = 0;
2238 		comm_point_stop_listening(req->cp);
2239 		comm_point_start_listening(req->cp, -1,
2240 			adjusted_tcp_timeout(req->cp));
2241 	} else if(rd) {
2242 		req->cp->tcp_is_reading = 1;
2243 		comm_point_stop_listening(req->cp);
2244 		comm_point_start_listening(req->cp, -1,
2245 			adjusted_tcp_timeout(req->cp));
2246 		/* and also read it (from SSL stack buffers), so
2247 		 * no event read event is expected since the remainder of
2248 		 * the TLS frame is sitting in the buffers. */
2249 		req->read_again = 1;
2250 	} else {
2251 		comm_point_stop_listening(req->cp);
2252 		comm_point_start_listening(req->cp, -1,
2253 			adjusted_tcp_timeout(req->cp));
2254 		comm_point_listen_for_rw(req->cp, 0, 0);
2255 	}
2256 }
2257 
2258 /** remove first item from list of pending results */
2259 static struct tcp_req_done_item*
tcp_req_info_pop_done(struct tcp_req_info * req)2260 tcp_req_info_pop_done(struct tcp_req_info* req)
2261 {
2262 	struct tcp_req_done_item* item;
2263 	log_assert(req->num_done_req > 0 && req->done_req_list);
2264 	item = req->done_req_list;
2265 	lock_basic_lock(&stream_wait_count_lock);
2266 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2267 	lock_basic_unlock(&stream_wait_count_lock);
2268 	req->done_req_list = req->done_req_list->next;
2269 	req->num_done_req --;
2270 	return item;
2271 }
2272 
2273 /** Send given buffer and setup to write */
2274 static void
tcp_req_info_start_write_buf(struct tcp_req_info * req,uint8_t * buf,size_t len)2275 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2276 	size_t len)
2277 {
2278 	sldns_buffer_clear(req->cp->buffer);
2279 	sldns_buffer_write(req->cp->buffer, buf, len);
2280 	sldns_buffer_flip(req->cp->buffer);
2281 
2282 	req->cp->tcp_is_reading = 0; /* we are now writing */
2283 }
2284 
2285 /** pick up the next result and start writing it to the channel */
2286 static void
tcp_req_pickup_next_result(struct tcp_req_info * req)2287 tcp_req_pickup_next_result(struct tcp_req_info* req)
2288 {
2289 	if(req->num_done_req > 0) {
2290 		/* unlist the done item from the list of pending results */
2291 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2292 		tcp_req_info_start_write_buf(req, item->buf, item->len);
2293 		free(item->buf);
2294 		free(item);
2295 	}
2296 }
2297 
2298 /** the read channel has closed */
2299 int
tcp_req_info_handle_read_close(struct tcp_req_info * req)2300 tcp_req_info_handle_read_close(struct tcp_req_info* req)
2301 {
2302 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2303 	/* reset byte count for (potential) partial read */
2304 	req->cp->tcp_byte_count = 0;
2305 	/* if we still have results to write, pick up next and write it */
2306 	if(req->num_done_req != 0) {
2307 		tcp_req_pickup_next_result(req);
2308 		tcp_req_info_setup_listen(req);
2309 		return 1;
2310 	}
2311 	/* if nothing to do, this closes the connection */
2312 	if(req->num_open_req == 0 && req->num_done_req == 0)
2313 		return 0;
2314 	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2315 	req->read_is_closed = 1;
2316 	tcp_req_info_setup_listen(req);
2317 	return 1;
2318 }
2319 
2320 void
tcp_req_info_handle_writedone(struct tcp_req_info * req)2321 tcp_req_info_handle_writedone(struct tcp_req_info* req)
2322 {
2323 	/* back to reading state, we finished this write event */
2324 	sldns_buffer_clear(req->cp->buffer);
2325 	if(req->num_done_req == 0 && req->read_is_closed) {
2326 		/* no more to write and nothing to read, close it */
2327 		comm_point_drop_reply(&req->cp->repinfo);
2328 		return;
2329 	}
2330 	req->cp->tcp_is_reading = 1;
2331 	/* see if another result needs writing */
2332 	tcp_req_pickup_next_result(req);
2333 
2334 	/* see if there is more to write, if not stop_listening for writing */
2335 	/* see if new requests are allowed, if so, start_listening
2336 	 * for reading */
2337 	tcp_req_info_setup_listen(req);
2338 }
2339 
2340 void
tcp_req_info_handle_readdone(struct tcp_req_info * req)2341 tcp_req_info_handle_readdone(struct tcp_req_info* req)
2342 {
2343 	struct comm_point* c = req->cp;
2344 
2345 	/* we want to read up several requests, unless there are
2346 	 * pending answers */
2347 
2348 	req->is_drop = 0;
2349 	req->is_reply = 0;
2350 	req->in_worker_handle = 1;
2351 	sldns_buffer_set_limit(req->spool_buffer, 0);
2352 	/* handle the current request */
2353 	/* this calls the worker handle request routine that could give
2354 	 * a cache response, or localdata response, or drop the reply,
2355 	 * or schedule a mesh entry for later */
2356 	fptr_ok(fptr_whitelist_comm_point(c->callback));
2357 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2358 		req->in_worker_handle = 0;
2359 		/* there is an answer, put it up.  It is already in the
2360 		 * c->buffer, just send it. */
2361 		/* since we were just reading a query, the channel is
2362 		 * clear to write to */
2363 	send_it:
2364 		c->tcp_is_reading = 0;
2365 		comm_point_stop_listening(c);
2366 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2367 		return;
2368 	}
2369 	req->in_worker_handle = 0;
2370 	/* it should be waiting in the mesh for recursion.
2371 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2372 	 * Then the mesh state has been cleared. */
2373 	if(req->is_drop) {
2374 		/* the reply has been dropped, stream has been closed. */
2375 		return;
2376 	}
2377 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2378 	 * something like servfail then we pick up that reply below. */
2379 	if(req->is_reply) {
2380 		goto send_it;
2381 	}
2382 
2383 	sldns_buffer_clear(c->buffer);
2384 	/* if pending answers, pick up an answer and start sending it */
2385 	tcp_req_pickup_next_result(req);
2386 
2387 	/* if answers pending, start sending answers */
2388 	/* read more requests if we can have more requests */
2389 	tcp_req_info_setup_listen(req);
2390 }
2391 
2392 int
tcp_req_info_add_meshstate(struct tcp_req_info * req,struct mesh_area * mesh,struct mesh_state * m)2393 tcp_req_info_add_meshstate(struct tcp_req_info* req,
2394 	struct mesh_area* mesh, struct mesh_state* m)
2395 {
2396 	struct tcp_req_open_item* item;
2397 	log_assert(req && mesh && m);
2398 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2399 	if(!item) return 0;
2400 	item->next = req->open_req_list;
2401 	item->mesh = mesh;
2402 	item->mesh_state = m;
2403 	req->open_req_list = item;
2404 	req->num_open_req++;
2405 	return 1;
2406 }
2407 
2408 /** Add a result to the result list.  At the end. */
2409 static int
tcp_req_info_add_result(struct tcp_req_info * req,uint8_t * buf,size_t len)2410 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2411 {
2412 	struct tcp_req_done_item* last = NULL;
2413 	struct tcp_req_done_item* item;
2414 	size_t space;
2415 
2416 	/* see if we have space */
2417 	space = sizeof(struct tcp_req_done_item) + len;
2418 	lock_basic_lock(&stream_wait_count_lock);
2419 	if(stream_wait_count + space > stream_wait_max) {
2420 		lock_basic_unlock(&stream_wait_count_lock);
2421 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2422 		return 0;
2423 	}
2424 	stream_wait_count += space;
2425 	lock_basic_unlock(&stream_wait_count_lock);
2426 
2427 	/* find last element */
2428 	last = req->done_req_list;
2429 	while(last && last->next)
2430 		last = last->next;
2431 
2432 	/* create new element */
2433 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2434 	if(!item) {
2435 		log_err("malloc failure, for stream result list");
2436 		return 0;
2437 	}
2438 	item->next = NULL;
2439 	item->len = len;
2440 	item->buf = memdup(buf, len);
2441 	if(!item->buf) {
2442 		free(item);
2443 		log_err("malloc failure, adding reply to stream result list");
2444 		return 0;
2445 	}
2446 
2447 	/* link in */
2448 	if(last) last->next = item;
2449 	else req->done_req_list = item;
2450 	req->num_done_req++;
2451 	return 1;
2452 }
2453 
2454 void
tcp_req_info_send_reply(struct tcp_req_info * req)2455 tcp_req_info_send_reply(struct tcp_req_info* req)
2456 {
2457 	if(req->in_worker_handle) {
2458 		/* reply from mesh is in the spool_buffer */
2459 		/* copy now, so that the spool buffer is free for other tasks
2460 		 * before the callback is done */
2461 		sldns_buffer_clear(req->cp->buffer);
2462 		sldns_buffer_write(req->cp->buffer,
2463 			sldns_buffer_begin(req->spool_buffer),
2464 			sldns_buffer_limit(req->spool_buffer));
2465 		sldns_buffer_flip(req->cp->buffer);
2466 		req->is_reply = 1;
2467 		return;
2468 	}
2469 	/* now that the query has been handled, that mesh_reply entry
2470 	 * should be removed, from the tcp_req_info list,
2471 	 * the mesh state cleanup removes then with region_cleanup and
2472 	 * replies_sent true. */
2473 	/* see if we can send it straight away (we are not doing
2474 	 * anything else).  If so, copy to buffer and start */
2475 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2476 		/* buffer is free, and was ready to read new query into,
2477 		 * but we are now going to use it to send this answer */
2478 		tcp_req_info_start_write_buf(req,
2479 			sldns_buffer_begin(req->spool_buffer),
2480 			sldns_buffer_limit(req->spool_buffer));
2481 		/* switch to listen to write events */
2482 		comm_point_stop_listening(req->cp);
2483 		comm_point_start_listening(req->cp, -1,
2484 			adjusted_tcp_timeout(req->cp));
2485 		return;
2486 	}
2487 	/* queue up the answer behind the others already pending */
2488 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2489 		sldns_buffer_limit(req->spool_buffer))) {
2490 		/* drop the connection, we are out of resources */
2491 		comm_point_drop_reply(&req->cp->repinfo);
2492 	}
2493 }
2494 
tcp_req_info_get_stream_buffer_size(void)2495 size_t tcp_req_info_get_stream_buffer_size(void)
2496 {
2497 	size_t s;
2498 	if(!stream_wait_lock_inited)
2499 		return stream_wait_count;
2500 	lock_basic_lock(&stream_wait_count_lock);
2501 	s = stream_wait_count;
2502 	lock_basic_unlock(&stream_wait_count_lock);
2503 	return s;
2504 }
2505 
http2_get_query_buffer_size(void)2506 size_t http2_get_query_buffer_size(void)
2507 {
2508 	size_t s;
2509 	if(!http2_query_buffer_lock_inited)
2510 		return http2_query_buffer_count;
2511 	lock_basic_lock(&http2_query_buffer_count_lock);
2512 	s = http2_query_buffer_count;
2513 	lock_basic_unlock(&http2_query_buffer_count_lock);
2514 	return s;
2515 }
2516 
http2_get_response_buffer_size(void)2517 size_t http2_get_response_buffer_size(void)
2518 {
2519 	size_t s;
2520 	if(!http2_response_buffer_lock_inited)
2521 		return http2_response_buffer_count;
2522 	lock_basic_lock(&http2_response_buffer_count_lock);
2523 	s = http2_response_buffer_count;
2524 	lock_basic_unlock(&http2_response_buffer_count_lock);
2525 	return s;
2526 }
2527 
2528 #ifdef HAVE_NGHTTP2
2529 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
http2_submit_response_read_callback(nghttp2_session * ATTR_UNUSED (session),int32_t stream_id,uint8_t * buf,size_t length,uint32_t * data_flags,nghttp2_data_source * source,void * ATTR_UNUSED (cb_arg))2530 static ssize_t http2_submit_response_read_callback(
2531 	nghttp2_session* ATTR_UNUSED(session),
2532 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2533 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2534 {
2535 	struct http2_stream* h2_stream;
2536 	struct http2_session* h2_session = source->ptr;
2537 	size_t copylen = length;
2538 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2539 		h2_session->session, stream_id))) {
2540 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2541 			"stream");
2542 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2543 	}
2544 	if(!h2_stream->rbuffer ||
2545 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2546 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2547 			"available in rbuffer");
2548 		/* rbuffer will be free'd in frame close cb */
2549 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2550 	}
2551 
2552 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2553 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2554 	if(copylen > SSIZE_MAX)
2555 		copylen = SSIZE_MAX; /* will probably never happen */
2556 
2557 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2558 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2559 
2560 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2561 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2562 		lock_basic_lock(&http2_response_buffer_count_lock);
2563 		http2_response_buffer_count -=
2564 			sldns_buffer_capacity(h2_stream->rbuffer);
2565 		lock_basic_unlock(&http2_response_buffer_count_lock);
2566 		sldns_buffer_free(h2_stream->rbuffer);
2567 		h2_stream->rbuffer = NULL;
2568 	}
2569 
2570 	return copylen;
2571 }
2572 
2573 /**
2574  * Send RST_STREAM frame for stream.
2575  * @param h2_session: http2 session to submit frame to
2576  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2577  * @return 0 on error, 1 otherwise
2578  */
http2_submit_rst_stream(struct http2_session * h2_session,struct http2_stream * h2_stream)2579 static int http2_submit_rst_stream(struct http2_session* h2_session,
2580 		struct http2_stream* h2_stream)
2581 {
2582 	int ret = nghttp2_submit_rst_stream(h2_session->session,
2583 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2584 		NGHTTP2_INTERNAL_ERROR);
2585 	if(ret) {
2586 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2587 			"error: %s", nghttp2_strerror(ret));
2588 		return 0;
2589 	}
2590 	return 1;
2591 }
2592 
2593 /**
2594  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2595  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2596  * might be used before this will be sent out.
2597  * @param h2_session: http2 session, containing c->buffer which contains answer
2598  * @return 0 on error, 1 otherwise
2599  */
http2_submit_dns_response(struct http2_session * h2_session)2600 int http2_submit_dns_response(struct http2_session* h2_session)
2601 {
2602 	int ret;
2603 	nghttp2_data_provider data_prd;
2604 	char status[4];
2605 	nghttp2_nv headers[3];
2606 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2607 	size_t rlen;
2608 	char rlen_str[32];
2609 
2610 	if(h2_stream->rbuffer) {
2611 		log_err("http2 submit response error: rbuffer already "
2612 			"exists");
2613 		return 0;
2614 	}
2615 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2616 		log_err("http2 submit response error: c->buffer not complete");
2617 		return 0;
2618 	}
2619 
2620 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2621 		verbose(VERB_QUERY, "http2: submit response error: "
2622 			"invalid status");
2623 		return 0;
2624 	}
2625 
2626 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2627 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2628 
2629 	lock_basic_lock(&http2_response_buffer_count_lock);
2630 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2631 		lock_basic_unlock(&http2_response_buffer_count_lock);
2632 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2633 			"in https-response-buffer-size");
2634 		return http2_submit_rst_stream(h2_session, h2_stream);
2635 	}
2636 	http2_response_buffer_count += rlen;
2637 	lock_basic_unlock(&http2_response_buffer_count_lock);
2638 
2639 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2640 		lock_basic_lock(&http2_response_buffer_count_lock);
2641 		http2_response_buffer_count -= rlen;
2642 		lock_basic_unlock(&http2_response_buffer_count_lock);
2643 		log_err("http2 submit response error: malloc failure");
2644 		return 0;
2645 	}
2646 
2647 	headers[0].name = (uint8_t*)":status";
2648 	headers[0].namelen = 7;
2649 	headers[0].value = (uint8_t*)status;
2650 	headers[0].valuelen = 3;
2651 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2652 
2653 	headers[1].name = (uint8_t*)"content-type";
2654 	headers[1].namelen = 12;
2655 	headers[1].value = (uint8_t*)"application/dns-message";
2656 	headers[1].valuelen = 23;
2657 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2658 
2659 	headers[2].name = (uint8_t*)"content-length";
2660 	headers[2].namelen = 14;
2661 	headers[2].value = (uint8_t*)rlen_str;
2662 	headers[2].valuelen = strlen(rlen_str);
2663 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2664 
2665 	sldns_buffer_write(h2_stream->rbuffer,
2666 		sldns_buffer_current(h2_session->c->buffer),
2667 		sldns_buffer_remaining(h2_session->c->buffer));
2668 	sldns_buffer_flip(h2_stream->rbuffer);
2669 
2670 	data_prd.source.ptr = h2_session;
2671 	data_prd.read_callback = http2_submit_response_read_callback;
2672 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2673 		headers, 3, &data_prd);
2674 	if(ret) {
2675 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2676 			"error: %s", nghttp2_strerror(ret));
2677 		return 0;
2678 	}
2679 	return 1;
2680 }
2681 #else
http2_submit_dns_response(void * ATTR_UNUSED (v))2682 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2683 {
2684 	return 0;
2685 }
2686 #endif
2687 
2688 #ifdef HAVE_NGHTTP2
2689 /** HTTP status to descriptive string */
http_status_to_str(enum http_status s)2690 static char* http_status_to_str(enum http_status s)
2691 {
2692 	switch(s) {
2693 		case HTTP_STATUS_OK:
2694 			return "OK";
2695 		case HTTP_STATUS_BAD_REQUEST:
2696 			return "Bad Request";
2697 		case HTTP_STATUS_NOT_FOUND:
2698 			return "Not Found";
2699 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2700 			return "Payload Too Large";
2701 		case HTTP_STATUS_URI_TOO_LONG:
2702 			return "URI Too Long";
2703 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2704 			return "Unsupported Media Type";
2705 		case HTTP_STATUS_NOT_IMPLEMENTED:
2706 			return "Not Implemented";
2707 	}
2708 	return "Status Unknown";
2709 }
2710 
2711 /** nghttp2 callback. Used to copy error message to nghttp2 session */
http2_submit_error_read_callback(nghttp2_session * ATTR_UNUSED (session),int32_t stream_id,uint8_t * buf,size_t length,uint32_t * data_flags,nghttp2_data_source * source,void * ATTR_UNUSED (cb_arg))2712 static ssize_t http2_submit_error_read_callback(
2713 	nghttp2_session* ATTR_UNUSED(session),
2714 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2715 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2716 {
2717 	struct http2_stream* h2_stream;
2718 	struct http2_session* h2_session = source->ptr;
2719 	char* msg;
2720 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2721 		h2_session->session, stream_id))) {
2722 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2723 			"stream");
2724 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2725 	}
2726 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2727 	msg = http_status_to_str(h2_stream->status);
2728 	if(length < strlen(msg))
2729 		return 0; /* not worth trying over multiple frames */
2730 	memcpy(buf, msg, strlen(msg));
2731 	return strlen(msg);
2732 
2733 }
2734 
2735 /**
2736  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2737  * sending out. Message body will contain descriptive string for HTTP status.
2738  * @param h2_session: http2 session to submit to
2739  * @param h2_stream: http2 stream containing HTTP status to use for error
2740  * @return 0 on error, 1 otherwise
2741  */
http2_submit_error(struct http2_session * h2_session,struct http2_stream * h2_stream)2742 static int http2_submit_error(struct http2_session* h2_session,
2743 	struct http2_stream* h2_stream)
2744 {
2745 	int ret;
2746 	char status[4];
2747 	nghttp2_data_provider data_prd;
2748 	nghttp2_nv headers[1]; /* will be copied by nghttp */
2749 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2750 		verbose(VERB_QUERY, "http2: submit error failed, "
2751 			"invalid status");
2752 		return 0;
2753 	}
2754 	headers[0].name = (uint8_t*)":status";
2755 	headers[0].namelen = 7;
2756 	headers[0].value = (uint8_t*)status;
2757 	headers[0].valuelen = 3;
2758 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2759 
2760 	data_prd.source.ptr = h2_session;
2761 	data_prd.read_callback = http2_submit_error_read_callback;
2762 
2763 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2764 		headers, 1, &data_prd);
2765 	if(ret) {
2766 		verbose(VERB_QUERY, "http2: submit error failed, "
2767 			"error: %s", nghttp2_strerror(ret));
2768 		return 0;
2769 	}
2770 	return 1;
2771 }
2772 
2773 /**
2774  * Start query handling. Query is stored in the stream, and will be free'd here.
2775  * @param h2_session: http2 session, containing comm point
2776  * @param h2_stream: stream containing buffered query
2777  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2778  * reply available (yet).
2779  */
http2_query_read_done(struct http2_session * h2_session,struct http2_stream * h2_stream)2780 static int http2_query_read_done(struct http2_session* h2_session,
2781 	struct http2_stream* h2_stream)
2782 {
2783 	log_assert(h2_stream->qbuffer);
2784 
2785 	if(h2_session->c->h2_stream) {
2786 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2787 			"buffer already assigned to stream");
2788 		return -1;
2789 	}
2790 
2791     /* the c->buffer might be used by mesh_send_reply and no be cleard
2792 	 * need to be cleared before use */
2793 	sldns_buffer_clear(h2_session->c->buffer);
2794 	if(sldns_buffer_remaining(h2_session->c->buffer) <
2795 		sldns_buffer_remaining(h2_stream->qbuffer)) {
2796 		/* qbuffer will be free'd in frame close cb */
2797 		sldns_buffer_clear(h2_session->c->buffer);
2798 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2799 			"qbuffer in c->buffer");
2800 		return -1;
2801 	}
2802 
2803 	sldns_buffer_write(h2_session->c->buffer,
2804 		sldns_buffer_current(h2_stream->qbuffer),
2805 		sldns_buffer_remaining(h2_stream->qbuffer));
2806 
2807 	lock_basic_lock(&http2_query_buffer_count_lock);
2808 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2809 	lock_basic_unlock(&http2_query_buffer_count_lock);
2810 	sldns_buffer_free(h2_stream->qbuffer);
2811 	h2_stream->qbuffer = NULL;
2812 
2813 	sldns_buffer_flip(h2_session->c->buffer);
2814 	h2_session->c->h2_stream = h2_stream;
2815 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2816 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2817 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2818 		return 1; /* answer in c->buffer */
2819 	}
2820 	sldns_buffer_clear(h2_session->c->buffer);
2821 	h2_session->c->h2_stream = NULL;
2822 	return 0; /* mesh state added, or dropped */
2823 }
2824 
2825 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2826  * stream. Gather collected request data and start query handling. */
http2_req_frame_recv_cb(nghttp2_session * session,const nghttp2_frame * frame,void * cb_arg)2827 static int http2_req_frame_recv_cb(nghttp2_session* session,
2828 	const nghttp2_frame* frame, void* cb_arg)
2829 {
2830 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2831 	struct http2_stream* h2_stream;
2832 	int query_read_done;
2833 
2834 	if((frame->hd.type != NGHTTP2_DATA &&
2835 		frame->hd.type != NGHTTP2_HEADERS) ||
2836 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2837 			return 0;
2838 	}
2839 
2840 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2841 		session, frame->hd.stream_id)))
2842 		return 0;
2843 
2844 	if(h2_stream->invalid_endpoint) {
2845 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2846 		goto submit_http_error;
2847 	}
2848 
2849 	if(h2_stream->invalid_content_type) {
2850 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2851 		goto submit_http_error;
2852 	}
2853 
2854 	if(h2_stream->http_method != HTTP_METHOD_GET &&
2855 		h2_stream->http_method != HTTP_METHOD_POST) {
2856 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2857 		goto submit_http_error;
2858 	}
2859 
2860 	if(h2_stream->query_too_large) {
2861 		if(h2_stream->http_method == HTTP_METHOD_POST)
2862 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2863 		else
2864 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2865 		goto submit_http_error;
2866 	}
2867 
2868 	if(!h2_stream->qbuffer) {
2869 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2870 		goto submit_http_error;
2871 	}
2872 
2873 	if(h2_stream->status) {
2874 submit_http_error:
2875 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2876 			"%d", h2_stream->status);
2877 		if(!http2_submit_error(h2_session, h2_stream)) {
2878 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2879 		}
2880 		return 0;
2881 	}
2882 	h2_stream->status = HTTP_STATUS_OK;
2883 
2884 	sldns_buffer_flip(h2_stream->qbuffer);
2885 	h2_session->postpone_drop = 1;
2886 	query_read_done = http2_query_read_done(h2_session, h2_stream);
2887 	if(query_read_done < 0)
2888 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2889 	else if(!query_read_done) {
2890 		if(h2_session->is_drop) {
2891 			/* connection needs to be closed. Return failure to make
2892 			 * sure no other action are taken anymore on comm point.
2893 			 * failure will result in reclaiming (and closing)
2894 			 * of comm point. */
2895 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2896 			h2_session->postpone_drop = 0;
2897 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2898 		}
2899 		/* nothing to submit right now, query added to mesh. */
2900 		h2_session->postpone_drop = 0;
2901 		return 0;
2902 	}
2903 	if(!http2_submit_dns_response(h2_session)) {
2904 		sldns_buffer_clear(h2_session->c->buffer);
2905 		h2_session->c->h2_stream = NULL;
2906 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2907 	}
2908 	verbose(VERB_QUERY, "http2 query submitted to session");
2909 	sldns_buffer_clear(h2_session->c->buffer);
2910 	h2_session->c->h2_stream = NULL;
2911 	return 0;
2912 }
2913 
2914 /** nghttp2 callback. Used to detect start of new streams. */
http2_req_begin_headers_cb(nghttp2_session * session,const nghttp2_frame * frame,void * cb_arg)2915 static int http2_req_begin_headers_cb(nghttp2_session* session,
2916 	const nghttp2_frame* frame, void* cb_arg)
2917 {
2918 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2919 	struct http2_stream* h2_stream;
2920 	int ret;
2921 	if(frame->hd.type != NGHTTP2_HEADERS ||
2922 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2923 		/* only interested in request headers */
2924 		return 0;
2925 	}
2926 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2927 		log_err("malloc failure while creating http2 stream");
2928 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2929 	}
2930 	http2_session_add_stream(h2_session, h2_stream);
2931 	ret = nghttp2_session_set_stream_user_data(session,
2932 		frame->hd.stream_id, h2_stream);
2933 	if(ret) {
2934 		/* stream does not exist */
2935 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2936 			"error: %s", nghttp2_strerror(ret));
2937 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2938 	}
2939 
2940 	return 0;
2941 }
2942 
2943 /**
2944  * base64url decode, store in qbuffer
2945  * @param h2_session: http2 session
2946  * @param h2_stream: http2 stream
2947  * @param start: start of the base64 string
2948  * @param length: length of the base64 string
2949  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2950  * buffer will be NULL is unparseble.
2951  */
http2_buffer_uri_query(struct http2_session * h2_session,struct http2_stream * h2_stream,const uint8_t * start,size_t length)2952 static int http2_buffer_uri_query(struct http2_session* h2_session,
2953 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2954 {
2955 	size_t expectb64len;
2956 	int b64len;
2957 	if(h2_stream->http_method == HTTP_METHOD_POST)
2958 		return 1;
2959 	if(length == 0)
2960 		return 1;
2961 	if(h2_stream->qbuffer) {
2962 		verbose(VERB_ALGO, "http2_req_header fail, "
2963 			"qbuffer already set");
2964 		return 0;
2965 	}
2966 
2967 	/* calculate size, might be a bit bigger than the real
2968 	 * decoded buffer size */
2969 	expectb64len = sldns_b64_pton_calculate_size(length);
2970 	log_assert(expectb64len > 0);
2971 	if(expectb64len >
2972 		h2_session->c->http2_stream_max_qbuffer_size) {
2973 		h2_stream->query_too_large = 1;
2974 		return 1;
2975 	}
2976 
2977 	lock_basic_lock(&http2_query_buffer_count_lock);
2978 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2979 		lock_basic_unlock(&http2_query_buffer_count_lock);
2980 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2981 			"in http2-query-buffer-size");
2982 		return http2_submit_rst_stream(h2_session, h2_stream);
2983 	}
2984 	http2_query_buffer_count += expectb64len;
2985 	lock_basic_unlock(&http2_query_buffer_count_lock);
2986 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2987 		lock_basic_lock(&http2_query_buffer_count_lock);
2988 		http2_query_buffer_count -= expectb64len;
2989 		lock_basic_unlock(&http2_query_buffer_count_lock);
2990 		log_err("http2_req_header fail, qbuffer "
2991 			"malloc failure");
2992 		return 0;
2993 	}
2994 
2995 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2996 		char buf[65536+4];
2997 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2998 		/* copy to the scratch buffer temporarily to terminate the
2999 		 * string with a zero */
3000 		if(length+1 > sizeof(buf)) {
3001 			/* too long */
3002 			lock_basic_lock(&http2_query_buffer_count_lock);
3003 			http2_query_buffer_count -= expectb64len;
3004 			lock_basic_unlock(&http2_query_buffer_count_lock);
3005 			sldns_buffer_free(h2_stream->qbuffer);
3006 			h2_stream->qbuffer = NULL;
3007 			return 1;
3008 		}
3009 		memmove(buf, start, length);
3010 		buf[length] = 0;
3011 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
3012 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
3013 			lock_basic_lock(&http2_query_buffer_count_lock);
3014 			http2_query_buffer_count -= expectb64len;
3015 			lock_basic_unlock(&http2_query_buffer_count_lock);
3016 			sldns_buffer_free(h2_stream->qbuffer);
3017 			h2_stream->qbuffer = NULL;
3018 			return 1;
3019 		}
3020 	} else {
3021 		if(!(b64len = sldns_b64url_pton(
3022 			(char const *)start, length,
3023 			sldns_buffer_current(h2_stream->qbuffer),
3024 			expectb64len)) || b64len < 0) {
3025 			lock_basic_lock(&http2_query_buffer_count_lock);
3026 			http2_query_buffer_count -= expectb64len;
3027 			lock_basic_unlock(&http2_query_buffer_count_lock);
3028 			sldns_buffer_free(h2_stream->qbuffer);
3029 			h2_stream->qbuffer = NULL;
3030 			/* return without error, method can be an
3031 			 * unknown POST */
3032 			return 1;
3033 		}
3034 	}
3035 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
3036 	return 1;
3037 }
3038 
3039 /** nghttp2 callback. Used to parse headers from HEADER frames. */
http2_req_header_cb(nghttp2_session * session,const nghttp2_frame * frame,const uint8_t * name,size_t namelen,const uint8_t * value,size_t valuelen,uint8_t ATTR_UNUSED (flags),void * cb_arg)3040 static int http2_req_header_cb(nghttp2_session* session,
3041 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
3042 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
3043 	void* cb_arg)
3044 {
3045 	struct http2_stream* h2_stream = NULL;
3046 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3047 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
3048 	 * the HEADER */
3049 	if(frame->hd.type != NGHTTP2_HEADERS ||
3050 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
3051 		/* only interested in request headers */
3052 		return 0;
3053 	}
3054 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
3055 		frame->hd.stream_id)))
3056 		return 0;
3057 
3058 	/* earlier checks already indicate we can stop handling this query */
3059 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
3060 		h2_stream->invalid_content_type ||
3061 		h2_stream->invalid_endpoint)
3062 		return 0;
3063 
3064 
3065 	/* nghttp2 performs some sanity checks in the headers, including:
3066 	 * name and value are guaranteed to be null terminated
3067 	 * name is guaranteed to be lowercase
3068 	 * content-length value is guaranteed to contain digits
3069 	 */
3070 
3071 	if(!h2_stream->http_method && namelen == 7 &&
3072 		memcmp(":method", name, namelen) == 0) {
3073 		/* Case insensitive check on :method value to be on the safe
3074 		 * side. I failed to find text about case sensitivity in specs.
3075 		 */
3076 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
3077 			h2_stream->http_method = HTTP_METHOD_GET;
3078 		else if(valuelen == 4 &&
3079 			strcasecmp("POST", (const char*)value) == 0) {
3080 			h2_stream->http_method = HTTP_METHOD_POST;
3081 			if(h2_stream->qbuffer) {
3082 				/* POST method uses query from DATA frames */
3083 				lock_basic_lock(&http2_query_buffer_count_lock);
3084 				http2_query_buffer_count -=
3085 					sldns_buffer_capacity(h2_stream->qbuffer);
3086 				lock_basic_unlock(&http2_query_buffer_count_lock);
3087 				sldns_buffer_free(h2_stream->qbuffer);
3088 				h2_stream->qbuffer = NULL;
3089 			}
3090 		} else
3091 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
3092 		return 0;
3093 	}
3094 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
3095 		/* :path may contain DNS query, depending on method. Method might
3096 		 * not be known yet here, so check after finishing receiving
3097 		 * stream. */
3098 #define	HTTP_QUERY_PARAM "?dns="
3099 		size_t el = strlen(h2_session->c->http_endpoint);
3100 		size_t qpl = strlen(HTTP_QUERY_PARAM);
3101 
3102 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
3103 			value, el) != 0) {
3104 			h2_stream->invalid_endpoint = 1;
3105 			return 0;
3106 		}
3107 		/* larger than endpoint only allowed if it is for the query
3108 		 * parameter */
3109 		if(valuelen <= el+qpl ||
3110 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
3111 			if(valuelen != el)
3112 				h2_stream->invalid_endpoint = 1;
3113 			return 0;
3114 		}
3115 
3116 		if(!http2_buffer_uri_query(h2_session, h2_stream,
3117 			value+(el+qpl), valuelen-(el+qpl))) {
3118 			return NGHTTP2_ERR_CALLBACK_FAILURE;
3119 		}
3120 		return 0;
3121 	}
3122 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
3123 	 * and not needed when using GET. Don't enforce.
3124 	 * If set only allow lowercase "application/dns-message".
3125 	 *
3126 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
3127 	 * be able to handle "application/dns-message". Since that is the only
3128 	 * content-type supported we can ignore the accept header.
3129 	 */
3130 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
3131 		if(valuelen != 23 || memcmp("application/dns-message", value,
3132 			valuelen) != 0) {
3133 			h2_stream->invalid_content_type = 1;
3134 		}
3135 	}
3136 
3137 	/* Only interested in content-lentg for POST (on not yet known) method.
3138 	 */
3139 	if((!h2_stream->http_method ||
3140 		h2_stream->http_method == HTTP_METHOD_POST) &&
3141 		!h2_stream->content_length && namelen  == 14 &&
3142 		memcmp("content-length", name, namelen) == 0) {
3143 		if(valuelen > 5) {
3144 			h2_stream->query_too_large = 1;
3145 			return 0;
3146 		}
3147 		/* guaranteed to only contain digits and be null terminated */
3148 		h2_stream->content_length = atoi((const char*)value);
3149 		if(h2_stream->content_length >
3150 			h2_session->c->http2_stream_max_qbuffer_size) {
3151 			h2_stream->query_too_large = 1;
3152 			return 0;
3153 		}
3154 	}
3155 	return 0;
3156 }
3157 
3158 /** nghttp2 callback. Used to get data from DATA frames, which can contain
3159  * queries in POST requests. */
http2_req_data_chunk_recv_cb(nghttp2_session * ATTR_UNUSED (session),uint8_t ATTR_UNUSED (flags),int32_t stream_id,const uint8_t * data,size_t len,void * cb_arg)3160 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3161 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3162 	size_t len, void* cb_arg)
3163 {
3164 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3165 	struct http2_stream* h2_stream;
3166 	size_t qlen = 0;
3167 
3168 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3169 		h2_session->session, stream_id))) {
3170 		return 0;
3171 	}
3172 
3173 	if(h2_stream->query_too_large)
3174 		return 0;
3175 
3176 	if(!h2_stream->qbuffer) {
3177 		if(h2_stream->content_length) {
3178 			if(h2_stream->content_length < len)
3179 				/* getting more data in DATA frame than
3180 				 * advertised in content-length header. */
3181 				return NGHTTP2_ERR_CALLBACK_FAILURE;
3182 			qlen = h2_stream->content_length;
3183 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3184 			/* setting this to msg-buffer-size can result in a lot
3185 			 * of memory consumption. Most queries should fit in a
3186 			 * single DATA frame, and most POST queries will
3187 			 * contain content-length which does not impose this
3188 			 * limit. */
3189 			qlen = len;
3190 		}
3191 	}
3192 	if(!h2_stream->qbuffer && qlen) {
3193 		lock_basic_lock(&http2_query_buffer_count_lock);
3194 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3195 			lock_basic_unlock(&http2_query_buffer_count_lock);
3196 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3197 				"in http2-query-buffer-size");
3198 			return http2_submit_rst_stream(h2_session, h2_stream);
3199 		}
3200 		http2_query_buffer_count += qlen;
3201 		lock_basic_unlock(&http2_query_buffer_count_lock);
3202 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3203 			lock_basic_lock(&http2_query_buffer_count_lock);
3204 			http2_query_buffer_count -= qlen;
3205 			lock_basic_unlock(&http2_query_buffer_count_lock);
3206 		}
3207 	}
3208 
3209 	if(!h2_stream->qbuffer ||
3210 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3211 		verbose(VERB_ALGO, "http2 data_chunk_recv failed. Not enough "
3212 			"buffer space for POST query. Can happen on multi "
3213 			"frame requests without content-length header");
3214 		h2_stream->query_too_large = 1;
3215 		return 0;
3216 	}
3217 
3218 	sldns_buffer_write(h2_stream->qbuffer, data, len);
3219 
3220 	return 0;
3221 }
3222 
http2_req_stream_clear(struct http2_stream * h2_stream)3223 void http2_req_stream_clear(struct http2_stream* h2_stream)
3224 {
3225 	if(h2_stream->qbuffer) {
3226 		lock_basic_lock(&http2_query_buffer_count_lock);
3227 		http2_query_buffer_count -=
3228 			sldns_buffer_capacity(h2_stream->qbuffer);
3229 		lock_basic_unlock(&http2_query_buffer_count_lock);
3230 		sldns_buffer_free(h2_stream->qbuffer);
3231 		h2_stream->qbuffer = NULL;
3232 	}
3233 	if(h2_stream->rbuffer) {
3234 		lock_basic_lock(&http2_response_buffer_count_lock);
3235 		http2_response_buffer_count -=
3236 			sldns_buffer_capacity(h2_stream->rbuffer);
3237 		lock_basic_unlock(&http2_response_buffer_count_lock);
3238 		sldns_buffer_free(h2_stream->rbuffer);
3239 		h2_stream->rbuffer = NULL;
3240 	}
3241 }
3242 
http2_req_callbacks_create(void)3243 nghttp2_session_callbacks* http2_req_callbacks_create(void)
3244 {
3245 	nghttp2_session_callbacks *callbacks;
3246 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3247 		log_err("failed to initialize nghttp2 callback");
3248 		return NULL;
3249 	}
3250 	/* reception of header block started, used to create h2_stream */
3251 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3252 		http2_req_begin_headers_cb);
3253 	/* complete frame received, used to get data from stream if frame
3254 	 * has end stream flag, and start processing query */
3255 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3256 		http2_req_frame_recv_cb);
3257 	/* get request info from headers */
3258 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3259 		http2_req_header_cb);
3260 	/* get data from DATA frames, containing POST query */
3261 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3262 		http2_req_data_chunk_recv_cb);
3263 
3264 	/* generic HTTP2 callbacks */
3265 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3266 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3267 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3268 		http2_stream_close_cb);
3269 
3270 	return callbacks;
3271 }
3272 #endif /* HAVE_NGHTTP2 */
3273 
3274 #ifdef HAVE_NGTCP2
3275 struct doq_table*
doq_table_create(struct config_file * cfg,struct ub_randstate * rnd)3276 doq_table_create(struct config_file* cfg, struct ub_randstate* rnd)
3277 {
3278 	struct doq_table* table = calloc(1, sizeof(*table));
3279 	if(!table)
3280 		return NULL;
3281 #ifdef USE_NGTCP2_CRYPTO_OSSL
3282 	/* Initialize the ossl crypto, it is harmless to call twice,
3283 	 * and this is before use of doq connections. */
3284 	if(ngtcp2_crypto_ossl_init() != 0) {
3285 		log_err("ngtcp2_crypto_oss_init failed");
3286 		free(table);
3287 		return NULL;
3288 	}
3289 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_INIT)
3290 	if(ngtcp2_crypto_quictls_init() != 0) {
3291 		log_err("ngtcp2_crypto_quictls_init failed");
3292 		free(table);
3293 		return NULL;
3294 	}
3295 #endif
3296 	table->idle_timeout = ((uint64_t)cfg->tcp_idle_timeout)*
3297 		NGTCP2_MILLISECONDS;
3298 	table->sv_scidlen = 16;
3299 	table->static_secret_len = 16;
3300 	table->static_secret = malloc(table->static_secret_len);
3301 	if(!table->static_secret) {
3302 		free(table);
3303 		return NULL;
3304 	}
3305 	doq_fill_rand(rnd, table->static_secret, table->static_secret_len);
3306 	table->conn_tree = rbtree_create(doq_conn_cmp);
3307 	if(!table->conn_tree) {
3308 		free(table->static_secret);
3309 		free(table);
3310 		return NULL;
3311 	}
3312 	table->conid_tree = rbtree_create(doq_conid_cmp);
3313 	if(!table->conid_tree) {
3314 		free(table->static_secret);
3315 		free(table->conn_tree);
3316 		free(table);
3317 		return NULL;
3318 	}
3319 	table->timer_tree = rbtree_create(doq_timer_cmp);
3320 	if(!table->timer_tree) {
3321 		free(table->static_secret);
3322 		free(table->conn_tree);
3323 		free(table->conid_tree);
3324 		free(table);
3325 		return NULL;
3326 	}
3327 	lock_rw_init(&table->lock);
3328 	lock_rw_init(&table->conid_lock);
3329 	lock_basic_init(&table->size_lock);
3330 	lock_protect(&table->lock, &table->static_secret,
3331 		sizeof(table->static_secret));
3332 	lock_protect(&table->lock, &table->static_secret_len,
3333 		sizeof(table->static_secret_len));
3334 	lock_protect(&table->lock, table->static_secret,
3335 		table->static_secret_len);
3336 	lock_protect(&table->lock, &table->sv_scidlen,
3337 		sizeof(table->sv_scidlen));
3338 	lock_protect(&table->lock, &table->idle_timeout,
3339 		sizeof(table->idle_timeout));
3340 	lock_protect(&table->lock, &table->conn_tree, sizeof(table->conn_tree));
3341 	lock_protect(&table->lock, table->conn_tree, sizeof(*table->conn_tree));
3342 	lock_protect(&table->conid_lock, table->conid_tree,
3343 		sizeof(*table->conid_tree));
3344 	lock_protect(&table->lock, table->timer_tree,
3345 		sizeof(*table->timer_tree));
3346 	lock_protect(&table->size_lock, &table->current_size,
3347 		sizeof(table->current_size));
3348 	return table;
3349 }
3350 
3351 /** delete elements from the connection tree */
3352 static void
conn_tree_del(rbnode_type * node,void * arg)3353 conn_tree_del(rbnode_type* node, void* arg)
3354 {
3355 	struct doq_table* table = (struct doq_table*)arg;
3356 	struct doq_conn* conn;
3357 	if(!node)
3358 		return;
3359 	conn = (struct doq_conn*)node->key;
3360 	if(conn->timer.timer_in_list) {
3361 		/* Remove timer from list first, because finding the rbnode
3362 		 * element of the setlist of same timeouts needs tree lookup.
3363 		 * Edit the tree structure after that lookup. */
3364 		doq_timer_list_remove(conn->table, &conn->timer);
3365 	}
3366 	if(conn->timer.timer_in_tree)
3367 		doq_timer_tree_remove(conn->table, &conn->timer);
3368 	doq_table_quic_size_subtract(table, sizeof(*conn)+conn->key.dcidlen);
3369 	doq_conn_delete(conn, table);
3370 }
3371 
3372 /** delete elements from the connection id tree */
3373 static void
conid_tree_del(rbnode_type * node,void * ATTR_UNUSED (arg))3374 conid_tree_del(rbnode_type* node, void* ATTR_UNUSED(arg))
3375 {
3376 	if(!node)
3377 		return;
3378 	doq_conid_delete((struct doq_conid*)node->key);
3379 }
3380 
3381 void
doq_table_delete(struct doq_table * table)3382 doq_table_delete(struct doq_table* table)
3383 {
3384 	if(!table)
3385 		return;
3386 	lock_rw_destroy(&table->lock);
3387 	free(table->static_secret);
3388 	if(table->conn_tree) {
3389 		traverse_postorder(table->conn_tree, conn_tree_del, table);
3390 		free(table->conn_tree);
3391 	}
3392 	lock_rw_destroy(&table->conid_lock);
3393 	if(table->conid_tree) {
3394 		/* The tree should be empty, because the doq_conn_delete calls
3395 		 * above should have also removed their conid elements. */
3396 		traverse_postorder(table->conid_tree, conid_tree_del, NULL);
3397 		free(table->conid_tree);
3398 	}
3399 	lock_basic_destroy(&table->size_lock);
3400 	if(table->timer_tree) {
3401 		/* The tree should be empty, because the conn_tree_del calls
3402 		 * above should also have removed them. Also the doq_timer
3403 		 * is part of the doq_conn struct, so is already freed. */
3404 		free(table->timer_tree);
3405 	}
3406 	table->write_list_first = NULL;
3407 	table->write_list_last = NULL;
3408 	free(table);
3409 }
3410 
3411 struct doq_timer*
doq_timer_find_time(struct doq_table * table,struct timeval * tv)3412 doq_timer_find_time(struct doq_table* table, struct timeval* tv)
3413 {
3414 	struct doq_timer key;
3415 	struct rbnode_type* node;
3416 	memset(&key, 0, sizeof(key));
3417 	key.time.tv_sec = tv->tv_sec;
3418 	key.time.tv_usec = tv->tv_usec;
3419 	node = rbtree_search(table->timer_tree, &key);
3420 	if(node)
3421 		return (struct doq_timer*)node->key;
3422 	return NULL;
3423 }
3424 
3425 void
doq_timer_tree_remove(struct doq_table * table,struct doq_timer * timer)3426 doq_timer_tree_remove(struct doq_table* table, struct doq_timer* timer)
3427 {
3428 	if(!timer->timer_in_tree)
3429 		return;
3430 	rbtree_delete(table->timer_tree, timer);
3431 	timer->timer_in_tree = 0;
3432 	/* This item could have more timers in the same set. */
3433 	if(timer->setlist_first) {
3434 		struct doq_timer* rb_timer = timer->setlist_first;
3435 		/* del first element from setlist */
3436 		if(rb_timer->setlist_next)
3437 			rb_timer->setlist_next->setlist_prev = NULL;
3438 		else
3439 			timer->setlist_last = NULL;
3440 		timer->setlist_first = rb_timer->setlist_next;
3441 		rb_timer->setlist_prev = NULL;
3442 		rb_timer->setlist_next = NULL;
3443 		rb_timer->timer_in_list = 0;
3444 		/* insert it into the tree as new rb element */
3445 		memset(&rb_timer->node, 0, sizeof(rb_timer->node));
3446 		rb_timer->node.key = rb_timer;
3447 		rbtree_insert(table->timer_tree, &rb_timer->node);
3448 		rb_timer->timer_in_tree = 1;
3449 		/* the setlist, if any remainder, moves to the rb element */
3450 		rb_timer->setlist_first = timer->setlist_first;
3451 		rb_timer->setlist_last = timer->setlist_last;
3452 		timer->setlist_first = NULL;
3453 		timer->setlist_last = NULL;
3454 		rb_timer->worker_doq_socket = timer->worker_doq_socket;
3455 	}
3456 	timer->worker_doq_socket = NULL;
3457 }
3458 
3459 void
doq_timer_list_remove(struct doq_table * table,struct doq_timer * timer)3460 doq_timer_list_remove(struct doq_table* table, struct doq_timer* timer)
3461 {
3462 	struct doq_timer* rb_timer;
3463 	if(!timer->timer_in_list)
3464 		return;
3465 	/* The item in the rbtree has the list start and end. */
3466 	rb_timer = doq_timer_find_time(table, &timer->time);
3467 	if(rb_timer) {
3468 		if(timer->setlist_prev)
3469 			timer->setlist_prev->setlist_next = timer->setlist_next;
3470 		else
3471 			rb_timer->setlist_first = timer->setlist_next;
3472 		if(timer->setlist_next)
3473 			timer->setlist_next->setlist_prev = timer->setlist_prev;
3474 		else
3475 			rb_timer->setlist_last = timer->setlist_prev;
3476 		timer->setlist_prev = NULL;
3477 		timer->setlist_next = NULL;
3478 	}
3479 	timer->timer_in_list = 0;
3480 }
3481 
3482 /** doq append timer to setlist */
3483 static void
doq_timer_list_append(struct doq_timer * rb_timer,struct doq_timer * timer)3484 doq_timer_list_append(struct doq_timer* rb_timer, struct doq_timer* timer)
3485 {
3486 	log_assert(timer->timer_in_list == 0);
3487 	timer->timer_in_list = 1;
3488 	timer->setlist_next = NULL;
3489 	timer->setlist_prev = rb_timer->setlist_last;
3490 	if(rb_timer->setlist_last)
3491 		rb_timer->setlist_last->setlist_next = timer;
3492 	else
3493 		rb_timer->setlist_first = timer;
3494 	rb_timer->setlist_last = timer;
3495 }
3496 
3497 void
doq_timer_unset(struct doq_table * table,struct doq_timer * timer)3498 doq_timer_unset(struct doq_table* table, struct doq_timer* timer)
3499 {
3500 	if(timer->timer_in_list) {
3501 		/* Remove timer from list first, because finding the rbnode
3502 		 * element of the setlist of same timeouts needs tree lookup.
3503 		 * Edit the tree structure after that lookup. */
3504 		doq_timer_list_remove(table, timer);
3505 	}
3506 	if(timer->timer_in_tree)
3507 		doq_timer_tree_remove(table, timer);
3508 	timer->worker_doq_socket = NULL;
3509 }
3510 
doq_timer_set(struct doq_table * table,struct doq_timer * timer,struct doq_server_socket * worker_doq_socket,struct timeval * tv)3511 void doq_timer_set(struct doq_table* table, struct doq_timer* timer,
3512 	struct doq_server_socket* worker_doq_socket, struct timeval* tv)
3513 {
3514 	struct doq_timer* rb_timer;
3515 	if(verbosity >= VERB_ALGO && timer->conn) {
3516 		char a[256];
3517 		struct timeval rel;
3518 		addr_to_str((void*)&timer->conn->key.paddr.addr,
3519 			timer->conn->key.paddr.addrlen, a, sizeof(a));
3520 		timeval_subtract(&rel, tv, worker_doq_socket->now_tv);
3521 		verbose(VERB_ALGO, "doq %s timer set %d.%6.6d in %d.%6.6d",
3522 			a, (int)tv->tv_sec, (int)tv->tv_usec,
3523 			(int)rel.tv_sec, (int)rel.tv_usec);
3524 	}
3525 	if(timer->timer_in_tree || timer->timer_in_list) {
3526 		if(timer->time.tv_sec == tv->tv_sec &&
3527 			timer->time.tv_usec == tv->tv_usec)
3528 			return; /* already set on that time */
3529 		doq_timer_unset(table, timer);
3530 	}
3531 	timer->time.tv_sec = tv->tv_sec;
3532 	timer->time.tv_usec = tv->tv_usec;
3533 	rb_timer = doq_timer_find_time(table, tv);
3534 	if(rb_timer) {
3535 		/* There is a timeout already with this value. Timer is
3536 		 * added to the setlist. */
3537 		doq_timer_list_append(rb_timer, timer);
3538 	} else {
3539 		/* There is no timeout with this value. Make timer a new
3540 		 * tree element. */
3541 		memset(&timer->node, 0, sizeof(timer->node));
3542 		timer->node.key = timer;
3543 		rbtree_insert(table->timer_tree, &timer->node);
3544 		timer->timer_in_tree = 1;
3545 		timer->setlist_first = NULL;
3546 		timer->setlist_last = NULL;
3547 		timer->worker_doq_socket = worker_doq_socket;
3548 	}
3549 }
3550 
3551 struct doq_conn*
doq_conn_create(struct comm_point * c,struct doq_pkt_addr * paddr,const uint8_t * dcid,size_t dcidlen,uint32_t version)3552 doq_conn_create(struct comm_point* c, struct doq_pkt_addr* paddr,
3553 	const uint8_t* dcid, size_t dcidlen, uint32_t version)
3554 {
3555 	struct doq_conn* conn = calloc(1, sizeof(*conn));
3556 	if(!conn)
3557 		return NULL;
3558 	conn->node.key = conn;
3559 	conn->doq_socket = c->doq_socket;
3560 	conn->table = c->doq_socket->table;
3561 	memmove(&conn->key.paddr.addr, &paddr->addr, paddr->addrlen);
3562 	conn->key.paddr.addrlen = paddr->addrlen;
3563 	memmove(&conn->key.paddr.localaddr, &paddr->localaddr,
3564 		paddr->localaddrlen);
3565 	conn->key.paddr.localaddrlen = paddr->localaddrlen;
3566 	conn->key.paddr.ifindex = paddr->ifindex;
3567 	conn->key.dcid = memdup((void*)dcid, dcidlen);
3568 	if(!conn->key.dcid) {
3569 		free(conn);
3570 		return NULL;
3571 	}
3572 	conn->key.dcidlen = dcidlen;
3573 	conn->version = version;
3574 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
3575 	ngtcp2_ccerr_default(&conn->ccerr);
3576 #else
3577 	ngtcp2_connection_close_error_default(&conn->last_error);
3578 #endif
3579 	rbtree_init(&conn->stream_tree, &doq_stream_cmp);
3580 	conn->timer.conn = conn;
3581 	lock_basic_init(&conn->lock);
3582 	lock_protect(&conn->lock, &conn->key, sizeof(conn->key));
3583 	lock_protect(&conn->lock, &conn->doq_socket, sizeof(conn->doq_socket));
3584 	lock_protect(&conn->lock, &conn->table, sizeof(conn->table));
3585 	lock_protect(&conn->lock, &conn->is_deleted, sizeof(conn->is_deleted));
3586 	lock_protect(&conn->lock, &conn->version, sizeof(conn->version));
3587 	lock_protect(&conn->lock, &conn->conn, sizeof(conn->conn));
3588 	lock_protect(&conn->lock, &conn->conid_list, sizeof(conn->conid_list));
3589 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
3590 	lock_protect(&conn->lock, &conn->ccerr, sizeof(conn->ccerr));
3591 #else
3592 	lock_protect(&conn->lock, &conn->last_error, sizeof(conn->last_error));
3593 #endif
3594 	lock_protect(&conn->lock, &conn->tls_alert, sizeof(conn->tls_alert));
3595 	lock_protect(&conn->lock, &conn->ssl, sizeof(conn->ssl));
3596 	lock_protect(&conn->lock, &conn->close_pkt, sizeof(conn->close_pkt));
3597 	lock_protect(&conn->lock, &conn->close_pkt_len, sizeof(conn->close_pkt_len));
3598 	lock_protect(&conn->lock, &conn->close_ecn, sizeof(conn->close_ecn));
3599 	lock_protect(&conn->lock, &conn->stream_tree, sizeof(conn->stream_tree));
3600 	lock_protect(&conn->lock, &conn->stream_write_first, sizeof(conn->stream_write_first));
3601 	lock_protect(&conn->lock, &conn->stream_write_last, sizeof(conn->stream_write_last));
3602 	lock_protect(&conn->lock, &conn->write_interest, sizeof(conn->write_interest));
3603 	lock_protect(&conn->lock, &conn->on_write_list, sizeof(conn->on_write_list));
3604 	lock_protect(&conn->lock, &conn->write_prev, sizeof(conn->write_prev));
3605 	lock_protect(&conn->lock, &conn->write_next, sizeof(conn->write_next));
3606 	return conn;
3607 }
3608 
3609 /** delete stream tree node */
3610 static void
stream_tree_del(rbnode_type * node,void * arg)3611 stream_tree_del(rbnode_type* node, void* arg)
3612 {
3613 	struct doq_table* table = (struct doq_table*)arg;
3614 	struct doq_stream* stream;
3615 	if(!node)
3616 		return;
3617 	stream = (struct doq_stream*)node;
3618 	if(stream->in)
3619 		doq_table_quic_size_subtract(table, stream->inlen);
3620 	if(stream->out)
3621 		doq_table_quic_size_subtract(table, stream->outlen);
3622 	doq_table_quic_size_subtract(table, sizeof(*stream));
3623 	doq_stream_delete(stream);
3624 }
3625 
3626 void
doq_conn_delete(struct doq_conn * conn,struct doq_table * table)3627 doq_conn_delete(struct doq_conn* conn, struct doq_table* table)
3628 {
3629 	if(!conn)
3630 		return;
3631 	lock_basic_destroy(&conn->lock);
3632 	lock_rw_wrlock(&conn->table->conid_lock);
3633 	doq_conn_clear_conids(conn);
3634 	lock_rw_unlock(&conn->table->conid_lock);
3635 	/* Remove the app data from ngtcp2 before SSL_free of conn->ssl,
3636 	 * because the ngtcp2 conn is deleted. */
3637 	SSL_set_app_data(conn->ssl, NULL);
3638 	if(conn->stream_tree.count != 0) {
3639 		traverse_postorder(&conn->stream_tree, stream_tree_del, table);
3640 	}
3641 	free(conn->key.dcid);
3642 	SSL_free(conn->ssl);
3643 #ifdef USE_NGTCP2_CRYPTO_OSSL
3644 	ngtcp2_crypto_ossl_ctx_del(conn->ossl_ctx);
3645 #endif
3646 	ngtcp2_conn_del(conn->conn);
3647 	free(conn->close_pkt);
3648 	free(conn);
3649 }
3650 
3651 int
doq_conn_cmp(const void * key1,const void * key2)3652 doq_conn_cmp(const void* key1, const void* key2)
3653 {
3654 	struct doq_conn* c = (struct doq_conn*)key1;
3655 	struct doq_conn* d = (struct doq_conn*)key2;
3656 	int r;
3657 	/* Compared in the order destination address, then
3658 	 * local address, ifindex and then dcid.
3659 	 * So that for a search for findlessorequal for the destination
3660 	 * address will find connections to that address, with different
3661 	 * dcids.
3662 	 * Also a printout in sorted order prints the connections by IP
3663 	 * address of destination, and then a number of them depending on the
3664 	 * dcids. */
3665 	if(c->key.paddr.addrlen != d->key.paddr.addrlen) {
3666 		if(c->key.paddr.addrlen < d->key.paddr.addrlen)
3667 			return -1;
3668 		return 1;
3669 	}
3670 	if((r=memcmp(&c->key.paddr.addr, &d->key.paddr.addr,
3671 		c->key.paddr.addrlen))!=0)
3672 		return r;
3673 	if(c->key.paddr.localaddrlen != d->key.paddr.localaddrlen) {
3674 		if(c->key.paddr.localaddrlen < d->key.paddr.localaddrlen)
3675 			return -1;
3676 		return 1;
3677 	}
3678 	if((r=memcmp(&c->key.paddr.localaddr, &d->key.paddr.localaddr,
3679 		c->key.paddr.localaddrlen))!=0)
3680 		return r;
3681 	if(c->key.paddr.ifindex != d->key.paddr.ifindex) {
3682 		if(c->key.paddr.ifindex < d->key.paddr.ifindex)
3683 			return -1;
3684 		return 1;
3685 	}
3686 	if(c->key.dcidlen != d->key.dcidlen) {
3687 		if(c->key.dcidlen < d->key.dcidlen)
3688 			return -1;
3689 		return 1;
3690 	}
3691 	if((r=memcmp(c->key.dcid, d->key.dcid, c->key.dcidlen))!=0)
3692 		return r;
3693 	return 0;
3694 }
3695 
doq_conid_cmp(const void * key1,const void * key2)3696 int doq_conid_cmp(const void* key1, const void* key2)
3697 {
3698 	struct doq_conid* c = (struct doq_conid*)key1;
3699 	struct doq_conid* d = (struct doq_conid*)key2;
3700 	if(c->cidlen != d->cidlen) {
3701 		if(c->cidlen < d->cidlen)
3702 			return -1;
3703 		return 1;
3704 	}
3705 	return memcmp(c->cid, d->cid, c->cidlen);
3706 }
3707 
doq_timer_cmp(const void * key1,const void * key2)3708 int doq_timer_cmp(const void* key1, const void* key2)
3709 {
3710 	struct doq_timer* e = (struct doq_timer*)key1;
3711 	struct doq_timer* f = (struct doq_timer*)key2;
3712 	if(e->time.tv_sec < f->time.tv_sec)
3713 		return -1;
3714 	if(e->time.tv_sec > f->time.tv_sec)
3715 		return 1;
3716 	if(e->time.tv_usec < f->time.tv_usec)
3717 		return -1;
3718 	if(e->time.tv_usec > f->time.tv_usec)
3719 		return 1;
3720 	return 0;
3721 }
3722 
doq_stream_cmp(const void * key1,const void * key2)3723 int doq_stream_cmp(const void* key1, const void* key2)
3724 {
3725 	struct doq_stream* c = (struct doq_stream*)key1;
3726 	struct doq_stream* d = (struct doq_stream*)key2;
3727 	if(c->stream_id != d->stream_id) {
3728 		if(c->stream_id < d->stream_id)
3729 			return -1;
3730 		return 1;
3731 	}
3732 	return 0;
3733 }
3734 
3735 /** doq store a local address in repinfo */
3736 static void
doq_repinfo_store_localaddr(struct comm_reply * repinfo,struct doq_addr_storage * localaddr,socklen_t localaddrlen)3737 doq_repinfo_store_localaddr(struct comm_reply* repinfo,
3738 	struct doq_addr_storage* localaddr, socklen_t localaddrlen)
3739 {
3740 	/* use the pktinfo that we have for ancillary udp data otherwise,
3741 	 * this saves space for a sockaddr */
3742 	memset(&repinfo->pktinfo, 0, sizeof(repinfo->pktinfo));
3743 	if(addr_is_ip6((void*)localaddr, localaddrlen)) {
3744 #ifdef IPV6_PKTINFO
3745 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
3746 		memmove(&repinfo->pktinfo.v6info.ipi6_addr,
3747 			&sa6->sin6_addr, sizeof(struct in6_addr));
3748 		repinfo->doq_srcport = sa6->sin6_port;
3749 #endif
3750 		repinfo->srctype = 6;
3751 	} else {
3752 #ifdef IP_PKTINFO
3753 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3754 		memmove(&repinfo->pktinfo.v4info.ipi_addr,
3755 			&sa->sin_addr, sizeof(struct in_addr));
3756 		repinfo->doq_srcport = sa->sin_port;
3757 #elif defined(IP_RECVDSTADDR)
3758 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3759 		memmove(&repinfo->pktinfo.v4addr, &sa->sin_addr,
3760 			sizeof(struct in_addr));
3761 		repinfo->doq_srcport = sa->sin_port;
3762 #endif
3763 		repinfo->srctype = 4;
3764 	}
3765 }
3766 
3767 /** doq retrieve localaddr from repinfo */
3768 static void
doq_repinfo_retrieve_localaddr(struct comm_reply * repinfo,struct doq_addr_storage * localaddr,socklen_t * localaddrlen)3769 doq_repinfo_retrieve_localaddr(struct comm_reply* repinfo,
3770 	struct doq_addr_storage* localaddr, socklen_t* localaddrlen)
3771 {
3772 	if(repinfo->srctype == 6) {
3773 #ifdef IPV6_PKTINFO
3774 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
3775 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in6);
3776 		memset(sa6, 0, *localaddrlen);
3777 		sa6->sin6_family = AF_INET6;
3778 		memmove(&sa6->sin6_addr, &repinfo->pktinfo.v6info.ipi6_addr,
3779 			*localaddrlen);
3780 		sa6->sin6_port = repinfo->doq_srcport;
3781 #endif
3782 	} else {
3783 #ifdef IP_PKTINFO
3784 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3785 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
3786 		memset(sa, 0, *localaddrlen);
3787 		sa->sin_family = AF_INET;
3788 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4info.ipi_addr,
3789 			*localaddrlen);
3790 		sa->sin_port = repinfo->doq_srcport;
3791 #elif defined(IP_RECVDSTADDR)
3792 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3793 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
3794 		memset(sa, 0, *localaddrlen);
3795 		sa->sin_family = AF_INET;
3796 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4addr,
3797 			sizeof(struct in_addr));
3798 		sa->sin_port = repinfo->doq_srcport;
3799 #endif
3800 	}
3801 }
3802 
3803 /** doq write a connection key into repinfo, false if it does not fit */
3804 static int
doq_conn_key_store_repinfo(struct doq_conn_key * key,struct comm_reply * repinfo)3805 doq_conn_key_store_repinfo(struct doq_conn_key* key,
3806 	struct comm_reply* repinfo)
3807 {
3808 	repinfo->is_proxied = 0;
3809 	repinfo->doq_ifindex = key->paddr.ifindex;
3810 	repinfo->remote_addrlen = key->paddr.addrlen;
3811 	memmove(&repinfo->remote_addr, &key->paddr.addr,
3812 		repinfo->remote_addrlen);
3813 	repinfo->client_addrlen = key->paddr.addrlen;
3814 	memmove(&repinfo->client_addr, &key->paddr.addr,
3815 		repinfo->client_addrlen);
3816 	doq_repinfo_store_localaddr(repinfo, &key->paddr.localaddr,
3817 		key->paddr.localaddrlen);
3818 	if(key->dcidlen > sizeof(repinfo->doq_dcid))
3819 		return 0;
3820 	repinfo->doq_dcidlen = key->dcidlen;
3821 	memmove(repinfo->doq_dcid, key->dcid, key->dcidlen);
3822 	return 1;
3823 }
3824 
3825 void
doq_conn_key_from_repinfo(struct doq_conn_key * key,struct comm_reply * repinfo)3826 doq_conn_key_from_repinfo(struct doq_conn_key* key, struct comm_reply* repinfo)
3827 {
3828 	key->paddr.ifindex = repinfo->doq_ifindex;
3829 	key->paddr.addrlen = repinfo->remote_addrlen;
3830 	memmove(&key->paddr.addr, &repinfo->remote_addr,
3831 		repinfo->remote_addrlen);
3832 	doq_repinfo_retrieve_localaddr(repinfo, &key->paddr.localaddr,
3833 		&key->paddr.localaddrlen);
3834 	key->dcidlen = repinfo->doq_dcidlen;
3835 	key->dcid = repinfo->doq_dcid;
3836 }
3837 
3838 /** doq add a stream to the connection */
3839 static void
doq_conn_add_stream(struct doq_conn * conn,struct doq_stream * stream)3840 doq_conn_add_stream(struct doq_conn* conn, struct doq_stream* stream)
3841 {
3842 	(void)rbtree_insert(&conn->stream_tree, &stream->node);
3843 }
3844 
3845 /** doq delete a stream from the connection */
3846 static void
doq_conn_del_stream(struct doq_conn * conn,struct doq_stream * stream)3847 doq_conn_del_stream(struct doq_conn* conn, struct doq_stream* stream)
3848 {
3849 	(void)rbtree_delete(&conn->stream_tree, &stream->node);
3850 }
3851 
3852 /** doq create new stream */
3853 static struct doq_stream*
doq_stream_create(int64_t stream_id)3854 doq_stream_create(int64_t stream_id)
3855 {
3856 	struct doq_stream* stream = calloc(1, sizeof(*stream));
3857 	if(!stream)
3858 		return NULL;
3859 	stream->node.key = stream;
3860 	stream->stream_id = stream_id;
3861 	return stream;
3862 }
3863 
doq_stream_delete(struct doq_stream * stream)3864 void doq_stream_delete(struct doq_stream* stream)
3865 {
3866 	if(!stream)
3867 		return;
3868 	free(stream->in);
3869 	free(stream->out);
3870 	free(stream);
3871 }
3872 
3873 struct doq_stream*
doq_stream_find(struct doq_conn * conn,int64_t stream_id)3874 doq_stream_find(struct doq_conn* conn, int64_t stream_id)
3875 {
3876 	rbnode_type* node;
3877 	struct doq_stream key;
3878 	key.node.key = &key;
3879 	key.stream_id = stream_id;
3880 	node = rbtree_search(&conn->stream_tree, &key);
3881 	if(node)
3882 		return (struct doq_stream*)node->key;
3883 	return NULL;
3884 }
3885 
3886 /** doq put stream on the conn write list */
3887 static void
doq_stream_on_write_list(struct doq_conn * conn,struct doq_stream * stream)3888 doq_stream_on_write_list(struct doq_conn* conn, struct doq_stream* stream)
3889 {
3890 	if(stream->on_write_list)
3891 		return;
3892 	stream->write_prev = conn->stream_write_last;
3893 	if(conn->stream_write_last)
3894 		conn->stream_write_last->write_next = stream;
3895 	else
3896 		conn->stream_write_first = stream;
3897 	conn->stream_write_last = stream;
3898 	stream->write_next = NULL;
3899 	stream->on_write_list = 1;
3900 }
3901 
3902 /** doq remove stream from the conn write list */
3903 static void
doq_stream_off_write_list(struct doq_conn * conn,struct doq_stream * stream)3904 doq_stream_off_write_list(struct doq_conn* conn, struct doq_stream* stream)
3905 {
3906 	if(!stream->on_write_list)
3907 		return;
3908 	if(stream->write_next)
3909 		stream->write_next->write_prev = stream->write_prev;
3910 	else conn->stream_write_last = stream->write_prev;
3911 	if(stream->write_prev)
3912 		stream->write_prev->write_next = stream->write_next;
3913 	else conn->stream_write_first = stream->write_next;
3914 	stream->write_prev = NULL;
3915 	stream->write_next = NULL;
3916 	stream->on_write_list = 0;
3917 }
3918 
3919 /** doq stream remove in buffer */
3920 static void
doq_stream_remove_in_buffer(struct doq_stream * stream,struct doq_table * table)3921 doq_stream_remove_in_buffer(struct doq_stream* stream, struct doq_table* table)
3922 {
3923 	if(stream->in) {
3924 		doq_table_quic_size_subtract(table, stream->inlen);
3925 		free(stream->in);
3926 		stream->in = NULL;
3927 		stream->inlen = 0;
3928 	}
3929 }
3930 
3931 /** doq stream remove out buffer */
3932 static void
doq_stream_remove_out_buffer(struct doq_stream * stream,struct doq_table * table)3933 doq_stream_remove_out_buffer(struct doq_stream* stream,
3934 	struct doq_table* table)
3935 {
3936 	if(stream->out) {
3937 		doq_table_quic_size_subtract(table, stream->outlen);
3938 		free(stream->out);
3939 		stream->out = NULL;
3940 		stream->outlen = 0;
3941 	}
3942 }
3943 
3944 int
doq_stream_close(struct doq_conn * conn,struct doq_stream * stream,int send_shutdown)3945 doq_stream_close(struct doq_conn* conn, struct doq_stream* stream,
3946 	int send_shutdown)
3947 {
3948 	int ret;
3949 	if(stream->is_closed)
3950 		return 1;
3951 	stream->is_closed = 1;
3952 	doq_stream_off_write_list(conn, stream);
3953 	if(send_shutdown) {
3954 		verbose(VERB_ALGO, "doq: shutdown stream_id %d with app_error_code %d",
3955 			(int)stream->stream_id, (int)DOQ_APP_ERROR_CODE);
3956 		ret = ngtcp2_conn_shutdown_stream(conn->conn,
3957 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
3958 			0,
3959 #endif
3960 			stream->stream_id, DOQ_APP_ERROR_CODE);
3961 		if(ret != 0) {
3962 			log_err("doq ngtcp2_conn_shutdown_stream %d failed: %s",
3963 				(int)stream->stream_id, ngtcp2_strerror(ret));
3964 			return 0;
3965 		}
3966 		doq_conn_write_enable(conn);
3967 	}
3968 	verbose(VERB_ALGO, "doq: conn extend max streams bidi by 1");
3969 	ngtcp2_conn_extend_max_streams_bidi(conn->conn, 1);
3970 	doq_conn_write_enable(conn);
3971 	doq_stream_remove_in_buffer(stream, conn->doq_socket->table);
3972 	doq_stream_remove_out_buffer(stream, conn->doq_socket->table);
3973 	doq_table_quic_size_subtract(conn->doq_socket->table, sizeof(*stream));
3974 	doq_conn_del_stream(conn, stream);
3975 	doq_stream_delete(stream);
3976 	return 1;
3977 }
3978 
3979 /** doq stream pick up answer data from buffer */
3980 static int
doq_stream_pickup_answer(struct doq_stream * stream,struct sldns_buffer * buf)3981 doq_stream_pickup_answer(struct doq_stream* stream, struct sldns_buffer* buf)
3982 {
3983 	stream->is_answer_available = 1;
3984 	if(stream->out) {
3985 		free(stream->out);
3986 		stream->out = NULL;
3987 		stream->outlen = 0;
3988 	}
3989 	stream->nwrite = 0;
3990 	stream->outlen = sldns_buffer_limit(buf);
3991 	/* For quic the output bytes have to stay allocated and available,
3992 	 * for potential resends, until the remote end has acknowledged them.
3993 	 * This includes the tcplen start uint16_t, in outlen_wire. */
3994 	stream->outlen_wire = htons(stream->outlen);
3995 	stream->out = memdup(sldns_buffer_begin(buf), sldns_buffer_limit(buf));
3996 	if(!stream->out) {
3997 		log_err("doq could not send answer: out of memory");
3998 		return 0;
3999 	}
4000 	return 1;
4001 }
4002 
4003 int
doq_stream_send_reply(struct doq_conn * conn,struct doq_stream * stream,struct sldns_buffer * buf)4004 doq_stream_send_reply(struct doq_conn* conn, struct doq_stream* stream,
4005 	struct sldns_buffer* buf)
4006 {
4007 	if(verbosity >= VERB_ALGO) {
4008 		char* s = sldns_wire2str_pkt(sldns_buffer_begin(buf),
4009 			sldns_buffer_limit(buf));
4010 		verbose(VERB_ALGO, "doq stream %d response\n%s",
4011 			(int)stream->stream_id, (s?s:"null"));
4012 		free(s);
4013 	}
4014 	if(stream->out)
4015 		doq_table_quic_size_subtract(conn->doq_socket->table,
4016 			stream->outlen);
4017 	if(!doq_stream_pickup_answer(stream, buf))
4018 		return 0;
4019 	doq_table_quic_size_add(conn->doq_socket->table, stream->outlen);
4020 	doq_stream_on_write_list(conn, stream);
4021 	doq_conn_write_enable(conn);
4022 	return 1;
4023 }
4024 
4025 /** doq stream data length has completed, allocations can be done. False on
4026  * allocation failure. */
4027 static int
doq_stream_datalen_complete(struct doq_stream * stream,struct doq_table * table)4028 doq_stream_datalen_complete(struct doq_stream* stream, struct doq_table* table)
4029 {
4030 	if(stream->inlen > 1024*1024) {
4031 		log_err("doq stream in length too large %d",
4032 			(int)stream->inlen);
4033 		return 0;
4034 	}
4035 	stream->in = calloc(1, stream->inlen);
4036 	if(!stream->in) {
4037 		log_err("doq could not read stream, calloc failed: "
4038 			"out of memory");
4039 		return 0;
4040 	}
4041 	doq_table_quic_size_add(table, stream->inlen);
4042 	return 1;
4043 }
4044 
4045 /** doq stream data is complete, the input data has been received. */
4046 static int
doq_stream_data_complete(struct doq_conn * conn,struct doq_stream * stream)4047 doq_stream_data_complete(struct doq_conn* conn, struct doq_stream* stream)
4048 {
4049 	struct comm_point* c;
4050 	if(verbosity >= VERB_ALGO) {
4051 		char* s = sldns_wire2str_pkt(stream->in, stream->inlen);
4052 		char a[128];
4053 		addr_to_str((void*)&conn->key.paddr.addr,
4054 			conn->key.paddr.addrlen, a, sizeof(a));
4055 		verbose(VERB_ALGO, "doq %s stream %d incoming query\n%s",
4056 			a, (int)stream->stream_id, (s?s:"null"));
4057 		free(s);
4058 	}
4059 	stream->is_query_complete = 1;
4060 	c = conn->doq_socket->cp;
4061 	if(!stream->in) {
4062 		verbose(VERB_ALGO, "doq_stream_data_complete: no in buffer");
4063 		return 0;
4064 	}
4065 	if(stream->inlen > sldns_buffer_capacity(c->buffer)) {
4066 		verbose(VERB_ALGO, "doq_stream_data_complete: query too long");
4067 		return 0;
4068 	}
4069 	sldns_buffer_clear(c->buffer);
4070 	sldns_buffer_write(c->buffer, stream->in, stream->inlen);
4071 	sldns_buffer_flip(c->buffer);
4072 	c->repinfo.c = c;
4073 	if(!doq_conn_key_store_repinfo(&conn->key, &c->repinfo)) {
4074 		verbose(VERB_ALGO, "doq_stream_data_complete: connection "
4075 			"DCID too long");
4076 		return 0;
4077 	}
4078 	c->repinfo.doq_streamid = stream->stream_id;
4079 	conn->doq_socket->current_conn = conn;
4080 	fptr_ok(fptr_whitelist_comm_point(c->callback));
4081 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo)) {
4082 		conn->doq_socket->current_conn = NULL;
4083 		if(!doq_stream_send_reply(conn, stream, c->buffer)) {
4084 			verbose(VERB_ALGO, "doq: failed to send_reply");
4085 			return 0;
4086 		}
4087 		return 1;
4088 	}
4089 	conn->doq_socket->current_conn = NULL;
4090 	return 1;
4091 }
4092 
4093 /** doq receive data for a stream, more bytes of the incoming data */
4094 static int
doq_stream_recv_data(struct doq_stream * stream,const uint8_t * data,size_t datalen,int * recv_done,struct doq_table * table)4095 doq_stream_recv_data(struct doq_stream* stream, const uint8_t* data,
4096 	size_t datalen, int* recv_done, struct doq_table* table)
4097 {
4098 	int got_data = 0;
4099 	/* read the tcplength uint16_t at the start */
4100 	if(stream->nread < 2) {
4101 		uint16_t tcplen = 0;
4102 		size_t todolen = 2 - stream->nread;
4103 
4104 		if(stream->nread > 0) {
4105 			/* put in the already read byte if there is one */
4106 			tcplen = stream->inlen;
4107 		}
4108 		if(datalen < todolen)
4109 			todolen = datalen;
4110 		memmove(((uint8_t*)&tcplen)+stream->nread, data, todolen);
4111 		stream->nread += todolen;
4112 		data += todolen;
4113 		datalen -= todolen;
4114 		if(stream->nread == 2) {
4115 			/* the initial length value is completed */
4116 			stream->inlen = ntohs(tcplen);
4117 			if(!doq_stream_datalen_complete(stream, table))
4118 				return 0;
4119 		} else {
4120 			/* store for later */
4121 			stream->inlen = tcplen;
4122 			return 1;
4123 		}
4124 	}
4125 	/* if there are more data bytes */
4126 	if(datalen > 0) {
4127 		size_t to_write = datalen;
4128 		if(stream->nread-2 > stream->inlen) {
4129 			verbose(VERB_ALGO, "doq stream buffer too small");
4130 			return 0;
4131 		}
4132 		if(datalen > stream->inlen - (stream->nread-2))
4133 			to_write = stream->inlen - (stream->nread-2);
4134 		if(to_write > 0) {
4135 			if(!stream->in) {
4136 				verbose(VERB_ALGO, "doq: stream has "
4137 					"no buffer");
4138 				return 0;
4139 			}
4140 			memmove(stream->in+(stream->nread-2), data, to_write);
4141 			stream->nread += to_write;
4142 			data += to_write;
4143 			datalen -= to_write;
4144 			got_data = 1;
4145 		}
4146 	}
4147 	/* Are there extra bytes received after the end? If so, log them. */
4148 	if(datalen > 0) {
4149 		if(verbosity >= VERB_ALGO)
4150 			log_hex("doq stream has extra bytes received after end",
4151 				(void*)data, datalen);
4152 	}
4153 	/* Is the input data complete? */
4154 	if(got_data && stream->nread >= stream->inlen+2) {
4155 		if(!stream->in) {
4156 			verbose(VERB_ALGO, "doq: completed stream has "
4157 				"no buffer");
4158 			return 0;
4159 		}
4160 		*recv_done = 1;
4161 	}
4162 	return 1;
4163 }
4164 
4165 /** doq receive FIN for a stream. No more bytes are going to arrive. */
4166 static int
doq_stream_recv_fin(struct doq_conn * conn,struct doq_stream * stream,int recv_done)4167 doq_stream_recv_fin(struct doq_conn* conn, struct doq_stream* stream, int
4168 	recv_done)
4169 {
4170 	if(!stream->is_query_complete && !recv_done) {
4171 		verbose(VERB_ALGO, "doq: stream recv FIN, but is "
4172 			"not complete, have %d of %d bytes",
4173 			((int)stream->nread)-2, (int)stream->inlen);
4174 		if(!doq_stream_close(conn, stream, 1))
4175 			return 0;
4176 	}
4177 	return 1;
4178 }
4179 
doq_fill_rand(struct ub_randstate * rnd,uint8_t * buf,size_t len)4180 void doq_fill_rand(struct ub_randstate* rnd, uint8_t* buf, size_t len)
4181 {
4182 	size_t i;
4183 	for(i=0; i<len; i++)
4184 		buf[i] = ub_random(rnd)&0xff;
4185 }
4186 
4187 /** generate new connection id, checks for duplicates.
4188  * caller must hold lock on conid tree. */
4189 static int
doq_conn_generate_new_conid(struct doq_conn * conn,uint8_t * data,size_t datalen)4190 doq_conn_generate_new_conid(struct doq_conn* conn, uint8_t* data,
4191 	size_t datalen)
4192 {
4193 	int max_try = 100;
4194 	int i;
4195 	for(i=0; i<max_try; i++) {
4196 		doq_fill_rand(conn->doq_socket->rnd, data, datalen);
4197 		if(!doq_conid_find(conn->table, data, datalen)) {
4198 			/* Found an unused connection id. */
4199 			return 1;
4200 		}
4201 	}
4202 	verbose(VERB_ALGO, "doq_conn_generate_new_conid failed: could not "
4203 		"generate random unused connection id value in %d attempts.",
4204 		max_try);
4205 	return 0;
4206 }
4207 
4208 /** ngtcp2 rand callback function */
4209 static void
doq_rand_cb(uint8_t * dest,size_t destlen,const ngtcp2_rand_ctx * rand_ctx)4210 doq_rand_cb(uint8_t* dest, size_t destlen, const ngtcp2_rand_ctx* rand_ctx)
4211 {
4212 	struct ub_randstate* rnd = (struct ub_randstate*)
4213 		rand_ctx->native_handle;
4214 	doq_fill_rand(rnd, dest, destlen);
4215 }
4216 
4217 /** ngtcp2 get_new_connection_id callback function */
4218 static int
doq_get_new_connection_id_cb(ngtcp2_conn * ATTR_UNUSED (conn),ngtcp2_cid * cid,uint8_t * token,size_t cidlen,void * user_data)4219 doq_get_new_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn), ngtcp2_cid* cid,
4220 	uint8_t* token, size_t cidlen, void* user_data)
4221 {
4222 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4223 	/* Lock the conid tree, so we can check for duplicates while
4224 	 * generating the id, and then insert it, whilst keeping the tree
4225 	 * locked against other modifications, guaranteeing uniqueness. */
4226 	lock_rw_wrlock(&doq_conn->table->conid_lock);
4227 	if(!doq_conn_generate_new_conid(doq_conn, cid->data, cidlen)) {
4228 		lock_rw_unlock(&doq_conn->table->conid_lock);
4229 		return NGTCP2_ERR_CALLBACK_FAILURE;
4230 	}
4231 	cid->datalen = cidlen;
4232 	if(ngtcp2_crypto_generate_stateless_reset_token(token,
4233 		doq_conn->doq_socket->static_secret,
4234 		doq_conn->doq_socket->static_secret_len, cid) != 0) {
4235 		lock_rw_unlock(&doq_conn->table->conid_lock);
4236 		return NGTCP2_ERR_CALLBACK_FAILURE;
4237 	}
4238 	if(!doq_conn_associate_conid(doq_conn, cid->data, cid->datalen)) {
4239 		lock_rw_unlock(&doq_conn->table->conid_lock);
4240 		return NGTCP2_ERR_CALLBACK_FAILURE;
4241 	}
4242 	lock_rw_unlock(&doq_conn->table->conid_lock);
4243 	return 0;
4244 }
4245 
4246 /** ngtcp2 remove_connection_id callback function */
4247 static int
doq_remove_connection_id_cb(ngtcp2_conn * ATTR_UNUSED (conn),const ngtcp2_cid * cid,void * user_data)4248 doq_remove_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn),
4249 	const ngtcp2_cid* cid, void* user_data)
4250 {
4251 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4252 	lock_rw_wrlock(&doq_conn->table->conid_lock);
4253 	doq_conn_dissociate_conid(doq_conn, cid->data, cid->datalen);
4254 	lock_rw_unlock(&doq_conn->table->conid_lock);
4255 	return 0;
4256 }
4257 
4258 /** doq submit a new token */
4259 static int
doq_submit_new_token(struct doq_conn * conn)4260 doq_submit_new_token(struct doq_conn* conn)
4261 {
4262 	uint8_t token[NGTCP2_CRYPTO_MAX_REGULAR_TOKENLEN];
4263 	ngtcp2_ssize tokenlen;
4264 	int ret;
4265 	const ngtcp2_path* path = ngtcp2_conn_get_path(conn->conn);
4266 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
4267 
4268 	tokenlen = ngtcp2_crypto_generate_regular_token(token,
4269 		conn->doq_socket->static_secret,
4270 		conn->doq_socket->static_secret_len, path->remote.addr,
4271 		path->remote.addrlen, ts);
4272 	if(tokenlen < 0) {
4273 		log_err("doq ngtcp2_crypto_generate_regular_token failed");
4274 		return 1;
4275 	}
4276 
4277 	verbose(VERB_ALGO, "doq submit new token");
4278 	ret = ngtcp2_conn_submit_new_token(conn->conn, token, tokenlen);
4279 	if(ret != 0) {
4280 		log_err("doq ngtcp2_conn_submit_new_token failed: %s",
4281 			ngtcp2_strerror(ret));
4282 		return 0;
4283 	}
4284 	return 1;
4285 }
4286 
4287 /** ngtcp2 handshake_completed callback function */
4288 static int
doq_handshake_completed_cb(ngtcp2_conn * ATTR_UNUSED (conn),void * user_data)4289 doq_handshake_completed_cb(ngtcp2_conn* ATTR_UNUSED(conn), void* user_data)
4290 {
4291 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4292 	verbose(VERB_ALGO, "doq handshake_completed callback");
4293 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_data_left is %d",
4294 		(int)ngtcp2_conn_get_max_data_left(doq_conn->conn));
4295 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
4296 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_local_streams_uni is %d",
4297 		(int)ngtcp2_conn_get_max_local_streams_uni(doq_conn->conn));
4298 #endif
4299 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_uni_left is %d",
4300 		(int)ngtcp2_conn_get_streams_uni_left(doq_conn->conn));
4301 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_bidi_left is %d",
4302 		(int)ngtcp2_conn_get_streams_bidi_left(doq_conn->conn));
4303 	verbose(VERB_ALGO, "negotiated cipher name is %s",
4304 		SSL_get_cipher_name(doq_conn->ssl));
4305 	if(verbosity > VERB_ALGO) {
4306 		const unsigned char* alpn = NULL;
4307 		unsigned int alpnlen = 0;
4308 		char alpnstr[128];
4309 		SSL_get0_alpn_selected(doq_conn->ssl, &alpn, &alpnlen);
4310 		if(alpnlen > sizeof(alpnstr)-1)
4311 			alpnlen = sizeof(alpnstr)-1;
4312 		memmove(alpnstr, alpn, alpnlen);
4313 		alpnstr[alpnlen]=0;
4314 		verbose(VERB_ALGO, "negotiated ALPN is '%s'", alpnstr);
4315 	}
4316 
4317 	if(!doq_submit_new_token(doq_conn))
4318 		return -1;
4319 	return 0;
4320 }
4321 
4322 /** ngtcp2 stream_open callback function */
4323 static int
doq_stream_open_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,void * user_data)4324 doq_stream_open_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
4325 	void* user_data)
4326 {
4327 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4328 	struct doq_stream* stream;
4329 	verbose(VERB_ALGO, "doq new stream %x", (int)stream_id);
4330 	if(doq_stream_find(doq_conn, stream_id)) {
4331 		verbose(VERB_ALGO, "doq: stream with this id already exists");
4332 		return 0;
4333 	}
4334 	if(stream_id != 0 && stream_id != 4 && /* allow one stream on a new connection */
4335 		!doq_table_quic_size_available(doq_conn->doq_socket->table,
4336 		doq_conn->doq_socket->cfg, sizeof(*stream)
4337 		+ 100 /* estimated query in */
4338 		+ 512 /* estimated response out */
4339 		)) {
4340 		int rv;
4341 		verbose(VERB_ALGO, "doq: no mem for new stream");
4342 		rv = ngtcp2_conn_shutdown_stream(doq_conn->conn,
4343 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
4344 			0,
4345 #endif
4346 			stream_id, NGTCP2_CONNECTION_REFUSED);
4347 		if(rv != 0) {
4348 			log_err("ngtcp2_conn_shutdown_stream failed: %s",
4349 				ngtcp2_strerror(rv));
4350 			return NGTCP2_ERR_CALLBACK_FAILURE;
4351 		}
4352 		return 0;
4353 	}
4354 	stream = doq_stream_create(stream_id);
4355 	if(!stream) {
4356 		log_err("doq: could not doq_stream_create: out of memory");
4357 		return NGTCP2_ERR_CALLBACK_FAILURE;
4358 	}
4359 	doq_table_quic_size_add(doq_conn->doq_socket->table, sizeof(*stream));
4360 	doq_conn_add_stream(doq_conn, stream);
4361 	return 0;
4362 }
4363 
4364 /** ngtcp2 recv_stream_data callback function */
4365 static int
doq_recv_stream_data_cb(ngtcp2_conn * ATTR_UNUSED (conn),uint32_t flags,int64_t stream_id,uint64_t offset,const uint8_t * data,size_t datalen,void * user_data,void * ATTR_UNUSED (stream_user_data))4366 doq_recv_stream_data_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
4367 	int64_t stream_id, uint64_t offset, const uint8_t* data,
4368 	size_t datalen, void* user_data, void* ATTR_UNUSED(stream_user_data))
4369 {
4370 	int recv_done = 0;
4371 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4372 	struct doq_stream* stream;
4373 	verbose(VERB_ALGO, "doq recv stream data stream id %d offset %d "
4374 		"datalen %d%s%s", (int)stream_id, (int)offset, (int)datalen,
4375 		((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0?" FIN":""),
4376 #ifdef NGTCP2_STREAM_DATA_FLAG_0RTT
4377 		((flags&NGTCP2_STREAM_DATA_FLAG_0RTT)!=0?" 0RTT":"")
4378 #else
4379 		((flags&NGTCP2_STREAM_DATA_FLAG_EARLY)!=0?" EARLY":"")
4380 #endif
4381 		);
4382 	stream = doq_stream_find(doq_conn, stream_id);
4383 	if(!stream) {
4384 		verbose(VERB_ALGO, "doq: received stream data for "
4385 			"unknown stream %d", (int)stream_id);
4386 		return 0;
4387 	}
4388 	if(stream->is_closed) {
4389 		verbose(VERB_ALGO, "doq: stream is closed, ignore recv data");
4390 		return 0;
4391 	}
4392 	if(datalen != 0) {
4393 		if(!doq_stream_recv_data(stream, data, datalen, &recv_done,
4394 			doq_conn->doq_socket->table))
4395 			return NGTCP2_ERR_CALLBACK_FAILURE;
4396 	}
4397 	if((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0) {
4398 		if(!doq_stream_recv_fin(doq_conn, stream, recv_done))
4399 			return NGTCP2_ERR_CALLBACK_FAILURE;
4400 	}
4401 	ngtcp2_conn_extend_max_stream_offset(doq_conn->conn, stream_id,
4402 		datalen);
4403 	ngtcp2_conn_extend_max_offset(doq_conn->conn, datalen);
4404 	if(recv_done) {
4405 		if(!doq_stream_data_complete(doq_conn, stream))
4406 			return NGTCP2_ERR_CALLBACK_FAILURE;
4407 	}
4408 	return 0;
4409 }
4410 
4411 /** ngtcp2 stream_close callback function */
4412 static int
doq_stream_close_cb(ngtcp2_conn * ATTR_UNUSED (conn),uint32_t flags,int64_t stream_id,uint64_t app_error_code,void * user_data,void * ATTR_UNUSED (stream_user_data))4413 doq_stream_close_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
4414 	int64_t stream_id, uint64_t app_error_code, void* user_data,
4415 	void* ATTR_UNUSED(stream_user_data))
4416 {
4417 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4418 	struct doq_stream* stream;
4419 	if((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)
4420 		verbose(VERB_ALGO, "doq stream close for stream id %d %sapp_error_code %d",
4421 		(int)stream_id,
4422 		(((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)?
4423 		"APP_ERROR_CODE_SET ":""),
4424 		(int)app_error_code);
4425 	else
4426 		verbose(VERB_ALGO, "doq stream close for stream id %d",
4427 			(int)stream_id);
4428 
4429 	stream = doq_stream_find(doq_conn, stream_id);
4430 	if(!stream) {
4431 		verbose(VERB_ALGO, "doq: stream close for "
4432 			"unknown stream %d", (int)stream_id);
4433 		return 0;
4434 	}
4435 	if(!doq_stream_close(doq_conn, stream, 0))
4436 		return NGTCP2_ERR_CALLBACK_FAILURE;
4437 	return 0;
4438 }
4439 
4440 /** ngtcp2 stream_reset callback function */
4441 static int
doq_stream_reset_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,uint64_t final_size,uint64_t app_error_code,void * user_data,void * ATTR_UNUSED (stream_user_data))4442 doq_stream_reset_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
4443 	uint64_t final_size, uint64_t app_error_code, void* user_data,
4444 	void* ATTR_UNUSED(stream_user_data))
4445 {
4446 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4447 	struct doq_stream* stream;
4448 	verbose(VERB_ALGO, "doq stream reset for stream id %d final_size %d "
4449 		"app_error_code %d", (int)stream_id, (int)final_size,
4450 		(int)app_error_code);
4451 
4452 	stream = doq_stream_find(doq_conn, stream_id);
4453 	if(!stream) {
4454 		verbose(VERB_ALGO, "doq: stream reset for "
4455 			"unknown stream %d", (int)stream_id);
4456 		return 0;
4457 	}
4458 	if(!doq_stream_close(doq_conn, stream, 0))
4459 		return NGTCP2_ERR_CALLBACK_FAILURE;
4460 	return 0;
4461 }
4462 
4463 /** ngtcp2 acked_stream_data_offset callback function */
4464 static int
doq_acked_stream_data_offset_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,uint64_t offset,uint64_t datalen,void * user_data,void * ATTR_UNUSED (stream_user_data))4465 doq_acked_stream_data_offset_cb(ngtcp2_conn* ATTR_UNUSED(conn),
4466 	int64_t stream_id, uint64_t offset, uint64_t datalen, void* user_data,
4467 	void* ATTR_UNUSED(stream_user_data))
4468 {
4469 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4470 	struct doq_stream* stream;
4471 	verbose(VERB_ALGO, "doq stream acked data for stream id %d offset %d "
4472 		"datalen %d", (int)stream_id, (int)offset, (int)datalen);
4473 
4474 	stream = doq_stream_find(doq_conn, stream_id);
4475 	if(!stream) {
4476 		verbose(VERB_ALGO, "doq: stream acked data for "
4477 			"unknown stream %d", (int)stream_id);
4478 		return 0;
4479 	}
4480 	/* Acked the data from [offset .. offset+datalen). */
4481 	if(stream->is_closed)
4482 		return 0;
4483 	if(offset+datalen >= stream->outlen) {
4484 		doq_stream_remove_in_buffer(stream,
4485 			doq_conn->doq_socket->table);
4486 		doq_stream_remove_out_buffer(stream,
4487 			doq_conn->doq_socket->table);
4488 	}
4489 	return 0;
4490 }
4491 
4492 /** ngtc2p log_printf callback function */
4493 static void
doq_log_printf_cb(void * ATTR_UNUSED (user_data),const char * fmt,...)4494 doq_log_printf_cb(void* ATTR_UNUSED(user_data), const char* fmt, ...)
4495 {
4496 	char buf[1024];
4497 	va_list ap;
4498 	va_start(ap, fmt);
4499 	vsnprintf(buf, sizeof(buf), fmt, ap);
4500 	verbose(VERB_ALGO, "libngtcp2: %s", buf);
4501 	va_end(ap);
4502 }
4503 
4504 #ifdef MAKE_QUIC_METHOD
4505 /** the doq application tx key callback, false on failure */
4506 static int
doq_application_tx_key_cb(struct doq_conn * conn)4507 doq_application_tx_key_cb(struct doq_conn* conn)
4508 {
4509 	verbose(VERB_ALGO, "doq application tx key cb");
4510 	/* The server does not want to open streams to the client,
4511 	 * the client instead initiates by opening bidi streams. */
4512 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_data_left is %d",
4513 		(int)ngtcp2_conn_get_max_data_left(conn->conn));
4514 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
4515 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_local_streams_uni is %d",
4516 		(int)ngtcp2_conn_get_max_local_streams_uni(conn->conn));
4517 #endif
4518 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_uni_left is %d",
4519 		(int)ngtcp2_conn_get_streams_uni_left(conn->conn));
4520 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_bidi_left is %d",
4521 		(int)ngtcp2_conn_get_streams_bidi_left(conn->conn));
4522 	return 1;
4523 }
4524 
4525 /** quic_method set_encryption_secrets function */
4526 static int
doq_set_encryption_secrets(SSL * ssl,OSSL_ENCRYPTION_LEVEL ossl_level,const uint8_t * read_secret,const uint8_t * write_secret,size_t secret_len)4527 doq_set_encryption_secrets(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
4528 	const uint8_t *read_secret, const uint8_t *write_secret,
4529 	size_t secret_len)
4530 {
4531 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4532 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
4533 	ngtcp2_encryption_level
4534 #else
4535 	ngtcp2_crypto_level
4536 #endif
4537 		level =
4538 #ifdef USE_NGTCP2_CRYPTO_OSSL
4539 		ngtcp2_crypto_ossl_from_ossl_encryption_level(ossl_level);
4540 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL)
4541 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
4542 #else
4543 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
4544 #endif
4545 
4546 	if(read_secret) {
4547 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_rx_key for level %d ossl %d", (int)level, (int)ossl_level);
4548 		if(ngtcp2_crypto_derive_and_install_rx_key(doq_conn->conn,
4549 			NULL, NULL, NULL, level, read_secret, secret_len)
4550 			!= 0) {
4551 			log_err("ngtcp2_crypto_derive_and_install_rx_key "
4552 				"failed");
4553 			return 0;
4554 		}
4555 	}
4556 
4557 	if(write_secret) {
4558 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_tx_key for level %d ossl %d", (int)level, (int)ossl_level);
4559 		if(ngtcp2_crypto_derive_and_install_tx_key(doq_conn->conn,
4560 			NULL, NULL, NULL, level, write_secret, secret_len)
4561 			!= 0) {
4562 			log_err("ngtcp2_crypto_derive_and_install_tx_key "
4563 				"failed");
4564 			return 0;
4565 		}
4566 		if(level == NGTCP2_CRYPTO_LEVEL_APPLICATION) {
4567 			if(!doq_application_tx_key_cb(doq_conn))
4568 				return 0;
4569 		}
4570 	}
4571 	return 1;
4572 }
4573 
4574 /** quic_method add_handshake_data function */
4575 static int
doq_add_handshake_data(SSL * ssl,OSSL_ENCRYPTION_LEVEL ossl_level,const uint8_t * data,size_t len)4576 doq_add_handshake_data(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
4577 	const uint8_t *data, size_t len)
4578 {
4579 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4580 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
4581 	ngtcp2_encryption_level
4582 #else
4583 	ngtcp2_crypto_level
4584 #endif
4585 		level =
4586 #ifdef USE_NGTCP2_CRYPTO_OSSL
4587 		ngtcp2_crypto_ossl_from_ossl_encryption_level(ossl_level);
4588 #elif defined(HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL)
4589 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
4590 #else
4591 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
4592 #endif
4593 	int rv;
4594 
4595 	verbose(VERB_ALGO, "doq_add_handshake_data: "
4596 		"ngtcp2_con_submit_crypto_data level %d", (int)level);
4597 	rv = ngtcp2_conn_submit_crypto_data(doq_conn->conn, level, data, len);
4598 	if(rv != 0) {
4599 		log_err("ngtcp2_conn_submit_crypto_data failed: %s",
4600 			ngtcp2_strerror(rv));
4601 		ngtcp2_conn_set_tls_error(doq_conn->conn, rv);
4602 		return 0;
4603 	}
4604 	return 1;
4605 }
4606 
4607 /** quic_method flush_flight function */
4608 static int
doq_flush_flight(SSL * ATTR_UNUSED (ssl))4609 doq_flush_flight(SSL* ATTR_UNUSED(ssl))
4610 {
4611 	return 1;
4612 }
4613 
4614 /** quic_method send_alert function */
4615 static int
doq_send_alert(SSL * ssl,enum ssl_encryption_level_t ATTR_UNUSED (level),uint8_t alert)4616 doq_send_alert(SSL *ssl, enum ssl_encryption_level_t ATTR_UNUSED(level),
4617 	uint8_t alert)
4618 {
4619 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4620 	doq_conn->tls_alert = alert;
4621 	return 1;
4622 }
4623 #endif /* MAKE_QUIC_METHOD */
4624 
4625 /** ALPN select callback for the doq SSL context */
4626 static int
doq_alpn_select_cb(SSL * ATTR_UNUSED (ssl),const unsigned char ** out,unsigned char * outlen,const unsigned char * in,unsigned int inlen,void * ATTR_UNUSED (arg))4627 doq_alpn_select_cb(SSL* ATTR_UNUSED(ssl), const unsigned char** out,
4628 	unsigned char* outlen, const unsigned char* in, unsigned int inlen,
4629 	void* ATTR_UNUSED(arg))
4630 {
4631 	/* select "doq" */
4632 	int ret = SSL_select_next_proto((void*)out, outlen,
4633 		(const unsigned char*)"\x03""doq", 4, in, inlen);
4634 	if(ret == OPENSSL_NPN_NEGOTIATED)
4635 		return SSL_TLSEXT_ERR_OK;
4636 	verbose(VERB_ALGO, "doq alpn_select_cb: ALPN from client does "
4637 		"not have 'doq'");
4638 	return SSL_TLSEXT_ERR_ALERT_FATAL;
4639 }
4640 
quic_sslctx_create(char * key,char * pem,char * verifypem)4641 void* quic_sslctx_create(char* key, char* pem, char* verifypem)
4642 {
4643 #ifdef HAVE_NGTCP2
4644 	char* sid_ctx = "unbound server";
4645 #ifdef MAKE_QUIC_METHOD
4646 	SSL_QUIC_METHOD* quic_method;
4647 #endif
4648 	SSL_CTX* ctx = SSL_CTX_new(TLS_server_method());
4649 	if(!ctx) {
4650 		log_crypto_err("Could not SSL_CTX_new");
4651 		return NULL;
4652 	}
4653 	if(!key || key[0] == 0) {
4654 		log_err("doq: error, no tls-service-key file specified");
4655 		SSL_CTX_free(ctx);
4656 		return NULL;
4657 	}
4658 	if(!pem || pem[0] == 0) {
4659 		log_err("doq: error, no tls-service-pem file specified");
4660 		SSL_CTX_free(ctx);
4661 		return NULL;
4662 	}
4663 	SSL_CTX_set_options(ctx,
4664 		(SSL_OP_ALL & ~SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) |
4665 		SSL_OP_SINGLE_ECDH_USE |
4666 		SSL_OP_CIPHER_SERVER_PREFERENCE |
4667 		SSL_OP_NO_ANTI_REPLAY);
4668 	SSL_CTX_set_mode(ctx, SSL_MODE_RELEASE_BUFFERS);
4669 	SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION);
4670 	SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION);
4671 #ifdef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
4672 	SSL_CTX_set_alpn_select_cb(ctx, doq_alpn_select_cb, NULL);
4673 #endif
4674 	SSL_CTX_set_default_verify_paths(ctx);
4675 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
4676 		log_err("doq: error for cert file: %s", pem);
4677 		log_crypto_err("doq: error in "
4678 			"SSL_CTX_use_certificate_chain_file");
4679 		SSL_CTX_free(ctx);
4680 		return NULL;
4681 	}
4682 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
4683 		log_err("doq: error for private key file: %s", key);
4684 		log_crypto_err("doq: error in SSL_CTX_use_PrivateKey_file");
4685 		SSL_CTX_free(ctx);
4686 		return NULL;
4687 	}
4688 	if(!SSL_CTX_check_private_key(ctx)) {
4689 		log_err("doq: error for key file: %s", key);
4690 		log_crypto_err("doq: error in SSL_CTX_check_private_key");
4691 		SSL_CTX_free(ctx);
4692 		return NULL;
4693 	}
4694 	SSL_CTX_set_session_id_context(ctx, (void*)sid_ctx, strlen(sid_ctx));
4695 	if(verifypem && verifypem[0]) {
4696 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
4697 			log_err("doq: error for verify pem file: %s",
4698 				verifypem);
4699 			log_crypto_err("doq: error in "
4700 				"SSL_CTX_load_verify_locations");
4701 			SSL_CTX_free(ctx);
4702 			return NULL;
4703 		}
4704 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(
4705 			verifypem));
4706 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER|
4707 			SSL_VERIFY_CLIENT_ONCE|
4708 			SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL);
4709 	}
4710 
4711 	SSL_CTX_set_max_early_data(ctx, 0xffffffff);
4712 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
4713 	if(ngtcp2_crypto_quictls_configure_server_context(ctx) != 0) {
4714 		log_err("ngtcp2_crypto_quictls_configure_server_context failed");
4715 		SSL_CTX_free(ctx);
4716 		return NULL;
4717 	}
4718 #elif defined(MAKE_QUIC_METHOD)
4719 	/* The quic_method needs to remain valid during the SSL_CTX
4720 	 * lifetime, so we allocate it. It is freed with the
4721 	 * doq_server_socket. */
4722 	quic_method = calloc(1, sizeof(SSL_QUIC_METHOD));
4723 	if(!quic_method) {
4724 		log_err("calloc failed: out of memory");
4725 		SSL_CTX_free(ctx);
4726 		return NULL;
4727 	}
4728 	doq_socket->quic_method = quic_method;
4729 	quic_method->set_encryption_secrets = doq_set_encryption_secrets;
4730 	quic_method->add_handshake_data = doq_add_handshake_data;
4731 	quic_method->flush_flight = doq_flush_flight;
4732 	quic_method->send_alert = doq_send_alert;
4733 	SSL_CTX_set_quic_method(ctx, doq_socket->quic_method);
4734 #endif
4735 	return ctx;
4736 #else /* HAVE_NGTCP2 */
4737 	(void)key; (void)pem; (void)verifypem;
4738 	return NULL;
4739 #endif /* HAVE_NGTCP2 */
4740 }
4741 
4742 /** Get the ngtcp2_conn from ssl userdata of type ngtcp2_conn_ref */
doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref * conn_ref)4743 static ngtcp2_conn* doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref* conn_ref)
4744 {
4745 	struct doq_conn* conn = (struct doq_conn*)conn_ref->user_data;
4746 	return conn->conn;
4747 }
4748 
4749 /** create new SSL session for server connection */
4750 static SSL*
doq_ssl_server_setup(SSL_CTX * ctx,struct doq_conn * conn)4751 doq_ssl_server_setup(SSL_CTX* ctx, struct doq_conn* conn)
4752 {
4753 #ifdef USE_NGTCP2_CRYPTO_OSSL
4754 	int ret;
4755 #endif
4756 	SSL* ssl = SSL_new(ctx);
4757 	if(!ssl) {
4758 		log_crypto_err("doq: SSL_new failed");
4759 		return NULL;
4760 	}
4761 #ifdef USE_NGTCP2_CRYPTO_OSSL
4762 	if((ret=ngtcp2_crypto_ossl_ctx_new(&conn->ossl_ctx, NULL)) != 0) {
4763 		log_err("doq: ngtcp2_crypto_ossl_ctx_new failed: %s",
4764 			ngtcp2_strerror(ret));
4765 		SSL_free(ssl);
4766 		return NULL;
4767 	}
4768 	ngtcp2_crypto_ossl_ctx_set_ssl(conn->ossl_ctx, ssl);
4769 	if(ngtcp2_crypto_ossl_configure_server_session(ssl) != 0) {
4770 		log_err("doq: ngtcp2_crypto_ossl_configure_server_session failed");
4771 		SSL_free(ssl);
4772 		return NULL;
4773 	}
4774 #endif
4775 #if defined(USE_NGTCP2_CRYPTO_OSSL) || defined(HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT)
4776 	conn->conn_ref.get_conn = &doq_conn_ref_get_conn;
4777 	conn->conn_ref.user_data = conn;
4778 	SSL_set_app_data(ssl, &conn->conn_ref);
4779 #else
4780 	SSL_set_app_data(ssl, conn);
4781 #endif
4782 	SSL_set_accept_state(ssl);
4783 #ifdef USE_NGTCP2_CRYPTO_OSSL
4784 	SSL_set_quic_tls_early_data_enabled(ssl, 1);
4785 #else
4786 	SSL_set_quic_early_data_enabled(ssl, 1);
4787 #endif
4788 	return ssl;
4789 }
4790 
4791 int
doq_conn_setup(struct doq_conn * conn,uint8_t * scid,size_t scidlen,uint8_t * ocid,size_t ocidlen,const uint8_t * token,size_t tokenlen)4792 doq_conn_setup(struct doq_conn* conn, uint8_t* scid, size_t scidlen,
4793 	uint8_t* ocid, size_t ocidlen, const uint8_t* token, size_t tokenlen)
4794 {
4795 	int rv;
4796 	struct ngtcp2_cid dcid, sv_scid, scid_cid;
4797 	struct ngtcp2_path path;
4798 	struct ngtcp2_callbacks callbacks;
4799 	struct ngtcp2_settings settings;
4800 	struct ngtcp2_transport_params params;
4801 	memset(&dcid, 0, sizeof(dcid));
4802 	memset(&sv_scid, 0, sizeof(sv_scid));
4803 	memset(&scid_cid, 0, sizeof(scid_cid));
4804 	memset(&path, 0, sizeof(path));
4805 	memset(&callbacks, 0, sizeof(callbacks));
4806 	memset(&settings, 0, sizeof(settings));
4807 	memset(&params, 0, sizeof(params));
4808 
4809 	ngtcp2_cid_init(&scid_cid, scid, scidlen);
4810 	ngtcp2_cid_init(&dcid, conn->key.dcid, conn->key.dcidlen);
4811 
4812 	path.remote.addr = (struct sockaddr*)&conn->key.paddr.addr;
4813 	path.remote.addrlen = conn->key.paddr.addrlen;
4814 	path.local.addr = (struct sockaddr*)&conn->key.paddr.localaddr;
4815 	path.local.addrlen = conn->key.paddr.localaddrlen;
4816 
4817 	callbacks.recv_client_initial = ngtcp2_crypto_recv_client_initial_cb;
4818 	callbacks.recv_crypto_data = ngtcp2_crypto_recv_crypto_data_cb;
4819 	callbacks.encrypt = ngtcp2_crypto_encrypt_cb;
4820 	callbacks.decrypt = ngtcp2_crypto_decrypt_cb;
4821 	callbacks.hp_mask = ngtcp2_crypto_hp_mask;
4822 	callbacks.update_key = ngtcp2_crypto_update_key_cb;
4823 	callbacks.delete_crypto_aead_ctx =
4824 		ngtcp2_crypto_delete_crypto_aead_ctx_cb;
4825 	callbacks.delete_crypto_cipher_ctx =
4826 		ngtcp2_crypto_delete_crypto_cipher_ctx_cb;
4827 	callbacks.get_path_challenge_data =
4828 		ngtcp2_crypto_get_path_challenge_data_cb;
4829 	callbacks.version_negotiation = ngtcp2_crypto_version_negotiation_cb;
4830 	callbacks.rand = doq_rand_cb;
4831 	callbacks.get_new_connection_id = doq_get_new_connection_id_cb;
4832 	callbacks.remove_connection_id = doq_remove_connection_id_cb;
4833 	callbacks.handshake_completed = doq_handshake_completed_cb;
4834 	callbacks.stream_open = doq_stream_open_cb;
4835 	callbacks.stream_close = doq_stream_close_cb;
4836 	callbacks.stream_reset = doq_stream_reset_cb;
4837 	callbacks.acked_stream_data_offset = doq_acked_stream_data_offset_cb;
4838 	callbacks.recv_stream_data = doq_recv_stream_data_cb;
4839 
4840 	ngtcp2_settings_default(&settings);
4841 	if(verbosity >= VERB_ALGO) {
4842 		settings.log_printf = doq_log_printf_cb;
4843 	}
4844 	settings.rand_ctx.native_handle = conn->doq_socket->rnd;
4845 	settings.initial_ts = doq_get_timestamp_nanosec();
4846 	settings.max_stream_window = 6*1024*1024;
4847 	settings.max_window = 6*1024*1024;
4848 #ifdef HAVE_STRUCT_NGTCP2_SETTINGS_TOKENLEN
4849 	settings.token = (void*)token;
4850 	settings.tokenlen = tokenlen;
4851 #else
4852 	settings.token.base = (void*)token;
4853 	settings.token.len = tokenlen;
4854 #endif
4855 
4856 	ngtcp2_transport_params_default(&params);
4857 	params.max_idle_timeout = conn->doq_socket->idle_timeout;
4858 	params.active_connection_id_limit = 7;
4859 	params.initial_max_stream_data_bidi_local = 256*1024;
4860 	params.initial_max_stream_data_bidi_remote = 256*1024;
4861 	params.initial_max_data = 1024*1024;
4862 	/* DoQ uses bidi streams, so we allow 0 uni streams. */
4863 	params.initial_max_streams_uni = 0;
4864 	/* Initial max on number of bidi streams the remote end can open.
4865 	 * That is the number of queries it can make, at first. */
4866 	params.initial_max_streams_bidi = 10;
4867 	if(ocid) {
4868 		ngtcp2_cid_init(&params.original_dcid, ocid, ocidlen);
4869 		ngtcp2_cid_init(&params.retry_scid, conn->key.dcid,
4870 			conn->key.dcidlen);
4871 		params.retry_scid_present = 1;
4872 	} else {
4873 		ngtcp2_cid_init(&params.original_dcid, conn->key.dcid,
4874 			conn->key.dcidlen);
4875 	}
4876 #ifdef HAVE_STRUCT_NGTCP2_TRANSPORT_PARAMS_ORIGINAL_DCID_PRESENT
4877 	params.original_dcid_present = 1;
4878 #endif
4879 	doq_fill_rand(conn->doq_socket->rnd, params.stateless_reset_token,
4880 		sizeof(params.stateless_reset_token));
4881 	sv_scid.datalen = conn->doq_socket->sv_scidlen;
4882 	lock_rw_wrlock(&conn->table->conid_lock);
4883 	if(!doq_conn_generate_new_conid(conn, sv_scid.data, sv_scid.datalen)) {
4884 		lock_rw_unlock(&conn->table->conid_lock);
4885 		return 0;
4886 	}
4887 
4888 	rv = ngtcp2_conn_server_new(&conn->conn, &scid_cid, &sv_scid, &path,
4889 		conn->version, &callbacks, &settings, &params, NULL, conn);
4890 	if(rv != 0) {
4891 		lock_rw_unlock(&conn->table->conid_lock);
4892 		log_err("ngtcp2_conn_server_new failed: %s",
4893 			ngtcp2_strerror(rv));
4894 		return 0;
4895 	}
4896 	if(!doq_conn_setup_conids(conn)) {
4897 		lock_rw_unlock(&conn->table->conid_lock);
4898 		log_err("doq_conn_setup_conids failed: out of memory");
4899 		return 0;
4900 	}
4901 	lock_rw_unlock(&conn->table->conid_lock);
4902 	conn->ssl = doq_ssl_server_setup((SSL_CTX*)conn->doq_socket->ctx,
4903 		conn);
4904 	if(!conn->ssl) {
4905 		log_err("doq_ssl_server_setup failed");
4906 		return 0;
4907 	}
4908 #ifdef USE_NGTCP2_CRYPTO_OSSL
4909 	ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ossl_ctx);
4910 #else
4911 	ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ssl);
4912 #endif
4913 	doq_conn_write_enable(conn);
4914 	return 1;
4915 }
4916 
4917 struct doq_conid*
doq_conid_find(struct doq_table * table,const uint8_t * data,size_t datalen)4918 doq_conid_find(struct doq_table* table, const uint8_t* data, size_t datalen)
4919 {
4920 	struct rbnode_type* node;
4921 	struct doq_conid key;
4922 	key.node.key = &key;
4923 	key.cid = (void*)data;
4924 	key.cidlen = datalen;
4925 	node = rbtree_search(table->conid_tree, &key);
4926 	if(node)
4927 		return (struct doq_conid*)node->key;
4928 	return NULL;
4929 }
4930 
4931 /** insert conid in the conid list */
4932 static void
doq_conid_list_insert(struct doq_conn * conn,struct doq_conid * conid)4933 doq_conid_list_insert(struct doq_conn* conn, struct doq_conid* conid)
4934 {
4935 	conid->prev = NULL;
4936 	conid->next = conn->conid_list;
4937 	if(conn->conid_list)
4938 		conn->conid_list->prev = conid;
4939 	conn->conid_list = conid;
4940 }
4941 
4942 /** remove conid from the conid list */
4943 static void
doq_conid_list_remove(struct doq_conn * conn,struct doq_conid * conid)4944 doq_conid_list_remove(struct doq_conn* conn, struct doq_conid* conid)
4945 {
4946 	if(conid->prev)
4947 		conid->prev->next = conid->next;
4948 	else	conn->conid_list = conid->next;
4949 	if(conid->next)
4950 		conid->next->prev = conid->prev;
4951 }
4952 
4953 /** create a doq_conid */
4954 static struct doq_conid*
doq_conid_create(uint8_t * data,size_t datalen,struct doq_conn_key * key)4955 doq_conid_create(uint8_t* data, size_t datalen, struct doq_conn_key* key)
4956 {
4957 	struct doq_conid* conid;
4958 	conid = calloc(1, sizeof(*conid));
4959 	if(!conid)
4960 		return NULL;
4961 	conid->cid = memdup(data, datalen);
4962 	if(!conid->cid) {
4963 		free(conid);
4964 		return NULL;
4965 	}
4966 	conid->cidlen = datalen;
4967 	conid->node.key = conid;
4968 	conid->key = *key;
4969 	conid->key.dcid = memdup(key->dcid, key->dcidlen);
4970 	if(!conid->key.dcid) {
4971 		free(conid->cid);
4972 		free(conid);
4973 		return NULL;
4974 	}
4975 	return conid;
4976 }
4977 
4978 void
doq_conid_delete(struct doq_conid * conid)4979 doq_conid_delete(struct doq_conid* conid)
4980 {
4981 	if(!conid)
4982 		return;
4983 	free(conid->key.dcid);
4984 	free(conid->cid);
4985 	free(conid);
4986 }
4987 
4988 /** return true if the conid is for the conn. */
4989 static int
conid_is_for_conn(struct doq_conn * conn,struct doq_conid * conid)4990 conid_is_for_conn(struct doq_conn* conn, struct doq_conid* conid)
4991 {
4992 	if(conid->key.dcidlen == conn->key.dcidlen &&
4993 		memcmp(conid->key.dcid, conn->key.dcid, conid->key.dcidlen)==0
4994 		&& conid->key.paddr.addrlen == conn->key.paddr.addrlen &&
4995 		memcmp(&conid->key.paddr.addr, &conn->key.paddr.addr,
4996 			conid->key.paddr.addrlen) == 0 &&
4997 		conid->key.paddr.localaddrlen == conn->key.paddr.localaddrlen &&
4998 		memcmp(&conid->key.paddr.localaddr, &conn->key.paddr.localaddr,
4999 			conid->key.paddr.localaddrlen) == 0 &&
5000 		conid->key.paddr.ifindex == conn->key.paddr.ifindex)
5001 		return 1;
5002 	return 0;
5003 }
5004 
5005 int
doq_conn_associate_conid(struct doq_conn * conn,uint8_t * data,size_t datalen)5006 doq_conn_associate_conid(struct doq_conn* conn, uint8_t* data, size_t datalen)
5007 {
5008 	struct doq_conid* conid;
5009 	conid = doq_conid_find(conn->table, data, datalen);
5010 	if(conid && !conid_is_for_conn(conn, conid)) {
5011 		verbose(VERB_ALGO, "doq connection id already exists for "
5012 			"another doq_conn. Ignoring second connection id.");
5013 		/* Already exists to another conn, ignore it.
5014 		 * This works, in that the conid is listed in the doq_conn
5015 		 * conid_list element, and removed from there. So our conid
5016 		 * tree and list are fine, when created and removed.
5017 		 * The tree now does not have the lookup element pointing
5018 		 * to this connection. */
5019 		return 1;
5020 	}
5021 	if(conid)
5022 		return 1; /* already inserted */
5023 	conid = doq_conid_create(data, datalen, &conn->key);
5024 	if(!conid)
5025 		return 0;
5026 	doq_conid_list_insert(conn, conid);
5027 	(void)rbtree_insert(conn->table->conid_tree, &conid->node);
5028 	return 1;
5029 }
5030 
5031 void
doq_conn_dissociate_conid(struct doq_conn * conn,const uint8_t * data,size_t datalen)5032 doq_conn_dissociate_conid(struct doq_conn* conn, const uint8_t* data,
5033 	size_t datalen)
5034 {
5035 	struct doq_conid* conid;
5036 	conid = doq_conid_find(conn->table, data, datalen);
5037 	if(conid && !conid_is_for_conn(conn, conid))
5038 		return;
5039 	if(conid) {
5040 		(void)rbtree_delete(conn->table->conid_tree,
5041 			conid->node.key);
5042 		doq_conid_list_remove(conn, conid);
5043 		doq_conid_delete(conid);
5044 	}
5045 }
5046 
5047 /** associate the scid array and also the dcid.
5048  * caller must hold the locks on conn and doq_table.conid_lock. */
5049 static int
doq_conn_setup_id_array_and_dcid(struct doq_conn * conn,struct ngtcp2_cid * scids,size_t num_scid)5050 doq_conn_setup_id_array_and_dcid(struct doq_conn* conn,
5051 	struct ngtcp2_cid* scids, size_t num_scid)
5052 {
5053 	size_t i;
5054 	for(i=0; i<num_scid; i++) {
5055 		if(!doq_conn_associate_conid(conn, scids[i].data,
5056 			scids[i].datalen))
5057 			return 0;
5058 	}
5059 	if(!doq_conn_associate_conid(conn, conn->key.dcid, conn->key.dcidlen))
5060 		return 0;
5061 	return 1;
5062 }
5063 
5064 int
doq_conn_setup_conids(struct doq_conn * conn)5065 doq_conn_setup_conids(struct doq_conn* conn)
5066 {
5067 	size_t num_scid =
5068 #ifndef HAVE_NGTCP2_CONN_GET_NUM_SCID
5069 		ngtcp2_conn_get_scid(conn->conn, NULL);
5070 #else
5071 		ngtcp2_conn_get_num_scid(conn->conn);
5072 #endif
5073 	if(num_scid <= 4) {
5074 		struct ngtcp2_cid ids[4];
5075 		/* Usually there are not that many scids when just accepted,
5076 		 * like only 2. */
5077 		ngtcp2_conn_get_scid(conn->conn, ids);
5078 		return doq_conn_setup_id_array_and_dcid(conn, ids, num_scid);
5079 	} else {
5080 		struct ngtcp2_cid *scids = calloc(num_scid,
5081 			sizeof(struct ngtcp2_cid));
5082 		if(!scids)
5083 			return 0;
5084 		ngtcp2_conn_get_scid(conn->conn, scids);
5085 		if(!doq_conn_setup_id_array_and_dcid(conn, scids, num_scid)) {
5086 			free(scids);
5087 			return 0;
5088 		}
5089 		free(scids);
5090 	}
5091 	return 1;
5092 }
5093 
5094 void
doq_conn_clear_conids(struct doq_conn * conn)5095 doq_conn_clear_conids(struct doq_conn* conn)
5096 {
5097 	struct doq_conid* p, *next;
5098 	if(!conn)
5099 		return;
5100 	p = conn->conid_list;
5101 	while(p) {
5102 		next = p->next;
5103 		(void)rbtree_delete(conn->table->conid_tree, p->node.key);
5104 		doq_conid_delete(p);
5105 		p = next;
5106 	}
5107 	conn->conid_list = NULL;
5108 }
5109 
doq_get_timestamp_nanosec(void)5110 ngtcp2_tstamp doq_get_timestamp_nanosec(void)
5111 {
5112 #ifdef CLOCK_REALTIME
5113 	struct timespec tp;
5114 	memset(&tp, 0, sizeof(tp));
5115 	/* Get a nanosecond time, that can be compared with the event base. */
5116 	if(clock_gettime(CLOCK_REALTIME, &tp) == -1) {
5117 		log_err("clock_gettime failed: %s", strerror(errno));
5118 	}
5119 	return ((uint64_t)tp.tv_sec)*((uint64_t)1000000000) +
5120 		((uint64_t)tp.tv_nsec);
5121 #else
5122 	struct timeval tv;
5123 	if(gettimeofday(&tv, NULL) < 0) {
5124 		log_err("gettimeofday failed: %s", strerror(errno));
5125 	}
5126 	return ((uint64_t)tv.tv_sec)*((uint64_t)1000000000) +
5127 		((uint64_t)tv.tv_usec)*((uint64_t)1000);
5128 #endif /* CLOCK_REALTIME */
5129 }
5130 
5131 /** doq start the closing period for the connection. */
5132 static int
doq_conn_start_closing_period(struct comm_point * c,struct doq_conn * conn)5133 doq_conn_start_closing_period(struct comm_point* c, struct doq_conn* conn)
5134 {
5135 	struct ngtcp2_path_storage ps;
5136 	struct ngtcp2_pkt_info pi;
5137 	ngtcp2_ssize ret;
5138 	if(!conn)
5139 		return 1;
5140 	if(
5141 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
5142 		ngtcp2_conn_in_closing_period(conn->conn)
5143 #else
5144 		ngtcp2_conn_is_in_closing_period(conn->conn)
5145 #endif
5146 		)
5147 		return 1;
5148 	if(
5149 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
5150 		ngtcp2_conn_in_draining_period(conn->conn)
5151 #else
5152 		ngtcp2_conn_is_in_draining_period(conn->conn)
5153 #endif
5154 		) {
5155 		doq_conn_write_disable(conn);
5156 		return 1;
5157 	}
5158 	ngtcp2_path_storage_zero(&ps);
5159 	sldns_buffer_clear(c->doq_socket->pkt_buf);
5160 	/* the call to ngtcp2_conn_write_connection_close causes the
5161 	 * conn to be closed. It is now in the closing period. */
5162 	ret = ngtcp2_conn_write_connection_close(conn->conn, &ps.path,
5163 		&pi, sldns_buffer_begin(c->doq_socket->pkt_buf),
5164 		sldns_buffer_remaining(c->doq_socket->pkt_buf),
5165 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5166 		&conn->ccerr
5167 #else
5168 		&conn->last_error
5169 #endif
5170 		, doq_get_timestamp_nanosec());
5171 	if(ret < 0) {
5172 		log_err("doq ngtcp2_conn_write_connection_close failed: %s",
5173 			ngtcp2_strerror(ret));
5174 		return 0;
5175 	}
5176 	if(ret == 0) {
5177 		return 0;
5178 	}
5179 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
5180 	sldns_buffer_flip(c->doq_socket->pkt_buf);
5181 
5182 	/* The close packet is allocated, because it may have to be repeated.
5183 	 * When incoming packets have this connection dcid. */
5184 	conn->close_pkt = memdup(sldns_buffer_begin(c->doq_socket->pkt_buf),
5185 		sldns_buffer_limit(c->doq_socket->pkt_buf));
5186 	if(!conn->close_pkt) {
5187 		log_err("doq: could not allocate close packet: out of memory");
5188 		return 0;
5189 	}
5190 	conn->close_pkt_len = sldns_buffer_limit(c->doq_socket->pkt_buf);
5191 	conn->close_ecn = pi.ecn;
5192 	return 1;
5193 }
5194 
5195 /** doq send the close packet for the connection, perhaps again. */
5196 int
doq_conn_send_close(struct comm_point * c,struct doq_conn * conn)5197 doq_conn_send_close(struct comm_point* c, struct doq_conn* conn)
5198 {
5199 	if(!conn)
5200 		return 0;
5201 	if(!conn->close_pkt)
5202 		return 0;
5203 	if(conn->close_pkt_len > sldns_buffer_capacity(c->doq_socket->pkt_buf))
5204 		return 0;
5205 	sldns_buffer_clear(c->doq_socket->pkt_buf);
5206 	sldns_buffer_write(c->doq_socket->pkt_buf, conn->close_pkt, conn->close_pkt_len);
5207 	sldns_buffer_flip(c->doq_socket->pkt_buf);
5208 	verbose(VERB_ALGO, "doq send connection close");
5209 	doq_send_pkt(c, &conn->key.paddr, conn->close_ecn);
5210 	doq_conn_write_disable(conn);
5211 	return 1;
5212 }
5213 
5214 /** doq close the connection on error. If it returns a failure, it
5215  * does not wait to send a close, and the connection can be dropped. */
5216 static int
doq_conn_close_error(struct comm_point * c,struct doq_conn * conn)5217 doq_conn_close_error(struct comm_point* c, struct doq_conn* conn)
5218 {
5219 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5220 	if(conn->ccerr.type == NGTCP2_CCERR_TYPE_IDLE_CLOSE)
5221 		return 0;
5222 #else
5223 	if(conn->last_error.type ==
5224 		NGTCP2_CONNECTION_CLOSE_ERROR_CODE_TYPE_TRANSPORT_IDLE_CLOSE)
5225 		return 0;
5226 #endif
5227 	if(!doq_conn_start_closing_period(c, conn))
5228 		return 0;
5229 	if(
5230 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
5231 		ngtcp2_conn_in_draining_period(conn->conn)
5232 #else
5233 		ngtcp2_conn_is_in_draining_period(conn->conn)
5234 #endif
5235 		) {
5236 		doq_conn_write_disable(conn);
5237 		return 1;
5238 	}
5239 	doq_conn_write_enable(conn);
5240 	if(!doq_conn_send_close(c, conn))
5241 		return 0;
5242 	return 1;
5243 }
5244 
5245 int
doq_conn_recv(struct comm_point * c,struct doq_pkt_addr * paddr,struct doq_conn * conn,struct ngtcp2_pkt_info * pi,int * err_retry,int * err_drop)5246 doq_conn_recv(struct comm_point* c, struct doq_pkt_addr* paddr,
5247 	struct doq_conn* conn, struct ngtcp2_pkt_info* pi, int* err_retry,
5248 	int* err_drop)
5249 {
5250 	int ret;
5251 	ngtcp2_tstamp ts;
5252 	struct ngtcp2_path path;
5253 	memset(&path, 0, sizeof(path));
5254 	path.remote.addr = (struct sockaddr*)&paddr->addr;
5255 	path.remote.addrlen = paddr->addrlen;
5256 	path.local.addr = (struct sockaddr*)&paddr->localaddr;
5257 	path.local.addrlen = paddr->localaddrlen;
5258 	ts = doq_get_timestamp_nanosec();
5259 
5260 	ret = ngtcp2_conn_read_pkt(conn->conn, &path, pi,
5261 		sldns_buffer_begin(c->doq_socket->pkt_buf),
5262 		sldns_buffer_limit(c->doq_socket->pkt_buf), ts);
5263 	if(ret != 0) {
5264 		if(err_retry)
5265 			*err_retry = 0;
5266 		if(err_drop)
5267 			*err_drop = 0;
5268 		if(ret == NGTCP2_ERR_DRAINING) {
5269 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5270 				ngtcp2_strerror(ret));
5271 			doq_conn_write_disable(conn);
5272 			return 0;
5273 		} else if(ret == NGTCP2_ERR_DROP_CONN) {
5274 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5275 				ngtcp2_strerror(ret));
5276 			if(err_drop)
5277 				*err_drop = 1;
5278 			return 0;
5279 		} else if(ret == NGTCP2_ERR_RETRY) {
5280 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5281 				ngtcp2_strerror(ret));
5282 			if(err_retry)
5283 				*err_retry = 1;
5284 			if(err_drop)
5285 				*err_drop = 1;
5286 			return 0;
5287 		} else if(ret == NGTCP2_ERR_CRYPTO) {
5288 			if(
5289 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5290 				!conn->ccerr.error_code
5291 #else
5292 				!conn->last_error.error_code
5293 #endif
5294 				) {
5295 				/* in picotls the tls alert may need to be
5296 				 * copied, but this is with openssl. And there
5297 				 * is conn->tls_alert. */
5298 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5299 				ngtcp2_ccerr_set_tls_alert(&conn->ccerr,
5300 					conn->tls_alert, NULL, 0);
5301 #else
5302 				ngtcp2_connection_close_error_set_transport_error_tls_alert(
5303 					&conn->last_error, conn->tls_alert,
5304 					NULL, 0);
5305 #endif
5306 			}
5307 		} else {
5308 			if(
5309 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5310 				!conn->ccerr.error_code
5311 #else
5312 				!conn->last_error.error_code
5313 #endif
5314 				) {
5315 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5316 				ngtcp2_ccerr_set_liberr(&conn->ccerr, ret,
5317 					NULL, 0);
5318 #else
5319 				ngtcp2_connection_close_error_set_transport_error_liberr(
5320 					&conn->last_error, ret, NULL, 0);
5321 #endif
5322 			}
5323 		}
5324 		log_err("ngtcp2_conn_read_pkt failed: %s",
5325 			ngtcp2_strerror(ret));
5326 		if(!doq_conn_close_error(c, conn)) {
5327 			if(err_drop)
5328 				*err_drop = 1;
5329 		}
5330 		return 0;
5331 	}
5332 	doq_conn_write_enable(conn);
5333 	return 1;
5334 }
5335 
5336 /** doq stream write is done */
5337 static void
doq_stream_write_is_done(struct doq_conn * conn,struct doq_stream * stream)5338 doq_stream_write_is_done(struct doq_conn* conn, struct doq_stream* stream)
5339 {
5340 	/* Cannot deallocate, the buffer may be needed for resends. */
5341 	doq_stream_off_write_list(conn, stream);
5342 }
5343 
5344 int
doq_conn_write_streams(struct comm_point * c,struct doq_conn * conn,int * err_drop)5345 doq_conn_write_streams(struct comm_point* c, struct doq_conn* conn,
5346 	int* err_drop)
5347 {
5348 	struct doq_stream* stream = conn->stream_write_first;
5349 	ngtcp2_path_storage ps;
5350 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
5351 	size_t num_packets = 0, max_packets = 65535;
5352 	ngtcp2_path_storage_zero(&ps);
5353 
5354 	for(;;) {
5355 		int64_t stream_id;
5356 		uint32_t flags = 0;
5357 		ngtcp2_pkt_info pi;
5358 		ngtcp2_vec datav[2];
5359 		size_t datav_count = 0;
5360 		ngtcp2_ssize ret, ndatalen = 0;
5361 		int fin;
5362 
5363 		if(stream) {
5364 			/* data to send */
5365 			verbose(VERB_ALGO, "doq: doq_conn write stream %d",
5366 				(int)stream->stream_id);
5367 			stream_id = stream->stream_id;
5368 			fin = 1;
5369 			if(stream->nwrite < 2) {
5370 				datav[0].base = ((uint8_t*)&stream->
5371 					outlen_wire) + stream->nwrite;
5372 				datav[0].len = 2 - stream->nwrite;
5373 				datav[1].base = stream->out;
5374 				datav[1].len = stream->outlen;
5375 				datav_count = 2;
5376 			} else {
5377 				datav[0].base = stream->out +
5378 					(stream->nwrite-2);
5379 				datav[0].len = stream->outlen -
5380 					(stream->nwrite-2);
5381 				datav_count = 1;
5382 			}
5383 		} else {
5384 			/* no data to send */
5385 			verbose(VERB_ALGO, "doq: doq_conn write stream -1");
5386 			stream_id = -1;
5387 			fin = 0;
5388 			datav[0].base = NULL;
5389 			datav[0].len = 0;
5390 			datav_count = 1;
5391 		}
5392 
5393 		/* if more streams, set it to write more */
5394 		if(stream && stream->write_next)
5395 			flags |= NGTCP2_WRITE_STREAM_FLAG_MORE;
5396 		if(fin)
5397 			flags |= NGTCP2_WRITE_STREAM_FLAG_FIN;
5398 
5399 		sldns_buffer_clear(c->doq_socket->pkt_buf);
5400 		ret = ngtcp2_conn_writev_stream(conn->conn, &ps.path, &pi,
5401 			sldns_buffer_begin(c->doq_socket->pkt_buf),
5402 			sldns_buffer_remaining(c->doq_socket->pkt_buf),
5403 			&ndatalen, flags, stream_id, datav, datav_count, ts);
5404 		if(ret < 0) {
5405 			if(ret == NGTCP2_ERR_WRITE_MORE) {
5406 				verbose(VERB_ALGO, "doq: write more, ndatalen %d", (int)ndatalen);
5407 				if(stream) {
5408 					if(ndatalen >= 0)
5409 						stream->nwrite += ndatalen;
5410 					if(stream->nwrite >= stream->outlen+2)
5411 						doq_stream_write_is_done(
5412 							conn, stream);
5413 					stream = stream->write_next;
5414 				}
5415 				continue;
5416 			} else if(ret == NGTCP2_ERR_STREAM_DATA_BLOCKED) {
5417 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_DATA_BLOCKED");
5418 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5419 				ngtcp2_ccerr_set_application_error(
5420 					&conn->ccerr, -1, NULL, 0);
5421 #else
5422 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
5423 #endif
5424 				if(err_drop)
5425 					*err_drop = 0;
5426 				if(!doq_conn_close_error(c, conn)) {
5427 					if(err_drop)
5428 						*err_drop = 1;
5429 				}
5430 				return 0;
5431 			} else if(ret == NGTCP2_ERR_STREAM_SHUT_WR) {
5432 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_SHUT_WR");
5433 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5434 				ngtcp2_ccerr_set_application_error(
5435 					&conn->ccerr, -1, NULL, 0);
5436 #else
5437 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
5438 #endif
5439 				if(err_drop)
5440 					*err_drop = 0;
5441 				if(!doq_conn_close_error(c, conn)) {
5442 					if(err_drop)
5443 						*err_drop = 1;
5444 				}
5445 				return 0;
5446 			}
5447 
5448 			log_err("doq: ngtcp2_conn_writev_stream failed: %s",
5449 				ngtcp2_strerror(ret));
5450 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5451 			ngtcp2_ccerr_set_liberr(&conn->ccerr, ret, NULL, 0);
5452 #else
5453 			ngtcp2_connection_close_error_set_transport_error_liberr(
5454 				&conn->last_error, ret, NULL, 0);
5455 #endif
5456 			if(err_drop)
5457 				*err_drop = 0;
5458 			if(!doq_conn_close_error(c, conn)) {
5459 				if(err_drop)
5460 					*err_drop = 1;
5461 			}
5462 			return 0;
5463 		}
5464 		verbose(VERB_ALGO, "doq: writev_stream pkt size %d ndatawritten %d",
5465 			(int)ret, (int)ndatalen);
5466 
5467 		if(ndatalen >= 0 && stream) {
5468 			stream->nwrite += ndatalen;
5469 			if(stream->nwrite >= stream->outlen+2)
5470 				doq_stream_write_is_done(conn, stream);
5471 		}
5472 		if(ret == 0) {
5473 			/* congestion limited */
5474 			doq_conn_write_disable(conn);
5475 			ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
5476 			return 1;
5477 		}
5478 		sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
5479 		sldns_buffer_flip(c->doq_socket->pkt_buf);
5480 		doq_send_pkt(c, &conn->key.paddr, pi.ecn);
5481 
5482 		if(c->doq_socket->have_blocked_pkt)
5483 			break;
5484 		if(++num_packets == max_packets)
5485 			break;
5486 		if(stream)
5487 			stream = stream->write_next;
5488 	}
5489 	ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
5490 	return 1;
5491 }
5492 
5493 void
doq_conn_write_enable(struct doq_conn * conn)5494 doq_conn_write_enable(struct doq_conn* conn)
5495 {
5496 	conn->write_interest = 1;
5497 }
5498 
5499 void
doq_conn_write_disable(struct doq_conn * conn)5500 doq_conn_write_disable(struct doq_conn* conn)
5501 {
5502 	conn->write_interest = 0;
5503 }
5504 
5505 /** doq append the connection to the write list */
5506 static void
doq_conn_write_list_append(struct doq_table * table,struct doq_conn * conn)5507 doq_conn_write_list_append(struct doq_table* table, struct doq_conn* conn)
5508 {
5509 	if(conn->on_write_list)
5510 		return;
5511 	conn->write_prev = table->write_list_last;
5512 	if(table->write_list_last)
5513 		table->write_list_last->write_next = conn;
5514 	else table->write_list_first = conn;
5515 	conn->write_next = NULL;
5516 	table->write_list_last = conn;
5517 	conn->on_write_list = 1;
5518 }
5519 
5520 void
doq_conn_write_list_remove(struct doq_table * table,struct doq_conn * conn)5521 doq_conn_write_list_remove(struct doq_table* table, struct doq_conn* conn)
5522 {
5523 	if(!conn->on_write_list)
5524 		return;
5525 	if(conn->write_next)
5526 		conn->write_next->write_prev = conn->write_prev;
5527 	else table->write_list_last = conn->write_prev;
5528 	if(conn->write_prev)
5529 		conn->write_prev->write_next = conn->write_next;
5530 	else table->write_list_first = conn->write_next;
5531 	conn->write_prev = NULL;
5532 	conn->write_next = NULL;
5533 	conn->on_write_list = 0;
5534 }
5535 
5536 void
doq_conn_set_write_list(struct doq_table * table,struct doq_conn * conn)5537 doq_conn_set_write_list(struct doq_table* table, struct doq_conn* conn)
5538 {
5539 	if(conn->write_interest && conn->on_write_list)
5540 		return;
5541 	if(!conn->write_interest && !conn->on_write_list)
5542 		return;
5543 	if(conn->write_interest)
5544 		doq_conn_write_list_append(table, conn);
5545 	else doq_conn_write_list_remove(table, conn);
5546 }
5547 
5548 struct doq_conn*
doq_table_pop_first(struct doq_table * table)5549 doq_table_pop_first(struct doq_table* table)
5550 {
5551 	struct doq_conn* conn = table->write_list_first;
5552 	if(!conn)
5553 		return NULL;
5554 	lock_basic_lock(&conn->lock);
5555 	table->write_list_first = conn->write_next;
5556 	if(conn->write_next)
5557 		conn->write_next->write_prev = NULL;
5558 	else table->write_list_last = NULL;
5559 	conn->write_next = NULL;
5560 	conn->write_prev = NULL;
5561 	conn->on_write_list = 0;
5562 	return conn;
5563 }
5564 
5565 int
doq_conn_check_timer(struct doq_conn * conn,struct timeval * tv)5566 doq_conn_check_timer(struct doq_conn* conn, struct timeval* tv)
5567 {
5568 	ngtcp2_tstamp expiry = ngtcp2_conn_get_expiry(conn->conn);
5569 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
5570 	ngtcp2_tstamp t;
5571 
5572 	if(expiry <= now) {
5573 		/* The timer has already expired, add with zero timeout.
5574 		 * This should call the callback straight away. Calling it
5575 		 * from the event callbacks is cleaner than calling it here,
5576 		 * because then it is always called with the same locks and
5577 		 * so on. This routine only has the conn.lock. */
5578 		t = now;
5579 	} else {
5580 		t = expiry;
5581 	}
5582 
5583 	/* convert to timeval */
5584 	memset(tv, 0, sizeof(*tv));
5585 	tv->tv_sec = t / NGTCP2_SECONDS;
5586 	tv->tv_usec = (t / NGTCP2_MICROSECONDS)%1000000;
5587 
5588 	/* If we already have a timer, is it the right value? */
5589 	if(conn->timer.timer_in_tree || conn->timer.timer_in_list) {
5590 		if(conn->timer.time.tv_sec == tv->tv_sec &&
5591 			conn->timer.time.tv_usec == tv->tv_usec)
5592 			return 0;
5593 	}
5594 	return 1;
5595 }
5596 
5597 /* doq print connection log */
5598 static void
doq_conn_log_line(struct doq_conn * conn,char * s)5599 doq_conn_log_line(struct doq_conn* conn, char* s)
5600 {
5601 	char remotestr[256], localstr[256];
5602 	addr_to_str((void*)&conn->key.paddr.addr, conn->key.paddr.addrlen,
5603 		remotestr, sizeof(remotestr));
5604 	addr_to_str((void*)&conn->key.paddr.localaddr,
5605 		conn->key.paddr.localaddrlen, localstr, sizeof(localstr));
5606 	log_info("doq conn %s %s %s", remotestr, localstr, s);
5607 }
5608 
5609 int
doq_conn_handle_timeout(struct doq_conn * conn)5610 doq_conn_handle_timeout(struct doq_conn* conn)
5611 {
5612 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
5613 	int rv;
5614 
5615 	if(verbosity >= VERB_ALGO)
5616 		doq_conn_log_line(conn, "timeout");
5617 
5618 	rv = ngtcp2_conn_handle_expiry(conn->conn, now);
5619 	if(rv != 0) {
5620 		verbose(VERB_ALGO, "ngtcp2_conn_handle_expiry failed: %s",
5621 			ngtcp2_strerror(rv));
5622 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5623 		ngtcp2_ccerr_set_liberr(&conn->ccerr, rv, NULL, 0);
5624 #else
5625 		ngtcp2_connection_close_error_set_transport_error_liberr(
5626 			&conn->last_error, rv, NULL, 0);
5627 #endif
5628 		if(!doq_conn_close_error(conn->doq_socket->cp, conn)) {
5629 			/* failed, return for deletion */
5630 			return 0;
5631 		}
5632 		return 1;
5633 	}
5634 	doq_conn_write_enable(conn);
5635 	if(!doq_conn_write_streams(conn->doq_socket->cp, conn, NULL)) {
5636 		/* failed, return for deletion. */
5637 		return 0;
5638 	}
5639 	return 1;
5640 }
5641 
5642 void
doq_table_quic_size_add(struct doq_table * table,size_t add)5643 doq_table_quic_size_add(struct doq_table* table, size_t add)
5644 {
5645 	lock_basic_lock(&table->size_lock);
5646 	table->current_size += add;
5647 	lock_basic_unlock(&table->size_lock);
5648 }
5649 
5650 void
doq_table_quic_size_subtract(struct doq_table * table,size_t subtract)5651 doq_table_quic_size_subtract(struct doq_table* table, size_t subtract)
5652 {
5653 	lock_basic_lock(&table->size_lock);
5654 	if(table->current_size < subtract)
5655 		table->current_size = 0;
5656 	else	table->current_size -= subtract;
5657 	lock_basic_unlock(&table->size_lock);
5658 }
5659 
5660 int
doq_table_quic_size_available(struct doq_table * table,struct config_file * cfg,size_t mem)5661 doq_table_quic_size_available(struct doq_table* table,
5662 	struct config_file* cfg, size_t mem)
5663 {
5664 	size_t cur;
5665 	lock_basic_lock(&table->size_lock);
5666 	cur = table->current_size;
5667 	lock_basic_unlock(&table->size_lock);
5668 
5669 	if(cur + mem > cfg->quic_size)
5670 		return 0;
5671 	return 1;
5672 }
5673 
doq_table_quic_size_get(struct doq_table * table)5674 size_t doq_table_quic_size_get(struct doq_table* table)
5675 {
5676 	size_t sz;
5677 	if(!table)
5678 		return 0;
5679 	lock_basic_lock(&table->size_lock);
5680 	sz = table->current_size;
5681 	lock_basic_unlock(&table->size_lock);
5682 	return sz;
5683 }
5684 #endif /* HAVE_NGTCP2 */
5685