xref: /freebsd/contrib/unbound/services/listen_dnsport.c (revision 46d2f61818f594174cafe31ee338c6e083fa1876)
1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #include <limits.h>
47 #ifdef USE_TCP_FASTOPEN
48 #include <netinet/tcp.h>
49 #endif
50 #include <ctype.h>
51 #include "services/listen_dnsport.h"
52 #include "services/outside_network.h"
53 #include "util/netevent.h"
54 #include "util/log.h"
55 #include "util/config_file.h"
56 #include "util/net_help.h"
57 #include "sldns/sbuffer.h"
58 #include "sldns/parseutil.h"
59 #include "sldns/wire2str.h"
60 #include "services/mesh.h"
61 #include "util/fptr_wlist.h"
62 #include "util/locks.h"
63 #include "util/timeval_func.h"
64 
65 #ifdef HAVE_NETDB_H
66 #include <netdb.h>
67 #endif
68 #include <fcntl.h>
69 
70 #ifdef HAVE_SYS_UN_H
71 #include <sys/un.h>
72 #endif
73 
74 #ifdef HAVE_SYSTEMD
75 #include <systemd/sd-daemon.h>
76 #endif
77 
78 #ifdef HAVE_IFADDRS_H
79 #include <ifaddrs.h>
80 #endif
81 #ifdef HAVE_NET_IF_H
82 #include <net/if.h>
83 #endif
84 
85 #ifdef HAVE_TIME_H
86 #include <time.h>
87 #endif
88 #include <sys/time.h>
89 
90 #ifdef HAVE_NGTCP2
91 #include <ngtcp2/ngtcp2.h>
92 #include <ngtcp2/ngtcp2_crypto.h>
93 #ifdef HAVE_NGTCP2_NGTCP2_CRYPTO_QUICTLS_H
94 #include <ngtcp2/ngtcp2_crypto_quictls.h>
95 #else
96 #include <ngtcp2/ngtcp2_crypto_openssl.h>
97 #endif
98 #endif
99 
100 #ifdef HAVE_OPENSSL_SSL_H
101 #include <openssl/ssl.h>
102 #endif
103 
104 #ifdef HAVE_LINUX_NET_TSTAMP_H
105 #include <linux/net_tstamp.h>
106 #endif
107 
108 /** number of queued TCP connections for listen() */
109 #define TCP_BACKLOG 256
110 
111 #ifndef THREADS_DISABLED
112 /** lock on the counter of stream buffer memory */
113 static lock_basic_type stream_wait_count_lock;
114 /** lock on the counter of HTTP2 query buffer memory */
115 static lock_basic_type http2_query_buffer_count_lock;
116 /** lock on the counter of HTTP2 response buffer memory */
117 static lock_basic_type http2_response_buffer_count_lock;
118 #endif
119 /** size (in bytes) of stream wait buffers */
120 static size_t stream_wait_count = 0;
121 /** is the lock initialised for stream wait buffers */
122 static int stream_wait_lock_inited = 0;
123 /** size (in bytes) of HTTP2 query buffers */
124 static size_t http2_query_buffer_count = 0;
125 /** is the lock initialised for HTTP2 query buffers */
126 static int http2_query_buffer_lock_inited = 0;
127 /** size (in bytes) of HTTP2 response buffers */
128 static size_t http2_response_buffer_count = 0;
129 /** is the lock initialised for HTTP2 response buffers */
130 static int http2_response_buffer_lock_inited = 0;
131 
132 /**
133  * Debug print of the getaddrinfo returned address.
134  * @param addr: the address returned.
135  * @param additional: additional text that describes the type of socket,
136  * 	or NULL for no text.
137  */
138 static void
verbose_print_addr(struct addrinfo * addr,const char * additional)139 verbose_print_addr(struct addrinfo *addr, const char* additional)
140 {
141 	if(verbosity >= VERB_ALGO) {
142 		char buf[100];
143 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
144 #ifdef INET6
145 		if(addr->ai_family == AF_INET6)
146 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
147 				sin6_addr;
148 #endif /* INET6 */
149 		if(inet_ntop(addr->ai_family, sinaddr, buf,
150 			(socklen_t)sizeof(buf)) == 0) {
151 			(void)strlcpy(buf, "(null)", sizeof(buf));
152 		}
153 		buf[sizeof(buf)-1] = 0;
154 		verbose(VERB_ALGO, "creating %s%s socket %s %d%s%s",
155 			addr->ai_socktype==SOCK_DGRAM?"udp":
156 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
157 			addr->ai_family==AF_INET?"4":
158 			addr->ai_family==AF_INET6?"6":
159 			"_otherfam", buf,
160 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port),
161 			(additional?" ":""), (additional?additional:""));
162 	}
163 }
164 
165 void
verbose_print_unbound_socket(struct unbound_socket * ub_sock)166 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
167 {
168 	if(verbosity >= VERB_ALGO) {
169 		char buf[256];
170 		log_info("listing of unbound_socket structure:");
171 		addr_to_str((void*)ub_sock->addr, ub_sock->addrlen, buf,
172 			sizeof(buf));
173 		log_info("%s s is: %d, fam is: %s, acl: %s", buf, ub_sock->s,
174 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
175 			ub_sock->acl?"yes":"no");
176 	}
177 }
178 
179 #ifdef HAVE_SYSTEMD
180 static int
systemd_get_activated(int family,int socktype,int listen,struct sockaddr * addr,socklen_t addrlen,const char * path)181 systemd_get_activated(int family, int socktype, int listen,
182 		      struct sockaddr *addr, socklen_t addrlen,
183 		      const char *path)
184 {
185 	int i = 0;
186 	int r = 0;
187 	int s = -1;
188 	const char* listen_pid, *listen_fds;
189 
190 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
191 
192 	if((r = sd_booted()) < 1) {
193 		if(r == 0)
194 			log_warn("systemd is not running");
195 		else
196 			log_err("systemd sd_booted(): %s", strerror(-r));
197 		return -1;
198 	}
199 
200 	listen_pid = getenv("LISTEN_PID");
201 	listen_fds = getenv("LISTEN_FDS");
202 
203 	if (!listen_pid) {
204 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
205 		return -1;
206 	}
207 
208 	if (!listen_fds) {
209 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
210 		return -1;
211 	}
212 
213 	if((r = sd_listen_fds(0)) < 1) {
214 		if(r == 0)
215 			log_warn("systemd: did not return socket, check unit configuration");
216 		else
217 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
218 		return -1;
219 	}
220 
221 	for(i = 0; i < r; i++) {
222 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
223 			s = SD_LISTEN_FDS_START + i;
224 			break;
225 		}
226 	}
227 	if (s == -1) {
228 		if (addr)
229 			log_err_addr("systemd sd_listen_fds()",
230 				     "no such socket",
231 				     (struct sockaddr_storage *)addr, addrlen);
232 		else
233 			log_err("systemd sd_listen_fds(): %s", path);
234 	}
235 	return s;
236 }
237 #endif
238 
239 int
create_udp_sock(int family,int socktype,struct sockaddr * addr,socklen_t addrlen,int v6only,int * inuse,int * noproto,int rcv,int snd,int listen,int * reuseport,int transparent,int freebind,int use_systemd,int dscp)240 create_udp_sock(int family, int socktype, struct sockaddr* addr,
241         socklen_t addrlen, int v6only, int* inuse, int* noproto,
242 	int rcv, int snd, int listen, int* reuseport, int transparent,
243 	int freebind, int use_systemd, int dscp)
244 {
245 	int s;
246 	char* err;
247 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
248 	int on=1;
249 #endif
250 #ifdef IPV6_MTU
251 	int mtu = IPV6_MIN_MTU;
252 #endif
253 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
254 	(void)rcv;
255 #endif
256 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
257 	(void)snd;
258 #endif
259 #ifndef IPV6_V6ONLY
260 	(void)v6only;
261 #endif
262 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
263 	(void)transparent;
264 #endif
265 #if !defined(IP_FREEBIND)
266 	(void)freebind;
267 #endif
268 #ifdef HAVE_SYSTEMD
269 	int got_fd_from_systemd = 0;
270 
271 	if (!use_systemd
272 	    || (use_systemd
273 		&& (s = systemd_get_activated(family, socktype, -1, addr,
274 					      addrlen, NULL)) == -1)) {
275 #else
276 	(void)use_systemd;
277 #endif
278 	if((s = socket(family, socktype, 0)) == -1) {
279 		*inuse = 0;
280 #ifndef USE_WINSOCK
281 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
282 			*noproto = 1;
283 			return -1;
284 		}
285 #else
286 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
287 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
288 			*noproto = 1;
289 			return -1;
290 		}
291 #endif
292 		log_err("can't create socket: %s", sock_strerror(errno));
293 		*noproto = 0;
294 		return -1;
295 	}
296 #ifdef HAVE_SYSTEMD
297 	} else {
298 		got_fd_from_systemd = 1;
299 	}
300 #endif
301 	if(listen) {
302 #ifdef SO_REUSEADDR
303 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
304 			(socklen_t)sizeof(on)) < 0) {
305 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
306 				sock_strerror(errno));
307 #ifndef USE_WINSOCK
308 			if(errno != ENOSYS) {
309 				close(s);
310 				*noproto = 0;
311 				*inuse = 0;
312 				return -1;
313 			}
314 #else
315 			closesocket(s);
316 			*noproto = 0;
317 			*inuse = 0;
318 			return -1;
319 #endif
320 		}
321 #endif /* SO_REUSEADDR */
322 #ifdef SO_REUSEPORT
323 #  ifdef SO_REUSEPORT_LB
324 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
325 		 * like SO_REUSEPORT on Linux.  This is what the users want
326 		 * with the config option in unbound.conf; if we actually
327 		 * need local address and port reuse they'll also need to
328 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
329 		 */
330 		if (reuseport && *reuseport &&
331 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
332 			(socklen_t)sizeof(on)) < 0) {
333 #ifdef ENOPROTOOPT
334 			if(errno != ENOPROTOOPT || verbosity >= 3)
335 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
336 					strerror(errno));
337 #endif
338 			/* this option is not essential, we can continue */
339 			*reuseport = 0;
340 		}
341 #  else /* no SO_REUSEPORT_LB */
342 
343 		/* try to set SO_REUSEPORT so that incoming
344 		 * queries are distributed evenly among the receiving threads.
345 		 * Each thread must have its own socket bound to the same port,
346 		 * with SO_REUSEPORT set on each socket.
347 		 */
348 		if (reuseport && *reuseport &&
349 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
350 			(socklen_t)sizeof(on)) < 0) {
351 #ifdef ENOPROTOOPT
352 			if(errno != ENOPROTOOPT || verbosity >= 3)
353 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
354 					strerror(errno));
355 #endif
356 			/* this option is not essential, we can continue */
357 			*reuseport = 0;
358 		}
359 #  endif /* SO_REUSEPORT_LB */
360 #else
361 		(void)reuseport;
362 #endif /* defined(SO_REUSEPORT) */
363 #ifdef IP_TRANSPARENT
364 		if (transparent &&
365 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
366 		    (socklen_t)sizeof(on)) < 0) {
367 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
368 			strerror(errno));
369 		}
370 #elif defined(IP_BINDANY)
371 		if (transparent &&
372 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
373 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
374 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
375 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
376 			(family==AF_INET6?"V6":""), strerror(errno));
377 		}
378 #elif defined(SO_BINDANY)
379 		if (transparent &&
380 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
381 		    (socklen_t)sizeof(on)) < 0) {
382 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
383 			strerror(errno));
384 		}
385 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
386 	}
387 #ifdef IP_FREEBIND
388 	if(freebind &&
389 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
390 	    (socklen_t)sizeof(on)) < 0) {
391 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
392 		strerror(errno));
393 	}
394 #endif /* IP_FREEBIND */
395 	if(rcv) {
396 #ifdef SO_RCVBUF
397 		int got;
398 		socklen_t slen = (socklen_t)sizeof(got);
399 #  ifdef SO_RCVBUFFORCE
400 		/* Linux specific: try to use root permission to override
401 		 * system limits on rcvbuf. The limit is stored in
402 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
403 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
404 			(socklen_t)sizeof(rcv)) < 0) {
405 			if(errno != EPERM) {
406 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
407 					"...) failed: %s", sock_strerror(errno));
408 				sock_close(s);
409 				*noproto = 0;
410 				*inuse = 0;
411 				return -1;
412 			}
413 #  endif /* SO_RCVBUFFORCE */
414 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
415 				(socklen_t)sizeof(rcv)) < 0) {
416 				log_err("setsockopt(..., SO_RCVBUF, "
417 					"...) failed: %s", sock_strerror(errno));
418 				sock_close(s);
419 				*noproto = 0;
420 				*inuse = 0;
421 				return -1;
422 			}
423 			/* check if we got the right thing or if system
424 			 * reduced to some system max.  Warn if so */
425 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
426 				&slen) >= 0 && got < rcv/2) {
427 				log_warn("so-rcvbuf %u was not granted. "
428 					"Got %u. To fix: start with "
429 					"root permissions(linux) or sysctl "
430 					"bigger net.core.rmem_max(linux) or "
431 					"kern.ipc.maxsockbuf(bsd) values.",
432 					(unsigned)rcv, (unsigned)got);
433 			}
434 #  ifdef SO_RCVBUFFORCE
435 		}
436 #  endif
437 #endif /* SO_RCVBUF */
438 	}
439 	/* first do RCVBUF as the receive buffer is more important */
440 	if(snd) {
441 #ifdef SO_SNDBUF
442 		int got;
443 		socklen_t slen = (socklen_t)sizeof(got);
444 #  ifdef SO_SNDBUFFORCE
445 		/* Linux specific: try to use root permission to override
446 		 * system limits on sndbuf. The limit is stored in
447 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
448 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
449 			(socklen_t)sizeof(snd)) < 0) {
450 			if(errno != EPERM) {
451 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
452 					"...) failed: %s", sock_strerror(errno));
453 				sock_close(s);
454 				*noproto = 0;
455 				*inuse = 0;
456 				return -1;
457 			}
458 #  endif /* SO_SNDBUFFORCE */
459 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
460 				(socklen_t)sizeof(snd)) < 0) {
461 				log_err("setsockopt(..., SO_SNDBUF, "
462 					"...) failed: %s", sock_strerror(errno));
463 				sock_close(s);
464 				*noproto = 0;
465 				*inuse = 0;
466 				return -1;
467 			}
468 			/* check if we got the right thing or if system
469 			 * reduced to some system max.  Warn if so */
470 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
471 				&slen) >= 0 && got < snd/2) {
472 				log_warn("so-sndbuf %u was not granted. "
473 					"Got %u. To fix: start with "
474 					"root permissions(linux) or sysctl "
475 					"bigger net.core.wmem_max(linux) or "
476 					"kern.ipc.maxsockbuf(bsd) values.",
477 					(unsigned)snd, (unsigned)got);
478 			}
479 #  ifdef SO_SNDBUFFORCE
480 		}
481 #  endif
482 #endif /* SO_SNDBUF */
483 	}
484 	err = set_ip_dscp(s, family, dscp);
485 	if(err != NULL)
486 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
487 	if(family == AF_INET6) {
488 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
489 		int omit6_set = 0;
490 		int action;
491 # endif
492 # if defined(IPV6_V6ONLY)
493 		if(v6only
494 #   ifdef HAVE_SYSTEMD
495 			/* Systemd wants to control if the socket is v6 only
496 			 * or both, with BindIPv6Only=default, ipv6-only or
497 			 * both in systemd.socket, so it is not set here. */
498 			&& !got_fd_from_systemd
499 #   endif
500 			) {
501 			int val=(v6only==2)?0:1;
502 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
503 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
504 				log_err("setsockopt(..., IPV6_V6ONLY"
505 					", ...) failed: %s", sock_strerror(errno));
506 				sock_close(s);
507 				*noproto = 0;
508 				*inuse = 0;
509 				return -1;
510 			}
511 		}
512 # endif
513 # if defined(IPV6_USE_MIN_MTU)
514 		/*
515 		 * There is no fragmentation of IPv6 datagrams
516 		 * during forwarding in the network. Therefore
517 		 * we do not send UDP datagrams larger than
518 		 * the minimum IPv6 MTU of 1280 octets. The
519 		 * EDNS0 message length can be larger if the
520 		 * network stack supports IPV6_USE_MIN_MTU.
521 		 */
522 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
523 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
524 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
525 				"...) failed: %s", sock_strerror(errno));
526 			sock_close(s);
527 			*noproto = 0;
528 			*inuse = 0;
529 			return -1;
530 		}
531 # elif defined(IPV6_MTU)
532 #   ifndef USE_WINSOCK
533 		/*
534 		 * On Linux, to send no larger than 1280, the PMTUD is
535 		 * disabled by default for datagrams anyway, so we set
536 		 * the MTU to use.
537 		 */
538 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
539 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
540 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
541 				sock_strerror(errno));
542 			sock_close(s);
543 			*noproto = 0;
544 			*inuse = 0;
545 			return -1;
546 		}
547 #   elif defined(IPV6_USER_MTU)
548 		/* As later versions of the mingw crosscompiler define
549 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
550 		 * instead which is writable; IPV6_MTU is readonly there. */
551 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
552 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
553 			if (WSAGetLastError() != WSAENOPROTOOPT) {
554 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
555 					wsa_strerror(WSAGetLastError()));
556 				sock_close(s);
557 				*noproto = 0;
558 				*inuse = 0;
559 				return -1;
560 			}
561 		}
562 #   endif /* USE_WINSOCK */
563 # endif /* IPv6 MTU */
564 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
565 #  if defined(IP_PMTUDISC_OMIT)
566 		action = IP_PMTUDISC_OMIT;
567 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
568 			&action, (socklen_t)sizeof(action)) < 0) {
569 
570 			if (errno != EINVAL) {
571 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
572 					strerror(errno));
573 				sock_close(s);
574 				*noproto = 0;
575 				*inuse = 0;
576 				return -1;
577 			}
578 		}
579 		else
580 		{
581 		    omit6_set = 1;
582 		}
583 #  endif
584 		if (omit6_set == 0) {
585 			action = IP_PMTUDISC_DONT;
586 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
587 				&action, (socklen_t)sizeof(action)) < 0) {
588 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
589 					strerror(errno));
590 				sock_close(s);
591 				*noproto = 0;
592 				*inuse = 0;
593 				return -1;
594 			}
595 		}
596 # endif /* IPV6_MTU_DISCOVER */
597 	} else if(family == AF_INET) {
598 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
599 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
600  * PMTU information is not accepted, but fragmentation is allowed
601  * if and only if the packet size exceeds the outgoing interface MTU
602  * (and also uses the interface mtu to determine the size of the packets).
603  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
604  * FreeBSD already has same semantics without setting the option. */
605 		int omit_set = 0;
606 		int action;
607 #   if defined(IP_PMTUDISC_OMIT)
608 		action = IP_PMTUDISC_OMIT;
609 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
610 			&action, (socklen_t)sizeof(action)) < 0) {
611 
612 			if (errno != EINVAL) {
613 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
614 					strerror(errno));
615 				sock_close(s);
616 				*noproto = 0;
617 				*inuse = 0;
618 				return -1;
619 			}
620 		}
621 		else
622 		{
623 		    omit_set = 1;
624 		}
625 #   endif
626 		if (omit_set == 0) {
627    			action = IP_PMTUDISC_DONT;
628 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
629 				&action, (socklen_t)sizeof(action)) < 0) {
630 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
631 					strerror(errno));
632 				sock_close(s);
633 				*noproto = 0;
634 				*inuse = 0;
635 				return -1;
636 			}
637 		}
638 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
639 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
640 		 * but does not work on that version, so we exclude it */
641 		/* a nonzero value disables fragmentation, according to
642 		 * docs.oracle.com for ip(4). */
643 		int off = 1;
644 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
645 			&off, (socklen_t)sizeof(off)) < 0) {
646 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
647 				strerror(errno));
648 			sock_close(s);
649 			*noproto = 0;
650 			*inuse = 0;
651 			return -1;
652 		}
653 #  endif /* IPv4 MTU */
654 	}
655 	if(
656 #ifdef HAVE_SYSTEMD
657 		!got_fd_from_systemd &&
658 #endif
659 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
660 		*noproto = 0;
661 		*inuse = 0;
662 #ifndef USE_WINSOCK
663 #ifdef EADDRINUSE
664 		*inuse = (errno == EADDRINUSE);
665 		/* detect freebsd jail with no ipv6 permission */
666 		if(family==AF_INET6 && errno==EINVAL)
667 			*noproto = 1;
668 		else if(errno != EADDRINUSE &&
669 			!(errno == EACCES && verbosity < 4 && !listen)
670 #ifdef EADDRNOTAVAIL
671 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
672 #endif
673 			) {
674 			log_err_addr("can't bind socket", strerror(errno),
675 				(struct sockaddr_storage*)addr, addrlen);
676 		}
677 #endif /* EADDRINUSE */
678 #else /* USE_WINSOCK */
679 		if(WSAGetLastError() != WSAEADDRINUSE &&
680 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
681 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
682 			log_err_addr("can't bind socket",
683 				wsa_strerror(WSAGetLastError()),
684 				(struct sockaddr_storage*)addr, addrlen);
685 		}
686 #endif /* USE_WINSOCK */
687 		sock_close(s);
688 		return -1;
689 	}
690 	if(!fd_set_nonblock(s)) {
691 		*noproto = 0;
692 		*inuse = 0;
693 		sock_close(s);
694 		return -1;
695 	}
696 	return s;
697 }
698 
699 int
create_tcp_accept_sock(struct addrinfo * addr,int v6only,int * noproto,int * reuseport,int transparent,int mss,int nodelay,int freebind,int use_systemd,int dscp,const char * additional)700 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
701 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
702 	int use_systemd, int dscp, const char* additional)
703 {
704 	int s = -1;
705 	char* err;
706 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
707 	int on = 1;
708 #endif
709 #ifdef HAVE_SYSTEMD
710 	int got_fd_from_systemd = 0;
711 #endif
712 #ifdef USE_TCP_FASTOPEN
713 	int qlen;
714 #endif
715 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
716 	(void)transparent;
717 #endif
718 #if !defined(IP_FREEBIND)
719 	(void)freebind;
720 #endif
721 	verbose_print_addr(addr, additional);
722 	*noproto = 0;
723 #ifdef HAVE_SYSTEMD
724 	if (!use_systemd ||
725 	    (use_systemd
726 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
727 					   addr->ai_addr, addr->ai_addrlen,
728 					   NULL)) == -1)) {
729 #else
730 	(void)use_systemd;
731 #endif
732 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
733 #ifndef USE_WINSOCK
734 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
735 			*noproto = 1;
736 			return -1;
737 		}
738 #else
739 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
740 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
741 			*noproto = 1;
742 			return -1;
743 		}
744 #endif
745 		log_err("can't create socket: %s", sock_strerror(errno));
746 		return -1;
747 	}
748 	if(nodelay) {
749 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
750 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
751 			(socklen_t)sizeof(on)) < 0) {
752 			#ifndef USE_WINSOCK
753 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
754 				strerror(errno));
755 			#else
756 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
757 				wsa_strerror(WSAGetLastError()));
758 			#endif
759 		}
760 #else
761 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
762 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
763 	}
764 	if (mss > 0) {
765 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
766 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
767 			(socklen_t)sizeof(mss)) < 0) {
768 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
769 				sock_strerror(errno));
770 		} else {
771 			verbose(VERB_ALGO,
772 				" tcp socket mss set to %d", mss);
773 		}
774 #else
775 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
776 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
777 	}
778 #ifdef HAVE_SYSTEMD
779 	} else {
780 		got_fd_from_systemd = 1;
781     }
782 #endif
783 #ifdef SO_REUSEADDR
784 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
785 		(socklen_t)sizeof(on)) < 0) {
786 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
787 			sock_strerror(errno));
788 		sock_close(s);
789 		return -1;
790 	}
791 #endif /* SO_REUSEADDR */
792 #ifdef IP_FREEBIND
793 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
794 	    (socklen_t)sizeof(on)) < 0) {
795 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
796 		strerror(errno));
797 	}
798 #endif /* IP_FREEBIND */
799 #ifdef SO_REUSEPORT
800 	/* try to set SO_REUSEPORT so that incoming
801 	 * connections are distributed evenly among the receiving threads.
802 	 * Each thread must have its own socket bound to the same port,
803 	 * with SO_REUSEPORT set on each socket.
804 	 */
805 	if (reuseport && *reuseport &&
806 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
807 		(socklen_t)sizeof(on)) < 0) {
808 #ifdef ENOPROTOOPT
809 		if(errno != ENOPROTOOPT || verbosity >= 3)
810 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
811 				strerror(errno));
812 #endif
813 		/* this option is not essential, we can continue */
814 		*reuseport = 0;
815 	}
816 #else
817 	(void)reuseport;
818 #endif /* defined(SO_REUSEPORT) */
819 #if defined(IPV6_V6ONLY)
820 	if(addr->ai_family == AF_INET6 && v6only
821 #  ifdef HAVE_SYSTEMD
822 		/* Systemd wants to control if the socket is v6 only
823 		 * or both, with BindIPv6Only=default, ipv6-only or
824 		 * both in systemd.socket, so it is not set here. */
825 		&& !got_fd_from_systemd
826 #  endif
827 		) {
828 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
829 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
830 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
831 				sock_strerror(errno));
832 			sock_close(s);
833 			return -1;
834 		}
835 	}
836 #else
837 	(void)v6only;
838 #endif /* IPV6_V6ONLY */
839 #ifdef IP_TRANSPARENT
840 	if (transparent &&
841 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
842 	    (socklen_t)sizeof(on)) < 0) {
843 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
844 			strerror(errno));
845 	}
846 #elif defined(IP_BINDANY)
847 	if (transparent &&
848 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
849 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
850 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
851 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
852 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
853 	}
854 #elif defined(SO_BINDANY)
855 	if (transparent &&
856 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
857 	    sizeof(on)) < 0) {
858 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
859 		strerror(errno));
860 	}
861 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
862 	err = set_ip_dscp(s, addr->ai_family, dscp);
863 	if(err != NULL)
864 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
865 	if(
866 #ifdef HAVE_SYSTEMD
867 		!got_fd_from_systemd &&
868 #endif
869         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
870 #ifndef USE_WINSOCK
871 		/* detect freebsd jail with no ipv6 permission */
872 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
873 			*noproto = 1;
874 		else {
875 			log_err_addr("can't bind socket", strerror(errno),
876 				(struct sockaddr_storage*)addr->ai_addr,
877 				addr->ai_addrlen);
878 		}
879 #else
880 		log_err_addr("can't bind socket",
881 			wsa_strerror(WSAGetLastError()),
882 			(struct sockaddr_storage*)addr->ai_addr,
883 			addr->ai_addrlen);
884 #endif
885 		sock_close(s);
886 		return -1;
887 	}
888 	if(!fd_set_nonblock(s)) {
889 		sock_close(s);
890 		return -1;
891 	}
892 	if(listen(s, TCP_BACKLOG) == -1) {
893 		log_err("can't listen: %s", sock_strerror(errno));
894 		sock_close(s);
895 		return -1;
896 	}
897 #ifdef USE_TCP_FASTOPEN
898 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
899 	   against IP spoofing attacks as suggested in RFC7413 */
900 #ifdef __APPLE__
901 	/* OS X implementation only supports qlen of 1 via this call. Actual
902 	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
903 	qlen = 1;
904 #else
905 	/* 5 is recommended on linux */
906 	qlen = 5;
907 #endif
908 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
909 		  sizeof(qlen))) == -1 ) {
910 #ifdef ENOPROTOOPT
911 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
912 		   disabled, except when verbosity enabled for debugging */
913 		if(errno != ENOPROTOOPT || verbosity >= 3) {
914 #endif
915 		  if(errno == EPERM) {
916 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
917 		  } else {
918 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
919 		  }
920 #ifdef ENOPROTOOPT
921 		}
922 #endif
923 	}
924 #endif
925 	return s;
926 }
927 
928 char*
set_ip_dscp(int socket,int addrfamily,int dscp)929 set_ip_dscp(int socket, int addrfamily, int dscp)
930 {
931 	int ds;
932 
933 	if(dscp == 0)
934 		return NULL;
935 	ds = dscp << 2;
936 	switch(addrfamily) {
937 	case AF_INET6:
938 	#ifdef IPV6_TCLASS
939 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
940 			sizeof(ds)) < 0)
941 			return sock_strerror(errno);
942 		break;
943 	#else
944 		return "IPV6_TCLASS not defined on this system";
945 	#endif
946 	default:
947 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
948 			return sock_strerror(errno);
949 		break;
950 	}
951 	return NULL;
952 }
953 
954 int
create_local_accept_sock(const char * path,int * noproto,int use_systemd)955 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
956 {
957 #ifdef HAVE_SYSTEMD
958 	int ret;
959 
960 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
961 		return ret;
962 	else {
963 #endif
964 #ifdef HAVE_SYS_UN_H
965 	int s;
966 	struct sockaddr_un usock;
967 #ifndef HAVE_SYSTEMD
968 	(void)use_systemd;
969 #endif
970 
971 	verbose(VERB_ALGO, "creating unix socket %s", path);
972 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
973 	/* this member exists on BSDs, not Linux */
974 	usock.sun_len = (unsigned)sizeof(usock);
975 #endif
976 	usock.sun_family = AF_LOCAL;
977 	/* length is 92-108, 104 on FreeBSD */
978 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
979 
980 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
981 		log_err("Cannot create local socket %s (%s)",
982 			path, strerror(errno));
983 		return -1;
984 	}
985 
986 	if (unlink(path) && errno != ENOENT) {
987 		/* The socket already exists and cannot be removed */
988 		log_err("Cannot remove old local socket %s (%s)",
989 			path, strerror(errno));
990 		goto err;
991 	}
992 
993 	if (bind(s, (struct sockaddr *)&usock,
994 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
995 		log_err("Cannot bind local socket %s (%s)",
996 			path, strerror(errno));
997 		goto err;
998 	}
999 
1000 	if (!fd_set_nonblock(s)) {
1001 		log_err("Cannot set non-blocking mode");
1002 		goto err;
1003 	}
1004 
1005 	if (listen(s, TCP_BACKLOG) == -1) {
1006 		log_err("can't listen: %s", strerror(errno));
1007 		goto err;
1008 	}
1009 
1010 	(void)noproto; /*unused*/
1011 	return s;
1012 
1013 err:
1014 	sock_close(s);
1015 	return -1;
1016 
1017 #ifdef HAVE_SYSTEMD
1018 	}
1019 #endif
1020 #else
1021 	(void)use_systemd;
1022 	(void)path;
1023 	log_err("Local sockets are not supported");
1024 	*noproto = 1;
1025 	return -1;
1026 #endif
1027 }
1028 
1029 
1030 /**
1031  * Create socket from getaddrinfo results
1032  */
1033 static int
make_sock(int stype,const char * ifname,const char * port,struct addrinfo * hints,int v6only,int * noip6,size_t rcv,size_t snd,int * reuseport,int transparent,int tcp_mss,int nodelay,int freebind,int use_systemd,int dscp,struct unbound_socket * ub_sock,const char * additional)1034 make_sock(int stype, const char* ifname, const char* port,
1035 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1036 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1037 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
1038 	const char* additional)
1039 {
1040 	struct addrinfo *res = NULL;
1041 	int r, s, inuse, noproto;
1042 	hints->ai_socktype = stype;
1043 	*noip6 = 0;
1044 	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1045 #ifdef USE_WINSOCK
1046 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1047 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1048 			return -1;
1049 		}
1050 #endif
1051 		log_err("node %s:%s getaddrinfo: %s %s",
1052 			ifname?ifname:"default", port, gai_strerror(r),
1053 #ifdef EAI_SYSTEM
1054 			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1055 #else
1056 			""
1057 #endif
1058 		);
1059 		return -1;
1060 	}
1061 	if(stype == SOCK_DGRAM) {
1062 		verbose_print_addr(res, additional);
1063 		s = create_udp_sock(res->ai_family, res->ai_socktype,
1064 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1065 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1066 			reuseport, transparent, freebind, use_systemd, dscp);
1067 		if(s == -1 && inuse) {
1068 			log_err("bind: address already in use");
1069 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1070 			*noip6 = 1;
1071 		}
1072 	} else	{
1073 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1074 			transparent, tcp_mss, nodelay, freebind, use_systemd,
1075 			dscp, additional);
1076 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1077 			*noip6 = 1;
1078 		}
1079 	}
1080 
1081 	if(!res->ai_addr) {
1082 		log_err("getaddrinfo returned no address");
1083 		freeaddrinfo(res);
1084 		sock_close(s);
1085 		return -1;
1086 	}
1087 	ub_sock->addr = memdup(res->ai_addr, res->ai_addrlen);
1088 	ub_sock->addrlen = res->ai_addrlen;
1089 	if(!ub_sock->addr) {
1090 		log_err("out of memory: allocate listening address");
1091 		freeaddrinfo(res);
1092 		sock_close(s);
1093 		return -1;
1094 	}
1095 	freeaddrinfo(res);
1096 
1097 	ub_sock->s = s;
1098 	ub_sock->fam = hints->ai_family;
1099 	ub_sock->acl = NULL;
1100 
1101 	return s;
1102 }
1103 
1104 /** make socket and first see if ifname contains port override info */
1105 static int
make_sock_port(int stype,const char * ifname,const char * port,struct addrinfo * hints,int v6only,int * noip6,size_t rcv,size_t snd,int * reuseport,int transparent,int tcp_mss,int nodelay,int freebind,int use_systemd,int dscp,struct unbound_socket * ub_sock,const char * additional)1106 make_sock_port(int stype, const char* ifname, const char* port,
1107 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1108 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1109 	int use_systemd, int dscp, struct unbound_socket* ub_sock,
1110 	const char* additional)
1111 {
1112 	char* s = strchr(ifname, '@');
1113 	if(s) {
1114 		/* override port with ifspec@port */
1115 		char p[16];
1116 		char newif[128];
1117 		if((size_t)(s-ifname) >= sizeof(newif)) {
1118 			log_err("ifname too long: %s", ifname);
1119 			*noip6 = 0;
1120 			return -1;
1121 		}
1122 		if(strlen(s+1) >= sizeof(p)) {
1123 			log_err("portnumber too long: %s", ifname);
1124 			*noip6 = 0;
1125 			return -1;
1126 		}
1127 		(void)strlcpy(newif, ifname, sizeof(newif));
1128 		newif[s-ifname] = 0;
1129 		(void)strlcpy(p, s+1, sizeof(p));
1130 		p[strlen(s+1)]=0;
1131 		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1132 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1133 			use_systemd, dscp, ub_sock, additional);
1134 	}
1135 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1136 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1137 		dscp, ub_sock, additional);
1138 }
1139 
1140 /**
1141  * Add port to open ports list.
1142  * @param list: list head. changed.
1143  * @param s: fd.
1144  * @param ftype: if fd is UDP.
1145  * @param pp2_enabled: if PROXYv2 is enabled for this port.
1146  * @param ub_sock: socket with address.
1147  * @return false on failure. list in unchanged then.
1148  */
1149 static int
port_insert(struct listen_port ** list,int s,enum listen_type ftype,int pp2_enabled,struct unbound_socket * ub_sock)1150 port_insert(struct listen_port** list, int s, enum listen_type ftype,
1151 	int pp2_enabled, struct unbound_socket* ub_sock)
1152 {
1153 	struct listen_port* item = (struct listen_port*)malloc(
1154 		sizeof(struct listen_port));
1155 	if(!item)
1156 		return 0;
1157 	item->next = *list;
1158 	item->fd = s;
1159 	item->ftype = ftype;
1160 	item->pp2_enabled = pp2_enabled;
1161 	item->socket = ub_sock;
1162 	*list = item;
1163 	return 1;
1164 }
1165 
1166 /** set fd to receive software timestamps */
1167 static int
set_recvtimestamp(int s)1168 set_recvtimestamp(int s)
1169 {
1170 #ifdef HAVE_LINUX_NET_TSTAMP_H
1171 	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1172 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1173 		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1174 			strerror(errno));
1175 		return 0;
1176 	}
1177 	return 1;
1178 #else
1179 	log_err("packets timestamping is not supported on this platform");
1180 	(void)s;
1181 	return 0;
1182 #endif
1183 }
1184 
1185 /** set fd to receive source address packet info */
1186 static int
set_recvpktinfo(int s,int family)1187 set_recvpktinfo(int s, int family)
1188 {
1189 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1190 	int on = 1;
1191 #else
1192 	(void)s;
1193 #endif
1194 	if(family == AF_INET6) {
1195 #           ifdef IPV6_RECVPKTINFO
1196 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1197 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1198 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1199 				strerror(errno));
1200 			return 0;
1201 		}
1202 #           elif defined(IPV6_PKTINFO)
1203 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1204 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1205 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1206 				strerror(errno));
1207 			return 0;
1208 		}
1209 #           else
1210 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1211 			"disable interface-automatic or do-ip6 in config");
1212 		return 0;
1213 #           endif /* defined IPV6_RECVPKTINFO */
1214 
1215 	} else if(family == AF_INET) {
1216 #           ifdef IP_PKTINFO
1217 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1218 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1219 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1220 				strerror(errno));
1221 			return 0;
1222 		}
1223 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1224 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1225 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1226 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1227 				strerror(errno));
1228 			return 0;
1229 		}
1230 #           else
1231 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1232 			"interface-automatic or do-ip4 in config");
1233 		return 0;
1234 #           endif /* IP_PKTINFO */
1235 
1236 	}
1237 	return 1;
1238 }
1239 
1240 /** see if interface is ssl, its port number == the ssl port number */
1241 static int
if_is_ssl(const char * ifname,const char * port,int ssl_port,struct config_strlist * tls_additional_port)1242 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1243 	struct config_strlist* tls_additional_port)
1244 {
1245 	struct config_strlist* s;
1246 	char* p = strchr(ifname, '@');
1247 	if(!p && atoi(port) == ssl_port)
1248 		return 1;
1249 	if(p && atoi(p+1) == ssl_port)
1250 		return 1;
1251 	for(s = tls_additional_port; s; s = s->next) {
1252 		if(p && atoi(p+1) == atoi(s->str))
1253 			return 1;
1254 		if(!p && atoi(port) == atoi(s->str))
1255 			return 1;
1256 	}
1257 	return 0;
1258 }
1259 
1260 /**
1261  * Helper for ports_open. Creates one interface (or NULL for default).
1262  * @param ifname: The interface ip address.
1263  * @param do_auto: use automatic interface detection.
1264  * 	If enabled, then ifname must be the wildcard name.
1265  * @param do_udp: if udp should be used.
1266  * @param do_tcp: if tcp should be used.
1267  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1268  * @param port: Port number to use (as string).
1269  * @param list: list of open ports, appended to, changed to point to list head.
1270  * @param rcv: receive buffer size for UDP
1271  * @param snd: send buffer size for UDP
1272  * @param ssl_port: ssl service port number
1273  * @param tls_additional_port: list of additional ssl service port numbers.
1274  * @param https_port: DoH service port number
1275  * @param proxy_protocol_port: list of PROXYv2 port numbers.
1276  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1277  * 	set to false on exit if reuseport failed due to no kernel support.
1278  * @param transparent: set IP_TRANSPARENT socket option.
1279  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1280  * @param freebind: set IP_FREEBIND socket option.
1281  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1282  * @param use_systemd: if true, fetch sockets from systemd.
1283  * @param dnscrypt_port: dnscrypt service port number
1284  * @param dscp: DSCP to use.
1285  * @param quic_port: dns over quic port number.
1286  * @param http_notls_downstream: if no tls is used for https downstream.
1287  * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1288  * 	wait to discard if UDP packets have waited for long in the socket
1289  * 	buffer.
1290  * @return: returns false on error.
1291  */
1292 static int
ports_create_if(const char * ifname,int do_auto,int do_udp,int do_tcp,struct addrinfo * hints,const char * port,struct listen_port ** list,size_t rcv,size_t snd,int ssl_port,struct config_strlist * tls_additional_port,int https_port,struct config_strlist * proxy_protocol_port,int * reuseport,int transparent,int tcp_mss,int freebind,int http2_nodelay,int use_systemd,int dnscrypt_port,int dscp,int quic_port,int http_notls_downstream,int sock_queue_timeout)1293 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1294 	struct addrinfo *hints, const char* port, struct listen_port** list,
1295 	size_t rcv, size_t snd, int ssl_port,
1296 	struct config_strlist* tls_additional_port, int https_port,
1297 	struct config_strlist* proxy_protocol_port,
1298 	int* reuseport, int transparent, int tcp_mss, int freebind,
1299 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1300 	int quic_port, int http_notls_downstream, int sock_queue_timeout)
1301 {
1302 	int s, noip6=0;
1303 	int is_https = if_is_https(ifname, port, https_port);
1304 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1305 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1306 	int nodelay = is_https && http2_nodelay;
1307 	struct unbound_socket* ub_sock;
1308 	int is_doq = if_is_quic(ifname, port, quic_port);
1309 	const char* add = NULL;
1310 
1311 	if(!do_udp && !do_tcp)
1312 		return 0;
1313 
1314 	if(is_pp2) {
1315 		if(is_dnscrypt) {
1316 			fatal_exit("PROXYv2 and DNSCrypt combination not "
1317 				"supported!");
1318 		} else if(is_https) {
1319 			fatal_exit("PROXYv2 and DoH combination not "
1320 				"supported!");
1321 		} else if(is_doq) {
1322 			fatal_exit("PROXYv2 and DoQ combination not "
1323 				"supported!");
1324 		}
1325 	}
1326 
1327 	if(do_auto) {
1328 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1329 		if(!ub_sock)
1330 			return 0;
1331 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1332 			&noip6, rcv, snd, reuseport, transparent,
1333 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
1334 			(is_dnscrypt?"udpancil_dnscrypt":"udpancil"))) == -1) {
1335 			free(ub_sock->addr);
1336 			free(ub_sock);
1337 			if(noip6) {
1338 				log_warn("IPv6 protocol not available");
1339 				return 1;
1340 			}
1341 			return 0;
1342 		}
1343 		/* getting source addr packet info is highly non-portable */
1344 		if(!set_recvpktinfo(s, hints->ai_family)) {
1345 			sock_close(s);
1346 			free(ub_sock->addr);
1347 			free(ub_sock);
1348 			return 0;
1349 		}
1350 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1351 			log_warn("socket timestamping is not available");
1352 		}
1353 		if(!port_insert(list, s, is_dnscrypt
1354 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1355 			is_pp2, ub_sock)) {
1356 			sock_close(s);
1357 			free(ub_sock->addr);
1358 			free(ub_sock);
1359 			return 0;
1360 		}
1361 	} else if(do_udp) {
1362 		enum listen_type udp_port_type;
1363 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1364 		if(!ub_sock)
1365 			return 0;
1366 		if(is_dnscrypt) {
1367 			udp_port_type = listen_type_udp_dnscrypt;
1368 			add = "dnscrypt";
1369 		} else if(is_doq) {
1370 			udp_port_type = listen_type_doq;
1371 			add = "doq";
1372 			if(((strchr(ifname, '@') &&
1373 				atoi(strchr(ifname, '@')+1) == 53) ||
1374 				(!strchr(ifname, '@') && atoi(port) == 53))) {
1375 				log_err("DNS over QUIC is not allowed on "
1376 					"port 53. Port 53 is for DNS "
1377 					"datagrams. Error for "
1378 					"interface '%s'.", ifname);
1379 				free(ub_sock->addr);
1380 				free(ub_sock);
1381 				return 0;
1382 			}
1383 		} else {
1384 			udp_port_type = listen_type_udp;
1385 			add = NULL;
1386 		}
1387 		/* regular udp socket */
1388 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1389 			&noip6, rcv, snd, reuseport, transparent,
1390 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock,
1391 			add)) == -1) {
1392 			free(ub_sock->addr);
1393 			free(ub_sock);
1394 			if(noip6) {
1395 				log_warn("IPv6 protocol not available");
1396 				return 1;
1397 			}
1398 			return 0;
1399 		}
1400 		if(udp_port_type == listen_type_doq) {
1401 			if(!set_recvpktinfo(s, hints->ai_family)) {
1402 				sock_close(s);
1403 				free(ub_sock->addr);
1404 				free(ub_sock);
1405 				return 0;
1406 			}
1407 		}
1408 		if(udp_port_type == listen_type_udp && sock_queue_timeout)
1409 			udp_port_type = listen_type_udpancil;
1410 		if (sock_queue_timeout) {
1411 			if(!set_recvtimestamp(s)) {
1412 				log_warn("socket timestamping is not available");
1413 			} else {
1414 				if(udp_port_type == listen_type_udp)
1415 					udp_port_type = listen_type_udpancil;
1416 			}
1417 		}
1418 		if(!port_insert(list, s, udp_port_type, is_pp2, ub_sock)) {
1419 			sock_close(s);
1420 			free(ub_sock->addr);
1421 			free(ub_sock);
1422 			return 0;
1423 		}
1424 	}
1425 	if(do_tcp) {
1426 		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1427 			tls_additional_port);
1428 		enum listen_type port_type;
1429 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1430 		if(!ub_sock)
1431 			return 0;
1432 		if(is_ssl) {
1433 			port_type = listen_type_ssl;
1434 			add = "tls";
1435 		} else if(is_https) {
1436 			port_type = listen_type_http;
1437 			add = "https";
1438 			if(http_notls_downstream)
1439 				add = "http";
1440 		} else if(is_dnscrypt) {
1441 			port_type = listen_type_tcp_dnscrypt;
1442 			add = "dnscrypt";
1443 		} else {
1444 			port_type = listen_type_tcp;
1445 			add = NULL;
1446 		}
1447 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1448 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1449 			freebind, use_systemd, dscp, ub_sock, add)) == -1) {
1450 			free(ub_sock->addr);
1451 			free(ub_sock);
1452 			if(noip6) {
1453 				/*log_warn("IPv6 protocol not available");*/
1454 				return 1;
1455 			}
1456 			return 0;
1457 		}
1458 		if(is_ssl)
1459 			verbose(VERB_ALGO, "setup TCP for SSL service");
1460 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1461 			sock_close(s);
1462 			free(ub_sock->addr);
1463 			free(ub_sock);
1464 			return 0;
1465 		}
1466 	}
1467 	return 1;
1468 }
1469 
1470 /**
1471  * Add items to commpoint list in front.
1472  * @param c: commpoint to add.
1473  * @param front: listen struct.
1474  * @return: false on failure.
1475  */
1476 static int
listen_cp_insert(struct comm_point * c,struct listen_dnsport * front)1477 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1478 {
1479 	struct listen_list* item = (struct listen_list*)malloc(
1480 		sizeof(struct listen_list));
1481 	if(!item)
1482 		return 0;
1483 	item->com = c;
1484 	item->next = front->cps;
1485 	front->cps = item;
1486 	return 1;
1487 }
1488 
listen_setup_locks(void)1489 void listen_setup_locks(void)
1490 {
1491 	if(!stream_wait_lock_inited) {
1492 		lock_basic_init(&stream_wait_count_lock);
1493 		stream_wait_lock_inited = 1;
1494 	}
1495 	if(!http2_query_buffer_lock_inited) {
1496 		lock_basic_init(&http2_query_buffer_count_lock);
1497 		http2_query_buffer_lock_inited = 1;
1498 	}
1499 	if(!http2_response_buffer_lock_inited) {
1500 		lock_basic_init(&http2_response_buffer_count_lock);
1501 		http2_response_buffer_lock_inited = 1;
1502 	}
1503 }
1504 
listen_desetup_locks(void)1505 void listen_desetup_locks(void)
1506 {
1507 	if(stream_wait_lock_inited) {
1508 		stream_wait_lock_inited = 0;
1509 		lock_basic_destroy(&stream_wait_count_lock);
1510 	}
1511 	if(http2_query_buffer_lock_inited) {
1512 		http2_query_buffer_lock_inited = 0;
1513 		lock_basic_destroy(&http2_query_buffer_count_lock);
1514 	}
1515 	if(http2_response_buffer_lock_inited) {
1516 		http2_response_buffer_lock_inited = 0;
1517 		lock_basic_destroy(&http2_response_buffer_count_lock);
1518 	}
1519 }
1520 
1521 struct listen_dnsport*
listen_create(struct comm_base * base,struct listen_port * ports,size_t bufsize,int tcp_accept_count,int tcp_idle_timeout,int harden_large_queries,uint32_t http_max_streams,char * http_endpoint,int http_notls,struct tcl_list * tcp_conn_limit,void * sslctx,struct dt_env * dtenv,struct doq_table * doq_table,struct ub_randstate * rnd,const char * ssl_service_key,const char * ssl_service_pem,struct config_file * cfg,comm_point_callback_type * cb,void * cb_arg)1522 listen_create(struct comm_base* base, struct listen_port* ports,
1523 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1524 	int harden_large_queries, uint32_t http_max_streams,
1525 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1526 	void* sslctx, struct dt_env* dtenv, struct doq_table* doq_table,
1527 	struct ub_randstate* rnd, const char* ssl_service_key,
1528 	const char* ssl_service_pem, struct config_file* cfg,
1529 	comm_point_callback_type* cb, void *cb_arg)
1530 {
1531 	struct listen_dnsport* front = (struct listen_dnsport*)
1532 		malloc(sizeof(struct listen_dnsport));
1533 	if(!front)
1534 		return NULL;
1535 	front->cps = NULL;
1536 	front->udp_buff = sldns_buffer_new(bufsize);
1537 #ifdef USE_DNSCRYPT
1538 	front->dnscrypt_udp_buff = NULL;
1539 #endif
1540 	if(!front->udp_buff) {
1541 		free(front);
1542 		return NULL;
1543 	}
1544 
1545 	/* create comm points as needed */
1546 	while(ports) {
1547 		struct comm_point* cp = NULL;
1548 		if(ports->ftype == listen_type_udp ||
1549 		   ports->ftype == listen_type_udp_dnscrypt) {
1550 			cp = comm_point_create_udp(base, ports->fd,
1551 				front->udp_buff, ports->pp2_enabled, cb,
1552 				cb_arg, ports->socket);
1553 		} else if(ports->ftype == listen_type_doq) {
1554 #ifndef HAVE_NGTCP2
1555 			log_warn("Unbound is not compiled with "
1556 				"ngtcp2. This is required to use DNS "
1557 				"over QUIC.");
1558 #endif
1559 			cp = comm_point_create_doq(base, ports->fd,
1560 				front->udp_buff, cb, cb_arg, ports->socket,
1561 				doq_table, rnd, ssl_service_key,
1562 				ssl_service_pem, cfg);
1563 		} else if(ports->ftype == listen_type_tcp ||
1564 				ports->ftype == listen_type_tcp_dnscrypt) {
1565 			cp = comm_point_create_tcp(base, ports->fd,
1566 				tcp_accept_count, tcp_idle_timeout,
1567 				harden_large_queries, 0, NULL,
1568 				tcp_conn_limit, bufsize, front->udp_buff,
1569 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1570 				ports->socket);
1571 		} else if(ports->ftype == listen_type_ssl ||
1572 			ports->ftype == listen_type_http) {
1573 			cp = comm_point_create_tcp(base, ports->fd,
1574 				tcp_accept_count, tcp_idle_timeout,
1575 				harden_large_queries,
1576 				http_max_streams, http_endpoint,
1577 				tcp_conn_limit, bufsize, front->udp_buff,
1578 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1579 				ports->socket);
1580 			if(ports->ftype == listen_type_http) {
1581 				if(!sslctx && !http_notls) {
1582 					log_warn("HTTPS port configured, but "
1583 						"no TLS tls-service-key or "
1584 						"tls-service-pem set");
1585 				}
1586 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1587 				if(!http_notls) {
1588 					log_warn("Unbound is not compiled "
1589 						"with an OpenSSL version "
1590 						"supporting ALPN "
1591 						"(OpenSSL >= 1.0.2). This "
1592 						"is required to use "
1593 						"DNS-over-HTTPS");
1594 				}
1595 #endif
1596 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1597 				log_warn("Unbound is not compiled with "
1598 					"nghttp2. This is required to use "
1599 					"DNS-over-HTTPS.");
1600 #endif
1601 			}
1602 		} else if(ports->ftype == listen_type_udpancil ||
1603 				  ports->ftype == listen_type_udpancil_dnscrypt) {
1604 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
1605 			cp = comm_point_create_udp_ancil(base, ports->fd,
1606 				front->udp_buff, ports->pp2_enabled, cb,
1607 				cb_arg, ports->socket);
1608 #else
1609 			log_warn("This system does not support UDP ancilliary data.");
1610 #endif
1611 		}
1612 		if(!cp) {
1613 			log_err("can't create commpoint");
1614 			listen_delete(front);
1615 			return NULL;
1616 		}
1617 		if((http_notls && ports->ftype == listen_type_http) ||
1618 			(ports->ftype == listen_type_tcp) ||
1619 			(ports->ftype == listen_type_udp) ||
1620 			(ports->ftype == listen_type_udpancil) ||
1621 			(ports->ftype == listen_type_tcp_dnscrypt) ||
1622 			(ports->ftype == listen_type_udp_dnscrypt) ||
1623 			(ports->ftype == listen_type_udpancil_dnscrypt))
1624 			cp->ssl = NULL;
1625 		else
1626 			cp->ssl = sslctx;
1627 		cp->dtenv = dtenv;
1628 		cp->do_not_close = 1;
1629 #ifdef USE_DNSCRYPT
1630 		if (ports->ftype == listen_type_udp_dnscrypt ||
1631 			ports->ftype == listen_type_tcp_dnscrypt ||
1632 			ports->ftype == listen_type_udpancil_dnscrypt) {
1633 			cp->dnscrypt = 1;
1634 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1635 			if(!cp->dnscrypt_buffer) {
1636 				log_err("can't alloc dnscrypt_buffer");
1637 				comm_point_delete(cp);
1638 				listen_delete(front);
1639 				return NULL;
1640 			}
1641 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1642 		}
1643 #endif
1644 		if(!listen_cp_insert(cp, front)) {
1645 			log_err("malloc failed");
1646 			comm_point_delete(cp);
1647 			listen_delete(front);
1648 			return NULL;
1649 		}
1650 		ports = ports->next;
1651 	}
1652 	if(!front->cps) {
1653 		log_err("Could not open sockets to accept queries.");
1654 		listen_delete(front);
1655 		return NULL;
1656 	}
1657 
1658 	return front;
1659 }
1660 
1661 void
listen_list_delete(struct listen_list * list)1662 listen_list_delete(struct listen_list* list)
1663 {
1664 	struct listen_list *p = list, *pn;
1665 	while(p) {
1666 		pn = p->next;
1667 		comm_point_delete(p->com);
1668 		free(p);
1669 		p = pn;
1670 	}
1671 }
1672 
1673 void
listen_delete(struct listen_dnsport * front)1674 listen_delete(struct listen_dnsport* front)
1675 {
1676 	if(!front)
1677 		return;
1678 	listen_list_delete(front->cps);
1679 #ifdef USE_DNSCRYPT
1680 	if(front->dnscrypt_udp_buff &&
1681 		front->udp_buff != front->dnscrypt_udp_buff) {
1682 		sldns_buffer_free(front->dnscrypt_udp_buff);
1683 	}
1684 #endif
1685 	sldns_buffer_free(front->udp_buff);
1686 	free(front);
1687 }
1688 
1689 #ifdef HAVE_GETIFADDRS
1690 static int
resolve_ifa_name(struct ifaddrs * ifas,const char * search_ifa,char *** ip_addresses,int * ip_addresses_size)1691 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1692 {
1693 	struct ifaddrs *ifa;
1694 	void *tmpbuf;
1695 	int last_ip_addresses_size = *ip_addresses_size;
1696 
1697 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1698 		sa_family_t family;
1699 		const char* atsign;
1700 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1701 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1702 #else
1703 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1704 #endif
1705 
1706 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1707 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1708 			   || strncmp(ifa->ifa_name, search_ifa,
1709 			   atsign-search_ifa) != 0)
1710 				continue;
1711 		} else {
1712 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1713 				continue;
1714 			atsign = "";
1715 		}
1716 
1717 		if(ifa->ifa_addr == NULL)
1718 			continue;
1719 
1720 		family = ifa->ifa_addr->sa_family;
1721 		if(family == AF_INET) {
1722 			char a4[INET_ADDRSTRLEN + 1];
1723 			struct sockaddr_in *in4 = (struct sockaddr_in *)
1724 				ifa->ifa_addr;
1725 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1726 				log_err("inet_ntop failed");
1727 				return 0;
1728 			}
1729 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1730 				a4, atsign);
1731 		}
1732 #ifdef INET6
1733 		else if(family == AF_INET6) {
1734 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1735 				ifa->ifa_addr;
1736 			char a6[INET6_ADDRSTRLEN + 1];
1737 			char if_index_name[IF_NAMESIZE + 1];
1738 			if_index_name[0] = 0;
1739 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1740 				log_err("inet_ntop failed");
1741 				return 0;
1742 			}
1743 			(void)if_indextoname(in6->sin6_scope_id,
1744 				(char *)if_index_name);
1745 			if (strlen(if_index_name) != 0) {
1746 				snprintf(addr_buf, sizeof(addr_buf),
1747 					"%s%%%s%s", a6, if_index_name, atsign);
1748 			} else {
1749 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1750 					a6, atsign);
1751 			}
1752 		}
1753 #endif
1754 		else {
1755 			continue;
1756 		}
1757 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1758 
1759 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1760 		if(!tmpbuf) {
1761 			log_err("realloc failed: out of memory");
1762 			return 0;
1763 		} else {
1764 			*ip_addresses = tmpbuf;
1765 		}
1766 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1767 		if(!(*ip_addresses)[*ip_addresses_size]) {
1768 			log_err("strdup failed: out of memory");
1769 			return 0;
1770 		}
1771 		(*ip_addresses_size)++;
1772 	}
1773 
1774 	if (*ip_addresses_size == last_ip_addresses_size) {
1775 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1776 		if(!tmpbuf) {
1777 			log_err("realloc failed: out of memory");
1778 			return 0;
1779 		} else {
1780 			*ip_addresses = tmpbuf;
1781 		}
1782 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1783 		if(!(*ip_addresses)[*ip_addresses_size]) {
1784 			log_err("strdup failed: out of memory");
1785 			return 0;
1786 		}
1787 		(*ip_addresses_size)++;
1788 	}
1789 	return 1;
1790 }
1791 #endif /* HAVE_GETIFADDRS */
1792 
resolve_interface_names(char ** ifs,int num_ifs,struct config_strlist * list,char *** resif,int * num_resif)1793 int resolve_interface_names(char** ifs, int num_ifs,
1794 	struct config_strlist* list, char*** resif, int* num_resif)
1795 {
1796 #ifdef HAVE_GETIFADDRS
1797 	struct ifaddrs *addrs = NULL;
1798 	if(num_ifs == 0 && list == NULL) {
1799 		*resif = NULL;
1800 		*num_resif = 0;
1801 		return 1;
1802 	}
1803 	if(getifaddrs(&addrs) == -1) {
1804 		log_err("failed to list interfaces: getifaddrs: %s",
1805 			strerror(errno));
1806 		freeifaddrs(addrs);
1807 		return 0;
1808 	}
1809 	if(ifs) {
1810 		int i;
1811 		for(i=0; i<num_ifs; i++) {
1812 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1813 				freeifaddrs(addrs);
1814 				config_del_strarray(*resif, *num_resif);
1815 				*resif = NULL;
1816 				*num_resif = 0;
1817 				return 0;
1818 			}
1819 		}
1820 	}
1821 	if(list) {
1822 		struct config_strlist* p;
1823 		for(p = list; p; p = p->next) {
1824 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1825 				freeifaddrs(addrs);
1826 				config_del_strarray(*resif, *num_resif);
1827 				*resif = NULL;
1828 				*num_resif = 0;
1829 				return 0;
1830 			}
1831 }
1832 	}
1833 	freeifaddrs(addrs);
1834 	return 1;
1835 #else
1836 	struct config_strlist* p;
1837 	if(num_ifs == 0 && list == NULL) {
1838 		*resif = NULL;
1839 		*num_resif = 0;
1840 		return 1;
1841 	}
1842 	*num_resif = num_ifs;
1843 	for(p = list; p; p = p->next) {
1844 		(*num_resif)++;
1845 	}
1846 	*resif = calloc(*num_resif, sizeof(**resif));
1847 	if(!*resif) {
1848 		log_err("out of memory");
1849 		return 0;
1850 	}
1851 	if(ifs) {
1852 		int i;
1853 		for(i=0; i<num_ifs; i++) {
1854 			(*resif)[i] = strdup(ifs[i]);
1855 			if(!((*resif)[i])) {
1856 				log_err("out of memory");
1857 				config_del_strarray(*resif, *num_resif);
1858 				*resif = NULL;
1859 				*num_resif = 0;
1860 				return 0;
1861 			}
1862 		}
1863 	}
1864 	if(list) {
1865 		int idx = num_ifs;
1866 		for(p = list; p; p = p->next) {
1867 			(*resif)[idx] = strdup(p->str);
1868 			if(!((*resif)[idx])) {
1869 				log_err("out of memory");
1870 				config_del_strarray(*resif, *num_resif);
1871 				*resif = NULL;
1872 				*num_resif = 0;
1873 				return 0;
1874 			}
1875 			idx++;
1876 		}
1877 	}
1878 	return 1;
1879 #endif /* HAVE_GETIFADDRS */
1880 }
1881 
1882 struct listen_port*
listening_ports_open(struct config_file * cfg,char ** ifs,int num_ifs,int * reuseport)1883 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1884 	int* reuseport)
1885 {
1886 	struct listen_port* list = NULL;
1887 	struct addrinfo hints;
1888 	int i, do_ip4, do_ip6;
1889 	int do_tcp, do_auto;
1890 	char portbuf[32];
1891 	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1892 	do_ip4 = cfg->do_ip4;
1893 	do_ip6 = cfg->do_ip6;
1894 	do_tcp = cfg->do_tcp;
1895 	do_auto = cfg->if_automatic && cfg->do_udp;
1896 	if(cfg->incoming_num_tcp == 0)
1897 		do_tcp = 0;
1898 
1899 	/* getaddrinfo */
1900 	memset(&hints, 0, sizeof(hints));
1901 	hints.ai_flags = AI_PASSIVE;
1902 	/* no name lookups on our listening ports */
1903 	if(num_ifs > 0)
1904 		hints.ai_flags |= AI_NUMERICHOST;
1905 	hints.ai_family = AF_UNSPEC;
1906 #ifndef INET6
1907 	do_ip6 = 0;
1908 #endif
1909 	if(!do_ip4 && !do_ip6) {
1910 		return NULL;
1911 	}
1912 	/* create ip4 and ip6 ports so that return addresses are nice. */
1913 	if(do_auto || num_ifs == 0) {
1914 		if(do_auto && cfg->if_automatic_ports &&
1915 			cfg->if_automatic_ports[0]!=0) {
1916 			char* now = cfg->if_automatic_ports;
1917 			while(now && *now) {
1918 				char* after;
1919 				int extraport;
1920 				while(isspace((unsigned char)*now))
1921 					now++;
1922 				if(!*now)
1923 					break;
1924 				after = now;
1925 				extraport = (int)strtol(now, &after, 10);
1926 				if(extraport < 0 || extraport > 65535) {
1927 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1928 					listening_ports_free(list);
1929 					return NULL;
1930 				}
1931 				if(extraport == 0 && now == after) {
1932 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1933 					listening_ports_free(list);
1934 					return NULL;
1935 				}
1936 				now = after;
1937 				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1938 				if(do_ip6) {
1939 					hints.ai_family = AF_INET6;
1940 					if(!ports_create_if("::0",
1941 						do_auto, cfg->do_udp, do_tcp,
1942 						&hints, portbuf, &list,
1943 						cfg->so_rcvbuf, cfg->so_sndbuf,
1944 						cfg->ssl_port, cfg->tls_additional_port,
1945 						cfg->https_port,
1946 						cfg->proxy_protocol_port,
1947 						reuseport, cfg->ip_transparent,
1948 						cfg->tcp_mss, cfg->ip_freebind,
1949 						cfg->http_nodelay, cfg->use_systemd,
1950 						cfg->dnscrypt_port, cfg->ip_dscp,
1951 						cfg->quic_port, cfg->http_notls_downstream,
1952 						cfg->sock_queue_timeout)) {
1953 						listening_ports_free(list);
1954 						return NULL;
1955 					}
1956 				}
1957 				if(do_ip4) {
1958 					hints.ai_family = AF_INET;
1959 					if(!ports_create_if("0.0.0.0",
1960 						do_auto, cfg->do_udp, do_tcp,
1961 						&hints, portbuf, &list,
1962 						cfg->so_rcvbuf, cfg->so_sndbuf,
1963 						cfg->ssl_port, cfg->tls_additional_port,
1964 						cfg->https_port,
1965 						cfg->proxy_protocol_port,
1966 						reuseport, cfg->ip_transparent,
1967 						cfg->tcp_mss, cfg->ip_freebind,
1968 						cfg->http_nodelay, cfg->use_systemd,
1969 						cfg->dnscrypt_port, cfg->ip_dscp,
1970 						cfg->quic_port, cfg->http_notls_downstream,
1971 						cfg->sock_queue_timeout)) {
1972 						listening_ports_free(list);
1973 						return NULL;
1974 					}
1975 				}
1976 			}
1977 			return list;
1978 		}
1979 		if(do_ip6) {
1980 			hints.ai_family = AF_INET6;
1981 			if(!ports_create_if(do_auto?"::0":"::1",
1982 				do_auto, cfg->do_udp, do_tcp,
1983 				&hints, portbuf, &list,
1984 				cfg->so_rcvbuf, cfg->so_sndbuf,
1985 				cfg->ssl_port, cfg->tls_additional_port,
1986 				cfg->https_port, cfg->proxy_protocol_port,
1987 				reuseport, cfg->ip_transparent,
1988 				cfg->tcp_mss, cfg->ip_freebind,
1989 				cfg->http_nodelay, cfg->use_systemd,
1990 				cfg->dnscrypt_port, cfg->ip_dscp,
1991 				cfg->quic_port, cfg->http_notls_downstream,
1992 				cfg->sock_queue_timeout)) {
1993 				listening_ports_free(list);
1994 				return NULL;
1995 			}
1996 		}
1997 		if(do_ip4) {
1998 			hints.ai_family = AF_INET;
1999 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
2000 				do_auto, cfg->do_udp, do_tcp,
2001 				&hints, portbuf, &list,
2002 				cfg->so_rcvbuf, cfg->so_sndbuf,
2003 				cfg->ssl_port, cfg->tls_additional_port,
2004 				cfg->https_port, cfg->proxy_protocol_port,
2005 				reuseport, cfg->ip_transparent,
2006 				cfg->tcp_mss, cfg->ip_freebind,
2007 				cfg->http_nodelay, cfg->use_systemd,
2008 				cfg->dnscrypt_port, cfg->ip_dscp,
2009 				cfg->quic_port, cfg->http_notls_downstream,
2010 				cfg->sock_queue_timeout)) {
2011 				listening_ports_free(list);
2012 				return NULL;
2013 			}
2014 		}
2015 	} else for(i = 0; i<num_ifs; i++) {
2016 		if(str_is_ip6(ifs[i])) {
2017 			if(!do_ip6)
2018 				continue;
2019 			hints.ai_family = AF_INET6;
2020 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
2021 				do_tcp, &hints, portbuf, &list,
2022 				cfg->so_rcvbuf, cfg->so_sndbuf,
2023 				cfg->ssl_port, cfg->tls_additional_port,
2024 				cfg->https_port, cfg->proxy_protocol_port,
2025 				reuseport, cfg->ip_transparent,
2026 				cfg->tcp_mss, cfg->ip_freebind,
2027 				cfg->http_nodelay, cfg->use_systemd,
2028 				cfg->dnscrypt_port, cfg->ip_dscp,
2029 				cfg->quic_port, cfg->http_notls_downstream,
2030 				cfg->sock_queue_timeout)) {
2031 				listening_ports_free(list);
2032 				return NULL;
2033 			}
2034 		} else {
2035 			if(!do_ip4)
2036 				continue;
2037 			hints.ai_family = AF_INET;
2038 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
2039 				do_tcp, &hints, portbuf, &list,
2040 				cfg->so_rcvbuf, cfg->so_sndbuf,
2041 				cfg->ssl_port, cfg->tls_additional_port,
2042 				cfg->https_port, cfg->proxy_protocol_port,
2043 				reuseport, cfg->ip_transparent,
2044 				cfg->tcp_mss, cfg->ip_freebind,
2045 				cfg->http_nodelay, cfg->use_systemd,
2046 				cfg->dnscrypt_port, cfg->ip_dscp,
2047 				cfg->quic_port, cfg->http_notls_downstream,
2048 				cfg->sock_queue_timeout)) {
2049 				listening_ports_free(list);
2050 				return NULL;
2051 			}
2052 		}
2053 	}
2054 
2055 	return list;
2056 }
2057 
listening_ports_free(struct listen_port * list)2058 void listening_ports_free(struct listen_port* list)
2059 {
2060 	struct listen_port* nx;
2061 	while(list) {
2062 		nx = list->next;
2063 		if(list->fd != -1) {
2064 			sock_close(list->fd);
2065 		}
2066 		/* rc_ports don't have ub_socket */
2067 		if(list->socket) {
2068 			free(list->socket->addr);
2069 			free(list->socket);
2070 		}
2071 		free(list);
2072 		list = nx;
2073 	}
2074 }
2075 
listen_get_mem(struct listen_dnsport * listen)2076 size_t listen_get_mem(struct listen_dnsport* listen)
2077 {
2078 	struct listen_list* p;
2079 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
2080 		sizeof(*listen->udp_buff) +
2081 		sldns_buffer_capacity(listen->udp_buff);
2082 #ifdef USE_DNSCRYPT
2083 	s += sizeof(*listen->dnscrypt_udp_buff);
2084 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
2085 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
2086 	}
2087 #endif
2088 	for(p = listen->cps; p; p = p->next) {
2089 		s += sizeof(*p);
2090 		s += comm_point_get_mem(p->com);
2091 	}
2092 	return s;
2093 }
2094 
listen_stop_accept(struct listen_dnsport * listen)2095 void listen_stop_accept(struct listen_dnsport* listen)
2096 {
2097 	/* do not stop the ones that have no tcp_free list
2098 	 * (they have already stopped listening) */
2099 	struct listen_list* p;
2100 	for(p=listen->cps; p; p=p->next) {
2101 		if(p->com->type == comm_tcp_accept &&
2102 			p->com->tcp_free != NULL) {
2103 			comm_point_stop_listening(p->com);
2104 		}
2105 	}
2106 }
2107 
listen_start_accept(struct listen_dnsport * listen)2108 void listen_start_accept(struct listen_dnsport* listen)
2109 {
2110 	/* do not start the ones that have no tcp_free list, it is no
2111 	 * use to listen to them because they have no free tcp handlers */
2112 	struct listen_list* p;
2113 	for(p=listen->cps; p; p=p->next) {
2114 		if(p->com->type == comm_tcp_accept &&
2115 			p->com->tcp_free != NULL) {
2116 			comm_point_start_listening(p->com, -1, -1);
2117 		}
2118 	}
2119 }
2120 
2121 struct tcp_req_info*
tcp_req_info_create(struct sldns_buffer * spoolbuf)2122 tcp_req_info_create(struct sldns_buffer* spoolbuf)
2123 {
2124 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2125 	if(!req) {
2126 		log_err("malloc failure for new stream outoforder processing structure");
2127 		return NULL;
2128 	}
2129 	memset(req, 0, sizeof(*req));
2130 	req->spool_buffer = spoolbuf;
2131 	return req;
2132 }
2133 
2134 void
tcp_req_info_delete(struct tcp_req_info * req)2135 tcp_req_info_delete(struct tcp_req_info* req)
2136 {
2137 	if(!req) return;
2138 	tcp_req_info_clear(req);
2139 	/* cp is pointer back to commpoint that owns this struct and
2140 	 * called delete on us */
2141 	/* spool_buffer is shared udp buffer, not deleted here */
2142 	free(req);
2143 }
2144 
tcp_req_info_clear(struct tcp_req_info * req)2145 void tcp_req_info_clear(struct tcp_req_info* req)
2146 {
2147 	struct tcp_req_open_item* open, *nopen;
2148 	struct tcp_req_done_item* item, *nitem;
2149 	if(!req) return;
2150 
2151 	/* free outstanding request mesh reply entries */
2152 	open = req->open_req_list;
2153 	while(open) {
2154 		nopen = open->next;
2155 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2156 		free(open);
2157 		open = nopen;
2158 	}
2159 	req->open_req_list = NULL;
2160 	req->num_open_req = 0;
2161 
2162 	/* free pending writable result packets */
2163 	item = req->done_req_list;
2164 	while(item) {
2165 		nitem = item->next;
2166 		lock_basic_lock(&stream_wait_count_lock);
2167 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2168 			+item->len);
2169 		lock_basic_unlock(&stream_wait_count_lock);
2170 		free(item->buf);
2171 		free(item);
2172 		item = nitem;
2173 	}
2174 	req->done_req_list = NULL;
2175 	req->num_done_req = 0;
2176 	req->read_is_closed = 0;
2177 }
2178 
2179 void
tcp_req_info_remove_mesh_state(struct tcp_req_info * req,struct mesh_state * m)2180 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2181 {
2182 	struct tcp_req_open_item* open, *prev = NULL;
2183 	if(!req || !m) return;
2184 	open = req->open_req_list;
2185 	while(open) {
2186 		if(open->mesh_state == m) {
2187 			struct tcp_req_open_item* next;
2188 			if(prev) prev->next = open->next;
2189 			else req->open_req_list = open->next;
2190 			/* caller has to manage the mesh state reply entry */
2191 			next = open->next;
2192 			free(open);
2193 			req->num_open_req --;
2194 
2195 			/* prev = prev; */
2196 			open = next;
2197 			continue;
2198 		}
2199 		prev = open;
2200 		open = open->next;
2201 	}
2202 }
2203 
2204 /** setup listening for read or write */
2205 static void
tcp_req_info_setup_listen(struct tcp_req_info * req)2206 tcp_req_info_setup_listen(struct tcp_req_info* req)
2207 {
2208 	int wr = 0;
2209 	int rd = 0;
2210 
2211 	if(req->cp->tcp_byte_count != 0) {
2212 		/* cannot change, halfway through */
2213 		return;
2214 	}
2215 
2216 	if(!req->cp->tcp_is_reading)
2217 		wr = 1;
2218 	if(!req->read_is_closed)
2219 		rd = 1;
2220 
2221 	if(wr) {
2222 		req->cp->tcp_is_reading = 0;
2223 		comm_point_stop_listening(req->cp);
2224 		comm_point_start_listening(req->cp, -1,
2225 			adjusted_tcp_timeout(req->cp));
2226 	} else if(rd) {
2227 		req->cp->tcp_is_reading = 1;
2228 		comm_point_stop_listening(req->cp);
2229 		comm_point_start_listening(req->cp, -1,
2230 			adjusted_tcp_timeout(req->cp));
2231 		/* and also read it (from SSL stack buffers), so
2232 		 * no event read event is expected since the remainder of
2233 		 * the TLS frame is sitting in the buffers. */
2234 		req->read_again = 1;
2235 	} else {
2236 		comm_point_stop_listening(req->cp);
2237 		comm_point_start_listening(req->cp, -1,
2238 			adjusted_tcp_timeout(req->cp));
2239 		comm_point_listen_for_rw(req->cp, 0, 0);
2240 	}
2241 }
2242 
2243 /** remove first item from list of pending results */
2244 static struct tcp_req_done_item*
tcp_req_info_pop_done(struct tcp_req_info * req)2245 tcp_req_info_pop_done(struct tcp_req_info* req)
2246 {
2247 	struct tcp_req_done_item* item;
2248 	log_assert(req->num_done_req > 0 && req->done_req_list);
2249 	item = req->done_req_list;
2250 	lock_basic_lock(&stream_wait_count_lock);
2251 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2252 	lock_basic_unlock(&stream_wait_count_lock);
2253 	req->done_req_list = req->done_req_list->next;
2254 	req->num_done_req --;
2255 	return item;
2256 }
2257 
2258 /** Send given buffer and setup to write */
2259 static void
tcp_req_info_start_write_buf(struct tcp_req_info * req,uint8_t * buf,size_t len)2260 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2261 	size_t len)
2262 {
2263 	sldns_buffer_clear(req->cp->buffer);
2264 	sldns_buffer_write(req->cp->buffer, buf, len);
2265 	sldns_buffer_flip(req->cp->buffer);
2266 
2267 	req->cp->tcp_is_reading = 0; /* we are now writing */
2268 }
2269 
2270 /** pick up the next result and start writing it to the channel */
2271 static void
tcp_req_pickup_next_result(struct tcp_req_info * req)2272 tcp_req_pickup_next_result(struct tcp_req_info* req)
2273 {
2274 	if(req->num_done_req > 0) {
2275 		/* unlist the done item from the list of pending results */
2276 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2277 		tcp_req_info_start_write_buf(req, item->buf, item->len);
2278 		free(item->buf);
2279 		free(item);
2280 	}
2281 }
2282 
2283 /** the read channel has closed */
2284 int
tcp_req_info_handle_read_close(struct tcp_req_info * req)2285 tcp_req_info_handle_read_close(struct tcp_req_info* req)
2286 {
2287 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2288 	/* reset byte count for (potential) partial read */
2289 	req->cp->tcp_byte_count = 0;
2290 	/* if we still have results to write, pick up next and write it */
2291 	if(req->num_done_req != 0) {
2292 		tcp_req_pickup_next_result(req);
2293 		tcp_req_info_setup_listen(req);
2294 		return 1;
2295 	}
2296 	/* if nothing to do, this closes the connection */
2297 	if(req->num_open_req == 0 && req->num_done_req == 0)
2298 		return 0;
2299 	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2300 	req->read_is_closed = 1;
2301 	tcp_req_info_setup_listen(req);
2302 	return 1;
2303 }
2304 
2305 void
tcp_req_info_handle_writedone(struct tcp_req_info * req)2306 tcp_req_info_handle_writedone(struct tcp_req_info* req)
2307 {
2308 	/* back to reading state, we finished this write event */
2309 	sldns_buffer_clear(req->cp->buffer);
2310 	if(req->num_done_req == 0 && req->read_is_closed) {
2311 		/* no more to write and nothing to read, close it */
2312 		comm_point_drop_reply(&req->cp->repinfo);
2313 		return;
2314 	}
2315 	req->cp->tcp_is_reading = 1;
2316 	/* see if another result needs writing */
2317 	tcp_req_pickup_next_result(req);
2318 
2319 	/* see if there is more to write, if not stop_listening for writing */
2320 	/* see if new requests are allowed, if so, start_listening
2321 	 * for reading */
2322 	tcp_req_info_setup_listen(req);
2323 }
2324 
2325 void
tcp_req_info_handle_readdone(struct tcp_req_info * req)2326 tcp_req_info_handle_readdone(struct tcp_req_info* req)
2327 {
2328 	struct comm_point* c = req->cp;
2329 
2330 	/* we want to read up several requests, unless there are
2331 	 * pending answers */
2332 
2333 	req->is_drop = 0;
2334 	req->is_reply = 0;
2335 	req->in_worker_handle = 1;
2336 	sldns_buffer_set_limit(req->spool_buffer, 0);
2337 	/* handle the current request */
2338 	/* this calls the worker handle request routine that could give
2339 	 * a cache response, or localdata response, or drop the reply,
2340 	 * or schedule a mesh entry for later */
2341 	fptr_ok(fptr_whitelist_comm_point(c->callback));
2342 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2343 		req->in_worker_handle = 0;
2344 		/* there is an answer, put it up.  It is already in the
2345 		 * c->buffer, just send it. */
2346 		/* since we were just reading a query, the channel is
2347 		 * clear to write to */
2348 	send_it:
2349 		c->tcp_is_reading = 0;
2350 		comm_point_stop_listening(c);
2351 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2352 		return;
2353 	}
2354 	req->in_worker_handle = 0;
2355 	/* it should be waiting in the mesh for recursion.
2356 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2357 	 * Then the mesh state has been cleared. */
2358 	if(req->is_drop) {
2359 		/* the reply has been dropped, stream has been closed. */
2360 		return;
2361 	}
2362 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2363 	 * something like servfail then we pick up that reply below. */
2364 	if(req->is_reply) {
2365 		goto send_it;
2366 	}
2367 
2368 	sldns_buffer_clear(c->buffer);
2369 	/* if pending answers, pick up an answer and start sending it */
2370 	tcp_req_pickup_next_result(req);
2371 
2372 	/* if answers pending, start sending answers */
2373 	/* read more requests if we can have more requests */
2374 	tcp_req_info_setup_listen(req);
2375 }
2376 
2377 int
tcp_req_info_add_meshstate(struct tcp_req_info * req,struct mesh_area * mesh,struct mesh_state * m)2378 tcp_req_info_add_meshstate(struct tcp_req_info* req,
2379 	struct mesh_area* mesh, struct mesh_state* m)
2380 {
2381 	struct tcp_req_open_item* item;
2382 	log_assert(req && mesh && m);
2383 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2384 	if(!item) return 0;
2385 	item->next = req->open_req_list;
2386 	item->mesh = mesh;
2387 	item->mesh_state = m;
2388 	req->open_req_list = item;
2389 	req->num_open_req++;
2390 	return 1;
2391 }
2392 
2393 /** Add a result to the result list.  At the end. */
2394 static int
tcp_req_info_add_result(struct tcp_req_info * req,uint8_t * buf,size_t len)2395 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2396 {
2397 	struct tcp_req_done_item* last = NULL;
2398 	struct tcp_req_done_item* item;
2399 	size_t space;
2400 
2401 	/* see if we have space */
2402 	space = sizeof(struct tcp_req_done_item) + len;
2403 	lock_basic_lock(&stream_wait_count_lock);
2404 	if(stream_wait_count + space > stream_wait_max) {
2405 		lock_basic_unlock(&stream_wait_count_lock);
2406 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2407 		return 0;
2408 	}
2409 	stream_wait_count += space;
2410 	lock_basic_unlock(&stream_wait_count_lock);
2411 
2412 	/* find last element */
2413 	last = req->done_req_list;
2414 	while(last && last->next)
2415 		last = last->next;
2416 
2417 	/* create new element */
2418 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2419 	if(!item) {
2420 		log_err("malloc failure, for stream result list");
2421 		return 0;
2422 	}
2423 	item->next = NULL;
2424 	item->len = len;
2425 	item->buf = memdup(buf, len);
2426 	if(!item->buf) {
2427 		free(item);
2428 		log_err("malloc failure, adding reply to stream result list");
2429 		return 0;
2430 	}
2431 
2432 	/* link in */
2433 	if(last) last->next = item;
2434 	else req->done_req_list = item;
2435 	req->num_done_req++;
2436 	return 1;
2437 }
2438 
2439 void
tcp_req_info_send_reply(struct tcp_req_info * req)2440 tcp_req_info_send_reply(struct tcp_req_info* req)
2441 {
2442 	if(req->in_worker_handle) {
2443 		/* reply from mesh is in the spool_buffer */
2444 		/* copy now, so that the spool buffer is free for other tasks
2445 		 * before the callback is done */
2446 		sldns_buffer_clear(req->cp->buffer);
2447 		sldns_buffer_write(req->cp->buffer,
2448 			sldns_buffer_begin(req->spool_buffer),
2449 			sldns_buffer_limit(req->spool_buffer));
2450 		sldns_buffer_flip(req->cp->buffer);
2451 		req->is_reply = 1;
2452 		return;
2453 	}
2454 	/* now that the query has been handled, that mesh_reply entry
2455 	 * should be removed, from the tcp_req_info list,
2456 	 * the mesh state cleanup removes then with region_cleanup and
2457 	 * replies_sent true. */
2458 	/* see if we can send it straight away (we are not doing
2459 	 * anything else).  If so, copy to buffer and start */
2460 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2461 		/* buffer is free, and was ready to read new query into,
2462 		 * but we are now going to use it to send this answer */
2463 		tcp_req_info_start_write_buf(req,
2464 			sldns_buffer_begin(req->spool_buffer),
2465 			sldns_buffer_limit(req->spool_buffer));
2466 		/* switch to listen to write events */
2467 		comm_point_stop_listening(req->cp);
2468 		comm_point_start_listening(req->cp, -1,
2469 			adjusted_tcp_timeout(req->cp));
2470 		return;
2471 	}
2472 	/* queue up the answer behind the others already pending */
2473 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2474 		sldns_buffer_limit(req->spool_buffer))) {
2475 		/* drop the connection, we are out of resources */
2476 		comm_point_drop_reply(&req->cp->repinfo);
2477 	}
2478 }
2479 
tcp_req_info_get_stream_buffer_size(void)2480 size_t tcp_req_info_get_stream_buffer_size(void)
2481 {
2482 	size_t s;
2483 	if(!stream_wait_lock_inited)
2484 		return stream_wait_count;
2485 	lock_basic_lock(&stream_wait_count_lock);
2486 	s = stream_wait_count;
2487 	lock_basic_unlock(&stream_wait_count_lock);
2488 	return s;
2489 }
2490 
http2_get_query_buffer_size(void)2491 size_t http2_get_query_buffer_size(void)
2492 {
2493 	size_t s;
2494 	if(!http2_query_buffer_lock_inited)
2495 		return http2_query_buffer_count;
2496 	lock_basic_lock(&http2_query_buffer_count_lock);
2497 	s = http2_query_buffer_count;
2498 	lock_basic_unlock(&http2_query_buffer_count_lock);
2499 	return s;
2500 }
2501 
http2_get_response_buffer_size(void)2502 size_t http2_get_response_buffer_size(void)
2503 {
2504 	size_t s;
2505 	if(!http2_response_buffer_lock_inited)
2506 		return http2_response_buffer_count;
2507 	lock_basic_lock(&http2_response_buffer_count_lock);
2508 	s = http2_response_buffer_count;
2509 	lock_basic_unlock(&http2_response_buffer_count_lock);
2510 	return s;
2511 }
2512 
2513 #ifdef HAVE_NGHTTP2
2514 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
http2_submit_response_read_callback(nghttp2_session * ATTR_UNUSED (session),int32_t stream_id,uint8_t * buf,size_t length,uint32_t * data_flags,nghttp2_data_source * source,void * ATTR_UNUSED (cb_arg))2515 static ssize_t http2_submit_response_read_callback(
2516 	nghttp2_session* ATTR_UNUSED(session),
2517 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2518 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2519 {
2520 	struct http2_stream* h2_stream;
2521 	struct http2_session* h2_session = source->ptr;
2522 	size_t copylen = length;
2523 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2524 		h2_session->session, stream_id))) {
2525 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2526 			"stream");
2527 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2528 	}
2529 	if(!h2_stream->rbuffer ||
2530 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2531 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2532 			"available in rbuffer");
2533 		/* rbuffer will be free'd in frame close cb */
2534 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2535 	}
2536 
2537 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2538 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2539 	if(copylen > SSIZE_MAX)
2540 		copylen = SSIZE_MAX; /* will probably never happen */
2541 
2542 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2543 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2544 
2545 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2546 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2547 		lock_basic_lock(&http2_response_buffer_count_lock);
2548 		http2_response_buffer_count -=
2549 			sldns_buffer_capacity(h2_stream->rbuffer);
2550 		lock_basic_unlock(&http2_response_buffer_count_lock);
2551 		sldns_buffer_free(h2_stream->rbuffer);
2552 		h2_stream->rbuffer = NULL;
2553 	}
2554 
2555 	return copylen;
2556 }
2557 
2558 /**
2559  * Send RST_STREAM frame for stream.
2560  * @param h2_session: http2 session to submit frame to
2561  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2562  * @return 0 on error, 1 otherwise
2563  */
http2_submit_rst_stream(struct http2_session * h2_session,struct http2_stream * h2_stream)2564 static int http2_submit_rst_stream(struct http2_session* h2_session,
2565 		struct http2_stream* h2_stream)
2566 {
2567 	int ret = nghttp2_submit_rst_stream(h2_session->session,
2568 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2569 		NGHTTP2_INTERNAL_ERROR);
2570 	if(ret) {
2571 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2572 			"error: %s", nghttp2_strerror(ret));
2573 		return 0;
2574 	}
2575 	return 1;
2576 }
2577 
2578 /**
2579  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2580  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2581  * might be used before this will be sent out.
2582  * @param h2_session: http2 session, containing c->buffer which contains answer
2583  * @return 0 on error, 1 otherwise
2584  */
http2_submit_dns_response(struct http2_session * h2_session)2585 int http2_submit_dns_response(struct http2_session* h2_session)
2586 {
2587 	int ret;
2588 	nghttp2_data_provider data_prd;
2589 	char status[4];
2590 	nghttp2_nv headers[3];
2591 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2592 	size_t rlen;
2593 	char rlen_str[32];
2594 
2595 	if(h2_stream->rbuffer) {
2596 		log_err("http2 submit response error: rbuffer already "
2597 			"exists");
2598 		return 0;
2599 	}
2600 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2601 		log_err("http2 submit response error: c->buffer not complete");
2602 		return 0;
2603 	}
2604 
2605 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2606 		verbose(VERB_QUERY, "http2: submit response error: "
2607 			"invalid status");
2608 		return 0;
2609 	}
2610 
2611 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2612 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2613 
2614 	lock_basic_lock(&http2_response_buffer_count_lock);
2615 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2616 		lock_basic_unlock(&http2_response_buffer_count_lock);
2617 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2618 			"in https-response-buffer-size");
2619 		return http2_submit_rst_stream(h2_session, h2_stream);
2620 	}
2621 	http2_response_buffer_count += rlen;
2622 	lock_basic_unlock(&http2_response_buffer_count_lock);
2623 
2624 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2625 		lock_basic_lock(&http2_response_buffer_count_lock);
2626 		http2_response_buffer_count -= rlen;
2627 		lock_basic_unlock(&http2_response_buffer_count_lock);
2628 		log_err("http2 submit response error: malloc failure");
2629 		return 0;
2630 	}
2631 
2632 	headers[0].name = (uint8_t*)":status";
2633 	headers[0].namelen = 7;
2634 	headers[0].value = (uint8_t*)status;
2635 	headers[0].valuelen = 3;
2636 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2637 
2638 	headers[1].name = (uint8_t*)"content-type";
2639 	headers[1].namelen = 12;
2640 	headers[1].value = (uint8_t*)"application/dns-message";
2641 	headers[1].valuelen = 23;
2642 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2643 
2644 	headers[2].name = (uint8_t*)"content-length";
2645 	headers[2].namelen = 14;
2646 	headers[2].value = (uint8_t*)rlen_str;
2647 	headers[2].valuelen = strlen(rlen_str);
2648 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2649 
2650 	sldns_buffer_write(h2_stream->rbuffer,
2651 		sldns_buffer_current(h2_session->c->buffer),
2652 		sldns_buffer_remaining(h2_session->c->buffer));
2653 	sldns_buffer_flip(h2_stream->rbuffer);
2654 
2655 	data_prd.source.ptr = h2_session;
2656 	data_prd.read_callback = http2_submit_response_read_callback;
2657 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2658 		headers, 3, &data_prd);
2659 	if(ret) {
2660 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2661 			"error: %s", nghttp2_strerror(ret));
2662 		return 0;
2663 	}
2664 	return 1;
2665 }
2666 #else
http2_submit_dns_response(void * ATTR_UNUSED (v))2667 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2668 {
2669 	return 0;
2670 }
2671 #endif
2672 
2673 #ifdef HAVE_NGHTTP2
2674 /** HTTP status to descriptive string */
http_status_to_str(enum http_status s)2675 static char* http_status_to_str(enum http_status s)
2676 {
2677 	switch(s) {
2678 		case HTTP_STATUS_OK:
2679 			return "OK";
2680 		case HTTP_STATUS_BAD_REQUEST:
2681 			return "Bad Request";
2682 		case HTTP_STATUS_NOT_FOUND:
2683 			return "Not Found";
2684 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2685 			return "Payload Too Large";
2686 		case HTTP_STATUS_URI_TOO_LONG:
2687 			return "URI Too Long";
2688 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2689 			return "Unsupported Media Type";
2690 		case HTTP_STATUS_NOT_IMPLEMENTED:
2691 			return "Not Implemented";
2692 	}
2693 	return "Status Unknown";
2694 }
2695 
2696 /** nghttp2 callback. Used to copy error message to nghttp2 session */
http2_submit_error_read_callback(nghttp2_session * ATTR_UNUSED (session),int32_t stream_id,uint8_t * buf,size_t length,uint32_t * data_flags,nghttp2_data_source * source,void * ATTR_UNUSED (cb_arg))2697 static ssize_t http2_submit_error_read_callback(
2698 	nghttp2_session* ATTR_UNUSED(session),
2699 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2700 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2701 {
2702 	struct http2_stream* h2_stream;
2703 	struct http2_session* h2_session = source->ptr;
2704 	char* msg;
2705 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2706 		h2_session->session, stream_id))) {
2707 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2708 			"stream");
2709 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2710 	}
2711 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2712 	msg = http_status_to_str(h2_stream->status);
2713 	if(length < strlen(msg))
2714 		return 0; /* not worth trying over multiple frames */
2715 	memcpy(buf, msg, strlen(msg));
2716 	return strlen(msg);
2717 
2718 }
2719 
2720 /**
2721  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2722  * sending out. Message body will contain descriptive string for HTTP status.
2723  * @param h2_session: http2 session to submit to
2724  * @param h2_stream: http2 stream containing HTTP status to use for error
2725  * @return 0 on error, 1 otherwise
2726  */
http2_submit_error(struct http2_session * h2_session,struct http2_stream * h2_stream)2727 static int http2_submit_error(struct http2_session* h2_session,
2728 	struct http2_stream* h2_stream)
2729 {
2730 	int ret;
2731 	char status[4];
2732 	nghttp2_data_provider data_prd;
2733 	nghttp2_nv headers[1]; /* will be copied by nghttp */
2734 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2735 		verbose(VERB_QUERY, "http2: submit error failed, "
2736 			"invalid status");
2737 		return 0;
2738 	}
2739 	headers[0].name = (uint8_t*)":status";
2740 	headers[0].namelen = 7;
2741 	headers[0].value = (uint8_t*)status;
2742 	headers[0].valuelen = 3;
2743 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2744 
2745 	data_prd.source.ptr = h2_session;
2746 	data_prd.read_callback = http2_submit_error_read_callback;
2747 
2748 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2749 		headers, 1, &data_prd);
2750 	if(ret) {
2751 		verbose(VERB_QUERY, "http2: submit error failed, "
2752 			"error: %s", nghttp2_strerror(ret));
2753 		return 0;
2754 	}
2755 	return 1;
2756 }
2757 
2758 /**
2759  * Start query handling. Query is stored in the stream, and will be free'd here.
2760  * @param h2_session: http2 session, containing comm point
2761  * @param h2_stream: stream containing buffered query
2762  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2763  * reply available (yet).
2764  */
http2_query_read_done(struct http2_session * h2_session,struct http2_stream * h2_stream)2765 static int http2_query_read_done(struct http2_session* h2_session,
2766 	struct http2_stream* h2_stream)
2767 {
2768 	log_assert(h2_stream->qbuffer);
2769 
2770 	if(h2_session->c->h2_stream) {
2771 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2772 			"buffer already assigned to stream");
2773 		return -1;
2774 	}
2775 
2776     /* the c->buffer might be used by mesh_send_reply and no be cleard
2777 	 * need to be cleared before use */
2778 	sldns_buffer_clear(h2_session->c->buffer);
2779 	if(sldns_buffer_remaining(h2_session->c->buffer) <
2780 		sldns_buffer_remaining(h2_stream->qbuffer)) {
2781 		/* qbuffer will be free'd in frame close cb */
2782 		sldns_buffer_clear(h2_session->c->buffer);
2783 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2784 			"qbuffer in c->buffer");
2785 		return -1;
2786 	}
2787 
2788 	sldns_buffer_write(h2_session->c->buffer,
2789 		sldns_buffer_current(h2_stream->qbuffer),
2790 		sldns_buffer_remaining(h2_stream->qbuffer));
2791 
2792 	lock_basic_lock(&http2_query_buffer_count_lock);
2793 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2794 	lock_basic_unlock(&http2_query_buffer_count_lock);
2795 	sldns_buffer_free(h2_stream->qbuffer);
2796 	h2_stream->qbuffer = NULL;
2797 
2798 	sldns_buffer_flip(h2_session->c->buffer);
2799 	h2_session->c->h2_stream = h2_stream;
2800 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2801 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2802 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2803 		return 1; /* answer in c->buffer */
2804 	}
2805 	sldns_buffer_clear(h2_session->c->buffer);
2806 	h2_session->c->h2_stream = NULL;
2807 	return 0; /* mesh state added, or dropped */
2808 }
2809 
2810 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2811  * stream. Gather collected request data and start query handling. */
http2_req_frame_recv_cb(nghttp2_session * session,const nghttp2_frame * frame,void * cb_arg)2812 static int http2_req_frame_recv_cb(nghttp2_session* session,
2813 	const nghttp2_frame* frame, void* cb_arg)
2814 {
2815 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2816 	struct http2_stream* h2_stream;
2817 	int query_read_done;
2818 
2819 	if((frame->hd.type != NGHTTP2_DATA &&
2820 		frame->hd.type != NGHTTP2_HEADERS) ||
2821 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2822 			return 0;
2823 	}
2824 
2825 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2826 		session, frame->hd.stream_id)))
2827 		return 0;
2828 
2829 	if(h2_stream->invalid_endpoint) {
2830 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2831 		goto submit_http_error;
2832 	}
2833 
2834 	if(h2_stream->invalid_content_type) {
2835 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2836 		goto submit_http_error;
2837 	}
2838 
2839 	if(h2_stream->http_method != HTTP_METHOD_GET &&
2840 		h2_stream->http_method != HTTP_METHOD_POST) {
2841 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2842 		goto submit_http_error;
2843 	}
2844 
2845 	if(h2_stream->query_too_large) {
2846 		if(h2_stream->http_method == HTTP_METHOD_POST)
2847 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2848 		else
2849 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2850 		goto submit_http_error;
2851 	}
2852 
2853 	if(!h2_stream->qbuffer) {
2854 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2855 		goto submit_http_error;
2856 	}
2857 
2858 	if(h2_stream->status) {
2859 submit_http_error:
2860 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2861 			"%d", h2_stream->status);
2862 		if(!http2_submit_error(h2_session, h2_stream)) {
2863 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2864 		}
2865 		return 0;
2866 	}
2867 	h2_stream->status = HTTP_STATUS_OK;
2868 
2869 	sldns_buffer_flip(h2_stream->qbuffer);
2870 	h2_session->postpone_drop = 1;
2871 	query_read_done = http2_query_read_done(h2_session, h2_stream);
2872 	if(query_read_done < 0)
2873 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2874 	else if(!query_read_done) {
2875 		if(h2_session->is_drop) {
2876 			/* connection needs to be closed. Return failure to make
2877 			 * sure no other action are taken anymore on comm point.
2878 			 * failure will result in reclaiming (and closing)
2879 			 * of comm point. */
2880 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2881 			h2_session->postpone_drop = 0;
2882 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2883 		}
2884 		/* nothing to submit right now, query added to mesh. */
2885 		h2_session->postpone_drop = 0;
2886 		return 0;
2887 	}
2888 	if(!http2_submit_dns_response(h2_session)) {
2889 		sldns_buffer_clear(h2_session->c->buffer);
2890 		h2_session->c->h2_stream = NULL;
2891 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2892 	}
2893 	verbose(VERB_QUERY, "http2 query submitted to session");
2894 	sldns_buffer_clear(h2_session->c->buffer);
2895 	h2_session->c->h2_stream = NULL;
2896 	return 0;
2897 }
2898 
2899 /** nghttp2 callback. Used to detect start of new streams. */
http2_req_begin_headers_cb(nghttp2_session * session,const nghttp2_frame * frame,void * cb_arg)2900 static int http2_req_begin_headers_cb(nghttp2_session* session,
2901 	const nghttp2_frame* frame, void* cb_arg)
2902 {
2903 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2904 	struct http2_stream* h2_stream;
2905 	int ret;
2906 	if(frame->hd.type != NGHTTP2_HEADERS ||
2907 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2908 		/* only interested in request headers */
2909 		return 0;
2910 	}
2911 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2912 		log_err("malloc failure while creating http2 stream");
2913 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2914 	}
2915 	http2_session_add_stream(h2_session, h2_stream);
2916 	ret = nghttp2_session_set_stream_user_data(session,
2917 		frame->hd.stream_id, h2_stream);
2918 	if(ret) {
2919 		/* stream does not exist */
2920 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2921 			"error: %s", nghttp2_strerror(ret));
2922 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2923 	}
2924 
2925 	return 0;
2926 }
2927 
2928 /**
2929  * base64url decode, store in qbuffer
2930  * @param h2_session: http2 session
2931  * @param h2_stream: http2 stream
2932  * @param start: start of the base64 string
2933  * @param length: length of the base64 string
2934  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2935  * buffer will be NULL is unparseble.
2936  */
http2_buffer_uri_query(struct http2_session * h2_session,struct http2_stream * h2_stream,const uint8_t * start,size_t length)2937 static int http2_buffer_uri_query(struct http2_session* h2_session,
2938 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2939 {
2940 	size_t expectb64len;
2941 	int b64len;
2942 	if(h2_stream->http_method == HTTP_METHOD_POST)
2943 		return 1;
2944 	if(length == 0)
2945 		return 1;
2946 	if(h2_stream->qbuffer) {
2947 		verbose(VERB_ALGO, "http2_req_header fail, "
2948 			"qbuffer already set");
2949 		return 0;
2950 	}
2951 
2952 	/* calculate size, might be a bit bigger than the real
2953 	 * decoded buffer size */
2954 	expectb64len = sldns_b64_pton_calculate_size(length);
2955 	log_assert(expectb64len > 0);
2956 	if(expectb64len >
2957 		h2_session->c->http2_stream_max_qbuffer_size) {
2958 		h2_stream->query_too_large = 1;
2959 		return 1;
2960 	}
2961 
2962 	lock_basic_lock(&http2_query_buffer_count_lock);
2963 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2964 		lock_basic_unlock(&http2_query_buffer_count_lock);
2965 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2966 			"in http2-query-buffer-size");
2967 		return http2_submit_rst_stream(h2_session, h2_stream);
2968 	}
2969 	http2_query_buffer_count += expectb64len;
2970 	lock_basic_unlock(&http2_query_buffer_count_lock);
2971 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2972 		lock_basic_lock(&http2_query_buffer_count_lock);
2973 		http2_query_buffer_count -= expectb64len;
2974 		lock_basic_unlock(&http2_query_buffer_count_lock);
2975 		log_err("http2_req_header fail, qbuffer "
2976 			"malloc failure");
2977 		return 0;
2978 	}
2979 
2980 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2981 		char buf[65536+4];
2982 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2983 		/* copy to the scratch buffer temporarily to terminate the
2984 		 * string with a zero */
2985 		if(length+1 > sizeof(buf)) {
2986 			/* too long */
2987 			lock_basic_lock(&http2_query_buffer_count_lock);
2988 			http2_query_buffer_count -= expectb64len;
2989 			lock_basic_unlock(&http2_query_buffer_count_lock);
2990 			sldns_buffer_free(h2_stream->qbuffer);
2991 			h2_stream->qbuffer = NULL;
2992 			return 1;
2993 		}
2994 		memmove(buf, start, length);
2995 		buf[length] = 0;
2996 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2997 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2998 			lock_basic_lock(&http2_query_buffer_count_lock);
2999 			http2_query_buffer_count -= expectb64len;
3000 			lock_basic_unlock(&http2_query_buffer_count_lock);
3001 			sldns_buffer_free(h2_stream->qbuffer);
3002 			h2_stream->qbuffer = NULL;
3003 			return 1;
3004 		}
3005 	} else {
3006 		if(!(b64len = sldns_b64url_pton(
3007 			(char const *)start, length,
3008 			sldns_buffer_current(h2_stream->qbuffer),
3009 			expectb64len)) || b64len < 0) {
3010 			lock_basic_lock(&http2_query_buffer_count_lock);
3011 			http2_query_buffer_count -= expectb64len;
3012 			lock_basic_unlock(&http2_query_buffer_count_lock);
3013 			sldns_buffer_free(h2_stream->qbuffer);
3014 			h2_stream->qbuffer = NULL;
3015 			/* return without error, method can be an
3016 			 * unknown POST */
3017 			return 1;
3018 		}
3019 	}
3020 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
3021 	return 1;
3022 }
3023 
3024 /** nghttp2 callback. Used to parse headers from HEADER frames. */
http2_req_header_cb(nghttp2_session * session,const nghttp2_frame * frame,const uint8_t * name,size_t namelen,const uint8_t * value,size_t valuelen,uint8_t ATTR_UNUSED (flags),void * cb_arg)3025 static int http2_req_header_cb(nghttp2_session* session,
3026 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
3027 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
3028 	void* cb_arg)
3029 {
3030 	struct http2_stream* h2_stream = NULL;
3031 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3032 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
3033 	 * the HEADER */
3034 	if(frame->hd.type != NGHTTP2_HEADERS ||
3035 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
3036 		/* only interested in request headers */
3037 		return 0;
3038 	}
3039 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
3040 		frame->hd.stream_id)))
3041 		return 0;
3042 
3043 	/* earlier checks already indicate we can stop handling this query */
3044 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
3045 		h2_stream->invalid_content_type ||
3046 		h2_stream->invalid_endpoint)
3047 		return 0;
3048 
3049 
3050 	/* nghttp2 performs some sanity checks in the headers, including:
3051 	 * name and value are guaranteed to be null terminated
3052 	 * name is guaranteed to be lowercase
3053 	 * content-length value is guaranteed to contain digits
3054 	 */
3055 
3056 	if(!h2_stream->http_method && namelen == 7 &&
3057 		memcmp(":method", name, namelen) == 0) {
3058 		/* Case insensitive check on :method value to be on the safe
3059 		 * side. I failed to find text about case sensitivity in specs.
3060 		 */
3061 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
3062 			h2_stream->http_method = HTTP_METHOD_GET;
3063 		else if(valuelen == 4 &&
3064 			strcasecmp("POST", (const char*)value) == 0) {
3065 			h2_stream->http_method = HTTP_METHOD_POST;
3066 			if(h2_stream->qbuffer) {
3067 				/* POST method uses query from DATA frames */
3068 				lock_basic_lock(&http2_query_buffer_count_lock);
3069 				http2_query_buffer_count -=
3070 					sldns_buffer_capacity(h2_stream->qbuffer);
3071 				lock_basic_unlock(&http2_query_buffer_count_lock);
3072 				sldns_buffer_free(h2_stream->qbuffer);
3073 				h2_stream->qbuffer = NULL;
3074 			}
3075 		} else
3076 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
3077 		return 0;
3078 	}
3079 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
3080 		/* :path may contain DNS query, depending on method. Method might
3081 		 * not be known yet here, so check after finishing receiving
3082 		 * stream. */
3083 #define	HTTP_QUERY_PARAM "?dns="
3084 		size_t el = strlen(h2_session->c->http_endpoint);
3085 		size_t qpl = strlen(HTTP_QUERY_PARAM);
3086 
3087 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
3088 			value, el) != 0) {
3089 			h2_stream->invalid_endpoint = 1;
3090 			return 0;
3091 		}
3092 		/* larger than endpoint only allowed if it is for the query
3093 		 * parameter */
3094 		if(valuelen <= el+qpl ||
3095 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
3096 			if(valuelen != el)
3097 				h2_stream->invalid_endpoint = 1;
3098 			return 0;
3099 		}
3100 
3101 		if(!http2_buffer_uri_query(h2_session, h2_stream,
3102 			value+(el+qpl), valuelen-(el+qpl))) {
3103 			return NGHTTP2_ERR_CALLBACK_FAILURE;
3104 		}
3105 		return 0;
3106 	}
3107 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
3108 	 * and not needed when using GET. Don't enfore.
3109 	 * If set only allow lowercase "application/dns-message".
3110 	 *
3111 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
3112 	 * be able to handle "application/dns-message". Since that is the only
3113 	 * content-type supported we can ignore the accept header.
3114 	 */
3115 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
3116 		if(valuelen != 23 || memcmp("application/dns-message", value,
3117 			valuelen) != 0) {
3118 			h2_stream->invalid_content_type = 1;
3119 		}
3120 	}
3121 
3122 	/* Only interested in content-lentg for POST (on not yet known) method.
3123 	 */
3124 	if((!h2_stream->http_method ||
3125 		h2_stream->http_method == HTTP_METHOD_POST) &&
3126 		!h2_stream->content_length && namelen  == 14 &&
3127 		memcmp("content-length", name, namelen) == 0) {
3128 		if(valuelen > 5) {
3129 			h2_stream->query_too_large = 1;
3130 			return 0;
3131 		}
3132 		/* guaranteed to only contain digits and be null terminated */
3133 		h2_stream->content_length = atoi((const char*)value);
3134 		if(h2_stream->content_length >
3135 			h2_session->c->http2_stream_max_qbuffer_size) {
3136 			h2_stream->query_too_large = 1;
3137 			return 0;
3138 		}
3139 	}
3140 	return 0;
3141 }
3142 
3143 /** nghttp2 callback. Used to get data from DATA frames, which can contain
3144  * queries in POST requests. */
http2_req_data_chunk_recv_cb(nghttp2_session * ATTR_UNUSED (session),uint8_t ATTR_UNUSED (flags),int32_t stream_id,const uint8_t * data,size_t len,void * cb_arg)3145 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3146 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3147 	size_t len, void* cb_arg)
3148 {
3149 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3150 	struct http2_stream* h2_stream;
3151 	size_t qlen = 0;
3152 
3153 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3154 		h2_session->session, stream_id))) {
3155 		return 0;
3156 	}
3157 
3158 	if(h2_stream->query_too_large)
3159 		return 0;
3160 
3161 	if(!h2_stream->qbuffer) {
3162 		if(h2_stream->content_length) {
3163 			if(h2_stream->content_length < len)
3164 				/* getting more data in DATA frame than
3165 				 * advertised in content-length header. */
3166 				return NGHTTP2_ERR_CALLBACK_FAILURE;
3167 			qlen = h2_stream->content_length;
3168 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3169 			/* setting this to msg-buffer-size can result in a lot
3170 			 * of memory consuption. Most queries should fit in a
3171 			 * single DATA frame, and most POST queries will
3172 			 * contain content-length which does not impose this
3173 			 * limit. */
3174 			qlen = len;
3175 		}
3176 	}
3177 	if(!h2_stream->qbuffer && qlen) {
3178 		lock_basic_lock(&http2_query_buffer_count_lock);
3179 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3180 			lock_basic_unlock(&http2_query_buffer_count_lock);
3181 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3182 				"in http2-query-buffer-size");
3183 			return http2_submit_rst_stream(h2_session, h2_stream);
3184 		}
3185 		http2_query_buffer_count += qlen;
3186 		lock_basic_unlock(&http2_query_buffer_count_lock);
3187 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3188 			lock_basic_lock(&http2_query_buffer_count_lock);
3189 			http2_query_buffer_count -= qlen;
3190 			lock_basic_unlock(&http2_query_buffer_count_lock);
3191 		}
3192 	}
3193 
3194 	if(!h2_stream->qbuffer ||
3195 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3196 		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3197 			"buffer space for POST query. Can happen on multi "
3198 			"frame requests without content-length header");
3199 		h2_stream->query_too_large = 1;
3200 		return 0;
3201 	}
3202 
3203 	sldns_buffer_write(h2_stream->qbuffer, data, len);
3204 
3205 	return 0;
3206 }
3207 
http2_req_stream_clear(struct http2_stream * h2_stream)3208 void http2_req_stream_clear(struct http2_stream* h2_stream)
3209 {
3210 	if(h2_stream->qbuffer) {
3211 		lock_basic_lock(&http2_query_buffer_count_lock);
3212 		http2_query_buffer_count -=
3213 			sldns_buffer_capacity(h2_stream->qbuffer);
3214 		lock_basic_unlock(&http2_query_buffer_count_lock);
3215 		sldns_buffer_free(h2_stream->qbuffer);
3216 		h2_stream->qbuffer = NULL;
3217 	}
3218 	if(h2_stream->rbuffer) {
3219 		lock_basic_lock(&http2_response_buffer_count_lock);
3220 		http2_response_buffer_count -=
3221 			sldns_buffer_capacity(h2_stream->rbuffer);
3222 		lock_basic_unlock(&http2_response_buffer_count_lock);
3223 		sldns_buffer_free(h2_stream->rbuffer);
3224 		h2_stream->rbuffer = NULL;
3225 	}
3226 }
3227 
http2_req_callbacks_create(void)3228 nghttp2_session_callbacks* http2_req_callbacks_create(void)
3229 {
3230 	nghttp2_session_callbacks *callbacks;
3231 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3232 		log_err("failed to initialize nghttp2 callback");
3233 		return NULL;
3234 	}
3235 	/* reception of header block started, used to create h2_stream */
3236 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3237 		http2_req_begin_headers_cb);
3238 	/* complete frame received, used to get data from stream if frame
3239 	 * has end stream flag, and start processing query */
3240 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3241 		http2_req_frame_recv_cb);
3242 	/* get request info from headers */
3243 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3244 		http2_req_header_cb);
3245 	/* get data from DATA frames, containing POST query */
3246 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3247 		http2_req_data_chunk_recv_cb);
3248 
3249 	/* generic HTTP2 callbacks */
3250 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3251 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3252 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3253 		http2_stream_close_cb);
3254 
3255 	return callbacks;
3256 }
3257 #endif /* HAVE_NGHTTP2 */
3258 
3259 #ifdef HAVE_NGTCP2
3260 struct doq_table*
doq_table_create(struct config_file * cfg,struct ub_randstate * rnd)3261 doq_table_create(struct config_file* cfg, struct ub_randstate* rnd)
3262 {
3263 	struct doq_table* table = calloc(1, sizeof(*table));
3264 	if(!table)
3265 		return NULL;
3266 	table->idle_timeout = ((uint64_t)cfg->tcp_idle_timeout)*
3267 		NGTCP2_MILLISECONDS;
3268 	table->sv_scidlen = 16;
3269 	table->static_secret_len = 16;
3270 	table->static_secret = malloc(table->static_secret_len);
3271 	if(!table->static_secret) {
3272 		free(table);
3273 		return NULL;
3274 	}
3275 	doq_fill_rand(rnd, table->static_secret, table->static_secret_len);
3276 	table->conn_tree = rbtree_create(doq_conn_cmp);
3277 	if(!table->conn_tree) {
3278 		free(table->static_secret);
3279 		free(table);
3280 		return NULL;
3281 	}
3282 	table->conid_tree = rbtree_create(doq_conid_cmp);
3283 	if(!table->conid_tree) {
3284 		free(table->static_secret);
3285 		free(table->conn_tree);
3286 		free(table);
3287 		return NULL;
3288 	}
3289 	table->timer_tree = rbtree_create(doq_timer_cmp);
3290 	if(!table->timer_tree) {
3291 		free(table->static_secret);
3292 		free(table->conn_tree);
3293 		free(table->conid_tree);
3294 		free(table);
3295 		return NULL;
3296 	}
3297 	lock_rw_init(&table->lock);
3298 	lock_rw_init(&table->conid_lock);
3299 	lock_basic_init(&table->size_lock);
3300 	lock_protect(&table->lock, &table->static_secret,
3301 		sizeof(table->static_secret));
3302 	lock_protect(&table->lock, &table->static_secret_len,
3303 		sizeof(table->static_secret_len));
3304 	lock_protect(&table->lock, table->static_secret,
3305 		table->static_secret_len);
3306 	lock_protect(&table->lock, &table->sv_scidlen,
3307 		sizeof(table->sv_scidlen));
3308 	lock_protect(&table->lock, &table->idle_timeout,
3309 		sizeof(table->idle_timeout));
3310 	lock_protect(&table->lock, &table->conn_tree, sizeof(table->conn_tree));
3311 	lock_protect(&table->lock, table->conn_tree, sizeof(*table->conn_tree));
3312 	lock_protect(&table->conid_lock, table->conid_tree,
3313 		sizeof(*table->conid_tree));
3314 	lock_protect(&table->lock, table->timer_tree,
3315 		sizeof(*table->timer_tree));
3316 	lock_protect(&table->size_lock, &table->current_size,
3317 		sizeof(table->current_size));
3318 	return table;
3319 }
3320 
3321 /** delete elements from the connection tree */
3322 static void
conn_tree_del(rbnode_type * node,void * arg)3323 conn_tree_del(rbnode_type* node, void* arg)
3324 {
3325 	struct doq_table* table = (struct doq_table*)arg;
3326 	struct doq_conn* conn;
3327 	if(!node)
3328 		return;
3329 	conn = (struct doq_conn*)node->key;
3330 	if(conn->timer.timer_in_list) {
3331 		/* Remove timer from list first, because finding the rbnode
3332 		 * element of the setlist of same timeouts needs tree lookup.
3333 		 * Edit the tree structure after that lookup. */
3334 		doq_timer_list_remove(conn->table, &conn->timer);
3335 	}
3336 	if(conn->timer.timer_in_tree)
3337 		doq_timer_tree_remove(conn->table, &conn->timer);
3338 	doq_table_quic_size_subtract(table, sizeof(*conn)+conn->key.dcidlen);
3339 	doq_conn_delete(conn, table);
3340 }
3341 
3342 /** delete elements from the connection id tree */
3343 static void
conid_tree_del(rbnode_type * node,void * ATTR_UNUSED (arg))3344 conid_tree_del(rbnode_type* node, void* ATTR_UNUSED(arg))
3345 {
3346 	if(!node)
3347 		return;
3348 	doq_conid_delete((struct doq_conid*)node->key);
3349 }
3350 
3351 void
doq_table_delete(struct doq_table * table)3352 doq_table_delete(struct doq_table* table)
3353 {
3354 	if(!table)
3355 		return;
3356 	lock_rw_destroy(&table->lock);
3357 	free(table->static_secret);
3358 	if(table->conn_tree) {
3359 		traverse_postorder(table->conn_tree, conn_tree_del, table);
3360 		free(table->conn_tree);
3361 	}
3362 	lock_rw_destroy(&table->conid_lock);
3363 	if(table->conid_tree) {
3364 		/* The tree should be empty, because the doq_conn_delete calls
3365 		 * above should have also removed their conid elements. */
3366 		traverse_postorder(table->conid_tree, conid_tree_del, NULL);
3367 		free(table->conid_tree);
3368 	}
3369 	lock_basic_destroy(&table->size_lock);
3370 	if(table->timer_tree) {
3371 		/* The tree should be empty, because the conn_tree_del calls
3372 		 * above should also have removed them. Also the doq_timer
3373 		 * is part of the doq_conn struct, so is already freed. */
3374 		free(table->timer_tree);
3375 	}
3376 	table->write_list_first = NULL;
3377 	table->write_list_last = NULL;
3378 	free(table);
3379 }
3380 
3381 struct doq_timer*
doq_timer_find_time(struct doq_table * table,struct timeval * tv)3382 doq_timer_find_time(struct doq_table* table, struct timeval* tv)
3383 {
3384 	struct doq_timer key;
3385 	struct rbnode_type* node;
3386 	memset(&key, 0, sizeof(key));
3387 	key.time.tv_sec = tv->tv_sec;
3388 	key.time.tv_usec = tv->tv_usec;
3389 	node = rbtree_search(table->timer_tree, &key);
3390 	if(node)
3391 		return (struct doq_timer*)node->key;
3392 	return NULL;
3393 }
3394 
3395 void
doq_timer_tree_remove(struct doq_table * table,struct doq_timer * timer)3396 doq_timer_tree_remove(struct doq_table* table, struct doq_timer* timer)
3397 {
3398 	if(!timer->timer_in_tree)
3399 		return;
3400 	rbtree_delete(table->timer_tree, timer);
3401 	timer->timer_in_tree = 0;
3402 	/* This item could have more timers in the same set. */
3403 	if(timer->setlist_first) {
3404 		struct doq_timer* rb_timer = timer->setlist_first;
3405 		/* del first element from setlist */
3406 		if(rb_timer->setlist_next)
3407 			rb_timer->setlist_next->setlist_prev = NULL;
3408 		else
3409 			timer->setlist_last = NULL;
3410 		timer->setlist_first = rb_timer->setlist_next;
3411 		rb_timer->setlist_prev = NULL;
3412 		rb_timer->setlist_next = NULL;
3413 		rb_timer->timer_in_list = 0;
3414 		/* insert it into the tree as new rb element */
3415 		memset(&rb_timer->node, 0, sizeof(rb_timer->node));
3416 		rb_timer->node.key = rb_timer;
3417 		rbtree_insert(table->timer_tree, &rb_timer->node);
3418 		rb_timer->timer_in_tree = 1;
3419 		/* the setlist, if any remainder, moves to the rb element */
3420 		rb_timer->setlist_first = timer->setlist_first;
3421 		rb_timer->setlist_last = timer->setlist_last;
3422 		timer->setlist_first = NULL;
3423 		timer->setlist_last = NULL;
3424 		rb_timer->worker_doq_socket = timer->worker_doq_socket;
3425 	}
3426 	timer->worker_doq_socket = NULL;
3427 }
3428 
3429 void
doq_timer_list_remove(struct doq_table * table,struct doq_timer * timer)3430 doq_timer_list_remove(struct doq_table* table, struct doq_timer* timer)
3431 {
3432 	struct doq_timer* rb_timer;
3433 	if(!timer->timer_in_list)
3434 		return;
3435 	/* The item in the rbtree has the list start and end. */
3436 	rb_timer = doq_timer_find_time(table, &timer->time);
3437 	if(rb_timer) {
3438 		if(timer->setlist_prev)
3439 			timer->setlist_prev->setlist_next = timer->setlist_next;
3440 		else
3441 			rb_timer->setlist_first = timer->setlist_next;
3442 		if(timer->setlist_next)
3443 			timer->setlist_next->setlist_prev = timer->setlist_prev;
3444 		else
3445 			rb_timer->setlist_last = timer->setlist_prev;
3446 		timer->setlist_prev = NULL;
3447 		timer->setlist_next = NULL;
3448 	}
3449 	timer->timer_in_list = 0;
3450 }
3451 
3452 /** doq append timer to setlist */
3453 static void
doq_timer_list_append(struct doq_timer * rb_timer,struct doq_timer * timer)3454 doq_timer_list_append(struct doq_timer* rb_timer, struct doq_timer* timer)
3455 {
3456 	log_assert(timer->timer_in_list == 0);
3457 	timer->timer_in_list = 1;
3458 	timer->setlist_next = NULL;
3459 	timer->setlist_prev = rb_timer->setlist_last;
3460 	if(rb_timer->setlist_last)
3461 		rb_timer->setlist_last->setlist_next = timer;
3462 	else
3463 		rb_timer->setlist_first = timer;
3464 	rb_timer->setlist_last = timer;
3465 }
3466 
3467 void
doq_timer_unset(struct doq_table * table,struct doq_timer * timer)3468 doq_timer_unset(struct doq_table* table, struct doq_timer* timer)
3469 {
3470 	if(timer->timer_in_list) {
3471 		/* Remove timer from list first, because finding the rbnode
3472 		 * element of the setlist of same timeouts needs tree lookup.
3473 		 * Edit the tree structure after that lookup. */
3474 		doq_timer_list_remove(table, timer);
3475 	}
3476 	if(timer->timer_in_tree)
3477 		doq_timer_tree_remove(table, timer);
3478 	timer->worker_doq_socket = NULL;
3479 }
3480 
doq_timer_set(struct doq_table * table,struct doq_timer * timer,struct doq_server_socket * worker_doq_socket,struct timeval * tv)3481 void doq_timer_set(struct doq_table* table, struct doq_timer* timer,
3482 	struct doq_server_socket* worker_doq_socket, struct timeval* tv)
3483 {
3484 	struct doq_timer* rb_timer;
3485 	if(verbosity >= VERB_ALGO && timer->conn) {
3486 		char a[256];
3487 		struct timeval rel;
3488 		addr_to_str((void*)&timer->conn->key.paddr.addr,
3489 			timer->conn->key.paddr.addrlen, a, sizeof(a));
3490 		timeval_subtract(&rel, tv, worker_doq_socket->now_tv);
3491 		verbose(VERB_ALGO, "doq %s timer set %d.%6.6d in %d.%6.6d",
3492 			a, (int)tv->tv_sec, (int)tv->tv_usec,
3493 			(int)rel.tv_sec, (int)rel.tv_usec);
3494 	}
3495 	if(timer->timer_in_tree || timer->timer_in_list) {
3496 		if(timer->time.tv_sec == tv->tv_sec &&
3497 			timer->time.tv_usec == tv->tv_usec)
3498 			return; /* already set on that time */
3499 		doq_timer_unset(table, timer);
3500 	}
3501 	timer->time.tv_sec = tv->tv_sec;
3502 	timer->time.tv_usec = tv->tv_usec;
3503 	rb_timer = doq_timer_find_time(table, tv);
3504 	if(rb_timer) {
3505 		/* There is a timeout already with this value. Timer is
3506 		 * added to the setlist. */
3507 		doq_timer_list_append(rb_timer, timer);
3508 	} else {
3509 		/* There is no timeout with this value. Make timer a new
3510 		 * tree element. */
3511 		memset(&timer->node, 0, sizeof(timer->node));
3512 		timer->node.key = timer;
3513 		rbtree_insert(table->timer_tree, &timer->node);
3514 		timer->timer_in_tree = 1;
3515 		timer->setlist_first = NULL;
3516 		timer->setlist_last = NULL;
3517 		timer->worker_doq_socket = worker_doq_socket;
3518 	}
3519 }
3520 
3521 struct doq_conn*
doq_conn_create(struct comm_point * c,struct doq_pkt_addr * paddr,const uint8_t * dcid,size_t dcidlen,uint32_t version)3522 doq_conn_create(struct comm_point* c, struct doq_pkt_addr* paddr,
3523 	const uint8_t* dcid, size_t dcidlen, uint32_t version)
3524 {
3525 	struct doq_conn* conn = calloc(1, sizeof(*conn));
3526 	if(!conn)
3527 		return NULL;
3528 	conn->node.key = conn;
3529 	conn->doq_socket = c->doq_socket;
3530 	conn->table = c->doq_socket->table;
3531 	memmove(&conn->key.paddr.addr, &paddr->addr, paddr->addrlen);
3532 	conn->key.paddr.addrlen = paddr->addrlen;
3533 	memmove(&conn->key.paddr.localaddr, &paddr->localaddr,
3534 		paddr->localaddrlen);
3535 	conn->key.paddr.localaddrlen = paddr->localaddrlen;
3536 	conn->key.paddr.ifindex = paddr->ifindex;
3537 	conn->key.dcid = memdup((void*)dcid, dcidlen);
3538 	if(!conn->key.dcid) {
3539 		free(conn);
3540 		return NULL;
3541 	}
3542 	conn->key.dcidlen = dcidlen;
3543 	conn->version = version;
3544 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
3545 	ngtcp2_ccerr_default(&conn->ccerr);
3546 #else
3547 	ngtcp2_connection_close_error_default(&conn->last_error);
3548 #endif
3549 	rbtree_init(&conn->stream_tree, &doq_stream_cmp);
3550 	conn->timer.conn = conn;
3551 	lock_basic_init(&conn->lock);
3552 	lock_protect(&conn->lock, &conn->key, sizeof(conn->key));
3553 	lock_protect(&conn->lock, &conn->doq_socket, sizeof(conn->doq_socket));
3554 	lock_protect(&conn->lock, &conn->table, sizeof(conn->table));
3555 	lock_protect(&conn->lock, &conn->is_deleted, sizeof(conn->is_deleted));
3556 	lock_protect(&conn->lock, &conn->version, sizeof(conn->version));
3557 	lock_protect(&conn->lock, &conn->conn, sizeof(conn->conn));
3558 	lock_protect(&conn->lock, &conn->conid_list, sizeof(conn->conid_list));
3559 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
3560 	lock_protect(&conn->lock, &conn->ccerr, sizeof(conn->ccerr));
3561 #else
3562 	lock_protect(&conn->lock, &conn->last_error, sizeof(conn->last_error));
3563 #endif
3564 	lock_protect(&conn->lock, &conn->tls_alert, sizeof(conn->tls_alert));
3565 	lock_protect(&conn->lock, &conn->ssl, sizeof(conn->ssl));
3566 	lock_protect(&conn->lock, &conn->close_pkt, sizeof(conn->close_pkt));
3567 	lock_protect(&conn->lock, &conn->close_pkt_len, sizeof(conn->close_pkt_len));
3568 	lock_protect(&conn->lock, &conn->close_ecn, sizeof(conn->close_ecn));
3569 	lock_protect(&conn->lock, &conn->stream_tree, sizeof(conn->stream_tree));
3570 	lock_protect(&conn->lock, &conn->stream_write_first, sizeof(conn->stream_write_first));
3571 	lock_protect(&conn->lock, &conn->stream_write_last, sizeof(conn->stream_write_last));
3572 	lock_protect(&conn->lock, &conn->write_interest, sizeof(conn->write_interest));
3573 	lock_protect(&conn->lock, &conn->on_write_list, sizeof(conn->on_write_list));
3574 	lock_protect(&conn->lock, &conn->write_prev, sizeof(conn->write_prev));
3575 	lock_protect(&conn->lock, &conn->write_next, sizeof(conn->write_next));
3576 	return conn;
3577 }
3578 
3579 /** delete stream tree node */
3580 static void
stream_tree_del(rbnode_type * node,void * arg)3581 stream_tree_del(rbnode_type* node, void* arg)
3582 {
3583 	struct doq_table* table = (struct doq_table*)arg;
3584 	struct doq_stream* stream;
3585 	if(!node)
3586 		return;
3587 	stream = (struct doq_stream*)node;
3588 	if(stream->in)
3589 		doq_table_quic_size_subtract(table, stream->inlen);
3590 	if(stream->out)
3591 		doq_table_quic_size_subtract(table, stream->outlen);
3592 	doq_table_quic_size_subtract(table, sizeof(*stream));
3593 	doq_stream_delete(stream);
3594 }
3595 
3596 void
doq_conn_delete(struct doq_conn * conn,struct doq_table * table)3597 doq_conn_delete(struct doq_conn* conn, struct doq_table* table)
3598 {
3599 	if(!conn)
3600 		return;
3601 	lock_basic_destroy(&conn->lock);
3602 	lock_rw_wrlock(&conn->table->conid_lock);
3603 	doq_conn_clear_conids(conn);
3604 	lock_rw_unlock(&conn->table->conid_lock);
3605 	ngtcp2_conn_del(conn->conn);
3606 	if(conn->stream_tree.count != 0) {
3607 		traverse_postorder(&conn->stream_tree, stream_tree_del, table);
3608 	}
3609 	free(conn->key.dcid);
3610 	SSL_free(conn->ssl);
3611 	free(conn->close_pkt);
3612 	free(conn);
3613 }
3614 
3615 int
doq_conn_cmp(const void * key1,const void * key2)3616 doq_conn_cmp(const void* key1, const void* key2)
3617 {
3618 	struct doq_conn* c = (struct doq_conn*)key1;
3619 	struct doq_conn* d = (struct doq_conn*)key2;
3620 	int r;
3621 	/* Compared in the order destination address, then
3622 	 * local address, ifindex and then dcid.
3623 	 * So that for a search for findlessorequal for the destination
3624 	 * address will find connections to that address, with different
3625 	 * dcids.
3626 	 * Also a printout in sorted order prints the connections by IP
3627 	 * address of destination, and then a number of them depending on the
3628 	 * dcids. */
3629 	if(c->key.paddr.addrlen != d->key.paddr.addrlen) {
3630 		if(c->key.paddr.addrlen < d->key.paddr.addrlen)
3631 			return -1;
3632 		return 1;
3633 	}
3634 	if((r=memcmp(&c->key.paddr.addr, &d->key.paddr.addr,
3635 		c->key.paddr.addrlen))!=0)
3636 		return r;
3637 	if(c->key.paddr.localaddrlen != d->key.paddr.localaddrlen) {
3638 		if(c->key.paddr.localaddrlen < d->key.paddr.localaddrlen)
3639 			return -1;
3640 		return 1;
3641 	}
3642 	if((r=memcmp(&c->key.paddr.localaddr, &d->key.paddr.localaddr,
3643 		c->key.paddr.localaddrlen))!=0)
3644 		return r;
3645 	if(c->key.paddr.ifindex != d->key.paddr.ifindex) {
3646 		if(c->key.paddr.ifindex < d->key.paddr.ifindex)
3647 			return -1;
3648 		return 1;
3649 	}
3650 	if(c->key.dcidlen != d->key.dcidlen) {
3651 		if(c->key.dcidlen < d->key.dcidlen)
3652 			return -1;
3653 		return 1;
3654 	}
3655 	if((r=memcmp(c->key.dcid, d->key.dcid, c->key.dcidlen))!=0)
3656 		return r;
3657 	return 0;
3658 }
3659 
doq_conid_cmp(const void * key1,const void * key2)3660 int doq_conid_cmp(const void* key1, const void* key2)
3661 {
3662 	struct doq_conid* c = (struct doq_conid*)key1;
3663 	struct doq_conid* d = (struct doq_conid*)key2;
3664 	if(c->cidlen != d->cidlen) {
3665 		if(c->cidlen < d->cidlen)
3666 			return -1;
3667 		return 1;
3668 	}
3669 	return memcmp(c->cid, d->cid, c->cidlen);
3670 }
3671 
doq_timer_cmp(const void * key1,const void * key2)3672 int doq_timer_cmp(const void* key1, const void* key2)
3673 {
3674 	struct doq_timer* e = (struct doq_timer*)key1;
3675 	struct doq_timer* f = (struct doq_timer*)key2;
3676 	if(e->time.tv_sec < f->time.tv_sec)
3677 		return -1;
3678 	if(e->time.tv_sec > f->time.tv_sec)
3679 		return 1;
3680 	if(e->time.tv_usec < f->time.tv_usec)
3681 		return -1;
3682 	if(e->time.tv_usec > f->time.tv_usec)
3683 		return 1;
3684 	return 0;
3685 }
3686 
doq_stream_cmp(const void * key1,const void * key2)3687 int doq_stream_cmp(const void* key1, const void* key2)
3688 {
3689 	struct doq_stream* c = (struct doq_stream*)key1;
3690 	struct doq_stream* d = (struct doq_stream*)key2;
3691 	if(c->stream_id != d->stream_id) {
3692 		if(c->stream_id < d->stream_id)
3693 			return -1;
3694 		return 1;
3695 	}
3696 	return 0;
3697 }
3698 
3699 /** doq store a local address in repinfo */
3700 static void
doq_repinfo_store_localaddr(struct comm_reply * repinfo,struct doq_addr_storage * localaddr,socklen_t localaddrlen)3701 doq_repinfo_store_localaddr(struct comm_reply* repinfo,
3702 	struct doq_addr_storage* localaddr, socklen_t localaddrlen)
3703 {
3704 	/* use the pktinfo that we have for ancillary udp data otherwise,
3705 	 * this saves space for a sockaddr */
3706 	memset(&repinfo->pktinfo, 0, sizeof(repinfo->pktinfo));
3707 	if(addr_is_ip6((void*)localaddr, localaddrlen)) {
3708 #ifdef IPV6_PKTINFO
3709 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
3710 		memmove(&repinfo->pktinfo.v6info.ipi6_addr,
3711 			&sa6->sin6_addr, sizeof(struct in6_addr));
3712 		repinfo->doq_srcport = sa6->sin6_port;
3713 #endif
3714 		repinfo->srctype = 6;
3715 	} else {
3716 #ifdef IP_PKTINFO
3717 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3718 		memmove(&repinfo->pktinfo.v4info.ipi_addr,
3719 			&sa->sin_addr, sizeof(struct in_addr));
3720 		repinfo->doq_srcport = sa->sin_port;
3721 #elif defined(IP_RECVDSTADDR)
3722 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3723 		memmove(&repinfo->pktinfo.v4addr, &sa->sin_addr,
3724 			sizeof(struct in_addr));
3725 		repinfo->doq_srcport = sa->sin_port;
3726 #endif
3727 		repinfo->srctype = 4;
3728 	}
3729 }
3730 
3731 /** doq retrieve localaddr from repinfo */
3732 static void
doq_repinfo_retrieve_localaddr(struct comm_reply * repinfo,struct doq_addr_storage * localaddr,socklen_t * localaddrlen)3733 doq_repinfo_retrieve_localaddr(struct comm_reply* repinfo,
3734 	struct doq_addr_storage* localaddr, socklen_t* localaddrlen)
3735 {
3736 	if(repinfo->srctype == 6) {
3737 #ifdef IPV6_PKTINFO
3738 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
3739 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in6);
3740 		memset(sa6, 0, *localaddrlen);
3741 		sa6->sin6_family = AF_INET6;
3742 		memmove(&sa6->sin6_addr, &repinfo->pktinfo.v6info.ipi6_addr,
3743 			*localaddrlen);
3744 		sa6->sin6_port = repinfo->doq_srcport;
3745 #endif
3746 	} else {
3747 #ifdef IP_PKTINFO
3748 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3749 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
3750 		memset(sa, 0, *localaddrlen);
3751 		sa->sin_family = AF_INET;
3752 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4info.ipi_addr,
3753 			*localaddrlen);
3754 		sa->sin_port = repinfo->doq_srcport;
3755 #elif defined(IP_RECVDSTADDR)
3756 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
3757 		*localaddrlen = (socklen_t)sizeof(struct sockaddr_in);
3758 		memset(sa, 0, *localaddrlen);
3759 		sa->sin_family = AF_INET;
3760 		memmove(&sa->sin_addr, &repinfo->pktinfo.v4addr,
3761 			sizeof(struct in_addr));
3762 		sa->sin_port = repinfo->doq_srcport;
3763 #endif
3764 	}
3765 }
3766 
3767 /** doq write a connection key into repinfo, false if it does not fit */
3768 static int
doq_conn_key_store_repinfo(struct doq_conn_key * key,struct comm_reply * repinfo)3769 doq_conn_key_store_repinfo(struct doq_conn_key* key,
3770 	struct comm_reply* repinfo)
3771 {
3772 	repinfo->is_proxied = 0;
3773 	repinfo->doq_ifindex = key->paddr.ifindex;
3774 	repinfo->remote_addrlen = key->paddr.addrlen;
3775 	memmove(&repinfo->remote_addr, &key->paddr.addr,
3776 		repinfo->remote_addrlen);
3777 	repinfo->client_addrlen = key->paddr.addrlen;
3778 	memmove(&repinfo->client_addr, &key->paddr.addr,
3779 		repinfo->client_addrlen);
3780 	doq_repinfo_store_localaddr(repinfo, &key->paddr.localaddr,
3781 		key->paddr.localaddrlen);
3782 	if(key->dcidlen > sizeof(repinfo->doq_dcid))
3783 		return 0;
3784 	repinfo->doq_dcidlen = key->dcidlen;
3785 	memmove(repinfo->doq_dcid, key->dcid, key->dcidlen);
3786 	return 1;
3787 }
3788 
3789 void
doq_conn_key_from_repinfo(struct doq_conn_key * key,struct comm_reply * repinfo)3790 doq_conn_key_from_repinfo(struct doq_conn_key* key, struct comm_reply* repinfo)
3791 {
3792 	key->paddr.ifindex = repinfo->doq_ifindex;
3793 	key->paddr.addrlen = repinfo->remote_addrlen;
3794 	memmove(&key->paddr.addr, &repinfo->remote_addr,
3795 		repinfo->remote_addrlen);
3796 	doq_repinfo_retrieve_localaddr(repinfo, &key->paddr.localaddr,
3797 		&key->paddr.localaddrlen);
3798 	key->dcidlen = repinfo->doq_dcidlen;
3799 	key->dcid = repinfo->doq_dcid;
3800 }
3801 
3802 /** doq add a stream to the connection */
3803 static void
doq_conn_add_stream(struct doq_conn * conn,struct doq_stream * stream)3804 doq_conn_add_stream(struct doq_conn* conn, struct doq_stream* stream)
3805 {
3806 	(void)rbtree_insert(&conn->stream_tree, &stream->node);
3807 }
3808 
3809 /** doq delete a stream from the connection */
3810 static void
doq_conn_del_stream(struct doq_conn * conn,struct doq_stream * stream)3811 doq_conn_del_stream(struct doq_conn* conn, struct doq_stream* stream)
3812 {
3813 	(void)rbtree_delete(&conn->stream_tree, &stream->node);
3814 }
3815 
3816 /** doq create new stream */
3817 static struct doq_stream*
doq_stream_create(int64_t stream_id)3818 doq_stream_create(int64_t stream_id)
3819 {
3820 	struct doq_stream* stream = calloc(1, sizeof(*stream));
3821 	if(!stream)
3822 		return NULL;
3823 	stream->node.key = stream;
3824 	stream->stream_id = stream_id;
3825 	return stream;
3826 }
3827 
doq_stream_delete(struct doq_stream * stream)3828 void doq_stream_delete(struct doq_stream* stream)
3829 {
3830 	if(!stream)
3831 		return;
3832 	free(stream->in);
3833 	free(stream->out);
3834 	free(stream);
3835 }
3836 
3837 struct doq_stream*
doq_stream_find(struct doq_conn * conn,int64_t stream_id)3838 doq_stream_find(struct doq_conn* conn, int64_t stream_id)
3839 {
3840 	rbnode_type* node;
3841 	struct doq_stream key;
3842 	key.node.key = &key;
3843 	key.stream_id = stream_id;
3844 	node = rbtree_search(&conn->stream_tree, &key);
3845 	if(node)
3846 		return (struct doq_stream*)node->key;
3847 	return NULL;
3848 }
3849 
3850 /** doq put stream on the conn write list */
3851 static void
doq_stream_on_write_list(struct doq_conn * conn,struct doq_stream * stream)3852 doq_stream_on_write_list(struct doq_conn* conn, struct doq_stream* stream)
3853 {
3854 	if(stream->on_write_list)
3855 		return;
3856 	stream->write_prev = conn->stream_write_last;
3857 	if(conn->stream_write_last)
3858 		conn->stream_write_last->write_next = stream;
3859 	else
3860 		conn->stream_write_first = stream;
3861 	conn->stream_write_last = stream;
3862 	stream->write_next = NULL;
3863 	stream->on_write_list = 1;
3864 }
3865 
3866 /** doq remove stream from the conn write list */
3867 static void
doq_stream_off_write_list(struct doq_conn * conn,struct doq_stream * stream)3868 doq_stream_off_write_list(struct doq_conn* conn, struct doq_stream* stream)
3869 {
3870 	if(!stream->on_write_list)
3871 		return;
3872 	if(stream->write_next)
3873 		stream->write_next->write_prev = stream->write_prev;
3874 	else conn->stream_write_last = stream->write_prev;
3875 	if(stream->write_prev)
3876 		stream->write_prev->write_next = stream->write_next;
3877 	else conn->stream_write_first = stream->write_next;
3878 	stream->write_prev = NULL;
3879 	stream->write_next = NULL;
3880 	stream->on_write_list = 0;
3881 }
3882 
3883 /** doq stream remove in buffer */
3884 static void
doq_stream_remove_in_buffer(struct doq_stream * stream,struct doq_table * table)3885 doq_stream_remove_in_buffer(struct doq_stream* stream, struct doq_table* table)
3886 {
3887 	if(stream->in) {
3888 		doq_table_quic_size_subtract(table, stream->inlen);
3889 		free(stream->in);
3890 		stream->in = NULL;
3891 		stream->inlen = 0;
3892 	}
3893 }
3894 
3895 /** doq stream remove out buffer */
3896 static void
doq_stream_remove_out_buffer(struct doq_stream * stream,struct doq_table * table)3897 doq_stream_remove_out_buffer(struct doq_stream* stream,
3898 	struct doq_table* table)
3899 {
3900 	if(stream->out) {
3901 		doq_table_quic_size_subtract(table, stream->outlen);
3902 		free(stream->out);
3903 		stream->out = NULL;
3904 		stream->outlen = 0;
3905 	}
3906 }
3907 
3908 int
doq_stream_close(struct doq_conn * conn,struct doq_stream * stream,int send_shutdown)3909 doq_stream_close(struct doq_conn* conn, struct doq_stream* stream,
3910 	int send_shutdown)
3911 {
3912 	int ret;
3913 	if(stream->is_closed)
3914 		return 1;
3915 	stream->is_closed = 1;
3916 	doq_stream_off_write_list(conn, stream);
3917 	if(send_shutdown) {
3918 		verbose(VERB_ALGO, "doq: shutdown stream_id %d with app_error_code %d",
3919 			(int)stream->stream_id, (int)DOQ_APP_ERROR_CODE);
3920 		ret = ngtcp2_conn_shutdown_stream(conn->conn,
3921 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
3922 			0,
3923 #endif
3924 			stream->stream_id, DOQ_APP_ERROR_CODE);
3925 		if(ret != 0) {
3926 			log_err("doq ngtcp2_conn_shutdown_stream %d failed: %s",
3927 				(int)stream->stream_id, ngtcp2_strerror(ret));
3928 			return 0;
3929 		}
3930 		doq_conn_write_enable(conn);
3931 	}
3932 	verbose(VERB_ALGO, "doq: conn extend max streams bidi by 1");
3933 	ngtcp2_conn_extend_max_streams_bidi(conn->conn, 1);
3934 	doq_conn_write_enable(conn);
3935 	doq_stream_remove_in_buffer(stream, conn->doq_socket->table);
3936 	doq_stream_remove_out_buffer(stream, conn->doq_socket->table);
3937 	doq_table_quic_size_subtract(conn->doq_socket->table, sizeof(*stream));
3938 	doq_conn_del_stream(conn, stream);
3939 	doq_stream_delete(stream);
3940 	return 1;
3941 }
3942 
3943 /** doq stream pick up answer data from buffer */
3944 static int
doq_stream_pickup_answer(struct doq_stream * stream,struct sldns_buffer * buf)3945 doq_stream_pickup_answer(struct doq_stream* stream, struct sldns_buffer* buf)
3946 {
3947 	stream->is_answer_available = 1;
3948 	if(stream->out) {
3949 		free(stream->out);
3950 		stream->out = NULL;
3951 		stream->outlen = 0;
3952 	}
3953 	stream->nwrite = 0;
3954 	stream->outlen = sldns_buffer_limit(buf);
3955 	/* For quic the output bytes have to stay allocated and available,
3956 	 * for potential resends, until the remote end has acknowledged them.
3957 	 * This includes the tcplen start uint16_t, in outlen_wire. */
3958 	stream->outlen_wire = htons(stream->outlen);
3959 	stream->out = memdup(sldns_buffer_begin(buf), sldns_buffer_limit(buf));
3960 	if(!stream->out) {
3961 		log_err("doq could not send answer: out of memory");
3962 		return 0;
3963 	}
3964 	return 1;
3965 }
3966 
3967 int
doq_stream_send_reply(struct doq_conn * conn,struct doq_stream * stream,struct sldns_buffer * buf)3968 doq_stream_send_reply(struct doq_conn* conn, struct doq_stream* stream,
3969 	struct sldns_buffer* buf)
3970 {
3971 	if(verbosity >= VERB_ALGO) {
3972 		char* s = sldns_wire2str_pkt(sldns_buffer_begin(buf),
3973 			sldns_buffer_limit(buf));
3974 		verbose(VERB_ALGO, "doq stream %d response\n%s",
3975 			(int)stream->stream_id, (s?s:"null"));
3976 		free(s);
3977 	}
3978 	if(stream->out)
3979 		doq_table_quic_size_subtract(conn->doq_socket->table,
3980 			stream->outlen);
3981 	if(!doq_stream_pickup_answer(stream, buf))
3982 		return 0;
3983 	doq_table_quic_size_add(conn->doq_socket->table, stream->outlen);
3984 	doq_stream_on_write_list(conn, stream);
3985 	doq_conn_write_enable(conn);
3986 	return 1;
3987 }
3988 
3989 /** doq stream data length has completed, allocations can be done. False on
3990  * allocation failure. */
3991 static int
doq_stream_datalen_complete(struct doq_stream * stream,struct doq_table * table)3992 doq_stream_datalen_complete(struct doq_stream* stream, struct doq_table* table)
3993 {
3994 	if(stream->inlen > 1024*1024) {
3995 		log_err("doq stream in length too large %d",
3996 			(int)stream->inlen);
3997 		return 0;
3998 	}
3999 	stream->in = calloc(1, stream->inlen);
4000 	if(!stream->in) {
4001 		log_err("doq could not read stream, calloc failed: "
4002 			"out of memory");
4003 		return 0;
4004 	}
4005 	doq_table_quic_size_add(table, stream->inlen);
4006 	return 1;
4007 }
4008 
4009 /** doq stream data is complete, the input data has been received. */
4010 static int
doq_stream_data_complete(struct doq_conn * conn,struct doq_stream * stream)4011 doq_stream_data_complete(struct doq_conn* conn, struct doq_stream* stream)
4012 {
4013 	struct comm_point* c;
4014 	if(verbosity >= VERB_ALGO) {
4015 		char* s = sldns_wire2str_pkt(stream->in, stream->inlen);
4016 		char a[128];
4017 		addr_to_str((void*)&conn->key.paddr.addr,
4018 			conn->key.paddr.addrlen, a, sizeof(a));
4019 		verbose(VERB_ALGO, "doq %s stream %d incoming query\n%s",
4020 			a, (int)stream->stream_id, (s?s:"null"));
4021 		free(s);
4022 	}
4023 	stream->is_query_complete = 1;
4024 	c = conn->doq_socket->cp;
4025 	if(!stream->in) {
4026 		verbose(VERB_ALGO, "doq_stream_data_complete: no in buffer");
4027 		return 0;
4028 	}
4029 	if(stream->inlen > sldns_buffer_capacity(c->buffer)) {
4030 		verbose(VERB_ALGO, "doq_stream_data_complete: query too long");
4031 		return 0;
4032 	}
4033 	sldns_buffer_clear(c->buffer);
4034 	sldns_buffer_write(c->buffer, stream->in, stream->inlen);
4035 	sldns_buffer_flip(c->buffer);
4036 	c->repinfo.c = c;
4037 	if(!doq_conn_key_store_repinfo(&conn->key, &c->repinfo)) {
4038 		verbose(VERB_ALGO, "doq_stream_data_complete: connection "
4039 			"DCID too long");
4040 		return 0;
4041 	}
4042 	c->repinfo.doq_streamid = stream->stream_id;
4043 	conn->doq_socket->current_conn = conn;
4044 	fptr_ok(fptr_whitelist_comm_point(c->callback));
4045 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo)) {
4046 		conn->doq_socket->current_conn = NULL;
4047 		if(!doq_stream_send_reply(conn, stream, c->buffer)) {
4048 			verbose(VERB_ALGO, "doq: failed to send_reply");
4049 			return 0;
4050 		}
4051 		return 1;
4052 	}
4053 	conn->doq_socket->current_conn = NULL;
4054 	return 1;
4055 }
4056 
4057 /** doq receive data for a stream, more bytes of the incoming data */
4058 static int
doq_stream_recv_data(struct doq_stream * stream,const uint8_t * data,size_t datalen,int * recv_done,struct doq_table * table)4059 doq_stream_recv_data(struct doq_stream* stream, const uint8_t* data,
4060 	size_t datalen, int* recv_done, struct doq_table* table)
4061 {
4062 	int got_data = 0;
4063 	/* read the tcplength uint16_t at the start */
4064 	if(stream->nread < 2) {
4065 		uint16_t tcplen = 0;
4066 		size_t todolen = 2 - stream->nread;
4067 
4068 		if(stream->nread > 0) {
4069 			/* put in the already read byte if there is one */
4070 			tcplen = stream->inlen;
4071 		}
4072 		if(datalen < todolen)
4073 			todolen = datalen;
4074 		memmove(((uint8_t*)&tcplen)+stream->nread, data, todolen);
4075 		stream->nread += todolen;
4076 		data += todolen;
4077 		datalen -= todolen;
4078 		if(stream->nread == 2) {
4079 			/* the initial length value is completed */
4080 			stream->inlen = ntohs(tcplen);
4081 			if(!doq_stream_datalen_complete(stream, table))
4082 				return 0;
4083 		} else {
4084 			/* store for later */
4085 			stream->inlen = tcplen;
4086 			return 1;
4087 		}
4088 	}
4089 	/* if there are more data bytes */
4090 	if(datalen > 0) {
4091 		size_t to_write = datalen;
4092 		if(stream->nread-2 > stream->inlen) {
4093 			verbose(VERB_ALGO, "doq stream buffer too small");
4094 			return 0;
4095 		}
4096 		if(datalen > stream->inlen - (stream->nread-2))
4097 			to_write = stream->inlen - (stream->nread-2);
4098 		if(to_write > 0) {
4099 			if(!stream->in) {
4100 				verbose(VERB_ALGO, "doq: stream has "
4101 					"no buffer");
4102 				return 0;
4103 			}
4104 			memmove(stream->in+(stream->nread-2), data, to_write);
4105 			stream->nread += to_write;
4106 			data += to_write;
4107 			datalen -= to_write;
4108 			got_data = 1;
4109 		}
4110 	}
4111 	/* Are there extra bytes received after the end? If so, log them. */
4112 	if(datalen > 0) {
4113 		if(verbosity >= VERB_ALGO)
4114 			log_hex("doq stream has extra bytes received after end",
4115 				(void*)data, datalen);
4116 	}
4117 	/* Is the input data complete? */
4118 	if(got_data && stream->nread >= stream->inlen+2) {
4119 		if(!stream->in) {
4120 			verbose(VERB_ALGO, "doq: completed stream has "
4121 				"no buffer");
4122 			return 0;
4123 		}
4124 		*recv_done = 1;
4125 	}
4126 	return 1;
4127 }
4128 
4129 /** doq receive FIN for a stream. No more bytes are going to arrive. */
4130 static int
doq_stream_recv_fin(struct doq_conn * conn,struct doq_stream * stream,int recv_done)4131 doq_stream_recv_fin(struct doq_conn* conn, struct doq_stream* stream, int
4132 	recv_done)
4133 {
4134 	if(!stream->is_query_complete && !recv_done) {
4135 		verbose(VERB_ALGO, "doq: stream recv FIN, but is "
4136 			"not complete, have %d of %d bytes",
4137 			((int)stream->nread)-2, (int)stream->inlen);
4138 		if(!doq_stream_close(conn, stream, 1))
4139 			return 0;
4140 	}
4141 	return 1;
4142 }
4143 
doq_fill_rand(struct ub_randstate * rnd,uint8_t * buf,size_t len)4144 void doq_fill_rand(struct ub_randstate* rnd, uint8_t* buf, size_t len)
4145 {
4146 	size_t i;
4147 	for(i=0; i<len; i++)
4148 		buf[i] = ub_random(rnd)&0xff;
4149 }
4150 
4151 /** generate new connection id, checks for duplicates.
4152  * caller must hold lock on conid tree. */
4153 static int
doq_conn_generate_new_conid(struct doq_conn * conn,uint8_t * data,size_t datalen)4154 doq_conn_generate_new_conid(struct doq_conn* conn, uint8_t* data,
4155 	size_t datalen)
4156 {
4157 	int max_try = 100;
4158 	int i;
4159 	for(i=0; i<max_try; i++) {
4160 		doq_fill_rand(conn->doq_socket->rnd, data, datalen);
4161 		if(!doq_conid_find(conn->table, data, datalen)) {
4162 			/* Found an unused connection id. */
4163 			return 1;
4164 		}
4165 	}
4166 	verbose(VERB_ALGO, "doq_conn_generate_new_conid failed: could not "
4167 		"generate random unused connection id value in %d attempts.",
4168 		max_try);
4169 	return 0;
4170 }
4171 
4172 /** ngtcp2 rand callback function */
4173 static void
doq_rand_cb(uint8_t * dest,size_t destlen,const ngtcp2_rand_ctx * rand_ctx)4174 doq_rand_cb(uint8_t* dest, size_t destlen, const ngtcp2_rand_ctx* rand_ctx)
4175 {
4176 	struct ub_randstate* rnd = (struct ub_randstate*)
4177 		rand_ctx->native_handle;
4178 	doq_fill_rand(rnd, dest, destlen);
4179 }
4180 
4181 /** ngtcp2 get_new_connection_id callback function */
4182 static int
doq_get_new_connection_id_cb(ngtcp2_conn * ATTR_UNUSED (conn),ngtcp2_cid * cid,uint8_t * token,size_t cidlen,void * user_data)4183 doq_get_new_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn), ngtcp2_cid* cid,
4184 	uint8_t* token, size_t cidlen, void* user_data)
4185 {
4186 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4187 	/* Lock the conid tree, so we can check for duplicates while
4188 	 * generating the id, and then insert it, whilst keeping the tree
4189 	 * locked against other modifications, guaranteeing uniqueness. */
4190 	lock_rw_wrlock(&doq_conn->table->conid_lock);
4191 	if(!doq_conn_generate_new_conid(doq_conn, cid->data, cidlen)) {
4192 		lock_rw_unlock(&doq_conn->table->conid_lock);
4193 		return NGTCP2_ERR_CALLBACK_FAILURE;
4194 	}
4195 	cid->datalen = cidlen;
4196 	if(ngtcp2_crypto_generate_stateless_reset_token(token,
4197 		doq_conn->doq_socket->static_secret,
4198 		doq_conn->doq_socket->static_secret_len, cid) != 0) {
4199 		lock_rw_unlock(&doq_conn->table->conid_lock);
4200 		return NGTCP2_ERR_CALLBACK_FAILURE;
4201 	}
4202 	if(!doq_conn_associate_conid(doq_conn, cid->data, cid->datalen)) {
4203 		lock_rw_unlock(&doq_conn->table->conid_lock);
4204 		return NGTCP2_ERR_CALLBACK_FAILURE;
4205 	}
4206 	lock_rw_unlock(&doq_conn->table->conid_lock);
4207 	return 0;
4208 }
4209 
4210 /** ngtcp2 remove_connection_id callback function */
4211 static int
doq_remove_connection_id_cb(ngtcp2_conn * ATTR_UNUSED (conn),const ngtcp2_cid * cid,void * user_data)4212 doq_remove_connection_id_cb(ngtcp2_conn* ATTR_UNUSED(conn),
4213 	const ngtcp2_cid* cid, void* user_data)
4214 {
4215 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4216 	lock_rw_wrlock(&doq_conn->table->conid_lock);
4217 	doq_conn_dissociate_conid(doq_conn, cid->data, cid->datalen);
4218 	lock_rw_unlock(&doq_conn->table->conid_lock);
4219 	return 0;
4220 }
4221 
4222 /** doq submit a new token */
4223 static int
doq_submit_new_token(struct doq_conn * conn)4224 doq_submit_new_token(struct doq_conn* conn)
4225 {
4226 	uint8_t token[NGTCP2_CRYPTO_MAX_REGULAR_TOKENLEN];
4227 	ngtcp2_ssize tokenlen;
4228 	int ret;
4229 	const ngtcp2_path* path = ngtcp2_conn_get_path(conn->conn);
4230 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
4231 
4232 	tokenlen = ngtcp2_crypto_generate_regular_token(token,
4233 		conn->doq_socket->static_secret,
4234 		conn->doq_socket->static_secret_len, path->remote.addr,
4235 		path->remote.addrlen, ts);
4236 	if(tokenlen < 0) {
4237 		log_err("doq ngtcp2_crypto_generate_regular_token failed");
4238 		return 1;
4239 	}
4240 
4241 	verbose(VERB_ALGO, "doq submit new token");
4242 	ret = ngtcp2_conn_submit_new_token(conn->conn, token, tokenlen);
4243 	if(ret != 0) {
4244 		log_err("doq ngtcp2_conn_submit_new_token failed: %s",
4245 			ngtcp2_strerror(ret));
4246 		return 0;
4247 	}
4248 	return 1;
4249 }
4250 
4251 /** ngtcp2 handshake_completed callback function */
4252 static int
doq_handshake_completed_cb(ngtcp2_conn * ATTR_UNUSED (conn),void * user_data)4253 doq_handshake_completed_cb(ngtcp2_conn* ATTR_UNUSED(conn), void* user_data)
4254 {
4255 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4256 	verbose(VERB_ALGO, "doq handshake_completed callback");
4257 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_data_left is %d",
4258 		(int)ngtcp2_conn_get_max_data_left(doq_conn->conn));
4259 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
4260 	verbose(VERB_ALGO, "ngtcp2_conn_get_max_local_streams_uni is %d",
4261 		(int)ngtcp2_conn_get_max_local_streams_uni(doq_conn->conn));
4262 #endif
4263 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_uni_left is %d",
4264 		(int)ngtcp2_conn_get_streams_uni_left(doq_conn->conn));
4265 	verbose(VERB_ALGO, "ngtcp2_conn_get_streams_bidi_left is %d",
4266 		(int)ngtcp2_conn_get_streams_bidi_left(doq_conn->conn));
4267 	verbose(VERB_ALGO, "negotiated cipher name is %s",
4268 		SSL_get_cipher_name(doq_conn->ssl));
4269 	if(verbosity > VERB_ALGO) {
4270 		const unsigned char* alpn = NULL;
4271 		unsigned int alpnlen = 0;
4272 		char alpnstr[128];
4273 		SSL_get0_alpn_selected(doq_conn->ssl, &alpn, &alpnlen);
4274 		if(alpnlen > sizeof(alpnstr)-1)
4275 			alpnlen = sizeof(alpnstr)-1;
4276 		memmove(alpnstr, alpn, alpnlen);
4277 		alpnstr[alpnlen]=0;
4278 		verbose(VERB_ALGO, "negotiated ALPN is '%s'", alpnstr);
4279 	}
4280 
4281 	if(!doq_submit_new_token(doq_conn))
4282 		return -1;
4283 	return 0;
4284 }
4285 
4286 /** ngtcp2 stream_open callback function */
4287 static int
doq_stream_open_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,void * user_data)4288 doq_stream_open_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
4289 	void* user_data)
4290 {
4291 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4292 	struct doq_stream* stream;
4293 	verbose(VERB_ALGO, "doq new stream %x", (int)stream_id);
4294 	if(doq_stream_find(doq_conn, stream_id)) {
4295 		verbose(VERB_ALGO, "doq: stream with this id already exists");
4296 		return 0;
4297 	}
4298 	if(stream_id != 0 && stream_id != 4 && /* allow one stream on a new connection */
4299 		!doq_table_quic_size_available(doq_conn->doq_socket->table,
4300 		doq_conn->doq_socket->cfg, sizeof(*stream)
4301 		+ 100 /* estimated query in */
4302 		+ 512 /* estimated response out */
4303 		)) {
4304 		int rv;
4305 		verbose(VERB_ALGO, "doq: no mem for new stream");
4306 		rv = ngtcp2_conn_shutdown_stream(doq_conn->conn,
4307 #ifdef HAVE_NGTCP2_CONN_SHUTDOWN_STREAM4
4308 			0,
4309 #endif
4310 			stream_id, NGTCP2_CONNECTION_REFUSED);
4311 		if(rv != 0) {
4312 			log_err("ngtcp2_conn_shutdown_stream failed: %s",
4313 				ngtcp2_strerror(rv));
4314 			return NGTCP2_ERR_CALLBACK_FAILURE;
4315 		}
4316 		return 0;
4317 	}
4318 	stream = doq_stream_create(stream_id);
4319 	if(!stream) {
4320 		log_err("doq: could not doq_stream_create: out of memory");
4321 		return NGTCP2_ERR_CALLBACK_FAILURE;
4322 	}
4323 	doq_table_quic_size_add(doq_conn->doq_socket->table, sizeof(*stream));
4324 	doq_conn_add_stream(doq_conn, stream);
4325 	return 0;
4326 }
4327 
4328 /** ngtcp2 recv_stream_data callback function */
4329 static int
doq_recv_stream_data_cb(ngtcp2_conn * ATTR_UNUSED (conn),uint32_t flags,int64_t stream_id,uint64_t offset,const uint8_t * data,size_t datalen,void * user_data,void * ATTR_UNUSED (stream_user_data))4330 doq_recv_stream_data_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
4331 	int64_t stream_id, uint64_t offset, const uint8_t* data,
4332 	size_t datalen, void* user_data, void* ATTR_UNUSED(stream_user_data))
4333 {
4334 	int recv_done = 0;
4335 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4336 	struct doq_stream* stream;
4337 	verbose(VERB_ALGO, "doq recv stream data stream id %d offset %d "
4338 		"datalen %d%s%s", (int)stream_id, (int)offset, (int)datalen,
4339 		((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0?" FIN":""),
4340 #ifdef NGTCP2_STREAM_DATA_FLAG_0RTT
4341 		((flags&NGTCP2_STREAM_DATA_FLAG_0RTT)!=0?" 0RTT":"")
4342 #else
4343 		((flags&NGTCP2_STREAM_DATA_FLAG_EARLY)!=0?" EARLY":"")
4344 #endif
4345 		);
4346 	stream = doq_stream_find(doq_conn, stream_id);
4347 	if(!stream) {
4348 		verbose(VERB_ALGO, "doq: received stream data for "
4349 			"unknown stream %d", (int)stream_id);
4350 		return 0;
4351 	}
4352 	if(stream->is_closed) {
4353 		verbose(VERB_ALGO, "doq: stream is closed, ignore recv data");
4354 		return 0;
4355 	}
4356 	if(datalen != 0) {
4357 		if(!doq_stream_recv_data(stream, data, datalen, &recv_done,
4358 			doq_conn->doq_socket->table))
4359 			return NGTCP2_ERR_CALLBACK_FAILURE;
4360 	}
4361 	if((flags&NGTCP2_STREAM_DATA_FLAG_FIN)!=0) {
4362 		if(!doq_stream_recv_fin(doq_conn, stream, recv_done))
4363 			return NGTCP2_ERR_CALLBACK_FAILURE;
4364 	}
4365 	ngtcp2_conn_extend_max_stream_offset(doq_conn->conn, stream_id,
4366 		datalen);
4367 	ngtcp2_conn_extend_max_offset(doq_conn->conn, datalen);
4368 	if(recv_done) {
4369 		if(!doq_stream_data_complete(doq_conn, stream))
4370 			return NGTCP2_ERR_CALLBACK_FAILURE;
4371 	}
4372 	return 0;
4373 }
4374 
4375 /** ngtcp2 stream_close callback function */
4376 static int
doq_stream_close_cb(ngtcp2_conn * ATTR_UNUSED (conn),uint32_t flags,int64_t stream_id,uint64_t app_error_code,void * user_data,void * ATTR_UNUSED (stream_user_data))4377 doq_stream_close_cb(ngtcp2_conn* ATTR_UNUSED(conn), uint32_t flags,
4378 	int64_t stream_id, uint64_t app_error_code, void* user_data,
4379 	void* ATTR_UNUSED(stream_user_data))
4380 {
4381 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4382 	struct doq_stream* stream;
4383 	if((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)
4384 		verbose(VERB_ALGO, "doq stream close for stream id %d %sapp_error_code %d",
4385 		(int)stream_id,
4386 		(((flags&NGTCP2_STREAM_CLOSE_FLAG_APP_ERROR_CODE_SET)!=0)?
4387 		"APP_ERROR_CODE_SET ":""),
4388 		(int)app_error_code);
4389 	else
4390 		verbose(VERB_ALGO, "doq stream close for stream id %d",
4391 			(int)stream_id);
4392 
4393 	stream = doq_stream_find(doq_conn, stream_id);
4394 	if(!stream) {
4395 		verbose(VERB_ALGO, "doq: stream close for "
4396 			"unknown stream %d", (int)stream_id);
4397 		return 0;
4398 	}
4399 	if(!doq_stream_close(doq_conn, stream, 0))
4400 		return NGTCP2_ERR_CALLBACK_FAILURE;
4401 	return 0;
4402 }
4403 
4404 /** ngtcp2 stream_reset callback function */
4405 static int
doq_stream_reset_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,uint64_t final_size,uint64_t app_error_code,void * user_data,void * ATTR_UNUSED (stream_user_data))4406 doq_stream_reset_cb(ngtcp2_conn* ATTR_UNUSED(conn), int64_t stream_id,
4407 	uint64_t final_size, uint64_t app_error_code, void* user_data,
4408 	void* ATTR_UNUSED(stream_user_data))
4409 {
4410 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4411 	struct doq_stream* stream;
4412 	verbose(VERB_ALGO, "doq stream reset for stream id %d final_size %d "
4413 		"app_error_code %d", (int)stream_id, (int)final_size,
4414 		(int)app_error_code);
4415 
4416 	stream = doq_stream_find(doq_conn, stream_id);
4417 	if(!stream) {
4418 		verbose(VERB_ALGO, "doq: stream reset for "
4419 			"unknown stream %d", (int)stream_id);
4420 		return 0;
4421 	}
4422 	if(!doq_stream_close(doq_conn, stream, 0))
4423 		return NGTCP2_ERR_CALLBACK_FAILURE;
4424 	return 0;
4425 }
4426 
4427 /** ngtcp2 acked_stream_data_offset callback function */
4428 static int
doq_acked_stream_data_offset_cb(ngtcp2_conn * ATTR_UNUSED (conn),int64_t stream_id,uint64_t offset,uint64_t datalen,void * user_data,void * ATTR_UNUSED (stream_user_data))4429 doq_acked_stream_data_offset_cb(ngtcp2_conn* ATTR_UNUSED(conn),
4430 	int64_t stream_id, uint64_t offset, uint64_t datalen, void* user_data,
4431 	void* ATTR_UNUSED(stream_user_data))
4432 {
4433 	struct doq_conn* doq_conn = (struct doq_conn*)user_data;
4434 	struct doq_stream* stream;
4435 	verbose(VERB_ALGO, "doq stream acked data for stream id %d offset %d "
4436 		"datalen %d", (int)stream_id, (int)offset, (int)datalen);
4437 
4438 	stream = doq_stream_find(doq_conn, stream_id);
4439 	if(!stream) {
4440 		verbose(VERB_ALGO, "doq: stream acked data for "
4441 			"unknown stream %d", (int)stream_id);
4442 		return 0;
4443 	}
4444 	/* Acked the data from [offset .. offset+datalen). */
4445 	if(stream->is_closed)
4446 		return 0;
4447 	if(offset+datalen >= stream->outlen) {
4448 		doq_stream_remove_in_buffer(stream,
4449 			doq_conn->doq_socket->table);
4450 		doq_stream_remove_out_buffer(stream,
4451 			doq_conn->doq_socket->table);
4452 	}
4453 	return 0;
4454 }
4455 
4456 /** ngtc2p log_printf callback function */
4457 static void
doq_log_printf_cb(void * ATTR_UNUSED (user_data),const char * fmt,...)4458 doq_log_printf_cb(void* ATTR_UNUSED(user_data), const char* fmt, ...)
4459 {
4460 	char buf[1024];
4461 	va_list ap;
4462 	va_start(ap, fmt);
4463 	vsnprintf(buf, sizeof(buf), fmt, ap);
4464 	verbose(VERB_ALGO, "libngtcp2: %s", buf);
4465 	va_end(ap);
4466 }
4467 
4468 #ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
4469 /** the doq application tx key callback, false on failure */
4470 static int
doq_application_tx_key_cb(struct doq_conn * conn)4471 doq_application_tx_key_cb(struct doq_conn* conn)
4472 {
4473 	verbose(VERB_ALGO, "doq application tx key cb");
4474 	/* The server does not want to open streams to the client,
4475 	 * the client instead initiates by opening bidi streams. */
4476 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_data_left is %d",
4477 		(int)ngtcp2_conn_get_max_data_left(conn->conn));
4478 #ifdef HAVE_NGTCP2_CONN_GET_MAX_LOCAL_STREAMS_UNI
4479 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_max_local_streams_uni is %d",
4480 		(int)ngtcp2_conn_get_max_local_streams_uni(conn->conn));
4481 #endif
4482 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_uni_left is %d",
4483 		(int)ngtcp2_conn_get_streams_uni_left(conn->conn));
4484 	verbose(VERB_ALGO, "doq ngtcp2_conn_get_streams_bidi_left is %d",
4485 		(int)ngtcp2_conn_get_streams_bidi_left(conn->conn));
4486 	return 1;
4487 }
4488 
4489 /** quic_method set_encryption_secrets function */
4490 static int
doq_set_encryption_secrets(SSL * ssl,OSSL_ENCRYPTION_LEVEL ossl_level,const uint8_t * read_secret,const uint8_t * write_secret,size_t secret_len)4491 doq_set_encryption_secrets(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
4492 	const uint8_t *read_secret, const uint8_t *write_secret,
4493 	size_t secret_len)
4494 {
4495 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4496 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
4497 	ngtcp2_encryption_level
4498 #else
4499 	ngtcp2_crypto_level
4500 #endif
4501 		level =
4502 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL
4503 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
4504 #else
4505 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
4506 #endif
4507 
4508 	if(read_secret) {
4509 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_rx_key for level %d ossl %d", (int)level, (int)ossl_level);
4510 		if(ngtcp2_crypto_derive_and_install_rx_key(doq_conn->conn,
4511 			NULL, NULL, NULL, level, read_secret, secret_len)
4512 			!= 0) {
4513 			log_err("ngtcp2_crypto_derive_and_install_rx_key "
4514 				"failed");
4515 			return 0;
4516 		}
4517 	}
4518 
4519 	if(write_secret) {
4520 		verbose(VERB_ALGO, "doq: ngtcp2_crypto_derive_and_install_tx_key for level %d ossl %d", (int)level, (int)ossl_level);
4521 		if(ngtcp2_crypto_derive_and_install_tx_key(doq_conn->conn,
4522 			NULL, NULL, NULL, level, write_secret, secret_len)
4523 			!= 0) {
4524 			log_err("ngtcp2_crypto_derive_and_install_tx_key "
4525 				"failed");
4526 			return 0;
4527 		}
4528 		if(level == NGTCP2_CRYPTO_LEVEL_APPLICATION) {
4529 			if(!doq_application_tx_key_cb(doq_conn))
4530 				return 0;
4531 		}
4532 	}
4533 	return 1;
4534 }
4535 
4536 /** quic_method add_handshake_data function */
4537 static int
doq_add_handshake_data(SSL * ssl,OSSL_ENCRYPTION_LEVEL ossl_level,const uint8_t * data,size_t len)4538 doq_add_handshake_data(SSL *ssl, OSSL_ENCRYPTION_LEVEL ossl_level,
4539 	const uint8_t *data, size_t len)
4540 {
4541 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4542 #ifdef HAVE_NGTCP2_ENCRYPTION_LEVEL
4543 	ngtcp2_encryption_level
4544 #else
4545 	ngtcp2_crypto_level
4546 #endif
4547 		level =
4548 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_FROM_OSSL_ENCRYPTION_LEVEL
4549 		ngtcp2_crypto_quictls_from_ossl_encryption_level(ossl_level);
4550 #else
4551 		ngtcp2_crypto_openssl_from_ossl_encryption_level(ossl_level);
4552 #endif
4553 	int rv;
4554 
4555 	verbose(VERB_ALGO, "doq_add_handshake_data: "
4556 		"ngtcp2_con_submit_crypto_data level %d", (int)level);
4557 	rv = ngtcp2_conn_submit_crypto_data(doq_conn->conn, level, data, len);
4558 	if(rv != 0) {
4559 		log_err("ngtcp2_conn_submit_crypto_data failed: %s",
4560 			ngtcp2_strerror(rv));
4561 		ngtcp2_conn_set_tls_error(doq_conn->conn, rv);
4562 		return 0;
4563 	}
4564 	return 1;
4565 }
4566 
4567 /** quic_method flush_flight function */
4568 static int
doq_flush_flight(SSL * ATTR_UNUSED (ssl))4569 doq_flush_flight(SSL* ATTR_UNUSED(ssl))
4570 {
4571 	return 1;
4572 }
4573 
4574 /** quic_method send_alert function */
4575 static int
doq_send_alert(SSL * ssl,enum ssl_encryption_level_t ATTR_UNUSED (level),uint8_t alert)4576 doq_send_alert(SSL *ssl, enum ssl_encryption_level_t ATTR_UNUSED(level),
4577 	uint8_t alert)
4578 {
4579 	struct doq_conn* doq_conn = (struct doq_conn*)SSL_get_app_data(ssl);
4580 	doq_conn->tls_alert = alert;
4581 	return 1;
4582 }
4583 #endif /* HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT */
4584 
4585 /** ALPN select callback for the doq SSL context */
4586 static int
doq_alpn_select_cb(SSL * ATTR_UNUSED (ssl),const unsigned char ** out,unsigned char * outlen,const unsigned char * in,unsigned int inlen,void * ATTR_UNUSED (arg))4587 doq_alpn_select_cb(SSL* ATTR_UNUSED(ssl), const unsigned char** out,
4588 	unsigned char* outlen, const unsigned char* in, unsigned int inlen,
4589 	void* ATTR_UNUSED(arg))
4590 {
4591 	/* select "doq" */
4592 	int ret = SSL_select_next_proto((void*)out, outlen,
4593 		(const unsigned char*)"\x03""doq", 4, in, inlen);
4594 	if(ret == OPENSSL_NPN_NEGOTIATED)
4595 		return SSL_TLSEXT_ERR_OK;
4596 	verbose(VERB_ALGO, "doq alpn_select_cb: ALPN from client does "
4597 		"not have 'doq'");
4598 	return SSL_TLSEXT_ERR_ALERT_FATAL;
4599 }
4600 
4601 /** create new tls session for server doq connection */
4602 static SSL_CTX*
doq_ctx_server_setup(struct doq_server_socket * doq_socket)4603 doq_ctx_server_setup(struct doq_server_socket* doq_socket)
4604 {
4605 	char* sid_ctx = "unbound server";
4606 #ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
4607 	SSL_QUIC_METHOD* quic_method;
4608 #endif
4609 	SSL_CTX* ctx = SSL_CTX_new(TLS_server_method());
4610 	if(!ctx) {
4611 		log_crypto_err("Could not SSL_CTX_new");
4612 		return NULL;
4613 	}
4614 	SSL_CTX_set_options(ctx,
4615 		(SSL_OP_ALL & ~SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) |
4616 		SSL_OP_SINGLE_ECDH_USE |
4617 		SSL_OP_CIPHER_SERVER_PREFERENCE |
4618 		SSL_OP_NO_ANTI_REPLAY);
4619 	SSL_CTX_set_mode(ctx, SSL_MODE_RELEASE_BUFFERS);
4620 	SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION);
4621 	SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION);
4622 #ifdef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
4623 	SSL_CTX_set_alpn_select_cb(ctx, doq_alpn_select_cb, NULL);
4624 #endif
4625 	SSL_CTX_set_default_verify_paths(ctx);
4626 	if(!SSL_CTX_use_certificate_chain_file(ctx,
4627 		doq_socket->ssl_service_pem)) {
4628 		log_err("doq: error for cert file: %s",
4629 			doq_socket->ssl_service_pem);
4630 		log_crypto_err("doq: error in "
4631 			"SSL_CTX_use_certificate_chain_file");
4632 		SSL_CTX_free(ctx);
4633 		return NULL;
4634 	}
4635 	if(!SSL_CTX_use_PrivateKey_file(ctx, doq_socket->ssl_service_key,
4636 		SSL_FILETYPE_PEM)) {
4637 		log_err("doq: error for private key file: %s",
4638 			doq_socket->ssl_service_key);
4639 		log_crypto_err("doq: error in SSL_CTX_use_PrivateKey_file");
4640 		SSL_CTX_free(ctx);
4641 		return NULL;
4642 	}
4643 	if(!SSL_CTX_check_private_key(ctx)) {
4644 		log_err("doq: error for key file: %s",
4645 			doq_socket->ssl_service_key);
4646 		log_crypto_err("doq: error in SSL_CTX_check_private_key");
4647 		SSL_CTX_free(ctx);
4648 		return NULL;
4649 	}
4650 	SSL_CTX_set_session_id_context(ctx, (void*)sid_ctx, strlen(sid_ctx));
4651 	if(doq_socket->ssl_verify_pem && doq_socket->ssl_verify_pem[0]) {
4652 		if(!SSL_CTX_load_verify_locations(ctx,
4653 			doq_socket->ssl_verify_pem, NULL)) {
4654 			log_err("doq: error for verify pem file: %s",
4655 				doq_socket->ssl_verify_pem);
4656 			log_crypto_err("doq: error in "
4657 				"SSL_CTX_load_verify_locations");
4658 			SSL_CTX_free(ctx);
4659 			return NULL;
4660 		}
4661 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(
4662 			doq_socket->ssl_verify_pem));
4663 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER|
4664 			SSL_VERIFY_CLIENT_ONCE|
4665 			SSL_VERIFY_FAIL_IF_NO_PEER_CERT, NULL);
4666 	}
4667 
4668 	SSL_CTX_set_max_early_data(ctx, 0xffffffff);
4669 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
4670 	if(ngtcp2_crypto_quictls_configure_server_context(ctx) != 0) {
4671 		log_err("ngtcp2_crypto_quictls_configure_server_context failed");
4672 		SSL_CTX_free(ctx);
4673 		return NULL;
4674 	}
4675 #else
4676 	/* The quic_method needs to remain valid during the SSL_CTX
4677 	 * lifetime, so we allocate it. It is freed with the
4678 	 * doq_server_socket. */
4679 	quic_method = calloc(1, sizeof(SSL_QUIC_METHOD));
4680 	if(!quic_method) {
4681 		log_err("calloc failed: out of memory");
4682 		SSL_CTX_free(ctx);
4683 		return NULL;
4684 	}
4685 	doq_socket->quic_method = quic_method;
4686 	quic_method->set_encryption_secrets = doq_set_encryption_secrets;
4687 	quic_method->add_handshake_data = doq_add_handshake_data;
4688 	quic_method->flush_flight = doq_flush_flight;
4689 	quic_method->send_alert = doq_send_alert;
4690 	SSL_CTX_set_quic_method(ctx, doq_socket->quic_method);
4691 #endif
4692 	return ctx;
4693 }
4694 
4695 /** Get the ngtcp2_conn from ssl userdata of type ngtcp2_conn_ref */
doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref * conn_ref)4696 static ngtcp2_conn* doq_conn_ref_get_conn(ngtcp2_crypto_conn_ref* conn_ref)
4697 {
4698 	struct doq_conn* conn = (struct doq_conn*)conn_ref->user_data;
4699 	return conn->conn;
4700 }
4701 
4702 /** create new SSL session for server connection */
4703 static SSL*
doq_ssl_server_setup(SSL_CTX * ctx,struct doq_conn * conn)4704 doq_ssl_server_setup(SSL_CTX* ctx, struct doq_conn* conn)
4705 {
4706 	SSL* ssl = SSL_new(ctx);
4707 	if(!ssl) {
4708 		log_crypto_err("doq: SSL_new failed");
4709 		return NULL;
4710 	}
4711 #ifdef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
4712 	conn->conn_ref.get_conn = &doq_conn_ref_get_conn;
4713 	conn->conn_ref.user_data = conn;
4714 	SSL_set_app_data(ssl, &conn->conn_ref);
4715 #else
4716 	SSL_set_app_data(ssl, conn);
4717 #endif
4718 	SSL_set_accept_state(ssl);
4719 	SSL_set_quic_early_data_enabled(ssl, 1);
4720 	return ssl;
4721 }
4722 
4723 /** setup the doq_socket server tls context */
4724 int
doq_socket_setup_ctx(struct doq_server_socket * doq_socket)4725 doq_socket_setup_ctx(struct doq_server_socket* doq_socket)
4726 {
4727 	doq_socket->ctx = doq_ctx_server_setup(doq_socket);
4728 	if(!doq_socket->ctx)
4729 		return 0;
4730 	return 1;
4731 }
4732 
4733 int
doq_conn_setup(struct doq_conn * conn,uint8_t * scid,size_t scidlen,uint8_t * ocid,size_t ocidlen,const uint8_t * token,size_t tokenlen)4734 doq_conn_setup(struct doq_conn* conn, uint8_t* scid, size_t scidlen,
4735 	uint8_t* ocid, size_t ocidlen, const uint8_t* token, size_t tokenlen)
4736 {
4737 	int rv;
4738 	struct ngtcp2_cid dcid, sv_scid, scid_cid;
4739 	struct ngtcp2_path path;
4740 	struct ngtcp2_callbacks callbacks;
4741 	struct ngtcp2_settings settings;
4742 	struct ngtcp2_transport_params params;
4743 	memset(&dcid, 0, sizeof(dcid));
4744 	memset(&sv_scid, 0, sizeof(sv_scid));
4745 	memset(&scid_cid, 0, sizeof(scid_cid));
4746 	memset(&path, 0, sizeof(path));
4747 	memset(&callbacks, 0, sizeof(callbacks));
4748 	memset(&settings, 0, sizeof(settings));
4749 	memset(&params, 0, sizeof(params));
4750 
4751 	ngtcp2_cid_init(&scid_cid, scid, scidlen);
4752 	ngtcp2_cid_init(&dcid, conn->key.dcid, conn->key.dcidlen);
4753 
4754 	path.remote.addr = (struct sockaddr*)&conn->key.paddr.addr;
4755 	path.remote.addrlen = conn->key.paddr.addrlen;
4756 	path.local.addr = (struct sockaddr*)&conn->key.paddr.localaddr;
4757 	path.local.addrlen = conn->key.paddr.localaddrlen;
4758 
4759 	callbacks.recv_client_initial = ngtcp2_crypto_recv_client_initial_cb;
4760 	callbacks.recv_crypto_data = ngtcp2_crypto_recv_crypto_data_cb;
4761 	callbacks.encrypt = ngtcp2_crypto_encrypt_cb;
4762 	callbacks.decrypt = ngtcp2_crypto_decrypt_cb;
4763 	callbacks.hp_mask = ngtcp2_crypto_hp_mask;
4764 	callbacks.update_key = ngtcp2_crypto_update_key_cb;
4765 	callbacks.delete_crypto_aead_ctx =
4766 		ngtcp2_crypto_delete_crypto_aead_ctx_cb;
4767 	callbacks.delete_crypto_cipher_ctx =
4768 		ngtcp2_crypto_delete_crypto_cipher_ctx_cb;
4769 	callbacks.get_path_challenge_data =
4770 		ngtcp2_crypto_get_path_challenge_data_cb;
4771 	callbacks.version_negotiation = ngtcp2_crypto_version_negotiation_cb;
4772 	callbacks.rand = doq_rand_cb;
4773 	callbacks.get_new_connection_id = doq_get_new_connection_id_cb;
4774 	callbacks.remove_connection_id = doq_remove_connection_id_cb;
4775 	callbacks.handshake_completed = doq_handshake_completed_cb;
4776 	callbacks.stream_open = doq_stream_open_cb;
4777 	callbacks.stream_close = doq_stream_close_cb;
4778 	callbacks.stream_reset = doq_stream_reset_cb;
4779 	callbacks.acked_stream_data_offset = doq_acked_stream_data_offset_cb;
4780 	callbacks.recv_stream_data = doq_recv_stream_data_cb;
4781 
4782 	ngtcp2_settings_default(&settings);
4783 	if(verbosity >= VERB_ALGO) {
4784 		settings.log_printf = doq_log_printf_cb;
4785 	}
4786 	settings.rand_ctx.native_handle = conn->doq_socket->rnd;
4787 	settings.initial_ts = doq_get_timestamp_nanosec();
4788 	settings.max_stream_window = 6*1024*1024;
4789 	settings.max_window = 6*1024*1024;
4790 #ifdef HAVE_STRUCT_NGTCP2_SETTINGS_TOKENLEN
4791 	settings.token = (void*)token;
4792 	settings.tokenlen = tokenlen;
4793 #else
4794 	settings.token.base = (void*)token;
4795 	settings.token.len = tokenlen;
4796 #endif
4797 
4798 	ngtcp2_transport_params_default(&params);
4799 	params.max_idle_timeout = conn->doq_socket->idle_timeout;
4800 	params.active_connection_id_limit = 7;
4801 	params.initial_max_stream_data_bidi_local = 256*1024;
4802 	params.initial_max_stream_data_bidi_remote = 256*1024;
4803 	params.initial_max_data = 1024*1024;
4804 	/* DoQ uses bidi streams, so we allow 0 uni streams. */
4805 	params.initial_max_streams_uni = 0;
4806 	/* Initial max on number of bidi streams the remote end can open.
4807 	 * That is the number of queries it can make, at first. */
4808 	params.initial_max_streams_bidi = 10;
4809 	if(ocid) {
4810 		ngtcp2_cid_init(&params.original_dcid, ocid, ocidlen);
4811 		ngtcp2_cid_init(&params.retry_scid, conn->key.dcid,
4812 			conn->key.dcidlen);
4813 		params.retry_scid_present = 1;
4814 	} else {
4815 		ngtcp2_cid_init(&params.original_dcid, conn->key.dcid,
4816 			conn->key.dcidlen);
4817 	}
4818 #ifdef HAVE_STRUCT_NGTCP2_TRANSPORT_PARAMS_ORIGINAL_DCID_PRESENT
4819 	params.original_dcid_present = 1;
4820 #endif
4821 	doq_fill_rand(conn->doq_socket->rnd, params.stateless_reset_token,
4822 		sizeof(params.stateless_reset_token));
4823 	sv_scid.datalen = conn->doq_socket->sv_scidlen;
4824 	lock_rw_wrlock(&conn->table->conid_lock);
4825 	if(!doq_conn_generate_new_conid(conn, sv_scid.data, sv_scid.datalen)) {
4826 		lock_rw_unlock(&conn->table->conid_lock);
4827 		return 0;
4828 	}
4829 
4830 	rv = ngtcp2_conn_server_new(&conn->conn, &scid_cid, &sv_scid, &path,
4831 		conn->version, &callbacks, &settings, &params, NULL, conn);
4832 	if(rv != 0) {
4833 		lock_rw_unlock(&conn->table->conid_lock);
4834 		log_err("ngtcp2_conn_server_new failed: %s",
4835 			ngtcp2_strerror(rv));
4836 		return 0;
4837 	}
4838 	if(!doq_conn_setup_conids(conn)) {
4839 		lock_rw_unlock(&conn->table->conid_lock);
4840 		log_err("doq_conn_setup_conids failed: out of memory");
4841 		return 0;
4842 	}
4843 	lock_rw_unlock(&conn->table->conid_lock);
4844 	conn->ssl = doq_ssl_server_setup((SSL_CTX*)conn->doq_socket->ctx,
4845 		conn);
4846 	if(!conn->ssl) {
4847 		log_err("doq_ssl_server_setup failed");
4848 		return 0;
4849 	}
4850 	ngtcp2_conn_set_tls_native_handle(conn->conn, conn->ssl);
4851 	doq_conn_write_enable(conn);
4852 	return 1;
4853 }
4854 
4855 struct doq_conid*
doq_conid_find(struct doq_table * table,const uint8_t * data,size_t datalen)4856 doq_conid_find(struct doq_table* table, const uint8_t* data, size_t datalen)
4857 {
4858 	struct rbnode_type* node;
4859 	struct doq_conid key;
4860 	key.node.key = &key;
4861 	key.cid = (void*)data;
4862 	key.cidlen = datalen;
4863 	node = rbtree_search(table->conid_tree, &key);
4864 	if(node)
4865 		return (struct doq_conid*)node->key;
4866 	return NULL;
4867 }
4868 
4869 /** insert conid in the conid list */
4870 static void
doq_conid_list_insert(struct doq_conn * conn,struct doq_conid * conid)4871 doq_conid_list_insert(struct doq_conn* conn, struct doq_conid* conid)
4872 {
4873 	conid->prev = NULL;
4874 	conid->next = conn->conid_list;
4875 	if(conn->conid_list)
4876 		conn->conid_list->prev = conid;
4877 	conn->conid_list = conid;
4878 }
4879 
4880 /** remove conid from the conid list */
4881 static void
doq_conid_list_remove(struct doq_conn * conn,struct doq_conid * conid)4882 doq_conid_list_remove(struct doq_conn* conn, struct doq_conid* conid)
4883 {
4884 	if(conid->prev)
4885 		conid->prev->next = conid->next;
4886 	else	conn->conid_list = conid->next;
4887 	if(conid->next)
4888 		conid->next->prev = conid->prev;
4889 }
4890 
4891 /** create a doq_conid */
4892 static struct doq_conid*
doq_conid_create(uint8_t * data,size_t datalen,struct doq_conn_key * key)4893 doq_conid_create(uint8_t* data, size_t datalen, struct doq_conn_key* key)
4894 {
4895 	struct doq_conid* conid;
4896 	conid = calloc(1, sizeof(*conid));
4897 	if(!conid)
4898 		return NULL;
4899 	conid->cid = memdup(data, datalen);
4900 	if(!conid->cid) {
4901 		free(conid);
4902 		return NULL;
4903 	}
4904 	conid->cidlen = datalen;
4905 	conid->node.key = conid;
4906 	conid->key = *key;
4907 	conid->key.dcid = memdup(key->dcid, key->dcidlen);
4908 	if(!conid->key.dcid) {
4909 		free(conid->cid);
4910 		free(conid);
4911 		return NULL;
4912 	}
4913 	return conid;
4914 }
4915 
4916 void
doq_conid_delete(struct doq_conid * conid)4917 doq_conid_delete(struct doq_conid* conid)
4918 {
4919 	if(!conid)
4920 		return;
4921 	free(conid->key.dcid);
4922 	free(conid->cid);
4923 	free(conid);
4924 }
4925 
4926 /** return true if the conid is for the conn. */
4927 static int
conid_is_for_conn(struct doq_conn * conn,struct doq_conid * conid)4928 conid_is_for_conn(struct doq_conn* conn, struct doq_conid* conid)
4929 {
4930 	if(conid->key.dcidlen == conn->key.dcidlen &&
4931 		memcmp(conid->key.dcid, conn->key.dcid, conid->key.dcidlen)==0
4932 		&& conid->key.paddr.addrlen == conn->key.paddr.addrlen &&
4933 		memcmp(&conid->key.paddr.addr, &conn->key.paddr.addr,
4934 			conid->key.paddr.addrlen) == 0 &&
4935 		conid->key.paddr.localaddrlen == conn->key.paddr.localaddrlen &&
4936 		memcmp(&conid->key.paddr.localaddr, &conn->key.paddr.localaddr,
4937 			conid->key.paddr.localaddrlen) == 0 &&
4938 		conid->key.paddr.ifindex == conn->key.paddr.ifindex)
4939 		return 1;
4940 	return 0;
4941 }
4942 
4943 int
doq_conn_associate_conid(struct doq_conn * conn,uint8_t * data,size_t datalen)4944 doq_conn_associate_conid(struct doq_conn* conn, uint8_t* data, size_t datalen)
4945 {
4946 	struct doq_conid* conid;
4947 	conid = doq_conid_find(conn->table, data, datalen);
4948 	if(conid && !conid_is_for_conn(conn, conid)) {
4949 		verbose(VERB_ALGO, "doq connection id already exists for "
4950 			"another doq_conn. Ignoring second connection id.");
4951 		/* Already exists to another conn, ignore it.
4952 		 * This works, in that the conid is listed in the doq_conn
4953 		 * conid_list element, and removed from there. So our conid
4954 		 * tree and list are fine, when created and removed.
4955 		 * The tree now does not have the lookup element pointing
4956 		 * to this connection. */
4957 		return 1;
4958 	}
4959 	if(conid)
4960 		return 1; /* already inserted */
4961 	conid = doq_conid_create(data, datalen, &conn->key);
4962 	if(!conid)
4963 		return 0;
4964 	doq_conid_list_insert(conn, conid);
4965 	(void)rbtree_insert(conn->table->conid_tree, &conid->node);
4966 	return 1;
4967 }
4968 
4969 void
doq_conn_dissociate_conid(struct doq_conn * conn,const uint8_t * data,size_t datalen)4970 doq_conn_dissociate_conid(struct doq_conn* conn, const uint8_t* data,
4971 	size_t datalen)
4972 {
4973 	struct doq_conid* conid;
4974 	conid = doq_conid_find(conn->table, data, datalen);
4975 	if(conid && !conid_is_for_conn(conn, conid))
4976 		return;
4977 	if(conid) {
4978 		(void)rbtree_delete(conn->table->conid_tree,
4979 			conid->node.key);
4980 		doq_conid_list_remove(conn, conid);
4981 		doq_conid_delete(conid);
4982 	}
4983 }
4984 
4985 /** associate the scid array and also the dcid.
4986  * caller must hold the locks on conn and doq_table.conid_lock. */
4987 static int
doq_conn_setup_id_array_and_dcid(struct doq_conn * conn,struct ngtcp2_cid * scids,size_t num_scid)4988 doq_conn_setup_id_array_and_dcid(struct doq_conn* conn,
4989 	struct ngtcp2_cid* scids, size_t num_scid)
4990 {
4991 	size_t i;
4992 	for(i=0; i<num_scid; i++) {
4993 		if(!doq_conn_associate_conid(conn, scids[i].data,
4994 			scids[i].datalen))
4995 			return 0;
4996 	}
4997 	if(!doq_conn_associate_conid(conn, conn->key.dcid, conn->key.dcidlen))
4998 		return 0;
4999 	return 1;
5000 }
5001 
5002 int
doq_conn_setup_conids(struct doq_conn * conn)5003 doq_conn_setup_conids(struct doq_conn* conn)
5004 {
5005 	size_t num_scid =
5006 #ifndef HAVE_NGTCP2_CONN_GET_NUM_SCID
5007 		ngtcp2_conn_get_scid(conn->conn, NULL);
5008 #else
5009 		ngtcp2_conn_get_num_scid(conn->conn);
5010 #endif
5011 	if(num_scid <= 4) {
5012 		struct ngtcp2_cid ids[4];
5013 		/* Usually there are not that many scids when just accepted,
5014 		 * like only 2. */
5015 		ngtcp2_conn_get_scid(conn->conn, ids);
5016 		return doq_conn_setup_id_array_and_dcid(conn, ids, num_scid);
5017 	} else {
5018 		struct ngtcp2_cid *scids = calloc(num_scid,
5019 			sizeof(struct ngtcp2_cid));
5020 		if(!scids)
5021 			return 0;
5022 		ngtcp2_conn_get_scid(conn->conn, scids);
5023 		if(!doq_conn_setup_id_array_and_dcid(conn, scids, num_scid)) {
5024 			free(scids);
5025 			return 0;
5026 		}
5027 		free(scids);
5028 	}
5029 	return 1;
5030 }
5031 
5032 void
doq_conn_clear_conids(struct doq_conn * conn)5033 doq_conn_clear_conids(struct doq_conn* conn)
5034 {
5035 	struct doq_conid* p, *next;
5036 	if(!conn)
5037 		return;
5038 	p = conn->conid_list;
5039 	while(p) {
5040 		next = p->next;
5041 		(void)rbtree_delete(conn->table->conid_tree, p->node.key);
5042 		doq_conid_delete(p);
5043 		p = next;
5044 	}
5045 	conn->conid_list = NULL;
5046 }
5047 
doq_get_timestamp_nanosec(void)5048 ngtcp2_tstamp doq_get_timestamp_nanosec(void)
5049 {
5050 #ifdef CLOCK_REALTIME
5051 	struct timespec tp;
5052 	memset(&tp, 0, sizeof(tp));
5053 	/* Get a nanosecond time, that can be compared with the event base. */
5054 	if(clock_gettime(CLOCK_REALTIME, &tp) == -1) {
5055 		log_err("clock_gettime failed: %s", strerror(errno));
5056 	}
5057 	return ((uint64_t)tp.tv_sec)*((uint64_t)1000000000) +
5058 		((uint64_t)tp.tv_nsec);
5059 #else
5060 	struct timeval tv;
5061 	if(gettimeofday(&tv, NULL) < 0) {
5062 		log_err("gettimeofday failed: %s", strerror(errno));
5063 	}
5064 	return ((uint64_t)tv.tv_sec)*((uint64_t)1000000000) +
5065 		((uint64_t)tv.tv_usec)*((uint64_t)1000);
5066 #endif /* CLOCK_REALTIME */
5067 }
5068 
5069 /** doq start the closing period for the connection. */
5070 static int
doq_conn_start_closing_period(struct comm_point * c,struct doq_conn * conn)5071 doq_conn_start_closing_period(struct comm_point* c, struct doq_conn* conn)
5072 {
5073 	struct ngtcp2_path_storage ps;
5074 	struct ngtcp2_pkt_info pi;
5075 	ngtcp2_ssize ret;
5076 	if(!conn)
5077 		return 1;
5078 	if(
5079 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
5080 		ngtcp2_conn_in_closing_period(conn->conn)
5081 #else
5082 		ngtcp2_conn_is_in_closing_period(conn->conn)
5083 #endif
5084 		)
5085 		return 1;
5086 	if(
5087 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
5088 		ngtcp2_conn_in_draining_period(conn->conn)
5089 #else
5090 		ngtcp2_conn_is_in_draining_period(conn->conn)
5091 #endif
5092 		) {
5093 		doq_conn_write_disable(conn);
5094 		return 1;
5095 	}
5096 	ngtcp2_path_storage_zero(&ps);
5097 	sldns_buffer_clear(c->doq_socket->pkt_buf);
5098 	/* the call to ngtcp2_conn_write_connection_close causes the
5099 	 * conn to be closed. It is now in the closing period. */
5100 	ret = ngtcp2_conn_write_connection_close(conn->conn, &ps.path,
5101 		&pi, sldns_buffer_begin(c->doq_socket->pkt_buf),
5102 		sldns_buffer_remaining(c->doq_socket->pkt_buf),
5103 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5104 		&conn->ccerr
5105 #else
5106 		&conn->last_error
5107 #endif
5108 		, doq_get_timestamp_nanosec());
5109 	if(ret < 0) {
5110 		log_err("doq ngtcp2_conn_write_connection_close failed: %s",
5111 			ngtcp2_strerror(ret));
5112 		return 0;
5113 	}
5114 	if(ret == 0) {
5115 		return 0;
5116 	}
5117 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
5118 	sldns_buffer_flip(c->doq_socket->pkt_buf);
5119 
5120 	/* The close packet is allocated, because it may have to be repeated.
5121 	 * When incoming packets have this connection dcid. */
5122 	conn->close_pkt = memdup(sldns_buffer_begin(c->doq_socket->pkt_buf),
5123 		sldns_buffer_limit(c->doq_socket->pkt_buf));
5124 	if(!conn->close_pkt) {
5125 		log_err("doq: could not allocate close packet: out of memory");
5126 		return 0;
5127 	}
5128 	conn->close_pkt_len = sldns_buffer_limit(c->doq_socket->pkt_buf);
5129 	conn->close_ecn = pi.ecn;
5130 	return 1;
5131 }
5132 
5133 /** doq send the close packet for the connection, perhaps again. */
5134 int
doq_conn_send_close(struct comm_point * c,struct doq_conn * conn)5135 doq_conn_send_close(struct comm_point* c, struct doq_conn* conn)
5136 {
5137 	if(!conn)
5138 		return 0;
5139 	if(!conn->close_pkt)
5140 		return 0;
5141 	if(conn->close_pkt_len > sldns_buffer_capacity(c->doq_socket->pkt_buf))
5142 		return 0;
5143 	sldns_buffer_clear(c->doq_socket->pkt_buf);
5144 	sldns_buffer_write(c->doq_socket->pkt_buf, conn->close_pkt, conn->close_pkt_len);
5145 	sldns_buffer_flip(c->doq_socket->pkt_buf);
5146 	verbose(VERB_ALGO, "doq send connection close");
5147 	doq_send_pkt(c, &conn->key.paddr, conn->close_ecn);
5148 	doq_conn_write_disable(conn);
5149 	return 1;
5150 }
5151 
5152 /** doq close the connection on error. If it returns a failure, it
5153  * does not wait to send a close, and the connection can be dropped. */
5154 static int
doq_conn_close_error(struct comm_point * c,struct doq_conn * conn)5155 doq_conn_close_error(struct comm_point* c, struct doq_conn* conn)
5156 {
5157 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5158 	if(conn->ccerr.type == NGTCP2_CCERR_TYPE_IDLE_CLOSE)
5159 		return 0;
5160 #else
5161 	if(conn->last_error.type ==
5162 		NGTCP2_CONNECTION_CLOSE_ERROR_CODE_TYPE_TRANSPORT_IDLE_CLOSE)
5163 		return 0;
5164 #endif
5165 	if(!doq_conn_start_closing_period(c, conn))
5166 		return 0;
5167 	if(
5168 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
5169 		ngtcp2_conn_in_draining_period(conn->conn)
5170 #else
5171 		ngtcp2_conn_is_in_draining_period(conn->conn)
5172 #endif
5173 		) {
5174 		doq_conn_write_disable(conn);
5175 		return 1;
5176 	}
5177 	doq_conn_write_enable(conn);
5178 	if(!doq_conn_send_close(c, conn))
5179 		return 0;
5180 	return 1;
5181 }
5182 
5183 int
doq_conn_recv(struct comm_point * c,struct doq_pkt_addr * paddr,struct doq_conn * conn,struct ngtcp2_pkt_info * pi,int * err_retry,int * err_drop)5184 doq_conn_recv(struct comm_point* c, struct doq_pkt_addr* paddr,
5185 	struct doq_conn* conn, struct ngtcp2_pkt_info* pi, int* err_retry,
5186 	int* err_drop)
5187 {
5188 	int ret;
5189 	ngtcp2_tstamp ts;
5190 	struct ngtcp2_path path;
5191 	memset(&path, 0, sizeof(path));
5192 	path.remote.addr = (struct sockaddr*)&paddr->addr;
5193 	path.remote.addrlen = paddr->addrlen;
5194 	path.local.addr = (struct sockaddr*)&paddr->localaddr;
5195 	path.local.addrlen = paddr->localaddrlen;
5196 	ts = doq_get_timestamp_nanosec();
5197 
5198 	ret = ngtcp2_conn_read_pkt(conn->conn, &path, pi,
5199 		sldns_buffer_begin(c->doq_socket->pkt_buf),
5200 		sldns_buffer_limit(c->doq_socket->pkt_buf), ts);
5201 	if(ret != 0) {
5202 		if(err_retry)
5203 			*err_retry = 0;
5204 		if(err_drop)
5205 			*err_drop = 0;
5206 		if(ret == NGTCP2_ERR_DRAINING) {
5207 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5208 				ngtcp2_strerror(ret));
5209 			doq_conn_write_disable(conn);
5210 			return 0;
5211 		} else if(ret == NGTCP2_ERR_DROP_CONN) {
5212 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5213 				ngtcp2_strerror(ret));
5214 			if(err_drop)
5215 				*err_drop = 1;
5216 			return 0;
5217 		} else if(ret == NGTCP2_ERR_RETRY) {
5218 			verbose(VERB_ALGO, "ngtcp2_conn_read_pkt returned %s",
5219 				ngtcp2_strerror(ret));
5220 			if(err_retry)
5221 				*err_retry = 1;
5222 			if(err_drop)
5223 				*err_drop = 1;
5224 			return 0;
5225 		} else if(ret == NGTCP2_ERR_CRYPTO) {
5226 			if(
5227 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5228 				!conn->ccerr.error_code
5229 #else
5230 				!conn->last_error.error_code
5231 #endif
5232 				) {
5233 				/* in picotls the tls alert may need to be
5234 				 * copied, but this is with openssl. And there
5235 				 * is conn->tls_alert. */
5236 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5237 				ngtcp2_ccerr_set_tls_alert(&conn->ccerr,
5238 					conn->tls_alert, NULL, 0);
5239 #else
5240 				ngtcp2_connection_close_error_set_transport_error_tls_alert(
5241 					&conn->last_error, conn->tls_alert,
5242 					NULL, 0);
5243 #endif
5244 			}
5245 		} else {
5246 			if(
5247 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5248 				!conn->ccerr.error_code
5249 #else
5250 				!conn->last_error.error_code
5251 #endif
5252 				) {
5253 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5254 				ngtcp2_ccerr_set_liberr(&conn->ccerr, ret,
5255 					NULL, 0);
5256 #else
5257 				ngtcp2_connection_close_error_set_transport_error_liberr(
5258 					&conn->last_error, ret, NULL, 0);
5259 #endif
5260 			}
5261 		}
5262 		log_err("ngtcp2_conn_read_pkt failed: %s",
5263 			ngtcp2_strerror(ret));
5264 		if(!doq_conn_close_error(c, conn)) {
5265 			if(err_drop)
5266 				*err_drop = 1;
5267 		}
5268 		return 0;
5269 	}
5270 	doq_conn_write_enable(conn);
5271 	return 1;
5272 }
5273 
5274 /** doq stream write is done */
5275 static void
doq_stream_write_is_done(struct doq_conn * conn,struct doq_stream * stream)5276 doq_stream_write_is_done(struct doq_conn* conn, struct doq_stream* stream)
5277 {
5278 	/* Cannot deallocate, the buffer may be needed for resends. */
5279 	doq_stream_off_write_list(conn, stream);
5280 }
5281 
5282 int
doq_conn_write_streams(struct comm_point * c,struct doq_conn * conn,int * err_drop)5283 doq_conn_write_streams(struct comm_point* c, struct doq_conn* conn,
5284 	int* err_drop)
5285 {
5286 	struct doq_stream* stream = conn->stream_write_first;
5287 	ngtcp2_path_storage ps;
5288 	ngtcp2_tstamp ts = doq_get_timestamp_nanosec();
5289 	size_t num_packets = 0, max_packets = 65535;
5290 	ngtcp2_path_storage_zero(&ps);
5291 
5292 	for(;;) {
5293 		int64_t stream_id;
5294 		uint32_t flags = 0;
5295 		ngtcp2_pkt_info pi;
5296 		ngtcp2_vec datav[2];
5297 		size_t datav_count = 0;
5298 		ngtcp2_ssize ret, ndatalen = 0;
5299 		int fin;
5300 
5301 		if(stream) {
5302 			/* data to send */
5303 			verbose(VERB_ALGO, "doq: doq_conn write stream %d",
5304 				(int)stream->stream_id);
5305 			stream_id = stream->stream_id;
5306 			fin = 1;
5307 			if(stream->nwrite < 2) {
5308 				datav[0].base = ((uint8_t*)&stream->
5309 					outlen_wire) + stream->nwrite;
5310 				datav[0].len = 2 - stream->nwrite;
5311 				datav[1].base = stream->out;
5312 				datav[1].len = stream->outlen;
5313 				datav_count = 2;
5314 			} else {
5315 				datav[0].base = stream->out +
5316 					(stream->nwrite-2);
5317 				datav[0].len = stream->outlen -
5318 					(stream->nwrite-2);
5319 				datav_count = 1;
5320 			}
5321 		} else {
5322 			/* no data to send */
5323 			verbose(VERB_ALGO, "doq: doq_conn write stream -1");
5324 			stream_id = -1;
5325 			fin = 0;
5326 			datav[0].base = NULL;
5327 			datav[0].len = 0;
5328 			datav_count = 1;
5329 		}
5330 
5331 		/* if more streams, set it to write more */
5332 		if(stream && stream->write_next)
5333 			flags |= NGTCP2_WRITE_STREAM_FLAG_MORE;
5334 		if(fin)
5335 			flags |= NGTCP2_WRITE_STREAM_FLAG_FIN;
5336 
5337 		sldns_buffer_clear(c->doq_socket->pkt_buf);
5338 		ret = ngtcp2_conn_writev_stream(conn->conn, &ps.path, &pi,
5339 			sldns_buffer_begin(c->doq_socket->pkt_buf),
5340 			sldns_buffer_remaining(c->doq_socket->pkt_buf),
5341 			&ndatalen, flags, stream_id, datav, datav_count, ts);
5342 		if(ret < 0) {
5343 			if(ret == NGTCP2_ERR_WRITE_MORE) {
5344 				verbose(VERB_ALGO, "doq: write more, ndatalen %d", (int)ndatalen);
5345 				if(stream) {
5346 					if(ndatalen >= 0)
5347 						stream->nwrite += ndatalen;
5348 					if(stream->nwrite >= stream->outlen+2)
5349 						doq_stream_write_is_done(
5350 							conn, stream);
5351 					stream = stream->write_next;
5352 				}
5353 				continue;
5354 			} else if(ret == NGTCP2_ERR_STREAM_DATA_BLOCKED) {
5355 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_DATA_BLOCKED");
5356 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5357 				ngtcp2_ccerr_set_application_error(
5358 					&conn->ccerr, -1, NULL, 0);
5359 #else
5360 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
5361 #endif
5362 				if(err_drop)
5363 					*err_drop = 0;
5364 				if(!doq_conn_close_error(c, conn)) {
5365 					if(err_drop)
5366 						*err_drop = 1;
5367 				}
5368 				return 0;
5369 			} else if(ret == NGTCP2_ERR_STREAM_SHUT_WR) {
5370 				verbose(VERB_ALGO, "doq: ngtcp2_conn_writev_stream returned NGTCP2_ERR_STREAM_SHUT_WR");
5371 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5372 				ngtcp2_ccerr_set_application_error(
5373 					&conn->ccerr, -1, NULL, 0);
5374 #else
5375 				ngtcp2_connection_close_error_set_application_error(&conn->last_error, -1, NULL, 0);
5376 #endif
5377 				if(err_drop)
5378 					*err_drop = 0;
5379 				if(!doq_conn_close_error(c, conn)) {
5380 					if(err_drop)
5381 						*err_drop = 1;
5382 				}
5383 				return 0;
5384 			}
5385 
5386 			log_err("doq: ngtcp2_conn_writev_stream failed: %s",
5387 				ngtcp2_strerror(ret));
5388 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5389 			ngtcp2_ccerr_set_liberr(&conn->ccerr, ret, NULL, 0);
5390 #else
5391 			ngtcp2_connection_close_error_set_transport_error_liberr(
5392 				&conn->last_error, ret, NULL, 0);
5393 #endif
5394 			if(err_drop)
5395 				*err_drop = 0;
5396 			if(!doq_conn_close_error(c, conn)) {
5397 				if(err_drop)
5398 					*err_drop = 1;
5399 			}
5400 			return 0;
5401 		}
5402 		verbose(VERB_ALGO, "doq: writev_stream pkt size %d ndatawritten %d",
5403 			(int)ret, (int)ndatalen);
5404 
5405 		if(ndatalen >= 0 && stream) {
5406 			stream->nwrite += ndatalen;
5407 			if(stream->nwrite >= stream->outlen+2)
5408 				doq_stream_write_is_done(conn, stream);
5409 		}
5410 		if(ret == 0) {
5411 			/* congestion limited */
5412 			doq_conn_write_disable(conn);
5413 			ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
5414 			return 1;
5415 		}
5416 		sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
5417 		sldns_buffer_flip(c->doq_socket->pkt_buf);
5418 		doq_send_pkt(c, &conn->key.paddr, pi.ecn);
5419 
5420 		if(c->doq_socket->have_blocked_pkt)
5421 			break;
5422 		if(++num_packets == max_packets)
5423 			break;
5424 		if(stream)
5425 			stream = stream->write_next;
5426 	}
5427 	ngtcp2_conn_update_pkt_tx_time(conn->conn, ts);
5428 	return 1;
5429 }
5430 
5431 void
doq_conn_write_enable(struct doq_conn * conn)5432 doq_conn_write_enable(struct doq_conn* conn)
5433 {
5434 	conn->write_interest = 1;
5435 }
5436 
5437 void
doq_conn_write_disable(struct doq_conn * conn)5438 doq_conn_write_disable(struct doq_conn* conn)
5439 {
5440 	conn->write_interest = 0;
5441 }
5442 
5443 /** doq append the connection to the write list */
5444 static void
doq_conn_write_list_append(struct doq_table * table,struct doq_conn * conn)5445 doq_conn_write_list_append(struct doq_table* table, struct doq_conn* conn)
5446 {
5447 	if(conn->on_write_list)
5448 		return;
5449 	conn->write_prev = table->write_list_last;
5450 	if(table->write_list_last)
5451 		table->write_list_last->write_next = conn;
5452 	else table->write_list_first = conn;
5453 	conn->write_next = NULL;
5454 	table->write_list_last = conn;
5455 	conn->on_write_list = 1;
5456 }
5457 
5458 void
doq_conn_write_list_remove(struct doq_table * table,struct doq_conn * conn)5459 doq_conn_write_list_remove(struct doq_table* table, struct doq_conn* conn)
5460 {
5461 	if(!conn->on_write_list)
5462 		return;
5463 	if(conn->write_next)
5464 		conn->write_next->write_prev = conn->write_prev;
5465 	else table->write_list_last = conn->write_prev;
5466 	if(conn->write_prev)
5467 		conn->write_prev->write_next = conn->write_next;
5468 	else table->write_list_first = conn->write_next;
5469 	conn->write_prev = NULL;
5470 	conn->write_next = NULL;
5471 	conn->on_write_list = 0;
5472 }
5473 
5474 void
doq_conn_set_write_list(struct doq_table * table,struct doq_conn * conn)5475 doq_conn_set_write_list(struct doq_table* table, struct doq_conn* conn)
5476 {
5477 	if(conn->write_interest && conn->on_write_list)
5478 		return;
5479 	if(!conn->write_interest && !conn->on_write_list)
5480 		return;
5481 	if(conn->write_interest)
5482 		doq_conn_write_list_append(table, conn);
5483 	else doq_conn_write_list_remove(table, conn);
5484 }
5485 
5486 struct doq_conn*
doq_table_pop_first(struct doq_table * table)5487 doq_table_pop_first(struct doq_table* table)
5488 {
5489 	struct doq_conn* conn = table->write_list_first;
5490 	if(!conn)
5491 		return NULL;
5492 	lock_basic_lock(&conn->lock);
5493 	table->write_list_first = conn->write_next;
5494 	if(conn->write_next)
5495 		conn->write_next->write_prev = NULL;
5496 	else table->write_list_last = NULL;
5497 	conn->write_next = NULL;
5498 	conn->write_prev = NULL;
5499 	conn->on_write_list = 0;
5500 	return conn;
5501 }
5502 
5503 int
doq_conn_check_timer(struct doq_conn * conn,struct timeval * tv)5504 doq_conn_check_timer(struct doq_conn* conn, struct timeval* tv)
5505 {
5506 	ngtcp2_tstamp expiry = ngtcp2_conn_get_expiry(conn->conn);
5507 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
5508 	ngtcp2_tstamp t;
5509 
5510 	if(expiry <= now) {
5511 		/* The timer has already expired, add with zero timeout.
5512 		 * This should call the callback straight away. Calling it
5513 		 * from the event callbacks is cleaner than calling it here,
5514 		 * because then it is always called with the same locks and
5515 		 * so on. This routine only has the conn.lock. */
5516 		t = now;
5517 	} else {
5518 		t = expiry;
5519 	}
5520 
5521 	/* convert to timeval */
5522 	memset(tv, 0, sizeof(*tv));
5523 	tv->tv_sec = t / NGTCP2_SECONDS;
5524 	tv->tv_usec = (t / NGTCP2_MICROSECONDS)%1000000;
5525 
5526 	/* If we already have a timer, is it the right value? */
5527 	if(conn->timer.timer_in_tree || conn->timer.timer_in_list) {
5528 		if(conn->timer.time.tv_sec == tv->tv_sec &&
5529 			conn->timer.time.tv_usec == tv->tv_usec)
5530 			return 0;
5531 	}
5532 	return 1;
5533 }
5534 
5535 /* doq print connection log */
5536 static void
doq_conn_log_line(struct doq_conn * conn,char * s)5537 doq_conn_log_line(struct doq_conn* conn, char* s)
5538 {
5539 	char remotestr[256], localstr[256];
5540 	addr_to_str((void*)&conn->key.paddr.addr, conn->key.paddr.addrlen,
5541 		remotestr, sizeof(remotestr));
5542 	addr_to_str((void*)&conn->key.paddr.localaddr,
5543 		conn->key.paddr.localaddrlen, localstr, sizeof(localstr));
5544 	log_info("doq conn %s %s %s", remotestr, localstr, s);
5545 }
5546 
5547 int
doq_conn_handle_timeout(struct doq_conn * conn)5548 doq_conn_handle_timeout(struct doq_conn* conn)
5549 {
5550 	ngtcp2_tstamp now = doq_get_timestamp_nanosec();
5551 	int rv;
5552 
5553 	if(verbosity >= VERB_ALGO)
5554 		doq_conn_log_line(conn, "timeout");
5555 
5556 	rv = ngtcp2_conn_handle_expiry(conn->conn, now);
5557 	if(rv != 0) {
5558 		verbose(VERB_ALGO, "ngtcp2_conn_handle_expiry failed: %s",
5559 			ngtcp2_strerror(rv));
5560 #ifdef HAVE_NGTCP2_CCERR_DEFAULT
5561 		ngtcp2_ccerr_set_liberr(&conn->ccerr, rv, NULL, 0);
5562 #else
5563 		ngtcp2_connection_close_error_set_transport_error_liberr(
5564 			&conn->last_error, rv, NULL, 0);
5565 #endif
5566 		if(!doq_conn_close_error(conn->doq_socket->cp, conn)) {
5567 			/* failed, return for deletion */
5568 			return 0;
5569 		}
5570 		return 1;
5571 	}
5572 	doq_conn_write_enable(conn);
5573 	if(!doq_conn_write_streams(conn->doq_socket->cp, conn, NULL)) {
5574 		/* failed, return for deletion. */
5575 		return 0;
5576 	}
5577 	return 1;
5578 }
5579 
5580 void
doq_table_quic_size_add(struct doq_table * table,size_t add)5581 doq_table_quic_size_add(struct doq_table* table, size_t add)
5582 {
5583 	lock_basic_lock(&table->size_lock);
5584 	table->current_size += add;
5585 	lock_basic_unlock(&table->size_lock);
5586 }
5587 
5588 void
doq_table_quic_size_subtract(struct doq_table * table,size_t subtract)5589 doq_table_quic_size_subtract(struct doq_table* table, size_t subtract)
5590 {
5591 	lock_basic_lock(&table->size_lock);
5592 	if(table->current_size < subtract)
5593 		table->current_size = 0;
5594 	else	table->current_size -= subtract;
5595 	lock_basic_unlock(&table->size_lock);
5596 }
5597 
5598 int
doq_table_quic_size_available(struct doq_table * table,struct config_file * cfg,size_t mem)5599 doq_table_quic_size_available(struct doq_table* table,
5600 	struct config_file* cfg, size_t mem)
5601 {
5602 	size_t cur;
5603 	lock_basic_lock(&table->size_lock);
5604 	cur = table->current_size;
5605 	lock_basic_unlock(&table->size_lock);
5606 
5607 	if(cur + mem > cfg->quic_size)
5608 		return 0;
5609 	return 1;
5610 }
5611 
doq_table_quic_size_get(struct doq_table * table)5612 size_t doq_table_quic_size_get(struct doq_table* table)
5613 {
5614 	size_t sz;
5615 	if(!table)
5616 		return 0;
5617 	lock_basic_lock(&table->size_lock);
5618 	sz = table->current_size;
5619 	lock_basic_unlock(&table->size_lock);
5620 	return sz;
5621 }
5622 #endif /* HAVE_NGTCP2 */
5623