xref: /freebsd/contrib/unbound/util/netevent.c (revision be771a7b7f4580a30d99e41a5bb1b93a385a119d)
1 /*
2  * util/netevent.c - event notification
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file contains event notification functions.
40  */
41 #include "config.h"
42 #include "util/netevent.h"
43 #include "util/ub_event.h"
44 #include "util/log.h"
45 #include "util/net_help.h"
46 #include "util/tcp_conn_limit.h"
47 #include "util/fptr_wlist.h"
48 #include "util/proxy_protocol.h"
49 #include "util/timeval_func.h"
50 #include "sldns/pkthdr.h"
51 #include "sldns/sbuffer.h"
52 #include "sldns/str2wire.h"
53 #include "dnstap/dnstap.h"
54 #include "dnscrypt/dnscrypt.h"
55 #include "services/listen_dnsport.h"
56 #include "util/random.h"
57 #ifdef HAVE_SYS_TYPES_H
58 #include <sys/types.h>
59 #endif
60 #ifdef HAVE_SYS_SOCKET_H
61 #include <sys/socket.h>
62 #endif
63 #ifdef HAVE_NETDB_H
64 #include <netdb.h>
65 #endif
66 #ifdef HAVE_POLL_H
67 #include <poll.h>
68 #endif
69 
70 #ifdef HAVE_OPENSSL_SSL_H
71 #include <openssl/ssl.h>
72 #endif
73 #ifdef HAVE_OPENSSL_ERR_H
74 #include <openssl/err.h>
75 #endif
76 
77 #ifdef HAVE_NGTCP2
78 #include <ngtcp2/ngtcp2.h>
79 #include <ngtcp2/ngtcp2_crypto.h>
80 #endif
81 
82 #ifdef HAVE_LINUX_NET_TSTAMP_H
83 #include <linux/net_tstamp.h>
84 #endif
85 
86 /* -------- Start of local definitions -------- */
87 /** if CMSG_ALIGN is not defined on this platform, a workaround */
88 #ifndef CMSG_ALIGN
89 #  ifdef __CMSG_ALIGN
90 #    define CMSG_ALIGN(n) __CMSG_ALIGN(n)
91 #  elif defined(CMSG_DATA_ALIGN)
92 #    define CMSG_ALIGN _CMSG_DATA_ALIGN
93 #  else
94 #    define CMSG_ALIGN(len) (((len)+sizeof(long)-1) & ~(sizeof(long)-1))
95 #  endif
96 #endif
97 
98 /** if CMSG_LEN is not defined on this platform, a workaround */
99 #ifndef CMSG_LEN
100 #  define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr))+(len))
101 #endif
102 
103 /** if CMSG_SPACE is not defined on this platform, a workaround */
104 #ifndef CMSG_SPACE
105 #  ifdef _CMSG_HDR_ALIGN
106 #    define CMSG_SPACE(l) (CMSG_ALIGN(l)+_CMSG_HDR_ALIGN(sizeof(struct cmsghdr)))
107 #  else
108 #    define CMSG_SPACE(l) (CMSG_ALIGN(l)+CMSG_ALIGN(sizeof(struct cmsghdr)))
109 #  endif
110 #endif
111 
112 /** The TCP writing query timeout in milliseconds */
113 #define TCP_QUERY_TIMEOUT 120000
114 /** The minimum actual TCP timeout to use, regardless of what we advertise,
115  * in msec */
116 #define TCP_QUERY_TIMEOUT_MINIMUM 200
117 
118 #ifndef NONBLOCKING_IS_BROKEN
119 /** number of UDP reads to perform per read indication from select */
120 #define NUM_UDP_PER_SELECT 100
121 #else
122 #define NUM_UDP_PER_SELECT 1
123 #endif
124 
125 /** timeout in millisec to wait for write to unblock, packets dropped after.*/
126 #define SEND_BLOCKED_WAIT_TIMEOUT 200
127 /** max number of times to wait for write to unblock, packets dropped after.*/
128 #define SEND_BLOCKED_MAX_RETRY 5
129 
130 /** Let's make timestamping code cleaner and redefine SO_TIMESTAMP* */
131 #ifndef SO_TIMESTAMP
132 #define SO_TIMESTAMP 29
133 #endif
134 #ifndef SO_TIMESTAMPNS
135 #define SO_TIMESTAMPNS 35
136 #endif
137 #ifndef SO_TIMESTAMPING
138 #define SO_TIMESTAMPING 37
139 #endif
140 /**
141  * The internal event structure for keeping ub_event info for the event.
142  * Possibly other structures (list, tree) this is part of.
143  */
144 struct internal_event {
145 	/** the comm base */
146 	struct comm_base* base;
147 	/** ub_event event type */
148 	struct ub_event* ev;
149 };
150 
151 /**
152  * Internal base structure, so that every thread has its own events.
153  */
154 struct internal_base {
155 	/** ub_event event_base type. */
156 	struct ub_event_base* base;
157 	/** seconds time pointer points here */
158 	time_t secs;
159 	/** timeval with current time */
160 	struct timeval now;
161 	/** the event used for slow_accept timeouts */
162 	struct ub_event* slow_accept;
163 	/** true if slow_accept is enabled */
164 	int slow_accept_enabled;
165 	/** last log time for slow logging of file descriptor errors */
166 	time_t last_slow_log;
167 	/** last log time for slow logging of write wait failures */
168 	time_t last_writewait_log;
169 };
170 
171 /**
172  * Internal timer structure, to store timer event in.
173  */
174 struct internal_timer {
175 	/** the super struct from which derived */
176 	struct comm_timer super;
177 	/** the comm base */
178 	struct comm_base* base;
179 	/** ub_event event type */
180 	struct ub_event* ev;
181 	/** is timer enabled */
182 	uint8_t enabled;
183 };
184 
185 /**
186  * Internal signal structure, to store signal event in.
187  */
188 struct internal_signal {
189 	/** ub_event event type */
190 	struct ub_event* ev;
191 	/** next in signal list */
192 	struct internal_signal* next;
193 };
194 
195 /** create a tcp handler with a parent */
196 static struct comm_point* comm_point_create_tcp_handler(
197 	struct comm_base *base, struct comm_point* parent, size_t bufsize,
198 	struct sldns_buffer* spoolbuf, comm_point_callback_type* callback,
199 	void* callback_arg, struct unbound_socket* socket);
200 
201 /* -------- End of local definitions -------- */
202 
203 struct comm_base*
204 comm_base_create(int sigs)
205 {
206 	struct comm_base* b = (struct comm_base*)calloc(1,
207 		sizeof(struct comm_base));
208 	const char *evnm="event", *evsys="", *evmethod="";
209 
210 	if(!b)
211 		return NULL;
212 	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
213 	if(!b->eb) {
214 		free(b);
215 		return NULL;
216 	}
217 	b->eb->base = ub_default_event_base(sigs, &b->eb->secs, &b->eb->now);
218 	if(!b->eb->base) {
219 		free(b->eb);
220 		free(b);
221 		return NULL;
222 	}
223 	ub_comm_base_now(b);
224 	ub_get_event_sys(b->eb->base, &evnm, &evsys, &evmethod);
225 	verbose(VERB_ALGO, "%s %s uses %s method.", evnm, evsys, evmethod);
226 	return b;
227 }
228 
229 struct comm_base*
230 comm_base_create_event(struct ub_event_base* base)
231 {
232 	struct comm_base* b = (struct comm_base*)calloc(1,
233 		sizeof(struct comm_base));
234 	if(!b)
235 		return NULL;
236 	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
237 	if(!b->eb) {
238 		free(b);
239 		return NULL;
240 	}
241 	b->eb->base = base;
242 	ub_comm_base_now(b);
243 	return b;
244 }
245 
246 void
247 comm_base_delete(struct comm_base* b)
248 {
249 	if(!b)
250 		return;
251 	if(b->eb->slow_accept_enabled) {
252 		if(ub_event_del(b->eb->slow_accept) != 0) {
253 			log_err("could not event_del slow_accept");
254 		}
255 		ub_event_free(b->eb->slow_accept);
256 	}
257 	ub_event_base_free(b->eb->base);
258 	b->eb->base = NULL;
259 	free(b->eb);
260 	free(b);
261 }
262 
263 void
264 comm_base_delete_no_base(struct comm_base* b)
265 {
266 	if(!b)
267 		return;
268 	if(b->eb->slow_accept_enabled) {
269 		if(ub_event_del(b->eb->slow_accept) != 0) {
270 			log_err("could not event_del slow_accept");
271 		}
272 		ub_event_free(b->eb->slow_accept);
273 	}
274 	b->eb->base = NULL;
275 	free(b->eb);
276 	free(b);
277 }
278 
279 void
280 comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv)
281 {
282 	*tt = &b->eb->secs;
283 	*tv = &b->eb->now;
284 }
285 
286 void
287 comm_base_dispatch(struct comm_base* b)
288 {
289 	int retval;
290 	retval = ub_event_base_dispatch(b->eb->base);
291 	if(retval < 0) {
292 		fatal_exit("event_dispatch returned error %d, "
293 			"errno is %s", retval, strerror(errno));
294 	}
295 }
296 
297 void comm_base_exit(struct comm_base* b)
298 {
299 	if(ub_event_base_loopexit(b->eb->base) != 0) {
300 		log_err("Could not loopexit");
301 	}
302 }
303 
304 void comm_base_set_slow_accept_handlers(struct comm_base* b,
305 	void (*stop_acc)(void*), void (*start_acc)(void*), void* arg)
306 {
307 	b->stop_accept = stop_acc;
308 	b->start_accept = start_acc;
309 	b->cb_arg = arg;
310 }
311 
312 struct ub_event_base* comm_base_internal(struct comm_base* b)
313 {
314 	return b->eb->base;
315 }
316 
317 struct ub_event* comm_point_internal(struct comm_point* c)
318 {
319 	return c->ev->ev;
320 }
321 
322 /** see if errno for udp has to be logged or not uses globals */
323 static int
324 udp_send_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
325 {
326 	/* do not log transient errors (unless high verbosity) */
327 #if defined(ENETUNREACH) || defined(EHOSTDOWN) || defined(EHOSTUNREACH) || defined(ENETDOWN)
328 	switch(errno) {
329 #  ifdef ENETUNREACH
330 		case ENETUNREACH:
331 #  endif
332 #  ifdef EHOSTDOWN
333 		case EHOSTDOWN:
334 #  endif
335 #  ifdef EHOSTUNREACH
336 		case EHOSTUNREACH:
337 #  endif
338 #  ifdef ENETDOWN
339 		case ENETDOWN:
340 #  endif
341 		case EPERM:
342 		case EACCES:
343 			if(verbosity < VERB_ALGO)
344 				return 0;
345 			break;
346 		default:
347 			break;
348 	}
349 #endif
350 	/* permission denied is gotten for every send if the
351 	 * network is disconnected (on some OS), squelch it */
352 	if( ((errno == EPERM)
353 #  ifdef EADDRNOTAVAIL
354 		/* 'Cannot assign requested address' also when disconnected */
355 		|| (errno == EADDRNOTAVAIL)
356 #  endif
357 		) && verbosity < VERB_ALGO)
358 		return 0;
359 #  ifdef EADDRINUSE
360 	/* If SO_REUSEADDR is set, we could try to connect to the same server
361 	 * from the same source port twice. */
362 	if(errno == EADDRINUSE && verbosity < VERB_DETAIL)
363 		return 0;
364 #  endif
365 	/* squelch errors where people deploy AAAA ::ffff:bla for
366 	 * authority servers, which we try for intranets. */
367 	if(errno == EINVAL && addr_is_ip4mapped(
368 		(struct sockaddr_storage*)addr, addrlen) &&
369 		verbosity < VERB_DETAIL)
370 		return 0;
371 	/* SO_BROADCAST sockopt can give access to 255.255.255.255,
372 	 * but a dns cache does not need it. */
373 	if(errno == EACCES && addr_is_broadcast(
374 		(struct sockaddr_storage*)addr, addrlen) &&
375 		verbosity < VERB_DETAIL)
376 		return 0;
377 #  ifdef ENOTCONN
378 	/* For 0.0.0.0, ::0 targets it can return that socket is not connected.
379 	 * This can be ignored, and the address skipped. It remains
380 	 * possible to send there for completeness in configuration. */
381 	if(errno == ENOTCONN && addr_is_any(
382 		(struct sockaddr_storage*)addr, addrlen) &&
383 		verbosity < VERB_DETAIL)
384 		return 0;
385 #  endif
386 	return 1;
387 }
388 
389 int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
390 {
391 	return udp_send_errno_needs_log(addr, addrlen);
392 }
393 
394 /* send a UDP reply */
395 int
396 comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
397 	struct sockaddr* addr, socklen_t addrlen, int is_connected)
398 {
399 	ssize_t sent;
400 	log_assert(c->fd != -1);
401 #ifdef UNBOUND_DEBUG
402 	if(sldns_buffer_remaining(packet) == 0)
403 		log_err("error: send empty UDP packet");
404 #endif
405 	log_assert(addr && addrlen > 0);
406 	if(!is_connected) {
407 		sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
408 			sldns_buffer_remaining(packet), 0,
409 			addr, addrlen);
410 	} else {
411 		sent = send(c->fd, (void*)sldns_buffer_begin(packet),
412 			sldns_buffer_remaining(packet), 0);
413 	}
414 	if(sent == -1) {
415 		/* try again and block, waiting for IO to complete,
416 		 * we want to send the answer, and we will wait for
417 		 * the ethernet interface buffer to have space. */
418 #ifndef USE_WINSOCK
419 		if(errno == EAGAIN || errno == EINTR ||
420 #  ifdef EWOULDBLOCK
421 			errno == EWOULDBLOCK ||
422 #  endif
423 			errno == ENOBUFS) {
424 #else
425 		if(WSAGetLastError() == WSAEINPROGRESS ||
426 			WSAGetLastError() == WSAEINTR ||
427 			WSAGetLastError() == WSAENOBUFS ||
428 			WSAGetLastError() == WSAEWOULDBLOCK) {
429 #endif
430 			int retries = 0;
431 			/* if we set the fd blocking, other threads suddenly
432 			 * have a blocking fd that they operate on */
433 			while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && (
434 #ifndef USE_WINSOCK
435 				errno == EAGAIN || errno == EINTR ||
436 #  ifdef EWOULDBLOCK
437 				errno == EWOULDBLOCK ||
438 #  endif
439 				errno == ENOBUFS
440 #else
441 				WSAGetLastError() == WSAEINPROGRESS ||
442 				WSAGetLastError() == WSAEINTR ||
443 				WSAGetLastError() == WSAENOBUFS ||
444 				WSAGetLastError() == WSAEWOULDBLOCK
445 #endif
446 			)) {
447 #if defined(HAVE_POLL) || defined(USE_WINSOCK)
448 				int send_nobufs = (
449 #ifndef USE_WINSOCK
450 					errno == ENOBUFS
451 #else
452 					WSAGetLastError() == WSAENOBUFS
453 #endif
454 				);
455 				struct pollfd p;
456 				int pret;
457 				memset(&p, 0, sizeof(p));
458 				p.fd = c->fd;
459 				p.events = POLLOUT
460 #ifndef USE_WINSOCK
461 					| POLLERR | POLLHUP
462 #endif
463 					;
464 #  ifndef USE_WINSOCK
465 				pret = poll(&p, 1, SEND_BLOCKED_WAIT_TIMEOUT);
466 #  else
467 				pret = WSAPoll(&p, 1,
468 					SEND_BLOCKED_WAIT_TIMEOUT);
469 #  endif
470 				if(pret == 0) {
471 					/* timer expired */
472 					struct comm_base* b = c->ev->base;
473 					if(b->eb->last_writewait_log+SLOW_LOG_TIME <=
474 						b->eb->secs) {
475 						b->eb->last_writewait_log = b->eb->secs;
476 						verbose(VERB_OPS, "send udp blocked "
477 							"for long, dropping packet.");
478 					}
479 					return 0;
480 				} else if(pret < 0 &&
481 #ifndef USE_WINSOCK
482 					errno != EAGAIN && errno != EINTR &&
483 #  ifdef EWOULDBLOCK
484 					errno != EWOULDBLOCK &&
485 #  endif
486 					errno != ENOMEM && errno != ENOBUFS
487 #else
488 					WSAGetLastError() != WSAEINPROGRESS &&
489 					WSAGetLastError() != WSAEINTR &&
490 					WSAGetLastError() != WSAENOBUFS &&
491 					WSAGetLastError() != WSAEWOULDBLOCK
492 #endif
493 					) {
494 					log_err("poll udp out failed: %s",
495 						sock_strerror(errno));
496 					return 0;
497 				} else if((pret < 0 &&
498 #ifndef USE_WINSOCK
499 					( errno == ENOBUFS  /* Maybe some systems */
500 					|| errno == ENOMEM  /* Linux */
501 					|| errno == EAGAIN)  /* Macos, solaris, openbsd */
502 #else
503 					WSAGetLastError() == WSAENOBUFS
504 #endif
505 					) || (send_nobufs && retries > 0)) {
506 					/* ENOBUFS/ENOMEM/EAGAIN, and poll
507 					 * returned without
508 					 * a timeout. Or the retried send call
509 					 * returned ENOBUFS/ENOMEM/EAGAIN.
510 					 * It is good to wait a bit for the
511 					 * error to clear. */
512 					/* The timeout is 20*(2^(retries+1)),
513 					 * it increases exponentially, starting
514 					 * at 40 msec. After 5 tries, 1240 msec
515 					 * have passed in total, when poll
516 					 * returned the error, and 1200 msec
517 					 * when send returned the errors. */
518 #ifndef USE_WINSOCK
519 					pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
520 #else
521 					Sleep((SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
522 					pret = 0;
523 #endif
524 					if(pret < 0
525 #ifndef USE_WINSOCK
526 						&& errno != EAGAIN && errno != EINTR &&
527 #  ifdef EWOULDBLOCK
528 						errno != EWOULDBLOCK &&
529 #  endif
530 						errno != ENOMEM && errno != ENOBUFS
531 #else
532 						/* Sleep does not error */
533 #endif
534 					) {
535 						log_err("poll udp out timer failed: %s",
536 							sock_strerror(errno));
537 					}
538 				}
539 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */
540 				retries++;
541 				if (!is_connected) {
542 					sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
543 						sldns_buffer_remaining(packet), 0,
544 						addr, addrlen);
545 				} else {
546 					sent = send(c->fd, (void*)sldns_buffer_begin(packet),
547 						sldns_buffer_remaining(packet), 0);
548 				}
549 			}
550 		}
551 	}
552 	if(sent == -1) {
553 		if(!udp_send_errno_needs_log(addr, addrlen))
554 			return 0;
555 		if (!is_connected) {
556 			verbose(VERB_OPS, "sendto failed: %s", sock_strerror(errno));
557 		} else {
558 			verbose(VERB_OPS, "send failed: %s", sock_strerror(errno));
559 		}
560 		if(addr)
561 			log_addr(VERB_OPS, "remote address is",
562 				(struct sockaddr_storage*)addr, addrlen);
563 		return 0;
564 	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
565 		log_err("sent %d in place of %d bytes",
566 			(int)sent, (int)sldns_buffer_remaining(packet));
567 		return 0;
568 	}
569 	return 1;
570 }
571 
572 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && (defined(HAVE_RECVMSG) || defined(HAVE_SENDMSG))
573 /** print debug ancillary info */
574 static void p_ancil(const char* str, struct comm_reply* r)
575 {
576 	if(r->srctype != 4 && r->srctype != 6) {
577 		log_info("%s: unknown srctype %d", str, r->srctype);
578 		return;
579 	}
580 
581 	if(r->srctype == 6) {
582 #ifdef IPV6_PKTINFO
583 		char buf[1024];
584 		if(inet_ntop(AF_INET6, &r->pktinfo.v6info.ipi6_addr,
585 			buf, (socklen_t)sizeof(buf)) == 0) {
586 			(void)strlcpy(buf, "(inet_ntop error)", sizeof(buf));
587 		}
588 		buf[sizeof(buf)-1]=0;
589 		log_info("%s: %s %d", str, buf, r->pktinfo.v6info.ipi6_ifindex);
590 #endif
591 	} else if(r->srctype == 4) {
592 #ifdef IP_PKTINFO
593 		char buf1[1024], buf2[1024];
594 		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_addr,
595 			buf1, (socklen_t)sizeof(buf1)) == 0) {
596 			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
597 		}
598 		buf1[sizeof(buf1)-1]=0;
599 #ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST
600 		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_spec_dst,
601 			buf2, (socklen_t)sizeof(buf2)) == 0) {
602 			(void)strlcpy(buf2, "(inet_ntop error)", sizeof(buf2));
603 		}
604 		buf2[sizeof(buf2)-1]=0;
605 #else
606 		buf2[0]=0;
607 #endif
608 		log_info("%s: %d %s %s", str, r->pktinfo.v4info.ipi_ifindex,
609 			buf1, buf2);
610 #elif defined(IP_RECVDSTADDR)
611 		char buf1[1024];
612 		if(inet_ntop(AF_INET, &r->pktinfo.v4addr,
613 			buf1, (socklen_t)sizeof(buf1)) == 0) {
614 			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
615 		}
616 		buf1[sizeof(buf1)-1]=0;
617 		log_info("%s: %s", str, buf1);
618 #endif /* IP_PKTINFO or PI_RECVDSTDADDR */
619 	}
620 }
621 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG||HAVE_SENDMSG */
622 
623 /** send a UDP reply over specified interface*/
624 static int
625 comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
626 	struct sockaddr* addr, socklen_t addrlen, struct comm_reply* r)
627 {
628 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_SENDMSG)
629 	ssize_t sent;
630 	struct msghdr msg;
631 	struct iovec iov[1];
632 	union {
633 		struct cmsghdr hdr;
634 		char buf[256];
635 	} control;
636 #ifndef S_SPLINT_S
637 	struct cmsghdr *cmsg;
638 #endif /* S_SPLINT_S */
639 
640 	log_assert(c->fd != -1);
641 #ifdef UNBOUND_DEBUG
642 	if(sldns_buffer_remaining(packet) == 0)
643 		log_err("error: send empty UDP packet");
644 #endif
645 	log_assert(addr && addrlen > 0);
646 
647 	msg.msg_name = addr;
648 	msg.msg_namelen = addrlen;
649 	iov[0].iov_base = sldns_buffer_begin(packet);
650 	iov[0].iov_len = sldns_buffer_remaining(packet);
651 	msg.msg_iov = iov;
652 	msg.msg_iovlen = 1;
653 	msg.msg_control = control.buf;
654 #ifndef S_SPLINT_S
655 	msg.msg_controllen = sizeof(control.buf);
656 #endif /* S_SPLINT_S */
657 	msg.msg_flags = 0;
658 
659 #ifndef S_SPLINT_S
660 	cmsg = CMSG_FIRSTHDR(&msg);
661 	if(r->srctype == 4) {
662 #ifdef IP_PKTINFO
663 		void* cmsg_data;
664 		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo));
665 		log_assert(msg.msg_controllen <= sizeof(control.buf));
666 		cmsg->cmsg_level = IPPROTO_IP;
667 		cmsg->cmsg_type = IP_PKTINFO;
668 		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4info,
669 			sizeof(struct in_pktinfo));
670 		/* unset the ifindex to not bypass the routing tables */
671 		cmsg_data = CMSG_DATA(cmsg);
672 		((struct in_pktinfo *) cmsg_data)->ipi_ifindex = 0;
673 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
674 		/* zero the padding bytes inserted by the CMSG_LEN */
675 		if(sizeof(struct in_pktinfo) < cmsg->cmsg_len)
676 			memset(((uint8_t*)(CMSG_DATA(cmsg))) +
677 				sizeof(struct in_pktinfo), 0, cmsg->cmsg_len
678 				- sizeof(struct in_pktinfo));
679 #elif defined(IP_SENDSRCADDR)
680 		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr));
681 		log_assert(msg.msg_controllen <= sizeof(control.buf));
682 		cmsg->cmsg_level = IPPROTO_IP;
683 		cmsg->cmsg_type = IP_SENDSRCADDR;
684 		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4addr,
685 			sizeof(struct in_addr));
686 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr));
687 		/* zero the padding bytes inserted by the CMSG_LEN */
688 		if(sizeof(struct in_addr) < cmsg->cmsg_len)
689 			memset(((uint8_t*)(CMSG_DATA(cmsg))) +
690 				sizeof(struct in_addr), 0, cmsg->cmsg_len
691 				- sizeof(struct in_addr));
692 #else
693 		verbose(VERB_ALGO, "no IP_PKTINFO or IP_SENDSRCADDR");
694 		msg.msg_control = NULL;
695 #endif /* IP_PKTINFO or IP_SENDSRCADDR */
696 	} else if(r->srctype == 6) {
697 		void* cmsg_data;
698 		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
699 		log_assert(msg.msg_controllen <= sizeof(control.buf));
700 		cmsg->cmsg_level = IPPROTO_IPV6;
701 		cmsg->cmsg_type = IPV6_PKTINFO;
702 		memmove(CMSG_DATA(cmsg), &r->pktinfo.v6info,
703 			sizeof(struct in6_pktinfo));
704 		/* unset the ifindex to not bypass the routing tables */
705 		cmsg_data = CMSG_DATA(cmsg);
706 		((struct in6_pktinfo *) cmsg_data)->ipi6_ifindex = 0;
707 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
708 		/* zero the padding bytes inserted by the CMSG_LEN */
709 		if(sizeof(struct in6_pktinfo) < cmsg->cmsg_len)
710 			memset(((uint8_t*)(CMSG_DATA(cmsg))) +
711 				sizeof(struct in6_pktinfo), 0, cmsg->cmsg_len
712 				- sizeof(struct in6_pktinfo));
713 	} else {
714 		/* try to pass all 0 to use default route */
715 		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
716 		log_assert(msg.msg_controllen <= sizeof(control.buf));
717 		cmsg->cmsg_level = IPPROTO_IPV6;
718 		cmsg->cmsg_type = IPV6_PKTINFO;
719 		memset(CMSG_DATA(cmsg), 0, sizeof(struct in6_pktinfo));
720 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
721 		/* zero the padding bytes inserted by the CMSG_LEN */
722 		if(sizeof(struct in6_pktinfo) < cmsg->cmsg_len)
723 			memset(((uint8_t*)(CMSG_DATA(cmsg))) +
724 				sizeof(struct in6_pktinfo), 0, cmsg->cmsg_len
725 				- sizeof(struct in6_pktinfo));
726 	}
727 #endif /* S_SPLINT_S */
728 	if(verbosity >= VERB_ALGO && r->srctype != 0)
729 		p_ancil("send_udp over interface", r);
730 	sent = sendmsg(c->fd, &msg, 0);
731 	if(sent == -1) {
732 		/* try again and block, waiting for IO to complete,
733 		 * we want to send the answer, and we will wait for
734 		 * the ethernet interface buffer to have space. */
735 #ifndef USE_WINSOCK
736 		if(errno == EAGAIN || errno == EINTR ||
737 #  ifdef EWOULDBLOCK
738 			errno == EWOULDBLOCK ||
739 #  endif
740 			errno == ENOBUFS) {
741 #else
742 		if(WSAGetLastError() == WSAEINPROGRESS ||
743 			WSAGetLastError() == WSAEINTR ||
744 			WSAGetLastError() == WSAENOBUFS ||
745 			WSAGetLastError() == WSAEWOULDBLOCK) {
746 #endif
747 			int retries = 0;
748 			while(sent == -1 && retries < SEND_BLOCKED_MAX_RETRY && (
749 #ifndef USE_WINSOCK
750 				errno == EAGAIN || errno == EINTR ||
751 #  ifdef EWOULDBLOCK
752 				errno == EWOULDBLOCK ||
753 #  endif
754 				errno == ENOBUFS
755 #else
756 				WSAGetLastError() == WSAEINPROGRESS ||
757 				WSAGetLastError() == WSAEINTR ||
758 				WSAGetLastError() == WSAENOBUFS ||
759 				WSAGetLastError() == WSAEWOULDBLOCK
760 #endif
761 			)) {
762 #if defined(HAVE_POLL) || defined(USE_WINSOCK)
763 				int send_nobufs = (
764 #ifndef USE_WINSOCK
765 					errno == ENOBUFS
766 #else
767 					WSAGetLastError() == WSAENOBUFS
768 #endif
769 				);
770 				struct pollfd p;
771 				int pret;
772 				memset(&p, 0, sizeof(p));
773 				p.fd = c->fd;
774 				p.events = POLLOUT
775 #ifndef USE_WINSOCK
776 					| POLLERR | POLLHUP
777 #endif
778 					;
779 #  ifndef USE_WINSOCK
780 				pret = poll(&p, 1, SEND_BLOCKED_WAIT_TIMEOUT);
781 #  else
782 				pret = WSAPoll(&p, 1,
783 					SEND_BLOCKED_WAIT_TIMEOUT);
784 #  endif
785 				if(pret == 0) {
786 					/* timer expired */
787 					struct comm_base* b = c->ev->base;
788 					if(b->eb->last_writewait_log+SLOW_LOG_TIME <=
789 						b->eb->secs) {
790 						b->eb->last_writewait_log = b->eb->secs;
791 						verbose(VERB_OPS, "send udp blocked "
792 							"for long, dropping packet.");
793 					}
794 					return 0;
795 				} else if(pret < 0 &&
796 #ifndef USE_WINSOCK
797 					errno != EAGAIN && errno != EINTR &&
798 #  ifdef EWOULDBLOCK
799 					errno != EWOULDBLOCK &&
800 #  endif
801 					errno != ENOMEM && errno != ENOBUFS
802 #else
803 					WSAGetLastError() != WSAEINPROGRESS &&
804 					WSAGetLastError() != WSAEINTR &&
805 					WSAGetLastError() != WSAENOBUFS &&
806 					WSAGetLastError() != WSAEWOULDBLOCK
807 #endif
808 					) {
809 					log_err("poll udp out failed: %s",
810 						sock_strerror(errno));
811 					return 0;
812 				} else if((pret < 0 &&
813 #ifndef USE_WINSOCK
814 					( errno == ENOBUFS  /* Maybe some systems */
815 					|| errno == ENOMEM  /* Linux */
816 					|| errno == EAGAIN)  /* Macos, solaris, openbsd */
817 #else
818 					WSAGetLastError() == WSAENOBUFS
819 #endif
820 					) || (send_nobufs && retries > 0)) {
821 					/* ENOBUFS/ENOMEM/EAGAIN, and poll
822 					 * returned without
823 					 * a timeout. Or the retried send call
824 					 * returned ENOBUFS/ENOMEM/EAGAIN.
825 					 * It is good to wait a bit for the
826 					 * error to clear. */
827 					/* The timeout is 20*(2^(retries+1)),
828 					 * it increases exponentially, starting
829 					 * at 40 msec. After 5 tries, 1240 msec
830 					 * have passed in total, when poll
831 					 * returned the error, and 1200 msec
832 					 * when send returned the errors. */
833 #ifndef USE_WINSOCK
834 					pret = poll(NULL, 0, (SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
835 #else
836 					Sleep((SEND_BLOCKED_WAIT_TIMEOUT/10)<<(retries+1));
837 					pret = 0;
838 #endif
839 					if(pret < 0
840 #ifndef USE_WINSOCK
841 						&& errno != EAGAIN && errno != EINTR &&
842 #  ifdef EWOULDBLOCK
843 						errno != EWOULDBLOCK &&
844 #  endif
845 						errno != ENOMEM && errno != ENOBUFS
846 #else  /* USE_WINSOCK */
847 						/* Sleep does not error */
848 #endif
849 					) {
850 						log_err("poll udp out timer failed: %s",
851 							sock_strerror(errno));
852 					}
853 				}
854 #endif /* defined(HAVE_POLL) || defined(USE_WINSOCK) */
855 				retries++;
856 				sent = sendmsg(c->fd, &msg, 0);
857 			}
858 		}
859 	}
860 	if(sent == -1) {
861 		if(!udp_send_errno_needs_log(addr, addrlen))
862 			return 0;
863 		verbose(VERB_OPS, "sendmsg failed: %s", strerror(errno));
864 		log_addr(VERB_OPS, "remote address is",
865 			(struct sockaddr_storage*)addr, addrlen);
866 #ifdef __NetBSD__
867 		/* netbsd 7 has IP_PKTINFO for recv but not send */
868 		if(errno == EINVAL && r->srctype == 4)
869 			log_err("sendmsg: No support for sendmsg(IP_PKTINFO). "
870 				"Please disable interface-automatic");
871 #endif
872 		return 0;
873 	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
874 		log_err("sent %d in place of %d bytes",
875 			(int)sent, (int)sldns_buffer_remaining(packet));
876 		return 0;
877 	}
878 	return 1;
879 #else
880 	(void)c;
881 	(void)packet;
882 	(void)addr;
883 	(void)addrlen;
884 	(void)r;
885 	log_err("sendmsg: IPV6_PKTINFO not supported");
886 	return 0;
887 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_SENDMSG */
888 }
889 
890 /** return true is UDP receive error needs to be logged */
891 static int udp_recv_needs_log(int err)
892 {
893 	switch(err) {
894 	case EACCES: /* some hosts send ICMP 'Permission Denied' */
895 #ifndef USE_WINSOCK
896 	case ECONNREFUSED:
897 #  ifdef ENETUNREACH
898 	case ENETUNREACH:
899 #  endif
900 #  ifdef EHOSTDOWN
901 	case EHOSTDOWN:
902 #  endif
903 #  ifdef EHOSTUNREACH
904 	case EHOSTUNREACH:
905 #  endif
906 #  ifdef ENETDOWN
907 	case ENETDOWN:
908 #  endif
909 #else /* USE_WINSOCK */
910 	case WSAECONNREFUSED:
911 	case WSAENETUNREACH:
912 	case WSAEHOSTDOWN:
913 	case WSAEHOSTUNREACH:
914 	case WSAENETDOWN:
915 #endif
916 		if(verbosity >= VERB_ALGO)
917 			return 1;
918 		return 0;
919 	default:
920 		break;
921 	}
922 	return 1;
923 }
924 
925 /** Parses the PROXYv2 header from buf and updates the comm_reply struct.
926  *  Returns 1 on success, 0 on failure. */
927 static int consume_pp2_header(struct sldns_buffer* buf, struct comm_reply* rep,
928 	int stream) {
929 	size_t size;
930 	struct pp2_header *header;
931 	int err = pp2_read_header(sldns_buffer_begin(buf),
932 		sldns_buffer_remaining(buf));
933 	if(err) return 0;
934 	header = (struct pp2_header*)sldns_buffer_begin(buf);
935 	size = PP2_HEADER_SIZE + ntohs(header->len);
936 	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
937 		/* A connection from the proxy itself.
938 		 * No need to do anything with addresses. */
939 		goto done;
940 	}
941 	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
942 		/* Unspecified family and protocol. This could be used for
943 		 * health checks by proxies.
944 		 * No need to do anything with addresses. */
945 		goto done;
946 	}
947 	/* Read the proxied address */
948 	switch(header->fam_prot) {
949 		case PP2_INET_STREAM:
950 		case PP2_INET_DGRAM:
951 			{
952 			struct sockaddr_in* addr =
953 				(struct sockaddr_in*)&rep->client_addr;
954 			addr->sin_family = AF_INET;
955 			addr->sin_addr.s_addr = header->addr.addr4.src_addr;
956 			addr->sin_port = header->addr.addr4.src_port;
957 			rep->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
958 			}
959 			/* Ignore the destination address; it should be us. */
960 			break;
961 		case PP2_INET6_STREAM:
962 		case PP2_INET6_DGRAM:
963 			{
964 			struct sockaddr_in6* addr =
965 				(struct sockaddr_in6*)&rep->client_addr;
966 			memset(addr, 0, sizeof(*addr));
967 			addr->sin6_family = AF_INET6;
968 			memcpy(&addr->sin6_addr,
969 				header->addr.addr6.src_addr, 16);
970 			addr->sin6_port = header->addr.addr6.src_port;
971 			rep->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
972 			}
973 			/* Ignore the destination address; it should be us. */
974 			break;
975 		default:
976 			log_err("proxy_protocol: unsupported family and "
977 				"protocol 0x%x", (int)header->fam_prot);
978 			return 0;
979 	}
980 	rep->is_proxied = 1;
981 done:
982 	if(!stream) {
983 		/* We are reading a whole packet;
984 		 * Move the rest of the data to overwrite the PROXYv2 header */
985 		/* XXX can we do better to avoid memmove? */
986 		memmove(header, ((char*)header)+size,
987 			sldns_buffer_limit(buf)-size);
988 		sldns_buffer_set_limit(buf, sldns_buffer_limit(buf)-size);
989 	}
990 	return 1;
991 }
992 
993 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
994 void
995 comm_point_udp_ancil_callback(int fd, short event, void* arg)
996 {
997 	struct comm_reply rep;
998 	struct msghdr msg;
999 	struct iovec iov[1];
1000 	ssize_t rcv;
1001 	union {
1002 		struct cmsghdr hdr;
1003 		char buf[256];
1004 	} ancil;
1005 	int i;
1006 #ifndef S_SPLINT_S
1007 	struct cmsghdr* cmsg;
1008 #endif /* S_SPLINT_S */
1009 #ifdef HAVE_LINUX_NET_TSTAMP_H
1010 	struct timespec *ts;
1011 #endif /* HAVE_LINUX_NET_TSTAMP_H */
1012 
1013 	rep.c = (struct comm_point*)arg;
1014 	log_assert(rep.c->type == comm_udp);
1015 
1016 	if(!(event&UB_EV_READ))
1017 		return;
1018 	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
1019 	ub_comm_base_now(rep.c->ev->base);
1020 	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
1021 		sldns_buffer_clear(rep.c->buffer);
1022 		timeval_clear(&rep.c->recv_tv);
1023 		rep.remote_addrlen = (socklen_t)sizeof(rep.remote_addr);
1024 		log_assert(fd != -1);
1025 		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
1026 		msg.msg_name = &rep.remote_addr;
1027 		msg.msg_namelen = (socklen_t)sizeof(rep.remote_addr);
1028 		iov[0].iov_base = sldns_buffer_begin(rep.c->buffer);
1029 		iov[0].iov_len = sldns_buffer_remaining(rep.c->buffer);
1030 		msg.msg_iov = iov;
1031 		msg.msg_iovlen = 1;
1032 		msg.msg_control = ancil.buf;
1033 #ifndef S_SPLINT_S
1034 		msg.msg_controllen = sizeof(ancil.buf);
1035 #endif /* S_SPLINT_S */
1036 		msg.msg_flags = 0;
1037 		rcv = recvmsg(fd, &msg, MSG_DONTWAIT);
1038 		if(rcv == -1) {
1039 			if(errno != EAGAIN && errno != EINTR
1040 				&& udp_recv_needs_log(errno)) {
1041 				log_err("recvmsg failed: %s", strerror(errno));
1042 			}
1043 			return;
1044 		}
1045 		rep.remote_addrlen = msg.msg_namelen;
1046 		sldns_buffer_skip(rep.c->buffer, rcv);
1047 		sldns_buffer_flip(rep.c->buffer);
1048 		rep.srctype = 0;
1049 		rep.is_proxied = 0;
1050 #ifndef S_SPLINT_S
1051 		for(cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
1052 			cmsg = CMSG_NXTHDR(&msg, cmsg)) {
1053 			if( cmsg->cmsg_level == IPPROTO_IPV6 &&
1054 				cmsg->cmsg_type == IPV6_PKTINFO) {
1055 				rep.srctype = 6;
1056 				memmove(&rep.pktinfo.v6info, CMSG_DATA(cmsg),
1057 					sizeof(struct in6_pktinfo));
1058 				break;
1059 #ifdef IP_PKTINFO
1060 			} else if( cmsg->cmsg_level == IPPROTO_IP &&
1061 				cmsg->cmsg_type == IP_PKTINFO) {
1062 				rep.srctype = 4;
1063 				memmove(&rep.pktinfo.v4info, CMSG_DATA(cmsg),
1064 					sizeof(struct in_pktinfo));
1065 				break;
1066 #elif defined(IP_RECVDSTADDR)
1067 			} else if( cmsg->cmsg_level == IPPROTO_IP &&
1068 				cmsg->cmsg_type == IP_RECVDSTADDR) {
1069 				rep.srctype = 4;
1070 				memmove(&rep.pktinfo.v4addr, CMSG_DATA(cmsg),
1071 					sizeof(struct in_addr));
1072 				break;
1073 #endif /* IP_PKTINFO or IP_RECVDSTADDR */
1074 #ifdef HAVE_LINUX_NET_TSTAMP_H
1075 			} else if( cmsg->cmsg_level == SOL_SOCKET &&
1076 				cmsg->cmsg_type == SO_TIMESTAMPNS) {
1077 				ts = (struct timespec *)CMSG_DATA(cmsg);
1078 				TIMESPEC_TO_TIMEVAL(&rep.c->recv_tv, ts);
1079 			} else if( cmsg->cmsg_level == SOL_SOCKET &&
1080 				cmsg->cmsg_type == SO_TIMESTAMPING) {
1081 				ts = (struct timespec *)CMSG_DATA(cmsg);
1082 				TIMESPEC_TO_TIMEVAL(&rep.c->recv_tv, ts);
1083 			} else if( cmsg->cmsg_level == SOL_SOCKET &&
1084 				cmsg->cmsg_type == SO_TIMESTAMP) {
1085 				memmove(&rep.c->recv_tv, CMSG_DATA(cmsg), sizeof(struct timeval));
1086 #endif /* HAVE_LINUX_NET_TSTAMP_H */
1087 			}
1088 		}
1089 
1090 		if(verbosity >= VERB_ALGO && rep.srctype != 0)
1091 			p_ancil("receive_udp on interface", &rep);
1092 #endif /* S_SPLINT_S */
1093 
1094 		if(rep.c->pp2_enabled && !consume_pp2_header(rep.c->buffer,
1095 			&rep, 0)) {
1096 			log_err("proxy_protocol: could not consume PROXYv2 header");
1097 			return;
1098 		}
1099 		if(!rep.is_proxied) {
1100 			rep.client_addrlen = rep.remote_addrlen;
1101 			memmove(&rep.client_addr, &rep.remote_addr,
1102 				rep.remote_addrlen);
1103 		}
1104 
1105 		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
1106 		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
1107 			/* send back immediate reply */
1108 			struct sldns_buffer *buffer;
1109 #ifdef USE_DNSCRYPT
1110 			buffer = rep.c->dnscrypt_buffer;
1111 #else
1112 			buffer = rep.c->buffer;
1113 #endif
1114 			(void)comm_point_send_udp_msg_if(rep.c, buffer,
1115 				(struct sockaddr*)&rep.remote_addr,
1116 				rep.remote_addrlen, &rep);
1117 		}
1118 		if(!rep.c || rep.c->fd == -1) /* commpoint closed */
1119 			break;
1120 	}
1121 }
1122 #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG */
1123 
1124 void
1125 comm_point_udp_callback(int fd, short event, void* arg)
1126 {
1127 	struct comm_reply rep;
1128 	ssize_t rcv;
1129 	int i;
1130 	struct sldns_buffer *buffer;
1131 
1132 	rep.c = (struct comm_point*)arg;
1133 	log_assert(rep.c->type == comm_udp);
1134 
1135 	if(!(event&UB_EV_READ))
1136 		return;
1137 	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
1138 	ub_comm_base_now(rep.c->ev->base);
1139 	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
1140 		sldns_buffer_clear(rep.c->buffer);
1141 		rep.remote_addrlen = (socklen_t)sizeof(rep.remote_addr);
1142 		log_assert(fd != -1);
1143 		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
1144 		rcv = recvfrom(fd, (void*)sldns_buffer_begin(rep.c->buffer),
1145 			sldns_buffer_remaining(rep.c->buffer), MSG_DONTWAIT,
1146 			(struct sockaddr*)&rep.remote_addr, &rep.remote_addrlen);
1147 		if(rcv == -1) {
1148 #ifndef USE_WINSOCK
1149 			if(errno != EAGAIN && errno != EINTR
1150 				&& udp_recv_needs_log(errno))
1151 				log_err("recvfrom %d failed: %s",
1152 					fd, strerror(errno));
1153 #else
1154 			if(WSAGetLastError() != WSAEINPROGRESS &&
1155 				WSAGetLastError() != WSAECONNRESET &&
1156 				WSAGetLastError()!= WSAEWOULDBLOCK &&
1157 				udp_recv_needs_log(WSAGetLastError()))
1158 				log_err("recvfrom failed: %s",
1159 					wsa_strerror(WSAGetLastError()));
1160 #endif
1161 			return;
1162 		}
1163 		sldns_buffer_skip(rep.c->buffer, rcv);
1164 		sldns_buffer_flip(rep.c->buffer);
1165 		rep.srctype = 0;
1166 		rep.is_proxied = 0;
1167 
1168 		if(rep.c->pp2_enabled && !consume_pp2_header(rep.c->buffer,
1169 			&rep, 0)) {
1170 			log_err("proxy_protocol: could not consume PROXYv2 header");
1171 			return;
1172 		}
1173 		if(!rep.is_proxied) {
1174 			rep.client_addrlen = rep.remote_addrlen;
1175 			memmove(&rep.client_addr, &rep.remote_addr,
1176 				rep.remote_addrlen);
1177 		}
1178 
1179 		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
1180 		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
1181 			/* send back immediate reply */
1182 #ifdef USE_DNSCRYPT
1183 			buffer = rep.c->dnscrypt_buffer;
1184 #else
1185 			buffer = rep.c->buffer;
1186 #endif
1187 			(void)comm_point_send_udp_msg(rep.c, buffer,
1188 				(struct sockaddr*)&rep.remote_addr,
1189 				rep.remote_addrlen, 0);
1190 		}
1191 		if(!rep.c || rep.c->fd != fd) /* commpoint closed to -1 or reused for
1192 		another UDP port. Note rep.c cannot be reused with TCP fd. */
1193 			break;
1194 	}
1195 }
1196 
1197 #ifdef HAVE_NGTCP2
1198 void
1199 doq_pkt_addr_init(struct doq_pkt_addr* paddr)
1200 {
1201 	paddr->addrlen = (socklen_t)sizeof(paddr->addr);
1202 	paddr->localaddrlen = (socklen_t)sizeof(paddr->localaddr);
1203 	paddr->ifindex = 0;
1204 }
1205 
1206 /** set the ecn on the transmission */
1207 static void
1208 doq_set_ecn(int fd, int family, uint32_t ecn)
1209 {
1210 	unsigned int val = ecn;
1211 	if(family == AF_INET6) {
1212 		if(setsockopt(fd, IPPROTO_IPV6, IPV6_TCLASS, &val,
1213 			(socklen_t)sizeof(val)) == -1) {
1214 			log_err("setsockopt(.. IPV6_TCLASS ..): %s",
1215 				strerror(errno));
1216 		}
1217 		return;
1218 	}
1219 	if(setsockopt(fd, IPPROTO_IP, IP_TOS, &val,
1220 		(socklen_t)sizeof(val)) == -1) {
1221 		log_err("setsockopt(.. IP_TOS ..): %s",
1222 			strerror(errno));
1223 	}
1224 }
1225 
1226 /** set the local address in the control ancillary data */
1227 static void
1228 doq_set_localaddr_cmsg(struct msghdr* msg, size_t control_size,
1229 	struct doq_addr_storage* localaddr, socklen_t localaddrlen,
1230 	int ifindex)
1231 {
1232 #ifndef S_SPLINT_S
1233 	struct cmsghdr* cmsg;
1234 #endif /* S_SPLINT_S */
1235 #ifndef S_SPLINT_S
1236 	cmsg = CMSG_FIRSTHDR(msg);
1237 	if(localaddr->sockaddr.in.sin_family == AF_INET) {
1238 #ifdef IP_PKTINFO
1239 		struct sockaddr_in* sa = (struct sockaddr_in*)localaddr;
1240 		struct in_pktinfo v4info;
1241 		log_assert(localaddrlen >= sizeof(struct sockaddr_in));
1242 		msg->msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo));
1243 		memset(msg->msg_control, 0, msg->msg_controllen);
1244 		log_assert(msg->msg_controllen <= control_size);
1245 		cmsg->cmsg_level = IPPROTO_IP;
1246 		cmsg->cmsg_type = IP_PKTINFO;
1247 		memset(&v4info, 0, sizeof(v4info));
1248 #  ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST
1249 		memmove(&v4info.ipi_spec_dst, &sa->sin_addr,
1250 			sizeof(struct in_addr));
1251 #  else
1252 		memmove(&v4info.ipi_addr, &sa->sin_addr,
1253 			sizeof(struct in_addr));
1254 #  endif
1255 		v4info.ipi_ifindex = ifindex;
1256 		memmove(CMSG_DATA(cmsg), &v4info, sizeof(struct in_pktinfo));
1257 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
1258 #elif defined(IP_SENDSRCADDR)
1259 		struct sockaddr_in* sa= (struct sockaddr_in*)localaddr;
1260 		log_assert(localaddrlen >= sizeof(struct sockaddr_in));
1261 		msg->msg_controllen = CMSG_SPACE(sizeof(struct in_addr));
1262 		memset(msg->msg_control, 0, msg->msg_controllen);
1263 		log_assert(msg->msg_controllen <= control_size);
1264 		cmsg->cmsg_level = IPPROTO_IP;
1265 		cmsg->cmsg_type = IP_SENDSRCADDR;
1266 		memmove(CMSG_DATA(cmsg),  &sa->sin_addr,
1267 			sizeof(struct in_addr));
1268 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr));
1269 #endif
1270 	} else {
1271 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)localaddr;
1272 		struct in6_pktinfo v6info;
1273 		log_assert(localaddrlen >= sizeof(struct sockaddr_in6));
1274 		msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
1275 		memset(msg->msg_control, 0, msg->msg_controllen);
1276 		log_assert(msg->msg_controllen <= control_size);
1277 		cmsg->cmsg_level = IPPROTO_IPV6;
1278 		cmsg->cmsg_type = IPV6_PKTINFO;
1279 		memset(&v6info, 0, sizeof(v6info));
1280 		memmove(&v6info.ipi6_addr, &sa6->sin6_addr,
1281 			sizeof(struct in6_addr));
1282 		v6info.ipi6_ifindex = ifindex;
1283 		memmove(CMSG_DATA(cmsg), &v6info, sizeof(struct in6_pktinfo));
1284 		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
1285 	}
1286 #endif /* S_SPLINT_S */
1287 	/* Ignore unused variables, if no assertions are compiled. */
1288 	(void)localaddrlen;
1289 	(void)control_size;
1290 }
1291 
1292 /** write address and port into strings */
1293 static int
1294 doq_print_addr_port(struct doq_addr_storage* addr, socklen_t addrlen,
1295 	char* host, size_t hostlen, char* port, size_t portlen)
1296 {
1297 	if(addr->sockaddr.in.sin_family == AF_INET) {
1298 		struct sockaddr_in* sa = (struct sockaddr_in*)addr;
1299 		log_assert(addrlen >= sizeof(*sa));
1300 		if(inet_ntop(sa->sin_family, &sa->sin_addr, host,
1301 			(socklen_t)hostlen) == 0) {
1302 			log_hex("inet_ntop error: address", &sa->sin_addr,
1303 				sizeof(sa->sin_addr));
1304 			return 0;
1305 		}
1306 		snprintf(port, portlen, "%u", (unsigned)ntohs(sa->sin_port));
1307 	} else if(addr->sockaddr.in.sin_family == AF_INET6) {
1308 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)addr;
1309 		log_assert(addrlen >= sizeof(*sa6));
1310 		if(inet_ntop(sa6->sin6_family, &sa6->sin6_addr, host,
1311 			(socklen_t)hostlen) == 0) {
1312 			log_hex("inet_ntop error: address", &sa6->sin6_addr,
1313 				sizeof(sa6->sin6_addr));
1314 			return 0;
1315 		}
1316 		snprintf(port, portlen, "%u", (unsigned)ntohs(sa6->sin6_port));
1317 	}
1318 	return 1;
1319 }
1320 
1321 /** doq store the blocked packet when write has blocked */
1322 static void
1323 doq_store_blocked_pkt(struct comm_point* c, struct doq_pkt_addr* paddr,
1324 	uint32_t ecn)
1325 {
1326 	if(c->doq_socket->have_blocked_pkt)
1327 		return; /* should not happen that we write when there is
1328 		already a blocked write, but if so, drop it. */
1329 	if(sldns_buffer_limit(c->doq_socket->pkt_buf) >
1330 		sldns_buffer_capacity(c->doq_socket->blocked_pkt))
1331 		return; /* impossibly large, drop packet. impossible because
1332 		pkt_buf and blocked_pkt are the same size. */
1333 	c->doq_socket->have_blocked_pkt = 1;
1334 	c->doq_socket->blocked_pkt_pi.ecn = ecn;
1335 	memcpy(c->doq_socket->blocked_paddr, paddr,
1336 		sizeof(*c->doq_socket->blocked_paddr));
1337 	sldns_buffer_clear(c->doq_socket->blocked_pkt);
1338 	sldns_buffer_write(c->doq_socket->blocked_pkt,
1339 		sldns_buffer_begin(c->doq_socket->pkt_buf),
1340 		sldns_buffer_limit(c->doq_socket->pkt_buf));
1341 	sldns_buffer_flip(c->doq_socket->blocked_pkt);
1342 }
1343 
1344 void
1345 doq_send_pkt(struct comm_point* c, struct doq_pkt_addr* paddr, uint32_t ecn)
1346 {
1347 	struct msghdr msg;
1348 	struct iovec iov[1];
1349 	union {
1350 		struct cmsghdr hdr;
1351 		char buf[256];
1352 	} control;
1353 	ssize_t ret;
1354 	iov[0].iov_base = sldns_buffer_begin(c->doq_socket->pkt_buf);
1355 	iov[0].iov_len = sldns_buffer_limit(c->doq_socket->pkt_buf);
1356 	memset(&msg, 0, sizeof(msg));
1357 	msg.msg_name = (void*)&paddr->addr;
1358 	msg.msg_namelen = paddr->addrlen;
1359 	msg.msg_iov = iov;
1360 	msg.msg_iovlen = 1;
1361 	msg.msg_control = control.buf;
1362 #ifndef S_SPLINT_S
1363 	msg.msg_controllen = sizeof(control.buf);
1364 #endif /* S_SPLINT_S */
1365 	msg.msg_flags = 0;
1366 
1367 	doq_set_localaddr_cmsg(&msg, sizeof(control.buf), &paddr->localaddr,
1368 		paddr->localaddrlen, paddr->ifindex);
1369 	doq_set_ecn(c->fd, paddr->addr.sockaddr.in.sin_family, ecn);
1370 
1371 	for(;;) {
1372 		ret = sendmsg(c->fd, &msg, MSG_DONTWAIT);
1373 		if(ret == -1 && errno == EINTR)
1374 			continue;
1375 		break;
1376 	}
1377 	if(ret == -1) {
1378 #ifndef USE_WINSOCK
1379 		if(errno == EAGAIN ||
1380 #  ifdef EWOULDBLOCK
1381 			errno == EWOULDBLOCK ||
1382 #  endif
1383 			errno == ENOBUFS)
1384 #else
1385 		if(WSAGetLastError() == WSAEINPROGRESS ||
1386 			WSAGetLastError() == WSAENOBUFS ||
1387 			WSAGetLastError() == WSAEWOULDBLOCK)
1388 #endif
1389 		{
1390 			/* udp send has blocked */
1391 			doq_store_blocked_pkt(c, paddr, ecn);
1392 			return;
1393 		}
1394 		if(!udp_send_errno_needs_log((void*)&paddr->addr,
1395 			paddr->addrlen))
1396 			return;
1397 		if(verbosity >= VERB_OPS) {
1398 			char host[256], port[32];
1399 			if(doq_print_addr_port(&paddr->addr, paddr->addrlen,
1400 				host, sizeof(host), port, sizeof(port))) {
1401 				verbose(VERB_OPS, "doq sendmsg to %s %s "
1402 					"failed: %s", host, port,
1403 					strerror(errno));
1404 			} else {
1405 				verbose(VERB_OPS, "doq sendmsg failed: %s",
1406 					strerror(errno));
1407 			}
1408 		}
1409 		return;
1410 	} else if(ret != (ssize_t)sldns_buffer_limit(c->doq_socket->pkt_buf)) {
1411 		char host[256], port[32];
1412 		if(doq_print_addr_port(&paddr->addr, paddr->addrlen, host,
1413 			sizeof(host), port, sizeof(port))) {
1414 			log_err("doq sendmsg to %s %s failed: "
1415 				"sent %d in place of %d bytes",
1416 				host, port, (int)ret,
1417 				(int)sldns_buffer_limit(c->doq_socket->pkt_buf));
1418 		} else {
1419 			log_err("doq sendmsg failed: "
1420 				"sent %d in place of %d bytes",
1421 				(int)ret, (int)sldns_buffer_limit(c->doq_socket->pkt_buf));
1422 		}
1423 		return;
1424 	}
1425 }
1426 
1427 /** fetch port number */
1428 static int
1429 doq_sockaddr_get_port(struct doq_addr_storage* addr)
1430 {
1431 	if(addr->sockaddr.in.sin_family == AF_INET) {
1432 		struct sockaddr_in* sa = (struct sockaddr_in*)addr;
1433 		return ntohs(sa->sin_port);
1434 	} else if(addr->sockaddr.in.sin_family == AF_INET6) {
1435 		struct sockaddr_in6* sa6 = (struct sockaddr_in6*)addr;
1436 		return ntohs(sa6->sin6_port);
1437 	}
1438 	return 0;
1439 }
1440 
1441 /** get local address from ancillary data headers */
1442 static int
1443 doq_get_localaddr_cmsg(struct comm_point* c, struct doq_pkt_addr* paddr,
1444 	int* pkt_continue, struct msghdr* msg)
1445 {
1446 #ifndef S_SPLINT_S
1447 	struct cmsghdr* cmsg;
1448 #endif /* S_SPLINT_S */
1449 
1450 	memset(&paddr->localaddr, 0, sizeof(paddr->localaddr));
1451 #ifndef S_SPLINT_S
1452 	for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1453 		cmsg = CMSG_NXTHDR(msg, cmsg)) {
1454 		if( cmsg->cmsg_level == IPPROTO_IPV6 &&
1455 			cmsg->cmsg_type == IPV6_PKTINFO) {
1456 			struct in6_pktinfo* v6info =
1457 				(struct in6_pktinfo*)CMSG_DATA(cmsg);
1458 			struct sockaddr_in6* sa= (struct sockaddr_in6*)
1459 				&paddr->localaddr;
1460 			struct sockaddr_in6* rema = (struct sockaddr_in6*)
1461 				&paddr->addr;
1462 			if(rema->sin6_family != AF_INET6) {
1463 				log_err("doq cmsg family mismatch cmsg is ip6");
1464 				*pkt_continue = 1;
1465 				return 0;
1466 			}
1467 			sa->sin6_family = AF_INET6;
1468 			sa->sin6_port = htons(doq_sockaddr_get_port(
1469 				(void*)c->socket->addr));
1470 			paddr->ifindex = v6info->ipi6_ifindex;
1471 			memmove(&sa->sin6_addr, &v6info->ipi6_addr,
1472 				sizeof(struct in6_addr));
1473 			paddr->localaddrlen = sizeof(struct sockaddr_in6);
1474 			break;
1475 #ifdef IP_PKTINFO
1476 		} else if( cmsg->cmsg_level == IPPROTO_IP &&
1477 			cmsg->cmsg_type == IP_PKTINFO) {
1478 			struct in_pktinfo* v4info =
1479 				(struct in_pktinfo*)CMSG_DATA(cmsg);
1480 			struct sockaddr_in* sa= (struct sockaddr_in*)
1481 				&paddr->localaddr;
1482 			struct sockaddr_in* rema = (struct sockaddr_in*)
1483 				&paddr->addr;
1484 			if(rema->sin_family != AF_INET) {
1485 				log_err("doq cmsg family mismatch cmsg is ip4");
1486 				*pkt_continue = 1;
1487 				return 0;
1488 			}
1489 			sa->sin_family = AF_INET;
1490 			sa->sin_port = htons(doq_sockaddr_get_port(
1491 				(void*)c->socket->addr));
1492 			paddr->ifindex = v4info->ipi_ifindex;
1493 			memmove(&sa->sin_addr, &v4info->ipi_addr,
1494 				sizeof(struct in_addr));
1495 			paddr->localaddrlen = sizeof(struct sockaddr_in);
1496 			break;
1497 #elif defined(IP_RECVDSTADDR)
1498 		} else if( cmsg->cmsg_level == IPPROTO_IP &&
1499 			cmsg->cmsg_type == IP_RECVDSTADDR) {
1500 			struct sockaddr_in* sa= (struct sockaddr_in*)
1501 				&paddr->localaddr;
1502 			struct sockaddr_in* rema = (struct sockaddr_in*)
1503 				&paddr->addr;
1504 			if(rema->sin_family != AF_INET) {
1505 				log_err("doq cmsg family mismatch cmsg is ip4");
1506 				*pkt_continue = 1;
1507 				return 0;
1508 			}
1509 			sa->sin_family = AF_INET;
1510 			sa->sin_port = htons(doq_sockaddr_get_port(
1511 				(void*)c->socket->addr));
1512 			paddr->ifindex = 0;
1513 			memmove(&sa.sin_addr, CMSG_DATA(cmsg),
1514 				sizeof(struct in_addr));
1515 			paddr->localaddrlen = sizeof(struct sockaddr_in);
1516 			break;
1517 #endif /* IP_PKTINFO or IP_RECVDSTADDR */
1518 		}
1519 	}
1520 #endif /* S_SPLINT_S */
1521 
1522 return 1;
1523 }
1524 
1525 /** get packet ecn information */
1526 static uint32_t
1527 msghdr_get_ecn(struct msghdr* msg, int family)
1528 {
1529 #ifndef S_SPLINT_S
1530 	struct cmsghdr* cmsg;
1531 	if(family == AF_INET6) {
1532 		for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1533 			cmsg = CMSG_NXTHDR(msg, cmsg)) {
1534 			if(cmsg->cmsg_level == IPPROTO_IPV6 &&
1535 				cmsg->cmsg_type == IPV6_TCLASS &&
1536 				cmsg->cmsg_len != 0) {
1537 				uint8_t* ecn = (uint8_t*)CMSG_DATA(cmsg);
1538 				return *ecn;
1539 			}
1540 		}
1541 		return 0;
1542 	}
1543 	for(cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1544 		cmsg = CMSG_NXTHDR(msg, cmsg)) {
1545 		if(cmsg->cmsg_level == IPPROTO_IP &&
1546 			cmsg->cmsg_type == IP_TOS &&
1547 			cmsg->cmsg_len != 0) {
1548 			uint8_t* ecn = (uint8_t*)CMSG_DATA(cmsg);
1549 			return *ecn;
1550 		}
1551 	}
1552 #endif /* S_SPLINT_S */
1553 	return 0;
1554 }
1555 
1556 /** receive packet for DoQ on UDP. get ancillary data for addresses,
1557  * return false if failed and the callback can stop receiving UDP packets
1558  * if pkt_continue is false. */
1559 static int
1560 doq_recv(struct comm_point* c, struct doq_pkt_addr* paddr, int* pkt_continue,
1561 	struct ngtcp2_pkt_info* pi)
1562 {
1563 	struct msghdr msg;
1564 	struct iovec iov[1];
1565 	ssize_t rcv;
1566 	union {
1567 		struct cmsghdr hdr;
1568 		char buf[256];
1569 	} ancil;
1570 
1571 	msg.msg_name = &paddr->addr;
1572 	msg.msg_namelen = (socklen_t)sizeof(paddr->addr);
1573 	iov[0].iov_base = sldns_buffer_begin(c->doq_socket->pkt_buf);
1574 	iov[0].iov_len = sldns_buffer_remaining(c->doq_socket->pkt_buf);
1575 	msg.msg_iov = iov;
1576 	msg.msg_iovlen = 1;
1577 	msg.msg_control = ancil.buf;
1578 #ifndef S_SPLINT_S
1579 	msg.msg_controllen = sizeof(ancil.buf);
1580 #endif /* S_SPLINT_S */
1581 	msg.msg_flags = 0;
1582 
1583 	rcv = recvmsg(c->fd, &msg, MSG_DONTWAIT);
1584 	if(rcv == -1) {
1585 		if(errno != EAGAIN && errno != EINTR
1586 			&& udp_recv_needs_log(errno)) {
1587 			log_err("recvmsg failed for doq: %s", strerror(errno));
1588 		}
1589 		*pkt_continue = 0;
1590 		return 0;
1591 	}
1592 
1593 	paddr->addrlen = msg.msg_namelen;
1594 	sldns_buffer_skip(c->doq_socket->pkt_buf, rcv);
1595 	sldns_buffer_flip(c->doq_socket->pkt_buf);
1596 	if(!doq_get_localaddr_cmsg(c, paddr, pkt_continue, &msg))
1597 		return 0;
1598 	pi->ecn = msghdr_get_ecn(&msg, paddr->addr.sockaddr.in.sin_family);
1599 	return 1;
1600 }
1601 
1602 /** send the version negotiation for doq. scid and dcid are flipped around
1603  * to send back to the client. */
1604 static void
1605 doq_send_version_negotiation(struct comm_point* c, struct doq_pkt_addr* paddr,
1606 	const uint8_t* dcid, size_t dcidlen, const uint8_t* scid,
1607 	size_t scidlen)
1608 {
1609 	uint32_t versions[2];
1610 	size_t versions_len = 0;
1611 	ngtcp2_ssize ret;
1612 	uint8_t unused_random;
1613 
1614 	/* fill the array with supported versions */
1615 	versions[0] = NGTCP2_PROTO_VER_V1;
1616 	versions_len = 1;
1617 	unused_random = ub_random_max(c->doq_socket->rnd, 256);
1618 	sldns_buffer_clear(c->doq_socket->pkt_buf);
1619 	ret = ngtcp2_pkt_write_version_negotiation(
1620 		sldns_buffer_begin(c->doq_socket->pkt_buf),
1621 		sldns_buffer_capacity(c->doq_socket->pkt_buf), unused_random,
1622 		dcid, dcidlen, scid, scidlen, versions, versions_len);
1623 	if(ret < 0) {
1624 		log_err("ngtcp2_pkt_write_version_negotiation failed: %s",
1625 			ngtcp2_strerror(ret));
1626 		return;
1627 	}
1628 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
1629 	sldns_buffer_flip(c->doq_socket->pkt_buf);
1630 	doq_send_pkt(c, paddr, 0);
1631 }
1632 
1633 /** Find the doq_conn object by remote address and dcid */
1634 static struct doq_conn*
1635 doq_conn_find(struct doq_table* table, struct doq_addr_storage* addr,
1636 	socklen_t addrlen, struct doq_addr_storage* localaddr,
1637 	socklen_t localaddrlen, int ifindex, const uint8_t* dcid,
1638 	size_t dcidlen)
1639 {
1640 	struct rbnode_type* node;
1641 	struct doq_conn key;
1642 	memset(&key.node, 0, sizeof(key.node));
1643 	key.node.key = &key;
1644 	memmove(&key.key.paddr.addr, addr, addrlen);
1645 	key.key.paddr.addrlen = addrlen;
1646 	memmove(&key.key.paddr.localaddr, localaddr, localaddrlen);
1647 	key.key.paddr.localaddrlen = localaddrlen;
1648 	key.key.paddr.ifindex = ifindex;
1649 	key.key.dcid = (void*)dcid;
1650 	key.key.dcidlen = dcidlen;
1651 	node = rbtree_search(table->conn_tree, &key);
1652 	if(node)
1653 		return (struct doq_conn*)node->key;
1654 	return NULL;
1655 }
1656 
1657 /** find the doq_con by the connection id */
1658 static struct doq_conn*
1659 doq_conn_find_by_id(struct doq_table* table, const uint8_t* dcid,
1660 	size_t dcidlen)
1661 {
1662 	struct doq_conid* conid;
1663 	lock_rw_rdlock(&table->conid_lock);
1664 	conid = doq_conid_find(table, dcid, dcidlen);
1665 	if(conid) {
1666 		/* make a copy of the key */
1667 		struct doq_conn* conn;
1668 		struct doq_conn_key key = conid->key;
1669 		uint8_t cid[NGTCP2_MAX_CIDLEN];
1670 		log_assert(conid->key.dcidlen <= NGTCP2_MAX_CIDLEN);
1671 		memcpy(cid, conid->key.dcid, conid->key.dcidlen);
1672 		key.dcid = cid;
1673 		lock_rw_unlock(&table->conid_lock);
1674 
1675 		/* now that the conid lock is released, look up the conn */
1676 		lock_rw_rdlock(&table->lock);
1677 		conn = doq_conn_find(table, &key.paddr.addr,
1678 			key.paddr.addrlen, &key.paddr.localaddr,
1679 			key.paddr.localaddrlen, key.paddr.ifindex, key.dcid,
1680 			key.dcidlen);
1681 		if(!conn) {
1682 			/* The connection got deleted between the conid lookup
1683 			 * and the connection lock grab, it no longer exists,
1684 			 * so return null. */
1685 			lock_rw_unlock(&table->lock);
1686 			return NULL;
1687 		}
1688 		lock_basic_lock(&conn->lock);
1689 		if(conn->is_deleted) {
1690 			lock_rw_unlock(&table->lock);
1691 			lock_basic_unlock(&conn->lock);
1692 			return NULL;
1693 		}
1694 		lock_rw_unlock(&table->lock);
1695 		return conn;
1696 	}
1697 	lock_rw_unlock(&table->conid_lock);
1698 	return NULL;
1699 }
1700 
1701 /** Find the doq_conn, by addr or by connection id */
1702 static struct doq_conn*
1703 doq_conn_find_by_addr_or_cid(struct doq_table* table,
1704 	struct doq_pkt_addr* paddr, const uint8_t* dcid, size_t dcidlen)
1705 {
1706 	struct doq_conn* conn;
1707 	lock_rw_rdlock(&table->lock);
1708 	conn = doq_conn_find(table, &paddr->addr, paddr->addrlen,
1709 		&paddr->localaddr, paddr->localaddrlen, paddr->ifindex,
1710 		dcid, dcidlen);
1711 	if(conn && conn->is_deleted) {
1712 		conn = NULL;
1713 	}
1714 	if(conn) {
1715 		lock_basic_lock(&conn->lock);
1716 		lock_rw_unlock(&table->lock);
1717 		verbose(VERB_ALGO, "doq: found connection by address, dcid");
1718 	} else {
1719 		lock_rw_unlock(&table->lock);
1720 		conn = doq_conn_find_by_id(table, dcid, dcidlen);
1721 		if(conn) {
1722 			verbose(VERB_ALGO, "doq: found connection by dcid");
1723 		}
1724 	}
1725 	return conn;
1726 }
1727 
1728 /** decode doq packet header, false on handled or failure, true to continue
1729  * to process the packet */
1730 static int
1731 doq_decode_pkt_header_negotiate(struct comm_point* c,
1732 	struct doq_pkt_addr* paddr, struct doq_conn** conn)
1733 {
1734 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1735 	struct ngtcp2_version_cid vc;
1736 #else
1737 	uint32_t version;
1738 	const uint8_t *dcid, *scid;
1739 	size_t dcidlen, scidlen;
1740 #endif
1741 	int rv;
1742 
1743 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1744 	rv = ngtcp2_pkt_decode_version_cid(&vc,
1745 		sldns_buffer_begin(c->doq_socket->pkt_buf),
1746 		sldns_buffer_limit(c->doq_socket->pkt_buf),
1747 		c->doq_socket->sv_scidlen);
1748 #else
1749 	rv = ngtcp2_pkt_decode_version_cid(&version, &dcid, &dcidlen,
1750 		&scid, &scidlen, sldns_buffer_begin(c->doq_socket->pkt_buf),
1751 		sldns_buffer_limit(c->doq_socket->pkt_buf), c->doq_socket->sv_scidlen);
1752 #endif
1753 	if(rv != 0) {
1754 		if(rv == NGTCP2_ERR_VERSION_NEGOTIATION) {
1755 			/* send the version negotiation */
1756 			doq_send_version_negotiation(c, paddr,
1757 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1758 			vc.scid, vc.scidlen, vc.dcid, vc.dcidlen
1759 #else
1760 			scid, scidlen, dcid, dcidlen
1761 #endif
1762 			);
1763 			return 0;
1764 		}
1765 		verbose(VERB_ALGO, "doq: could not decode version "
1766 			"and CID from QUIC packet header: %s",
1767 			ngtcp2_strerror(rv));
1768 		return 0;
1769 	}
1770 
1771 	if(verbosity >= VERB_ALGO) {
1772 		verbose(VERB_ALGO, "ngtcp2_pkt_decode_version_cid packet has "
1773 			"QUIC protocol version %u", (unsigned)
1774 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1775 			vc.
1776 #endif
1777 			version
1778 			);
1779 		log_hex("dcid",
1780 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1781 			(void*)vc.dcid, vc.dcidlen
1782 #else
1783 			(void*)dcid, dcidlen
1784 #endif
1785 			);
1786 		log_hex("scid",
1787 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1788 			(void*)vc.scid, vc.scidlen
1789 #else
1790 			(void*)scid, scidlen
1791 #endif
1792 			);
1793 	}
1794 	*conn = doq_conn_find_by_addr_or_cid(c->doq_socket->table, paddr,
1795 #ifdef HAVE_STRUCT_NGTCP2_VERSION_CID
1796 		vc.dcid, vc.dcidlen
1797 #else
1798 		dcid, dcidlen
1799 #endif
1800 		);
1801 	if(*conn)
1802 		(*conn)->doq_socket = c->doq_socket;
1803 	return 1;
1804 }
1805 
1806 /** fill cid structure with random data */
1807 static void doq_cid_randfill(struct ngtcp2_cid* cid, size_t datalen,
1808 	struct ub_randstate* rnd)
1809 {
1810 	uint8_t buf[32];
1811 	if(datalen > sizeof(buf))
1812 		datalen = sizeof(buf);
1813 	doq_fill_rand(rnd, buf, datalen);
1814 	ngtcp2_cid_init(cid, buf, datalen);
1815 }
1816 
1817 /** send retry packet for doq connection. */
1818 static void
1819 doq_send_retry(struct comm_point* c, struct doq_pkt_addr* paddr,
1820 	struct ngtcp2_pkt_hd* hd)
1821 {
1822 	char host[256], port[32];
1823 	struct ngtcp2_cid scid;
1824 	uint8_t token[NGTCP2_CRYPTO_MAX_RETRY_TOKENLEN];
1825 	ngtcp2_tstamp ts;
1826 	ngtcp2_ssize tokenlen, ret;
1827 
1828 	if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host,
1829 		sizeof(host), port, sizeof(port))) {
1830 		log_err("doq_send_retry failed");
1831 		return;
1832 	}
1833 	verbose(VERB_ALGO, "doq: sending retry packet to %s %s", host, port);
1834 
1835 	/* the server chosen source connection ID */
1836 	scid.datalen = c->doq_socket->sv_scidlen;
1837 	doq_cid_randfill(&scid, scid.datalen, c->doq_socket->rnd);
1838 
1839 	ts = doq_get_timestamp_nanosec();
1840 
1841 	tokenlen = ngtcp2_crypto_generate_retry_token(token,
1842 		c->doq_socket->static_secret, c->doq_socket->static_secret_len,
1843 		hd->version, (void*)&paddr->addr, paddr->addrlen, &scid,
1844 		&hd->dcid, ts);
1845 	if(tokenlen < 0) {
1846 		log_err("ngtcp2_crypto_generate_retry_token failed: %s",
1847 			ngtcp2_strerror(tokenlen));
1848 		return;
1849 	}
1850 
1851 	sldns_buffer_clear(c->doq_socket->pkt_buf);
1852 	ret = ngtcp2_crypto_write_retry(sldns_buffer_begin(c->doq_socket->pkt_buf),
1853 		sldns_buffer_capacity(c->doq_socket->pkt_buf), hd->version,
1854 		&hd->scid, &scid, &hd->dcid, token, tokenlen);
1855 	if(ret < 0) {
1856 		log_err("ngtcp2_crypto_write_retry failed: %s",
1857 			ngtcp2_strerror(ret));
1858 		return;
1859 	}
1860 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
1861 	sldns_buffer_flip(c->doq_socket->pkt_buf);
1862 	doq_send_pkt(c, paddr, 0);
1863 }
1864 
1865 /** doq send stateless connection close */
1866 static void
1867 doq_send_stateless_connection_close(struct comm_point* c,
1868 	struct doq_pkt_addr* paddr, struct ngtcp2_pkt_hd* hd,
1869 	uint64_t error_code)
1870 {
1871 	ngtcp2_ssize ret;
1872 	sldns_buffer_clear(c->doq_socket->pkt_buf);
1873 	ret = ngtcp2_crypto_write_connection_close(
1874 		sldns_buffer_begin(c->doq_socket->pkt_buf),
1875 		sldns_buffer_capacity(c->doq_socket->pkt_buf), hd->version, &hd->scid,
1876 		&hd->dcid, error_code, NULL, 0);
1877 	if(ret < 0) {
1878 		log_err("ngtcp2_crypto_write_connection_close failed: %s",
1879 			ngtcp2_strerror(ret));
1880 		return;
1881 	}
1882 	sldns_buffer_set_position(c->doq_socket->pkt_buf, ret);
1883 	sldns_buffer_flip(c->doq_socket->pkt_buf);
1884 	doq_send_pkt(c, paddr, 0);
1885 }
1886 
1887 /** doq verify retry token, false on failure */
1888 static int
1889 doq_verify_retry_token(struct comm_point* c, struct doq_pkt_addr* paddr,
1890 	struct ngtcp2_cid* ocid, struct ngtcp2_pkt_hd* hd)
1891 {
1892 	char host[256], port[32];
1893 	ngtcp2_tstamp ts;
1894 	if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host,
1895 		sizeof(host), port, sizeof(port))) {
1896 		log_err("doq_verify_retry_token failed");
1897 		return 0;
1898 	}
1899 	ts = doq_get_timestamp_nanosec();
1900 	verbose(VERB_ALGO, "doq: verifying retry token from %s %s", host,
1901 		port);
1902 	if(ngtcp2_crypto_verify_retry_token(ocid,
1903 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
1904 		hd->token, hd->tokenlen,
1905 #else
1906 		hd->token.base, hd->token.len,
1907 #endif
1908 		c->doq_socket->static_secret,
1909 		c->doq_socket->static_secret_len, hd->version,
1910 		(void*)&paddr->addr, paddr->addrlen, &hd->dcid,
1911 		10*NGTCP2_SECONDS, ts) != 0) {
1912 		verbose(VERB_ALGO, "doq: could not verify retry token "
1913 			"from %s %s", host, port);
1914 		return 0;
1915 	}
1916 	verbose(VERB_ALGO, "doq: verified retry token from %s %s", host, port);
1917 	return 1;
1918 }
1919 
1920 /** doq verify token, false on failure */
1921 static int
1922 doq_verify_token(struct comm_point* c, struct doq_pkt_addr* paddr,
1923 	struct ngtcp2_pkt_hd* hd)
1924 {
1925 	char host[256], port[32];
1926 	ngtcp2_tstamp ts;
1927 	if(!doq_print_addr_port(&paddr->addr, paddr->addrlen, host,
1928 		sizeof(host), port, sizeof(port))) {
1929 		log_err("doq_verify_token failed");
1930 		return 0;
1931 	}
1932 	ts = doq_get_timestamp_nanosec();
1933 	verbose(VERB_ALGO, "doq: verifying token from %s %s", host, port);
1934 	if(ngtcp2_crypto_verify_regular_token(
1935 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
1936 		hd->token, hd->tokenlen,
1937 #else
1938 		hd->token.base, hd->token.len,
1939 #endif
1940 		c->doq_socket->static_secret, c->doq_socket->static_secret_len,
1941 		(void*)&paddr->addr, paddr->addrlen, 3600*NGTCP2_SECONDS,
1942 		ts) != 0) {
1943 		verbose(VERB_ALGO, "doq: could not verify token from %s %s",
1944 			host, port);
1945 		return 0;
1946 	}
1947 	verbose(VERB_ALGO, "doq: verified token from %s %s", host, port);
1948 	return 1;
1949 }
1950 
1951 /** delete and remove from the lookup tree the doq_conn connection */
1952 static void
1953 doq_delete_connection(struct comm_point* c, struct doq_conn* conn)
1954 {
1955 	struct doq_conn copy;
1956 	uint8_t cid[NGTCP2_MAX_CIDLEN];
1957 	rbnode_type* node;
1958 	if(!conn)
1959 		return;
1960 	/* Copy the key and set it deleted. */
1961 	conn->is_deleted = 1;
1962 	doq_conn_write_disable(conn);
1963 	copy.key = conn->key;
1964 	log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN);
1965 	memcpy(cid, conn->key.dcid, conn->key.dcidlen);
1966 	copy.key.dcid = cid;
1967 	copy.node.key = &copy;
1968 	lock_basic_unlock(&conn->lock);
1969 
1970 	/* Now get the table lock to delete it from the tree */
1971 	lock_rw_wrlock(&c->doq_socket->table->lock);
1972 	node = rbtree_delete(c->doq_socket->table->conn_tree, copy.node.key);
1973 	if(node) {
1974 		conn = (struct doq_conn*)node->key;
1975 		lock_basic_lock(&conn->lock);
1976 		doq_conn_write_list_remove(c->doq_socket->table, conn);
1977 		if(conn->timer.timer_in_list) {
1978 			/* Remove timer from list first, because finding the
1979 			 * rbnode element of the setlist of same timeouts
1980 			 * needs tree lookup. Edit the tree structure after
1981 			 * that lookup. */
1982 			doq_timer_list_remove(c->doq_socket->table,
1983 				&conn->timer);
1984 		}
1985 		if(conn->timer.timer_in_tree)
1986 			doq_timer_tree_remove(c->doq_socket->table,
1987 				&conn->timer);
1988 	}
1989 	lock_rw_unlock(&c->doq_socket->table->lock);
1990 	if(node) {
1991 		lock_basic_unlock(&conn->lock);
1992 		doq_table_quic_size_subtract(c->doq_socket->table,
1993 			sizeof(*conn)+conn->key.dcidlen);
1994 		doq_conn_delete(conn, c->doq_socket->table);
1995 	}
1996 }
1997 
1998 /** create and setup a new doq connection, to a new destination, or with
1999  * a new dcid. It has a new set of streams. It is inserted in the lookup tree.
2000  * Returns NULL on failure. */
2001 static struct doq_conn*
2002 doq_setup_new_conn(struct comm_point* c, struct doq_pkt_addr* paddr,
2003 	struct ngtcp2_pkt_hd* hd, struct ngtcp2_cid* ocid)
2004 {
2005 	struct doq_conn* conn;
2006 	if(!doq_table_quic_size_available(c->doq_socket->table,
2007 		c->doq_socket->cfg, sizeof(*conn)+hd->dcid.datalen
2008 		+ sizeof(struct doq_stream)
2009 		+ 100 /* estimated input query */
2010 		+ 1200 /* estimated output query */)) {
2011 		verbose(VERB_ALGO, "doq: no mem available for new connection");
2012 		doq_send_stateless_connection_close(c, paddr, hd,
2013 			NGTCP2_CONNECTION_REFUSED);
2014 		return NULL;
2015 	}
2016 	conn = doq_conn_create(c, paddr, hd->dcid.data, hd->dcid.datalen,
2017 		hd->version);
2018 	if(!conn) {
2019 		log_err("doq: could not allocate doq_conn");
2020 		return NULL;
2021 	}
2022 	lock_rw_wrlock(&c->doq_socket->table->lock);
2023 	lock_basic_lock(&conn->lock);
2024 	if(!rbtree_insert(c->doq_socket->table->conn_tree, &conn->node)) {
2025 		lock_rw_unlock(&c->doq_socket->table->lock);
2026 		log_err("doq: duplicate connection");
2027 		/* conn has no entry in writelist, and no timer yet. */
2028 		lock_basic_unlock(&conn->lock);
2029 		doq_conn_delete(conn, c->doq_socket->table);
2030 		return NULL;
2031 	}
2032 	lock_rw_unlock(&c->doq_socket->table->lock);
2033 	doq_table_quic_size_add(c->doq_socket->table,
2034 		sizeof(*conn)+conn->key.dcidlen);
2035 	verbose(VERB_ALGO, "doq: created new connection");
2036 
2037 	/* the scid and dcid switch meaning from the accepted client
2038 	 * connection to the server connection. The 'source' and 'destination'
2039 	 * meaning is reversed. */
2040 	if(!doq_conn_setup(conn, hd->scid.data, hd->scid.datalen,
2041 		(ocid?ocid->data:NULL), (ocid?ocid->datalen:0),
2042 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
2043 		hd->token, hd->tokenlen
2044 #else
2045 		hd->token.base, hd->token.len
2046 #endif
2047 		)) {
2048 		log_err("doq: could not set up connection");
2049 		doq_delete_connection(c, conn);
2050 		return NULL;
2051 	}
2052 	return conn;
2053 }
2054 
2055 /** perform doq address validation */
2056 static int
2057 doq_address_validation(struct comm_point* c, struct doq_pkt_addr* paddr,
2058 	struct ngtcp2_pkt_hd* hd, struct ngtcp2_cid* ocid,
2059 	struct ngtcp2_cid** pocid)
2060 {
2061 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
2062 	const uint8_t* token = hd->token;
2063 	size_t tokenlen = hd->tokenlen;
2064 #else
2065 	const uint8_t* token = hd->token.base;
2066 	size_t tokenlen = hd->token.len;
2067 #endif
2068 	verbose(VERB_ALGO, "doq stateless address validation");
2069 
2070 	if(tokenlen == 0 || token == NULL) {
2071 		doq_send_retry(c, paddr, hd);
2072 		return 0;
2073 	}
2074 	if(token[0] != NGTCP2_CRYPTO_TOKEN_MAGIC_RETRY &&
2075 		hd->dcid.datalen < NGTCP2_MIN_INITIAL_DCIDLEN) {
2076 		doq_send_stateless_connection_close(c, paddr, hd,
2077 			NGTCP2_INVALID_TOKEN);
2078 		return 0;
2079 	}
2080 	if(token[0] == NGTCP2_CRYPTO_TOKEN_MAGIC_RETRY) {
2081 		if(!doq_verify_retry_token(c, paddr, ocid, hd)) {
2082 			doq_send_stateless_connection_close(c, paddr, hd,
2083 				NGTCP2_INVALID_TOKEN);
2084 			return 0;
2085 		}
2086 		*pocid = ocid;
2087 	} else if(token[0] == NGTCP2_CRYPTO_TOKEN_MAGIC_REGULAR) {
2088 		if(!doq_verify_token(c, paddr, hd)) {
2089 			doq_send_retry(c, paddr, hd);
2090 			return 0;
2091 		}
2092 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
2093 		hd->token = NULL;
2094 		hd->tokenlen = 0;
2095 #else
2096 		hd->token.base = NULL;
2097 		hd->token.len = 0;
2098 #endif
2099 	} else {
2100 		verbose(VERB_ALGO, "doq address validation: unrecognised "
2101 			"token in hd.token.base with magic byte 0x%2.2x",
2102 			(int)token[0]);
2103 		if(c->doq_socket->validate_addr) {
2104 			doq_send_retry(c, paddr, hd);
2105 			return 0;
2106 		}
2107 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
2108 		hd->token = NULL;
2109 		hd->tokenlen = 0;
2110 #else
2111 		hd->token.base = NULL;
2112 		hd->token.len = 0;
2113 #endif
2114 	}
2115 	return 1;
2116 }
2117 
2118 /** the doq accept, returns false if no further processing of content */
2119 static int
2120 doq_accept(struct comm_point* c, struct doq_pkt_addr* paddr,
2121 	struct doq_conn** conn, struct ngtcp2_pkt_info* pi)
2122 {
2123 	int rv;
2124 	struct ngtcp2_pkt_hd hd;
2125 	struct ngtcp2_cid ocid, *pocid=NULL;
2126 	int err_retry;
2127 	memset(&hd, 0, sizeof(hd));
2128 	rv = ngtcp2_accept(&hd, sldns_buffer_begin(c->doq_socket->pkt_buf),
2129 		sldns_buffer_limit(c->doq_socket->pkt_buf));
2130 	if(rv != 0) {
2131 		if(rv == NGTCP2_ERR_RETRY) {
2132 			doq_send_retry(c, paddr, &hd);
2133 			return 0;
2134 		}
2135 		log_err("doq: initial packet failed, ngtcp2_accept failed: %s",
2136 			ngtcp2_strerror(rv));
2137 		return 0;
2138 	}
2139 	if(c->doq_socket->validate_addr ||
2140 #ifdef HAVE_STRUCT_NGTCP2_PKT_HD_TOKENLEN
2141 		hd.tokenlen
2142 #else
2143 		hd.token.len
2144 #endif
2145 		) {
2146 		if(!doq_address_validation(c, paddr, &hd, &ocid, &pocid))
2147 			return 0;
2148 	}
2149 	*conn = doq_setup_new_conn(c, paddr, &hd, pocid);
2150 	if(!*conn)
2151 		return 0;
2152 	(*conn)->doq_socket = c->doq_socket;
2153 	if(!doq_conn_recv(c, paddr, *conn, pi, &err_retry, NULL)) {
2154 		if(err_retry)
2155 			doq_send_retry(c, paddr, &hd);
2156 		doq_delete_connection(c, *conn);
2157 		*conn = NULL;
2158 		return 0;
2159 	}
2160 	return 1;
2161 }
2162 
2163 /** doq pickup a timer to wait for for the worker. If any timer exists. */
2164 static void
2165 doq_pickup_timer(struct comm_point* c)
2166 {
2167 	struct doq_timer* t;
2168 	struct timeval tv;
2169 	int have_time = 0;
2170 	memset(&tv, 0, sizeof(tv));
2171 
2172 	lock_rw_wrlock(&c->doq_socket->table->lock);
2173 	RBTREE_FOR(t, struct doq_timer*, c->doq_socket->table->timer_tree) {
2174 		if(t->worker_doq_socket == NULL ||
2175 			t->worker_doq_socket == c->doq_socket) {
2176 			/* pick up this element */
2177 			t->worker_doq_socket = c->doq_socket;
2178 			have_time = 1;
2179 			memcpy(&tv, &t->time, sizeof(tv));
2180 			break;
2181 		}
2182 	}
2183 	lock_rw_unlock(&c->doq_socket->table->lock);
2184 
2185 	if(have_time) {
2186 		struct timeval rel;
2187 		timeval_subtract(&rel, &tv, c->doq_socket->now_tv);
2188 		comm_timer_set(c->doq_socket->timer, &rel);
2189 		memcpy(&c->doq_socket->marked_time, &tv,
2190 			sizeof(c->doq_socket->marked_time));
2191 		verbose(VERB_ALGO, "doq pickup timer at %d.%6.6d in %d.%6.6d",
2192 			(int)tv.tv_sec, (int)tv.tv_usec, (int)rel.tv_sec,
2193 			(int)rel.tv_usec);
2194 	} else {
2195 		if(comm_timer_is_set(c->doq_socket->timer))
2196 			comm_timer_disable(c->doq_socket->timer);
2197 		memset(&c->doq_socket->marked_time, 0,
2198 			sizeof(c->doq_socket->marked_time));
2199 		verbose(VERB_ALGO, "doq timer disabled");
2200 	}
2201 }
2202 
2203 /** doq done with connection, release locks and setup timer and write */
2204 static void
2205 doq_done_setup_timer_and_write(struct comm_point* c, struct doq_conn* conn)
2206 {
2207 	struct doq_conn copy;
2208 	uint8_t cid[NGTCP2_MAX_CIDLEN];
2209 	rbnode_type* node;
2210 	struct timeval new_tv;
2211 	int write_change = 0, timer_change = 0;
2212 
2213 	/* No longer in callbacks, so the pointer to doq_socket is back
2214 	 * to NULL. */
2215 	conn->doq_socket = NULL;
2216 
2217 	if(doq_conn_check_timer(conn, &new_tv))
2218 		timer_change = 1;
2219 	if( (conn->write_interest && !conn->on_write_list) ||
2220 		(!conn->write_interest && conn->on_write_list))
2221 		write_change = 1;
2222 
2223 	if(!timer_change && !write_change) {
2224 		/* Nothing to do. */
2225 		lock_basic_unlock(&conn->lock);
2226 		return;
2227 	}
2228 
2229 	/* The table lock is needed to change the write list and timer tree.
2230 	 * So the connection lock is release and then the connection is
2231 	 * looked up again. */
2232 	copy.key = conn->key;
2233 	log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN);
2234 	memcpy(cid, conn->key.dcid, conn->key.dcidlen);
2235 	copy.key.dcid = cid;
2236 	copy.node.key = &copy;
2237 	lock_basic_unlock(&conn->lock);
2238 
2239 	lock_rw_wrlock(&c->doq_socket->table->lock);
2240 	node = rbtree_search(c->doq_socket->table->conn_tree, copy.node.key);
2241 	if(!node) {
2242 		lock_rw_unlock(&c->doq_socket->table->lock);
2243 		/* Must have been deleted in the mean time. */
2244 		return;
2245 	}
2246 	conn = (struct doq_conn*)node->key;
2247 	lock_basic_lock(&conn->lock);
2248 	if(conn->is_deleted) {
2249 		/* It is deleted now. */
2250 		lock_rw_unlock(&c->doq_socket->table->lock);
2251 		lock_basic_unlock(&conn->lock);
2252 		return;
2253 	}
2254 
2255 	if(write_change) {
2256 		/* Edit the write lists, we are holding the table.lock and can
2257 		 * edit the list first,last and also prev,next and on_list
2258 		 * elements in the doq_conn structures. */
2259 		doq_conn_set_write_list(c->doq_socket->table, conn);
2260 	}
2261 	if(timer_change) {
2262 		doq_timer_set(c->doq_socket->table, &conn->timer,
2263 			c->doq_socket, &new_tv);
2264 	}
2265 	lock_rw_unlock(&c->doq_socket->table->lock);
2266 	lock_basic_unlock(&conn->lock);
2267 }
2268 
2269 /** doq done with connection callbacks, release locks and setup write */
2270 static void
2271 doq_done_with_conn_cb(struct comm_point* c, struct doq_conn* conn)
2272 {
2273 	struct doq_conn copy;
2274 	uint8_t cid[NGTCP2_MAX_CIDLEN];
2275 	rbnode_type* node;
2276 
2277 	/* no longer in callbacks, so the pointer to doq_socket is back
2278 	 * to NULL. */
2279 	conn->doq_socket = NULL;
2280 
2281 	if( (conn->write_interest && conn->on_write_list) ||
2282 		(!conn->write_interest && !conn->on_write_list)) {
2283 		/* The connection already has the required write list
2284 		 * status. */
2285 		lock_basic_unlock(&conn->lock);
2286 		return;
2287 	}
2288 
2289 	/* To edit the write list of connections we have to hold the table
2290 	 * lock, so we release the connection and then look it up again. */
2291 	copy.key = conn->key;
2292 	log_assert(conn->key.dcidlen <= NGTCP2_MAX_CIDLEN);
2293 	memcpy(cid, conn->key.dcid, conn->key.dcidlen);
2294 	copy.key.dcid = cid;
2295 	copy.node.key = &copy;
2296 	lock_basic_unlock(&conn->lock);
2297 
2298 	lock_rw_wrlock(&c->doq_socket->table->lock);
2299 	node = rbtree_search(c->doq_socket->table->conn_tree, copy.node.key);
2300 	if(!node) {
2301 		lock_rw_unlock(&c->doq_socket->table->lock);
2302 		/* must have been deleted in the mean time */
2303 		return;
2304 	}
2305 	conn = (struct doq_conn*)node->key;
2306 	lock_basic_lock(&conn->lock);
2307 	if(conn->is_deleted) {
2308 		/* it is deleted now. */
2309 		lock_rw_unlock(&c->doq_socket->table->lock);
2310 		lock_basic_unlock(&conn->lock);
2311 		return;
2312 	}
2313 
2314 	/* edit the write lists, we are holding the table.lock and can
2315 	 * edit the list first,last and also prev,next and on_list elements
2316 	 * in the doq_conn structures. */
2317 	doq_conn_set_write_list(c->doq_socket->table, conn);
2318 	lock_rw_unlock(&c->doq_socket->table->lock);
2319 	lock_basic_unlock(&conn->lock);
2320 }
2321 
2322 /** doq count the length of the write list */
2323 static size_t
2324 doq_write_list_length(struct comm_point* c)
2325 {
2326 	size_t count = 0;
2327 	struct doq_conn* conn;
2328 	lock_rw_rdlock(&c->doq_socket->table->lock);
2329 	conn = c->doq_socket->table->write_list_first;
2330 	while(conn) {
2331 		count++;
2332 		conn = conn->write_next;
2333 	}
2334 	lock_rw_unlock(&c->doq_socket->table->lock);
2335 	return count;
2336 }
2337 
2338 /** doq pop the first element from the write list to have write events */
2339 static struct doq_conn*
2340 doq_pop_write_conn(struct comm_point* c)
2341 {
2342 	struct doq_conn* conn;
2343 	lock_rw_wrlock(&c->doq_socket->table->lock);
2344 	conn = doq_table_pop_first(c->doq_socket->table);
2345 	while(conn && conn->is_deleted) {
2346 		lock_basic_unlock(&conn->lock);
2347 		conn = doq_table_pop_first(c->doq_socket->table);
2348 	}
2349 	lock_rw_unlock(&c->doq_socket->table->lock);
2350 	if(conn)
2351 		conn->doq_socket = c->doq_socket;
2352 	return conn;
2353 }
2354 
2355 /** doq the connection is done with write callbacks, release it. */
2356 static void
2357 doq_done_with_write_cb(struct comm_point* c, struct doq_conn* conn,
2358 	int delete_it)
2359 {
2360 	if(delete_it) {
2361 		doq_delete_connection(c, conn);
2362 		return;
2363 	}
2364 	doq_done_setup_timer_and_write(c, conn);
2365 }
2366 
2367 /** see if the doq socket wants to write packets */
2368 static int
2369 doq_socket_want_write(struct comm_point* c)
2370 {
2371 	int want_write = 0;
2372 	if(c->doq_socket->have_blocked_pkt)
2373 		return 1;
2374 	lock_rw_rdlock(&c->doq_socket->table->lock);
2375 	if(c->doq_socket->table->write_list_first)
2376 		want_write = 1;
2377 	lock_rw_unlock(&c->doq_socket->table->lock);
2378 	return want_write;
2379 }
2380 
2381 /** enable write event for the doq server socket fd */
2382 static void
2383 doq_socket_write_enable(struct comm_point* c)
2384 {
2385 	verbose(VERB_ALGO, "doq socket want write");
2386 	if(c->doq_socket->event_has_write)
2387 		return;
2388 	comm_point_listen_for_rw(c, 1, 1);
2389 	c->doq_socket->event_has_write = 1;
2390 }
2391 
2392 /** disable write event for the doq server socket fd */
2393 static void
2394 doq_socket_write_disable(struct comm_point* c)
2395 {
2396 	verbose(VERB_ALGO, "doq socket want no write");
2397 	if(!c->doq_socket->event_has_write)
2398 		return;
2399 	comm_point_listen_for_rw(c, 1, 0);
2400 	c->doq_socket->event_has_write = 0;
2401 }
2402 
2403 /** write blocked packet, if possible. returns false if failed, again. */
2404 static int
2405 doq_write_blocked_pkt(struct comm_point* c)
2406 {
2407 	struct doq_pkt_addr paddr;
2408 	if(!c->doq_socket->have_blocked_pkt)
2409 		return 1;
2410 	c->doq_socket->have_blocked_pkt = 0;
2411 	if(sldns_buffer_limit(c->doq_socket->blocked_pkt) >
2412 		sldns_buffer_remaining(c->doq_socket->pkt_buf))
2413 		return 1; /* impossibly large, drop it.
2414 		impossible since pkt_buf is same size as blocked_pkt buf. */
2415 	sldns_buffer_clear(c->doq_socket->pkt_buf);
2416 	sldns_buffer_write(c->doq_socket->pkt_buf,
2417 		sldns_buffer_begin(c->doq_socket->blocked_pkt),
2418 		sldns_buffer_limit(c->doq_socket->blocked_pkt));
2419 	sldns_buffer_flip(c->doq_socket->pkt_buf);
2420 	memcpy(&paddr, c->doq_socket->blocked_paddr, sizeof(paddr));
2421 	doq_send_pkt(c, &paddr, c->doq_socket->blocked_pkt_pi.ecn);
2422 	if(c->doq_socket->have_blocked_pkt)
2423 		return 0;
2424 	return 1;
2425 }
2426 
2427 /** doq find a timer that timeouted and return the conn, locked. */
2428 static struct doq_conn*
2429 doq_timer_timeout_conn(struct doq_server_socket* doq_socket)
2430 {
2431 	struct doq_conn* conn = NULL;
2432 	struct rbnode_type* node;
2433 	lock_rw_wrlock(&doq_socket->table->lock);
2434 	node = rbtree_first(doq_socket->table->timer_tree);
2435 	if(node && node != RBTREE_NULL) {
2436 		struct doq_timer* t = (struct doq_timer*)node;
2437 		conn = t->conn;
2438 
2439 		/* If now < timer then no further timeouts in tree. */
2440 		if(timeval_smaller(doq_socket->now_tv, &t->time)) {
2441 			lock_rw_unlock(&doq_socket->table->lock);
2442 			return NULL;
2443 		}
2444 
2445 		lock_basic_lock(&conn->lock);
2446 		conn->doq_socket = doq_socket;
2447 
2448 		/* Now that the timer is fired, remove it. */
2449 		doq_timer_unset(doq_socket->table, t);
2450 		lock_rw_unlock(&doq_socket->table->lock);
2451 		return conn;
2452 	}
2453 	lock_rw_unlock(&doq_socket->table->lock);
2454 	return NULL;
2455 }
2456 
2457 /** doq timer erase the marker that said which timer the worker uses. */
2458 static void
2459 doq_timer_erase_marker(struct doq_server_socket* doq_socket)
2460 {
2461 	struct doq_timer* t;
2462 	lock_rw_wrlock(&doq_socket->table->lock);
2463 	t = doq_timer_find_time(doq_socket->table, &doq_socket->marked_time);
2464 	if(t && t->worker_doq_socket == doq_socket)
2465 		t->worker_doq_socket = NULL;
2466 	lock_rw_unlock(&doq_socket->table->lock);
2467 	memset(&doq_socket->marked_time, 0, sizeof(doq_socket->marked_time));
2468 }
2469 
2470 void
2471 doq_timer_cb(void* arg)
2472 {
2473 	struct doq_server_socket* doq_socket = (struct doq_server_socket*)arg;
2474 	struct doq_conn* conn;
2475 	verbose(VERB_ALGO, "doq timer callback");
2476 
2477 	doq_timer_erase_marker(doq_socket);
2478 
2479 	while((conn = doq_timer_timeout_conn(doq_socket)) != NULL) {
2480 		if(conn->is_deleted ||
2481 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
2482 			ngtcp2_conn_in_closing_period(conn->conn) ||
2483 #else
2484 			ngtcp2_conn_is_in_closing_period(conn->conn) ||
2485 #endif
2486 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
2487 			ngtcp2_conn_in_draining_period(conn->conn)
2488 #else
2489 			ngtcp2_conn_is_in_draining_period(conn->conn)
2490 #endif
2491 			) {
2492 			if(verbosity >= VERB_ALGO) {
2493 				char remotestr[256];
2494 				addr_to_str((void*)&conn->key.paddr.addr,
2495 					conn->key.paddr.addrlen, remotestr,
2496 					sizeof(remotestr));
2497 				verbose(VERB_ALGO, "doq conn %s is deleted "
2498 					"after timeout", remotestr);
2499 			}
2500 			doq_delete_connection(doq_socket->cp, conn);
2501 			continue;
2502 		}
2503 		if(!doq_conn_handle_timeout(conn))
2504 			doq_delete_connection(doq_socket->cp, conn);
2505 		else doq_done_setup_timer_and_write(doq_socket->cp, conn);
2506 	}
2507 
2508 	if(doq_socket_want_write(doq_socket->cp))
2509 		doq_socket_write_enable(doq_socket->cp);
2510 	else doq_socket_write_disable(doq_socket->cp);
2511 	doq_pickup_timer(doq_socket->cp);
2512 }
2513 
2514 void
2515 comm_point_doq_callback(int fd, short event, void* arg)
2516 {
2517 	struct comm_point* c;
2518 	struct doq_pkt_addr paddr;
2519 	int i, pkt_continue, err_drop;
2520 	struct doq_conn* conn;
2521 	struct ngtcp2_pkt_info pi;
2522 	size_t count, num_len;
2523 
2524 	c = (struct comm_point*)arg;
2525 	log_assert(c->type == comm_doq);
2526 
2527 	log_assert(c && c->doq_socket->pkt_buf && c->fd == fd);
2528 	ub_comm_base_now(c->ev->base);
2529 
2530 	/* see if there is a blocked packet, and send that if possible.
2531 	 * do not attempt to read yet, even if possible, that would just
2532 	 * push more answers in reply to those read packets onto the list
2533 	 * of written replies. First attempt to clear the write content out.
2534 	 * That keeps the memory usage from bloating up. */
2535 	if(c->doq_socket->have_blocked_pkt) {
2536 		if(!doq_write_blocked_pkt(c)) {
2537 			/* this write has also blocked, attempt to write
2538 			 * later. Make sure the event listens to write
2539 			 * events. */
2540 			if(!c->doq_socket->event_has_write)
2541 				doq_socket_write_enable(c);
2542 			doq_pickup_timer(c);
2543 			return;
2544 		}
2545 	}
2546 
2547 	/* see if there is write interest */
2548 	count = 0;
2549 	num_len = doq_write_list_length(c);
2550 	while((conn = doq_pop_write_conn(c)) != NULL) {
2551 		if(conn->is_deleted ||
2552 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
2553 			ngtcp2_conn_in_closing_period(conn->conn) ||
2554 #else
2555 			ngtcp2_conn_is_in_closing_period(conn->conn) ||
2556 #endif
2557 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
2558 			ngtcp2_conn_in_draining_period(conn->conn)
2559 #else
2560 			ngtcp2_conn_is_in_draining_period(conn->conn)
2561 #endif
2562 			) {
2563 			conn->doq_socket = NULL;
2564 			lock_basic_unlock(&conn->lock);
2565 			if(c->doq_socket->have_blocked_pkt) {
2566 				if(!c->doq_socket->event_has_write)
2567 					doq_socket_write_enable(c);
2568 				doq_pickup_timer(c);
2569 				return;
2570 			}
2571 			if(++count > num_len*2)
2572 				break;
2573 			continue;
2574 		}
2575 		if(verbosity >= VERB_ALGO) {
2576 			char remotestr[256];
2577 			addr_to_str((void*)&conn->key.paddr.addr,
2578 				conn->key.paddr.addrlen, remotestr,
2579 				sizeof(remotestr));
2580 			verbose(VERB_ALGO, "doq write connection %s %d",
2581 				remotestr, doq_sockaddr_get_port(
2582 				&conn->key.paddr.addr));
2583 		}
2584 		if(doq_conn_write_streams(c, conn, &err_drop))
2585 			err_drop = 0;
2586 		doq_done_with_write_cb(c, conn, err_drop);
2587 		if(c->doq_socket->have_blocked_pkt) {
2588 			if(!c->doq_socket->event_has_write)
2589 				doq_socket_write_enable(c);
2590 			doq_pickup_timer(c);
2591 			return;
2592 		}
2593 		/* Stop overly long write lists that are created
2594 		 * while we are processing. Do those next time there
2595 		 * is a write callback. Stops long loops, and keeps
2596 		 * fair for other events. */
2597 		if(++count > num_len*2)
2598 			break;
2599 	}
2600 
2601 	/* check for data to read */
2602 	if((event&UB_EV_READ)!=0)
2603 	  for(i=0; i<NUM_UDP_PER_SELECT; i++) {
2604 		/* there may be a blocked write packet and if so, stop
2605 		 * reading because the reply cannot get written. The
2606 		 * blocked packet could be written during the conn_recv
2607 		 * handling of replies, or for a connection close. */
2608 		if(c->doq_socket->have_blocked_pkt) {
2609 			if(!c->doq_socket->event_has_write)
2610 				doq_socket_write_enable(c);
2611 			doq_pickup_timer(c);
2612 			return;
2613 		}
2614 		sldns_buffer_clear(c->doq_socket->pkt_buf);
2615 		doq_pkt_addr_init(&paddr);
2616 		log_assert(fd != -1);
2617 		log_assert(sldns_buffer_remaining(c->doq_socket->pkt_buf) > 0);
2618 		if(!doq_recv(c, &paddr, &pkt_continue, &pi)) {
2619 			if(pkt_continue)
2620 				continue;
2621 			break;
2622 		}
2623 
2624 		/* handle incoming packet from remote addr to localaddr */
2625 		if(verbosity >= VERB_ALGO) {
2626 			char remotestr[256], localstr[256];
2627 			addr_to_str((void*)&paddr.addr, paddr.addrlen,
2628 				remotestr, sizeof(remotestr));
2629 			addr_to_str((void*)&paddr.localaddr,
2630 				paddr.localaddrlen, localstr,
2631 				sizeof(localstr));
2632 			log_info("incoming doq packet from %s port %d on "
2633 				"%s port %d ifindex %d",
2634 				remotestr, doq_sockaddr_get_port(&paddr.addr),
2635 				localstr,
2636 				doq_sockaddr_get_port(&paddr.localaddr),
2637 				paddr.ifindex);
2638 			log_info("doq_recv length %d ecn 0x%x",
2639 				(int)sldns_buffer_limit(c->doq_socket->pkt_buf),
2640 				(int)pi.ecn);
2641 		}
2642 
2643 		if(sldns_buffer_limit(c->doq_socket->pkt_buf) == 0)
2644 			continue;
2645 
2646 		conn = NULL;
2647 		if(!doq_decode_pkt_header_negotiate(c, &paddr, &conn))
2648 			continue;
2649 		if(!conn) {
2650 			if(!doq_accept(c, &paddr, &conn, &pi))
2651 				continue;
2652 			if(!doq_conn_write_streams(c, conn, NULL)) {
2653 				doq_delete_connection(c, conn);
2654 				continue;
2655 			}
2656 			doq_done_setup_timer_and_write(c, conn);
2657 			continue;
2658 		}
2659 		if(
2660 #ifdef HAVE_NGTCP2_CONN_IN_CLOSING_PERIOD
2661 			ngtcp2_conn_in_closing_period(conn->conn)
2662 #else
2663 			ngtcp2_conn_is_in_closing_period(conn->conn)
2664 #endif
2665 			) {
2666 			if(!doq_conn_send_close(c, conn)) {
2667 				doq_delete_connection(c, conn);
2668 			} else {
2669 				doq_done_setup_timer_and_write(c, conn);
2670 			}
2671 			continue;
2672 		}
2673 		if(
2674 #ifdef HAVE_NGTCP2_CONN_IN_DRAINING_PERIOD
2675 			ngtcp2_conn_in_draining_period(conn->conn)
2676 #else
2677 			ngtcp2_conn_is_in_draining_period(conn->conn)
2678 #endif
2679 			) {
2680 			doq_done_setup_timer_and_write(c, conn);
2681 			continue;
2682 		}
2683 		if(!doq_conn_recv(c, &paddr, conn, &pi, NULL, &err_drop)) {
2684 			/* The receive failed, and if it also failed to send
2685 			 * a close, drop the connection. That means it is not
2686 			 * in the closing period. */
2687 			if(err_drop) {
2688 				doq_delete_connection(c, conn);
2689 			} else {
2690 				doq_done_setup_timer_and_write(c, conn);
2691 			}
2692 			continue;
2693 		}
2694 		if(!doq_conn_write_streams(c, conn, &err_drop)) {
2695 			if(err_drop) {
2696 				doq_delete_connection(c, conn);
2697 			} else {
2698 				doq_done_setup_timer_and_write(c, conn);
2699 			}
2700 			continue;
2701 		}
2702 		doq_done_setup_timer_and_write(c, conn);
2703 	}
2704 
2705 	/* see if we want to have more write events */
2706 	verbose(VERB_ALGO, "doq check write enable");
2707 	if(doq_socket_want_write(c))
2708 		doq_socket_write_enable(c);
2709 	else doq_socket_write_disable(c);
2710 	doq_pickup_timer(c);
2711 }
2712 
2713 /** create new doq server socket structure */
2714 static struct doq_server_socket*
2715 doq_server_socket_create(struct doq_table* table, struct ub_randstate* rnd,
2716 	const void* quic_sslctx, struct comm_point* c, struct comm_base* base,
2717 	struct config_file* cfg)
2718 {
2719 	size_t doq_buffer_size = 4096; /* bytes buffer size, for one packet. */
2720 	struct doq_server_socket* doq_socket;
2721 	doq_socket = calloc(1, sizeof(*doq_socket));
2722 	if(!doq_socket) {
2723 		return NULL;
2724 	}
2725 	doq_socket->table = table;
2726 	doq_socket->rnd = rnd;
2727 	doq_socket->validate_addr = 1;
2728 	/* the doq_socket has its own copy of the static secret, as
2729 	 * well as other config values, so that they do not need table.lock */
2730 	doq_socket->static_secret_len = table->static_secret_len;
2731 	doq_socket->static_secret = memdup(table->static_secret,
2732 		table->static_secret_len);
2733 	if(!doq_socket->static_secret) {
2734 		free(doq_socket);
2735 		return NULL;
2736 	}
2737 	doq_socket->ctx = (SSL_CTX*)quic_sslctx;
2738 	doq_socket->idle_timeout = table->idle_timeout;
2739 	doq_socket->sv_scidlen = table->sv_scidlen;
2740 	doq_socket->cp = c;
2741 	doq_socket->pkt_buf = sldns_buffer_new(doq_buffer_size);
2742 	if(!doq_socket->pkt_buf) {
2743 		free(doq_socket->static_secret);
2744 		free(doq_socket);
2745 		return NULL;
2746 	}
2747 	doq_socket->blocked_pkt = sldns_buffer_new(
2748 		sldns_buffer_capacity(doq_socket->pkt_buf));
2749 	if(!doq_socket->pkt_buf) {
2750 		free(doq_socket->static_secret);
2751 		sldns_buffer_free(doq_socket->pkt_buf);
2752 		free(doq_socket);
2753 		return NULL;
2754 	}
2755 	doq_socket->blocked_paddr = calloc(1,
2756 		sizeof(*doq_socket->blocked_paddr));
2757 	if(!doq_socket->blocked_paddr) {
2758 		free(doq_socket->static_secret);
2759 		sldns_buffer_free(doq_socket->pkt_buf);
2760 		sldns_buffer_free(doq_socket->blocked_pkt);
2761 		free(doq_socket);
2762 		return NULL;
2763 	}
2764 	doq_socket->timer = comm_timer_create(base, doq_timer_cb, doq_socket);
2765 	if(!doq_socket->timer) {
2766 		free(doq_socket->static_secret);
2767 		sldns_buffer_free(doq_socket->pkt_buf);
2768 		sldns_buffer_free(doq_socket->blocked_pkt);
2769 		free(doq_socket->blocked_paddr);
2770 		free(doq_socket);
2771 		return NULL;
2772 	}
2773 	memset(&doq_socket->marked_time, 0, sizeof(doq_socket->marked_time));
2774 	comm_base_timept(base, &doq_socket->now_tt, &doq_socket->now_tv);
2775 	doq_socket->cfg = cfg;
2776 	return doq_socket;
2777 }
2778 
2779 /** delete doq server socket structure */
2780 static void
2781 doq_server_socket_delete(struct doq_server_socket* doq_socket)
2782 {
2783 	if(!doq_socket)
2784 		return;
2785 	free(doq_socket->static_secret);
2786 #ifndef HAVE_NGTCP2_CRYPTO_QUICTLS_CONFIGURE_SERVER_CONTEXT
2787 	free(doq_socket->quic_method);
2788 #endif
2789 	sldns_buffer_free(doq_socket->pkt_buf);
2790 	sldns_buffer_free(doq_socket->blocked_pkt);
2791 	free(doq_socket->blocked_paddr);
2792 	comm_timer_delete(doq_socket->timer);
2793 	free(doq_socket);
2794 }
2795 
2796 /** find repinfo in the doq table */
2797 static struct doq_conn*
2798 doq_lookup_repinfo(struct doq_table* table, struct comm_reply* repinfo)
2799 {
2800 	struct doq_conn* conn;
2801 	struct doq_conn_key key;
2802 	doq_conn_key_from_repinfo(&key, repinfo);
2803 	lock_rw_rdlock(&table->lock);
2804 	conn = doq_conn_find(table, &key.paddr.addr,
2805 		key.paddr.addrlen, &key.paddr.localaddr,
2806 		key.paddr.localaddrlen, key.paddr.ifindex, key.dcid,
2807 		key.dcidlen);
2808 	if(conn) {
2809 		lock_basic_lock(&conn->lock);
2810 		lock_rw_unlock(&table->lock);
2811 		return conn;
2812 	}
2813 	lock_rw_unlock(&table->lock);
2814 	return NULL;
2815 }
2816 
2817 /** doq find connection and stream. From inside callbacks from worker. */
2818 static int
2819 doq_lookup_conn_stream(struct comm_reply* repinfo, struct comm_point* c,
2820 	struct doq_conn** conn, struct doq_stream** stream)
2821 {
2822 	log_assert(c->doq_socket);
2823 	if(c->doq_socket->current_conn) {
2824 		*conn = c->doq_socket->current_conn;
2825 	} else {
2826 		*conn = doq_lookup_repinfo(c->doq_socket->table, repinfo);
2827 		if((*conn) && (*conn)->is_deleted) {
2828 			lock_basic_unlock(&(*conn)->lock);
2829 			*conn = NULL;
2830 		}
2831 		if(*conn) {
2832 			(*conn)->doq_socket = c->doq_socket;
2833 		}
2834 	}
2835 	if(!*conn) {
2836 		*stream = NULL;
2837 		return 0;
2838 	}
2839 	*stream = doq_stream_find(*conn, repinfo->doq_streamid);
2840 	if(!*stream) {
2841 		if(!c->doq_socket->current_conn) {
2842 			/* Not inside callbacks, we have our own lock on conn.
2843 			 * Release it. */
2844 			lock_basic_unlock(&(*conn)->lock);
2845 		}
2846 		return 0;
2847 	}
2848 	if((*stream)->is_closed) {
2849 		/* stream is closed, ignore reply or drop */
2850 		if(!c->doq_socket->current_conn) {
2851 			/* Not inside callbacks, we have our own lock on conn.
2852 			 * Release it. */
2853 			lock_basic_unlock(&(*conn)->lock);
2854 		}
2855 		return 0;
2856 	}
2857 	return 1;
2858 }
2859 
2860 /** doq send a reply from a comm reply */
2861 static void
2862 doq_socket_send_reply(struct comm_reply* repinfo)
2863 {
2864 	struct doq_conn* conn;
2865 	struct doq_stream* stream;
2866 	log_assert(repinfo->c->type == comm_doq);
2867 	if(!doq_lookup_conn_stream(repinfo, repinfo->c, &conn, &stream)) {
2868 		verbose(VERB_ALGO, "doq: send_reply but %s is gone",
2869 			(conn?"stream":"connection"));
2870 		/* No stream, it may have been closed. */
2871 		/* Drop the reply, it cannot be sent. */
2872 		return;
2873 	}
2874 	if(!doq_stream_send_reply(conn, stream, repinfo->c->buffer))
2875 		doq_stream_close(conn, stream, 1);
2876 	if(!repinfo->c->doq_socket->current_conn) {
2877 		/* Not inside callbacks, we have our own lock on conn.
2878 		 * Release it. */
2879 		doq_done_with_conn_cb(repinfo->c, conn);
2880 		/* since we sent a reply, or closed it, the assumption is
2881 		 * that there is something to write, so enable write event.
2882 		 * It waits until the write event happens to write the
2883 		 * streams with answers, this allows some answers to be
2884 		 * answered before the event loop reaches the doq fd, in
2885 		 * repinfo->c->fd, and that collates answers. That would
2886 		 * not happen if we write doq packets right now. */
2887 		doq_socket_write_enable(repinfo->c);
2888 	}
2889 }
2890 
2891 /** doq drop a reply from a comm reply */
2892 static void
2893 doq_socket_drop_reply(struct comm_reply* repinfo)
2894 {
2895 	struct doq_conn* conn;
2896 	struct doq_stream* stream;
2897 	log_assert(repinfo->c->type == comm_doq);
2898 	if(!doq_lookup_conn_stream(repinfo, repinfo->c, &conn, &stream)) {
2899 		verbose(VERB_ALGO, "doq: drop_reply but %s is gone",
2900 			(conn?"stream":"connection"));
2901 		/* The connection or stream is already gone. */
2902 		return;
2903 	}
2904 	doq_stream_close(conn, stream, 1);
2905 	if(!repinfo->c->doq_socket->current_conn) {
2906 		/* Not inside callbacks, we have our own lock on conn.
2907 		 * Release it. */
2908 		doq_done_with_conn_cb(repinfo->c, conn);
2909 		doq_socket_write_enable(repinfo->c);
2910 	}
2911 }
2912 #endif /* HAVE_NGTCP2 */
2913 
2914 int adjusted_tcp_timeout(struct comm_point* c)
2915 {
2916 	if(c->tcp_timeout_msec < TCP_QUERY_TIMEOUT_MINIMUM)
2917 		return TCP_QUERY_TIMEOUT_MINIMUM;
2918 	return c->tcp_timeout_msec;
2919 }
2920 
2921 /** Use a new tcp handler for new query fd, set to read query */
2922 static void
2923 setup_tcp_handler(struct comm_point* c, int fd, int cur, int max)
2924 {
2925 	int handler_usage;
2926 	log_assert(c->type == comm_tcp || c->type == comm_http);
2927 	log_assert(c->fd == -1);
2928 	sldns_buffer_clear(c->buffer);
2929 #ifdef USE_DNSCRYPT
2930 	if (c->dnscrypt)
2931 		sldns_buffer_clear(c->dnscrypt_buffer);
2932 #endif
2933 	c->tcp_is_reading = 1;
2934 	c->tcp_byte_count = 0;
2935 	c->tcp_keepalive = 0;
2936 	/* if more than half the tcp handlers are in use, use a shorter
2937 	 * timeout for this TCP connection, we need to make space for
2938 	 * other connections to be able to get attention */
2939 	/* If > 50% TCP handler structures in use, set timeout to 1/100th
2940 	 * 	configured value.
2941 	 * If > 65%TCP handler structures in use, set to 1/500th configured
2942 	 * 	value.
2943 	 * If > 80% TCP handler structures in use, set to 0.
2944 	 *
2945 	 * If the timeout to use falls below 200 milliseconds, an actual
2946 	 * timeout of 200ms is used.
2947 	 */
2948 	handler_usage = (cur * 100) / max;
2949 	if(handler_usage > 50 && handler_usage <= 65)
2950 		c->tcp_timeout_msec /= 100;
2951 	else if (handler_usage > 65 && handler_usage <= 80)
2952 		c->tcp_timeout_msec /= 500;
2953 	else if (handler_usage > 80)
2954 		c->tcp_timeout_msec = 0;
2955 	comm_point_start_listening(c, fd, adjusted_tcp_timeout(c));
2956 }
2957 
2958 void comm_base_handle_slow_accept(int ATTR_UNUSED(fd),
2959 	short ATTR_UNUSED(event), void* arg)
2960 {
2961 	struct comm_base* b = (struct comm_base*)arg;
2962 	/* timeout for the slow accept, re-enable accepts again */
2963 	if(b->start_accept) {
2964 		verbose(VERB_ALGO, "wait is over, slow accept disabled");
2965 		fptr_ok(fptr_whitelist_start_accept(b->start_accept));
2966 		(*b->start_accept)(b->cb_arg);
2967 		b->eb->slow_accept_enabled = 0;
2968 	}
2969 }
2970 
2971 int comm_point_perform_accept(struct comm_point* c,
2972 	struct sockaddr_storage* addr, socklen_t* addrlen)
2973 {
2974 	int new_fd;
2975 	*addrlen = (socklen_t)sizeof(*addr);
2976 #ifndef HAVE_ACCEPT4
2977 	new_fd = accept(c->fd, (struct sockaddr*)addr, addrlen);
2978 #else
2979 	/* SOCK_NONBLOCK saves extra calls to fcntl for the same result */
2980 	new_fd = accept4(c->fd, (struct sockaddr*)addr, addrlen, SOCK_NONBLOCK);
2981 #endif
2982 	if(new_fd == -1) {
2983 #ifndef USE_WINSOCK
2984 		/* EINTR is signal interrupt. others are closed connection. */
2985 		if(	errno == EINTR || errno == EAGAIN
2986 #ifdef EWOULDBLOCK
2987 			|| errno == EWOULDBLOCK
2988 #endif
2989 #ifdef ECONNABORTED
2990 			|| errno == ECONNABORTED
2991 #endif
2992 #ifdef EPROTO
2993 			|| errno == EPROTO
2994 #endif /* EPROTO */
2995 			)
2996 			return -1;
2997 #if defined(ENFILE) && defined(EMFILE)
2998 		if(errno == ENFILE || errno == EMFILE) {
2999 			/* out of file descriptors, likely outside of our
3000 			 * control. stop accept() calls for some time */
3001 			if(c->ev->base->stop_accept) {
3002 				struct comm_base* b = c->ev->base;
3003 				struct timeval tv;
3004 				verbose(VERB_ALGO, "out of file descriptors: "
3005 					"slow accept");
3006 				ub_comm_base_now(b);
3007 				if(b->eb->last_slow_log+SLOW_LOG_TIME <=
3008 					b->eb->secs) {
3009 					b->eb->last_slow_log = b->eb->secs;
3010 					verbose(VERB_OPS, "accept failed, "
3011 						"slow down accept for %d "
3012 						"msec: %s",
3013 						NETEVENT_SLOW_ACCEPT_TIME,
3014 						sock_strerror(errno));
3015 				}
3016 				b->eb->slow_accept_enabled = 1;
3017 				fptr_ok(fptr_whitelist_stop_accept(
3018 					b->stop_accept));
3019 				(*b->stop_accept)(b->cb_arg);
3020 				/* set timeout, no mallocs */
3021 				tv.tv_sec = NETEVENT_SLOW_ACCEPT_TIME/1000;
3022 				tv.tv_usec = (NETEVENT_SLOW_ACCEPT_TIME%1000)*1000;
3023 				b->eb->slow_accept = ub_event_new(b->eb->base,
3024 					-1, UB_EV_TIMEOUT,
3025 					comm_base_handle_slow_accept, b);
3026 				if(b->eb->slow_accept == NULL) {
3027 					/* we do not want to log here, because
3028 					 * that would spam the logfiles.
3029 					 * error: "event_base_set failed." */
3030 				}
3031 				else if(ub_event_add(b->eb->slow_accept, &tv)
3032 					!= 0) {
3033 					/* we do not want to log here,
3034 					 * error: "event_add failed." */
3035 				}
3036 			} else {
3037 				log_err("accept, with no slow down, "
3038 					"failed: %s", sock_strerror(errno));
3039 			}
3040 			return -1;
3041 		}
3042 #endif
3043 #else /* USE_WINSOCK */
3044 		if(WSAGetLastError() == WSAEINPROGRESS ||
3045 			WSAGetLastError() == WSAECONNRESET)
3046 			return -1;
3047 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
3048 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
3049 			return -1;
3050 		}
3051 #endif
3052 		log_err_addr("accept failed", sock_strerror(errno), addr,
3053 			*addrlen);
3054 		return -1;
3055 	}
3056 	if(c->tcp_conn_limit && c->type == comm_tcp_accept) {
3057 		c->tcl_addr = tcl_addr_lookup(c->tcp_conn_limit, addr, *addrlen);
3058 		if(!tcl_new_connection(c->tcl_addr)) {
3059 			if(verbosity >= 3)
3060 				log_err_addr("accept rejected",
3061 				"connection limit exceeded", addr, *addrlen);
3062 			sock_close(new_fd);
3063 			return -1;
3064 		}
3065 	}
3066 #ifndef HAVE_ACCEPT4
3067 	fd_set_nonblock(new_fd);
3068 #endif
3069 	return new_fd;
3070 }
3071 
3072 #ifdef USE_WINSOCK
3073 static long win_bio_cb(BIO *b, int oper, const char* ATTR_UNUSED(argp),
3074 #ifdef HAVE_BIO_SET_CALLBACK_EX
3075 	size_t ATTR_UNUSED(len),
3076 #endif
3077         int ATTR_UNUSED(argi), long argl,
3078 #ifndef HAVE_BIO_SET_CALLBACK_EX
3079 	long retvalue
3080 #else
3081 	int retvalue, size_t* ATTR_UNUSED(processed)
3082 #endif
3083 	)
3084 {
3085 	int wsa_err = WSAGetLastError(); /* store errcode before it is gone */
3086 	verbose(VERB_ALGO, "bio_cb %d, %s %s %s", oper,
3087 		(oper&BIO_CB_RETURN)?"return":"before",
3088 		(oper&BIO_CB_READ)?"read":((oper&BIO_CB_WRITE)?"write":"other"),
3089 		wsa_err==WSAEWOULDBLOCK?"wsawb":"");
3090 	/* on windows, check if previous operation caused EWOULDBLOCK */
3091 	if( (oper == (BIO_CB_READ|BIO_CB_RETURN) && argl == 0) ||
3092 		(oper == (BIO_CB_GETS|BIO_CB_RETURN) && argl == 0)) {
3093 		if(wsa_err == WSAEWOULDBLOCK)
3094 			ub_winsock_tcp_wouldblock((struct ub_event*)
3095 				BIO_get_callback_arg(b), UB_EV_READ);
3096 	}
3097 	if( (oper == (BIO_CB_WRITE|BIO_CB_RETURN) && argl == 0) ||
3098 		(oper == (BIO_CB_PUTS|BIO_CB_RETURN) && argl == 0)) {
3099 		if(wsa_err == WSAEWOULDBLOCK)
3100 			ub_winsock_tcp_wouldblock((struct ub_event*)
3101 				BIO_get_callback_arg(b), UB_EV_WRITE);
3102 	}
3103 	/* return original return value */
3104 	return retvalue;
3105 }
3106 
3107 /** set win bio callbacks for nonblocking operations */
3108 void
3109 comm_point_tcp_win_bio_cb(struct comm_point* c, void* thessl)
3110 {
3111 	SSL* ssl = (SSL*)thessl;
3112 	/* set them both just in case, but usually they are the same BIO */
3113 #ifdef HAVE_BIO_SET_CALLBACK_EX
3114 	BIO_set_callback_ex(SSL_get_rbio(ssl), &win_bio_cb);
3115 #else
3116 	BIO_set_callback(SSL_get_rbio(ssl), &win_bio_cb);
3117 #endif
3118 	BIO_set_callback_arg(SSL_get_rbio(ssl), (char*)c->ev->ev);
3119 #ifdef HAVE_BIO_SET_CALLBACK_EX
3120 	BIO_set_callback_ex(SSL_get_wbio(ssl), &win_bio_cb);
3121 #else
3122 	BIO_set_callback(SSL_get_wbio(ssl), &win_bio_cb);
3123 #endif
3124 	BIO_set_callback_arg(SSL_get_wbio(ssl), (char*)c->ev->ev);
3125 }
3126 #endif
3127 
3128 #ifdef HAVE_NGHTTP2
3129 /** Create http2 session server.  Per connection, after TCP accepted.*/
3130 static int http2_session_server_create(struct http2_session* h2_session)
3131 {
3132 	log_assert(h2_session->callbacks);
3133 	h2_session->is_drop = 0;
3134 	if(nghttp2_session_server_new(&h2_session->session,
3135 			h2_session->callbacks,
3136 		h2_session) == NGHTTP2_ERR_NOMEM) {
3137 		log_err("failed to create nghttp2 session server");
3138 		return 0;
3139 	}
3140 
3141 	return 1;
3142 }
3143 
3144 /** Submit http2 setting to session. Once per session. */
3145 static int http2_submit_settings(struct http2_session* h2_session)
3146 {
3147 	int ret;
3148 	nghttp2_settings_entry settings[1] = {
3149 		{NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS,
3150 		 h2_session->c->http2_max_streams}};
3151 
3152 	ret = nghttp2_submit_settings(h2_session->session, NGHTTP2_FLAG_NONE,
3153 		settings, 1);
3154 	if(ret) {
3155 		verbose(VERB_QUERY, "http2: submit_settings failed, "
3156 			"error: %s", nghttp2_strerror(ret));
3157 		return 0;
3158 	}
3159 	return 1;
3160 }
3161 #endif /* HAVE_NGHTTP2 */
3162 
3163 #ifdef HAVE_NGHTTP2
3164 /** Delete http2 stream. After session delete or stream close callback */
3165 static void http2_stream_delete(struct http2_session* h2_session,
3166 	struct http2_stream* h2_stream)
3167 {
3168 	if(h2_stream->mesh_state) {
3169 		mesh_state_remove_reply(h2_stream->mesh, h2_stream->mesh_state,
3170 			h2_session->c);
3171 		h2_stream->mesh_state = NULL;
3172 	}
3173 	http2_req_stream_clear(h2_stream);
3174 	free(h2_stream);
3175 }
3176 #endif /* HAVE_NGHTTP2 */
3177 
3178 /** delete http2 session server. After closing connection. */
3179 static void http2_session_server_delete(struct http2_session* h2_session)
3180 {
3181 #ifdef HAVE_NGHTTP2
3182 	struct http2_stream* h2_stream, *next;
3183 	nghttp2_session_del(h2_session->session); /* NULL input is fine */
3184 	h2_session->session = NULL;
3185 	for(h2_stream = h2_session->first_stream; h2_stream;) {
3186 		next = h2_stream->next;
3187 		http2_stream_delete(h2_session, h2_stream);
3188 		h2_stream = next;
3189 	}
3190 	h2_session->first_stream = NULL;
3191 	h2_session->is_drop = 0;
3192 	h2_session->postpone_drop = 0;
3193 	h2_session->c->h2_stream = NULL;
3194 #endif
3195 	(void)h2_session;
3196 }
3197 
3198 void
3199 comm_point_tcp_accept_callback(int fd, short event, void* arg)
3200 {
3201 	struct comm_point* c = (struct comm_point*)arg, *c_hdl;
3202 	int new_fd;
3203 	log_assert(c->type == comm_tcp_accept);
3204 	if(!(event & UB_EV_READ)) {
3205 		log_info("ignoring tcp accept event %d", (int)event);
3206 		return;
3207 	}
3208 	ub_comm_base_now(c->ev->base);
3209 	/* find free tcp handler. */
3210 	if(!c->tcp_free) {
3211 		log_warn("accepted too many tcp, connections full");
3212 		return;
3213 	}
3214 	/* accept incoming connection. */
3215 	c_hdl = c->tcp_free;
3216 	/* clear leftover flags from previous use, and then set the
3217 	 * correct event base for the event structure for libevent */
3218 	ub_event_free(c_hdl->ev->ev);
3219 	c_hdl->ev->ev = NULL;
3220 	if((c_hdl->type == comm_tcp && c_hdl->tcp_req_info) ||
3221 		c_hdl->type == comm_local || c_hdl->type == comm_raw)
3222 		c_hdl->tcp_do_toggle_rw = 0;
3223 	else	c_hdl->tcp_do_toggle_rw = 1;
3224 
3225 	if(c_hdl->type == comm_http) {
3226 #ifdef HAVE_NGHTTP2
3227 		if(!c_hdl->h2_session ||
3228 			!http2_session_server_create(c_hdl->h2_session)) {
3229 			log_warn("failed to create nghttp2");
3230 			return;
3231 		}
3232 		if(!c_hdl->h2_session ||
3233 			!http2_submit_settings(c_hdl->h2_session)) {
3234 			log_warn("failed to submit http2 settings");
3235 			if(c_hdl->h2_session)
3236 				http2_session_server_delete(c_hdl->h2_session);
3237 			return;
3238 		}
3239 		if(!c->ssl) {
3240 			c_hdl->tcp_do_toggle_rw = 0;
3241 			c_hdl->use_h2 = 1;
3242 		}
3243 #endif
3244 		c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1,
3245 			UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT,
3246 			comm_point_http_handle_callback, c_hdl);
3247 	} else {
3248 		c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1,
3249 			UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT,
3250 			comm_point_tcp_handle_callback, c_hdl);
3251 	}
3252 	if(!c_hdl->ev->ev) {
3253 		log_warn("could not ub_event_new, dropped tcp");
3254 #ifdef HAVE_NGHTTP2
3255 		if(c_hdl->type == comm_http && c_hdl->h2_session)
3256 			http2_session_server_delete(c_hdl->h2_session);
3257 #endif
3258 		return;
3259 	}
3260 	log_assert(fd != -1);
3261 	(void)fd;
3262 	new_fd = comm_point_perform_accept(c, &c_hdl->repinfo.remote_addr,
3263 		&c_hdl->repinfo.remote_addrlen);
3264 	if(new_fd == -1) {
3265 #ifdef HAVE_NGHTTP2
3266 		if(c_hdl->type == comm_http && c_hdl->h2_session)
3267 			http2_session_server_delete(c_hdl->h2_session);
3268 #endif
3269 		return;
3270 	}
3271 	/* Copy remote_address to client_address.
3272 	 * Simplest way/time for streams to do that. */
3273 	c_hdl->repinfo.client_addrlen = c_hdl->repinfo.remote_addrlen;
3274 	memmove(&c_hdl->repinfo.client_addr,
3275 		&c_hdl->repinfo.remote_addr,
3276 		c_hdl->repinfo.remote_addrlen);
3277 	if(c->ssl) {
3278 		c_hdl->ssl = incoming_ssl_fd(c->ssl, new_fd);
3279 		if(!c_hdl->ssl) {
3280 			c_hdl->fd = new_fd;
3281 			comm_point_close(c_hdl);
3282 			return;
3283 		}
3284 		c_hdl->ssl_shake_state = comm_ssl_shake_read;
3285 #ifdef USE_WINSOCK
3286 		comm_point_tcp_win_bio_cb(c_hdl, c_hdl->ssl);
3287 #endif
3288 	}
3289 
3290 	/* grab the tcp handler buffers */
3291 	c->cur_tcp_count++;
3292 	c->tcp_free = c_hdl->tcp_free;
3293 	c_hdl->tcp_free = NULL;
3294 	if(!c->tcp_free) {
3295 		/* stop accepting incoming queries for now. */
3296 		comm_point_stop_listening(c);
3297 	}
3298 	setup_tcp_handler(c_hdl, new_fd, c->cur_tcp_count, c->max_tcp_count);
3299 }
3300 
3301 /** Make tcp handler free for next assignment */
3302 static void
3303 reclaim_tcp_handler(struct comm_point* c)
3304 {
3305 	log_assert(c->type == comm_tcp);
3306 	if(c->ssl) {
3307 #ifdef HAVE_SSL
3308 		SSL_shutdown(c->ssl);
3309 		SSL_free(c->ssl);
3310 		c->ssl = NULL;
3311 #endif
3312 	}
3313 	comm_point_close(c);
3314 	if(c->tcp_parent) {
3315 		if(c != c->tcp_parent->tcp_free) {
3316 			c->tcp_parent->cur_tcp_count--;
3317 			c->tcp_free = c->tcp_parent->tcp_free;
3318 			c->tcp_parent->tcp_free = c;
3319 		}
3320 		if(!c->tcp_free) {
3321 			/* re-enable listening on accept socket */
3322 			comm_point_start_listening(c->tcp_parent, -1, -1);
3323 		}
3324 	}
3325 	c->tcp_more_read_again = NULL;
3326 	c->tcp_more_write_again = NULL;
3327 	c->tcp_byte_count = 0;
3328 	c->pp2_header_state = pp2_header_none;
3329 	sldns_buffer_clear(c->buffer);
3330 }
3331 
3332 /** do the callback when writing is done */
3333 static void
3334 tcp_callback_writer(struct comm_point* c)
3335 {
3336 	log_assert(c->type == comm_tcp);
3337 	if(!c->tcp_write_and_read) {
3338 		sldns_buffer_clear(c->buffer);
3339 		c->tcp_byte_count = 0;
3340 	}
3341 	if(c->tcp_do_toggle_rw)
3342 		c->tcp_is_reading = 1;
3343 	/* switch from listening(write) to listening(read) */
3344 	if(c->tcp_req_info) {
3345 		tcp_req_info_handle_writedone(c->tcp_req_info);
3346 	} else {
3347 		comm_point_stop_listening(c);
3348 		if(c->tcp_write_and_read) {
3349 			fptr_ok(fptr_whitelist_comm_point(c->callback));
3350 			if( (*c->callback)(c, c->cb_arg, NETEVENT_PKT_WRITTEN,
3351 				&c->repinfo) ) {
3352 				comm_point_start_listening(c, -1,
3353 					adjusted_tcp_timeout(c));
3354 			}
3355 		} else {
3356 			comm_point_start_listening(c, -1,
3357 					adjusted_tcp_timeout(c));
3358 		}
3359 	}
3360 }
3361 
3362 /** do the callback when reading is done */
3363 static void
3364 tcp_callback_reader(struct comm_point* c)
3365 {
3366 	log_assert(c->type == comm_tcp || c->type == comm_local);
3367 	sldns_buffer_flip(c->buffer);
3368 	if(c->tcp_do_toggle_rw)
3369 		c->tcp_is_reading = 0;
3370 	c->tcp_byte_count = 0;
3371 	if(c->tcp_req_info) {
3372 		tcp_req_info_handle_readdone(c->tcp_req_info);
3373 	} else {
3374 		if(c->type == comm_tcp)
3375 			comm_point_stop_listening(c);
3376 		fptr_ok(fptr_whitelist_comm_point(c->callback));
3377 		if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
3378 			comm_point_start_listening(c, -1,
3379 					adjusted_tcp_timeout(c));
3380 		}
3381 	}
3382 }
3383 
3384 #ifdef HAVE_SSL
3385 /** true if the ssl handshake error has to be squelched from the logs */
3386 int
3387 squelch_err_ssl_handshake(unsigned long err)
3388 {
3389 	if(verbosity >= VERB_QUERY)
3390 		return 0; /* only squelch on low verbosity */
3391 	if(ERR_GET_LIB(err) == ERR_LIB_SSL &&
3392 		(ERR_GET_REASON(err) == SSL_R_HTTPS_PROXY_REQUEST ||
3393 		 ERR_GET_REASON(err) == SSL_R_HTTP_REQUEST ||
3394 		 ERR_GET_REASON(err) == SSL_R_WRONG_VERSION_NUMBER ||
3395 		 ERR_GET_REASON(err) == SSL_R_SSLV3_ALERT_BAD_CERTIFICATE
3396 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
3397 		 || ERR_GET_REASON(err) == SSL_R_NO_SHARED_CIPHER
3398 #endif
3399 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
3400 		 || ERR_GET_REASON(err) == SSL_R_UNKNOWN_PROTOCOL
3401 		 || ERR_GET_REASON(err) == SSL_R_UNSUPPORTED_PROTOCOL
3402 #  ifdef SSL_R_VERSION_TOO_LOW
3403 		 || ERR_GET_REASON(err) == SSL_R_VERSION_TOO_LOW
3404 #  endif
3405 #endif
3406 		))
3407 		return 1;
3408 	return 0;
3409 }
3410 #endif /* HAVE_SSL */
3411 
3412 /** continue ssl handshake */
3413 #ifdef HAVE_SSL
3414 static int
3415 ssl_handshake(struct comm_point* c)
3416 {
3417 	int r;
3418 	if(c->ssl_shake_state == comm_ssl_shake_hs_read) {
3419 		/* read condition satisfied back to writing */
3420 		comm_point_listen_for_rw(c, 0, 1);
3421 		c->ssl_shake_state = comm_ssl_shake_none;
3422 		return 1;
3423 	}
3424 	if(c->ssl_shake_state == comm_ssl_shake_hs_write) {
3425 		/* write condition satisfied, back to reading */
3426 		comm_point_listen_for_rw(c, 1, 0);
3427 		c->ssl_shake_state = comm_ssl_shake_none;
3428 		return 1;
3429 	}
3430 
3431 	ERR_clear_error();
3432 	r = SSL_do_handshake(c->ssl);
3433 	if(r != 1) {
3434 		int want = SSL_get_error(c->ssl, r);
3435 		if(want == SSL_ERROR_WANT_READ) {
3436 			if(c->ssl_shake_state == comm_ssl_shake_read)
3437 				return 1;
3438 			c->ssl_shake_state = comm_ssl_shake_read;
3439 			comm_point_listen_for_rw(c, 1, 0);
3440 			return 1;
3441 		} else if(want == SSL_ERROR_WANT_WRITE) {
3442 			if(c->ssl_shake_state == comm_ssl_shake_write)
3443 				return 1;
3444 			c->ssl_shake_state = comm_ssl_shake_write;
3445 			comm_point_listen_for_rw(c, 0, 1);
3446 			return 1;
3447 		} else if(r == 0) {
3448 			return 0; /* closed */
3449 		} else if(want == SSL_ERROR_SYSCALL) {
3450 			/* SYSCALL and errno==0 means closed uncleanly */
3451 #ifdef EPIPE
3452 			if(errno == EPIPE && verbosity < 2)
3453 				return 0; /* silence 'broken pipe' */
3454 #endif
3455 #ifdef ECONNRESET
3456 			if(errno == ECONNRESET && verbosity < 2)
3457 				return 0; /* silence reset by peer */
3458 #endif
3459 			if(!tcp_connect_errno_needs_log(
3460 				(struct sockaddr*)&c->repinfo.remote_addr,
3461 				c->repinfo.remote_addrlen))
3462 				return 0; /* silence connect failures that
3463 				show up because after connect this is the
3464 				first system call that accesses the socket */
3465 			if(errno != 0)
3466 				log_err("SSL_handshake syscall: %s",
3467 					strerror(errno));
3468 			return 0;
3469 		} else {
3470 			unsigned long err = ERR_get_error();
3471 			if(!squelch_err_ssl_handshake(err)) {
3472 				long vr;
3473 				log_crypto_err_io_code("ssl handshake failed",
3474 					want, err);
3475 				if((vr=SSL_get_verify_result(c->ssl)) != 0)
3476 					log_err("ssl handshake cert error: %s",
3477 						X509_verify_cert_error_string(
3478 						vr));
3479 				log_addr(VERB_OPS, "ssl handshake failed",
3480 					&c->repinfo.remote_addr,
3481 					c->repinfo.remote_addrlen);
3482 			}
3483 			return 0;
3484 		}
3485 	}
3486 	/* this is where peer verification could take place */
3487 	if((SSL_get_verify_mode(c->ssl)&SSL_VERIFY_PEER)) {
3488 		/* verification */
3489 		if(SSL_get_verify_result(c->ssl) == X509_V_OK) {
3490 #ifdef HAVE_SSL_GET1_PEER_CERTIFICATE
3491 			X509* x = SSL_get1_peer_certificate(c->ssl);
3492 #else
3493 			X509* x = SSL_get_peer_certificate(c->ssl);
3494 #endif
3495 			if(!x) {
3496 				log_addr(VERB_ALGO, "SSL connection failed: "
3497 					"no certificate",
3498 					&c->repinfo.remote_addr,
3499 					c->repinfo.remote_addrlen);
3500 				return 0;
3501 			}
3502 			log_cert(VERB_ALGO, "peer certificate", x);
3503 #ifdef HAVE_SSL_GET0_PEERNAME
3504 			if(SSL_get0_peername(c->ssl)) {
3505 				char buf[255];
3506 				snprintf(buf, sizeof(buf), "SSL connection "
3507 					"to %s authenticated",
3508 					SSL_get0_peername(c->ssl));
3509 				log_addr(VERB_ALGO, buf, &c->repinfo.remote_addr,
3510 					c->repinfo.remote_addrlen);
3511 			} else {
3512 #endif
3513 				log_addr(VERB_ALGO, "SSL connection "
3514 					"authenticated", &c->repinfo.remote_addr,
3515 					c->repinfo.remote_addrlen);
3516 #ifdef HAVE_SSL_GET0_PEERNAME
3517 			}
3518 #endif
3519 			X509_free(x);
3520 		} else {
3521 #ifdef HAVE_SSL_GET1_PEER_CERTIFICATE
3522 			X509* x = SSL_get1_peer_certificate(c->ssl);
3523 #else
3524 			X509* x = SSL_get_peer_certificate(c->ssl);
3525 #endif
3526 			if(x) {
3527 				log_cert(VERB_ALGO, "peer certificate", x);
3528 				X509_free(x);
3529 			}
3530 			log_addr(VERB_ALGO, "SSL connection failed: "
3531 				"failed to authenticate",
3532 				&c->repinfo.remote_addr,
3533 				c->repinfo.remote_addrlen);
3534 			return 0;
3535 		}
3536 	} else {
3537 		/* unauthenticated, the verify peer flag was not set
3538 		 * in c->ssl when the ssl object was created from ssl_ctx */
3539 		log_addr(VERB_ALGO, "SSL connection", &c->repinfo.remote_addr,
3540 			c->repinfo.remote_addrlen);
3541 	}
3542 
3543 #ifdef HAVE_SSL_GET0_ALPN_SELECTED
3544 	/* check if http2 use is negotiated */
3545 	if(c->type == comm_http && c->h2_session) {
3546 		const unsigned char *alpn;
3547 		unsigned int alpnlen = 0;
3548 		SSL_get0_alpn_selected(c->ssl, &alpn, &alpnlen);
3549 		if(alpnlen == 2 && memcmp("h2", alpn, 2) == 0) {
3550 			/* connection upgraded to HTTP2 */
3551 			c->tcp_do_toggle_rw = 0;
3552 			c->use_h2 = 1;
3553 		} else {
3554 			verbose(VERB_ALGO, "client doesn't support HTTP/2");
3555 			return 0;
3556 		}
3557 	}
3558 #endif
3559 
3560 	/* setup listen rw correctly */
3561 	if(c->tcp_is_reading) {
3562 		if(c->ssl_shake_state != comm_ssl_shake_read)
3563 			comm_point_listen_for_rw(c, 1, 0);
3564 	} else {
3565 		comm_point_listen_for_rw(c, 0, 1);
3566 	}
3567 	c->ssl_shake_state = comm_ssl_shake_none;
3568 	return 1;
3569 }
3570 #endif /* HAVE_SSL */
3571 
3572 /** ssl read callback on TCP */
3573 static int
3574 ssl_handle_read(struct comm_point* c)
3575 {
3576 #ifdef HAVE_SSL
3577 	int r;
3578 	if(c->ssl_shake_state != comm_ssl_shake_none) {
3579 		if(!ssl_handshake(c))
3580 			return 0;
3581 		if(c->ssl_shake_state != comm_ssl_shake_none)
3582 			return 1;
3583 	}
3584 	if(c->pp2_enabled && c->pp2_header_state != pp2_header_done) {
3585 		struct pp2_header* header = NULL;
3586 		size_t want_read_size = 0;
3587 		size_t current_read_size = 0;
3588 		if(c->pp2_header_state == pp2_header_none) {
3589 			want_read_size = PP2_HEADER_SIZE;
3590 			if(sldns_buffer_remaining(c->buffer)<want_read_size) {
3591 				log_err_addr("proxy_protocol: not enough "
3592 					"buffer size to read PROXYv2 header", "",
3593 					&c->repinfo.remote_addr,
3594 					c->repinfo.remote_addrlen);
3595 				return 0;
3596 			}
3597 			verbose(VERB_ALGO, "proxy_protocol: reading fixed "
3598 				"part of PROXYv2 header (len %lu)",
3599 				(unsigned long)want_read_size);
3600 			current_read_size = want_read_size;
3601 			if(c->tcp_byte_count < current_read_size) {
3602 				ERR_clear_error();
3603 				if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(
3604 					c->buffer, c->tcp_byte_count),
3605 					current_read_size -
3606 					c->tcp_byte_count)) <= 0) {
3607 					int want = SSL_get_error(c->ssl, r);
3608 					if(want == SSL_ERROR_ZERO_RETURN) {
3609 						if(c->tcp_req_info)
3610 							return tcp_req_info_handle_read_close(c->tcp_req_info);
3611 						return 0; /* shutdown, closed */
3612 					} else if(want == SSL_ERROR_WANT_READ) {
3613 #ifdef USE_WINSOCK
3614 						ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
3615 #endif
3616 						return 1; /* read more later */
3617 					} else if(want == SSL_ERROR_WANT_WRITE) {
3618 						c->ssl_shake_state = comm_ssl_shake_hs_write;
3619 						comm_point_listen_for_rw(c, 0, 1);
3620 						return 1;
3621 					} else if(want == SSL_ERROR_SYSCALL) {
3622 #ifdef ECONNRESET
3623 						if(errno == ECONNRESET && verbosity < 2)
3624 							return 0; /* silence reset by peer */
3625 #endif
3626 						if(errno != 0)
3627 							log_err("SSL_read syscall: %s",
3628 								strerror(errno));
3629 						return 0;
3630 					}
3631 					log_crypto_err_io("could not SSL_read",
3632 						want);
3633 					return 0;
3634 				}
3635 				c->tcp_byte_count += r;
3636 				sldns_buffer_skip(c->buffer, r);
3637 				if(c->tcp_byte_count != current_read_size) return 1;
3638 				c->pp2_header_state = pp2_header_init;
3639 			}
3640 		}
3641 		if(c->pp2_header_state == pp2_header_init) {
3642 			int err;
3643 			err = pp2_read_header(
3644 				sldns_buffer_begin(c->buffer),
3645 				sldns_buffer_limit(c->buffer));
3646 			if(err) {
3647 				log_err("proxy_protocol: could not parse "
3648 					"PROXYv2 header (%s)",
3649 					pp_lookup_error(err));
3650 				return 0;
3651 			}
3652 			header = (struct pp2_header*)sldns_buffer_begin(c->buffer);
3653 			want_read_size = ntohs(header->len);
3654 			if(sldns_buffer_limit(c->buffer) <
3655 				PP2_HEADER_SIZE + want_read_size) {
3656 				log_err_addr("proxy_protocol: not enough "
3657 					"buffer size to read PROXYv2 header", "",
3658 					&c->repinfo.remote_addr,
3659 					c->repinfo.remote_addrlen);
3660 				return 0;
3661 			}
3662 			verbose(VERB_ALGO, "proxy_protocol: reading variable "
3663 				"part of PROXYv2 header (len %lu)",
3664 				(unsigned long)want_read_size);
3665 			current_read_size = PP2_HEADER_SIZE + want_read_size;
3666 			if(want_read_size == 0) {
3667 				/* nothing more to read; header is complete */
3668 				c->pp2_header_state = pp2_header_done;
3669 			} else if(c->tcp_byte_count < current_read_size) {
3670 				ERR_clear_error();
3671 				if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(
3672 					c->buffer, c->tcp_byte_count),
3673 					current_read_size -
3674 					c->tcp_byte_count)) <= 0) {
3675 					int want = SSL_get_error(c->ssl, r);
3676 					if(want == SSL_ERROR_ZERO_RETURN) {
3677 						if(c->tcp_req_info)
3678 							return tcp_req_info_handle_read_close(c->tcp_req_info);
3679 						return 0; /* shutdown, closed */
3680 					} else if(want == SSL_ERROR_WANT_READ) {
3681 #ifdef USE_WINSOCK
3682 						ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
3683 #endif
3684 						return 1; /* read more later */
3685 					} else if(want == SSL_ERROR_WANT_WRITE) {
3686 						c->ssl_shake_state = comm_ssl_shake_hs_write;
3687 						comm_point_listen_for_rw(c, 0, 1);
3688 						return 1;
3689 					} else if(want == SSL_ERROR_SYSCALL) {
3690 #ifdef ECONNRESET
3691 						if(errno == ECONNRESET && verbosity < 2)
3692 							return 0; /* silence reset by peer */
3693 #endif
3694 						if(errno != 0)
3695 							log_err("SSL_read syscall: %s",
3696 								strerror(errno));
3697 						return 0;
3698 					}
3699 					log_crypto_err_io("could not SSL_read",
3700 						want);
3701 					return 0;
3702 				}
3703 				c->tcp_byte_count += r;
3704 				sldns_buffer_skip(c->buffer, r);
3705 				if(c->tcp_byte_count != current_read_size) return 1;
3706 				c->pp2_header_state = pp2_header_done;
3707 			}
3708 		}
3709 		if(c->pp2_header_state != pp2_header_done || !header) {
3710 			log_err_addr("proxy_protocol: wrong state for the "
3711 				"PROXYv2 header", "", &c->repinfo.remote_addr,
3712 				c->repinfo.remote_addrlen);
3713 			return 0;
3714 		}
3715 		sldns_buffer_flip(c->buffer);
3716 		if(!consume_pp2_header(c->buffer, &c->repinfo, 1)) {
3717 			log_err_addr("proxy_protocol: could not consume "
3718 				"PROXYv2 header", "", &c->repinfo.remote_addr,
3719 				c->repinfo.remote_addrlen);
3720 			return 0;
3721 		}
3722 		verbose(VERB_ALGO, "proxy_protocol: successful read of "
3723 			"PROXYv2 header");
3724 		/* Clear and reset the buffer to read the following
3725 		 * DNS packet(s). */
3726 		sldns_buffer_clear(c->buffer);
3727 		c->tcp_byte_count = 0;
3728 		return 1;
3729 	}
3730 	if(c->tcp_byte_count < sizeof(uint16_t)) {
3731 		/* read length bytes */
3732 		ERR_clear_error();
3733 		if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(c->buffer,
3734 			c->tcp_byte_count), (int)(sizeof(uint16_t) -
3735 			c->tcp_byte_count))) <= 0) {
3736 			int want = SSL_get_error(c->ssl, r);
3737 			if(want == SSL_ERROR_ZERO_RETURN) {
3738 				if(c->tcp_req_info)
3739 					return tcp_req_info_handle_read_close(c->tcp_req_info);
3740 				return 0; /* shutdown, closed */
3741 			} else if(want == SSL_ERROR_WANT_READ) {
3742 #ifdef USE_WINSOCK
3743 				ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
3744 #endif
3745 				return 1; /* read more later */
3746 			} else if(want == SSL_ERROR_WANT_WRITE) {
3747 				c->ssl_shake_state = comm_ssl_shake_hs_write;
3748 				comm_point_listen_for_rw(c, 0, 1);
3749 				return 1;
3750 			} else if(want == SSL_ERROR_SYSCALL) {
3751 #ifdef ECONNRESET
3752 				if(errno == ECONNRESET && verbosity < 2)
3753 					return 0; /* silence reset by peer */
3754 #endif
3755 				if(errno != 0)
3756 					log_err("SSL_read syscall: %s",
3757 						strerror(errno));
3758 				return 0;
3759 			}
3760 			log_crypto_err_io("could not SSL_read", want);
3761 			return 0;
3762 		}
3763 		c->tcp_byte_count += r;
3764 		if(c->tcp_byte_count < sizeof(uint16_t))
3765 			return 1;
3766 		if(sldns_buffer_read_u16_at(c->buffer, 0) >
3767 			sldns_buffer_capacity(c->buffer)) {
3768 			verbose(VERB_QUERY, "ssl: dropped larger than buffer");
3769 			return 0;
3770 		}
3771 		sldns_buffer_set_limit(c->buffer,
3772 			sldns_buffer_read_u16_at(c->buffer, 0));
3773 		if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
3774 			verbose(VERB_QUERY, "ssl: dropped bogus too short.");
3775 			return 0;
3776 		}
3777 		sldns_buffer_skip(c->buffer, (ssize_t)(c->tcp_byte_count-sizeof(uint16_t)));
3778 		verbose(VERB_ALGO, "Reading ssl tcp query of length %d",
3779 			(int)sldns_buffer_limit(c->buffer));
3780 	}
3781 	if(sldns_buffer_remaining(c->buffer) > 0) {
3782 		ERR_clear_error();
3783 		r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer),
3784 			(int)sldns_buffer_remaining(c->buffer));
3785 		if(r <= 0) {
3786 			int want = SSL_get_error(c->ssl, r);
3787 			if(want == SSL_ERROR_ZERO_RETURN) {
3788 				if(c->tcp_req_info)
3789 					return tcp_req_info_handle_read_close(c->tcp_req_info);
3790 				return 0; /* shutdown, closed */
3791 			} else if(want == SSL_ERROR_WANT_READ) {
3792 #ifdef USE_WINSOCK
3793 				ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
3794 #endif
3795 				return 1; /* read more later */
3796 			} else if(want == SSL_ERROR_WANT_WRITE) {
3797 				c->ssl_shake_state = comm_ssl_shake_hs_write;
3798 				comm_point_listen_for_rw(c, 0, 1);
3799 				return 1;
3800 			} else if(want == SSL_ERROR_SYSCALL) {
3801 #ifdef ECONNRESET
3802 				if(errno == ECONNRESET && verbosity < 2)
3803 					return 0; /* silence reset by peer */
3804 #endif
3805 				if(errno != 0)
3806 					log_err("SSL_read syscall: %s",
3807 						strerror(errno));
3808 				return 0;
3809 			}
3810 			log_crypto_err_io("could not SSL_read", want);
3811 			return 0;
3812 		}
3813 		sldns_buffer_skip(c->buffer, (ssize_t)r);
3814 	}
3815 	if(sldns_buffer_remaining(c->buffer) <= 0) {
3816 		tcp_callback_reader(c);
3817 	}
3818 	return 1;
3819 #else
3820 	(void)c;
3821 	return 0;
3822 #endif /* HAVE_SSL */
3823 }
3824 
3825 /** ssl write callback on TCP */
3826 static int
3827 ssl_handle_write(struct comm_point* c)
3828 {
3829 #ifdef HAVE_SSL
3830 	int r;
3831 	if(c->ssl_shake_state != comm_ssl_shake_none) {
3832 		if(!ssl_handshake(c))
3833 			return 0;
3834 		if(c->ssl_shake_state != comm_ssl_shake_none)
3835 			return 1;
3836 	}
3837 	/* ignore return, if fails we may simply block */
3838 	(void)SSL_set_mode(c->ssl, (long)SSL_MODE_ENABLE_PARTIAL_WRITE);
3839 	if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) {
3840 		uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(c->buffer));
3841 		ERR_clear_error();
3842 		if(c->tcp_write_and_read) {
3843 			if(c->tcp_write_pkt_len + 2 < LDNS_RR_BUF_SIZE) {
3844 				/* combine the tcp length and the query for
3845 				 * write, this emulates writev */
3846 				uint8_t buf[LDNS_RR_BUF_SIZE];
3847 				memmove(buf, &len, sizeof(uint16_t));
3848 				memmove(buf+sizeof(uint16_t),
3849 					c->tcp_write_pkt,
3850 					c->tcp_write_pkt_len);
3851 				r = SSL_write(c->ssl,
3852 					(void*)(buf+c->tcp_write_byte_count),
3853 					c->tcp_write_pkt_len + 2 -
3854 					c->tcp_write_byte_count);
3855 			} else {
3856 				r = SSL_write(c->ssl,
3857 					(void*)(((uint8_t*)&len)+c->tcp_write_byte_count),
3858 					(int)(sizeof(uint16_t)-c->tcp_write_byte_count));
3859 			}
3860 		} else if(sizeof(uint16_t)+sldns_buffer_remaining(c->buffer) <
3861 			LDNS_RR_BUF_SIZE) {
3862 			/* combine the tcp length and the query for write,
3863 			 * this emulates writev */
3864 			uint8_t buf[LDNS_RR_BUF_SIZE];
3865 			memmove(buf, &len, sizeof(uint16_t));
3866 			memmove(buf+sizeof(uint16_t),
3867 				sldns_buffer_current(c->buffer),
3868 				sldns_buffer_remaining(c->buffer));
3869 			r = SSL_write(c->ssl, (void*)(buf+c->tcp_byte_count),
3870 				(int)(sizeof(uint16_t)+
3871 				sldns_buffer_remaining(c->buffer)
3872 				- c->tcp_byte_count));
3873 		} else {
3874 			r = SSL_write(c->ssl,
3875 				(void*)(((uint8_t*)&len)+c->tcp_byte_count),
3876 				(int)(sizeof(uint16_t)-c->tcp_byte_count));
3877 		}
3878 		if(r <= 0) {
3879 			int want = SSL_get_error(c->ssl, r);
3880 			if(want == SSL_ERROR_ZERO_RETURN) {
3881 				return 0; /* closed */
3882 			} else if(want == SSL_ERROR_WANT_READ) {
3883 				c->ssl_shake_state = comm_ssl_shake_hs_read;
3884 				comm_point_listen_for_rw(c, 1, 0);
3885 				return 1; /* wait for read condition */
3886 			} else if(want == SSL_ERROR_WANT_WRITE) {
3887 #ifdef USE_WINSOCK
3888 				ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
3889 #endif
3890 				return 1; /* write more later */
3891 			} else if(want == SSL_ERROR_SYSCALL) {
3892 #ifdef EPIPE
3893 				if(errno == EPIPE && verbosity < 2)
3894 					return 0; /* silence 'broken pipe' */
3895 #endif
3896 				if(errno != 0)
3897 					log_err("SSL_write syscall: %s",
3898 						strerror(errno));
3899 				return 0;
3900 			}
3901 			log_crypto_err_io("could not SSL_write", want);
3902 			return 0;
3903 		}
3904 		if(c->tcp_write_and_read) {
3905 			c->tcp_write_byte_count += r;
3906 			if(c->tcp_write_byte_count < sizeof(uint16_t))
3907 				return 1;
3908 		} else {
3909 			c->tcp_byte_count += r;
3910 			if(c->tcp_byte_count < sizeof(uint16_t))
3911 				return 1;
3912 			sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
3913 				sizeof(uint16_t));
3914 		}
3915 		if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) {
3916 			tcp_callback_writer(c);
3917 			return 1;
3918 		}
3919 	}
3920 	log_assert(c->tcp_write_and_read || sldns_buffer_remaining(c->buffer) > 0);
3921 	log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2);
3922 	ERR_clear_error();
3923 	if(c->tcp_write_and_read) {
3924 		r = SSL_write(c->ssl, (void*)(c->tcp_write_pkt + c->tcp_write_byte_count - 2),
3925 			(int)(c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count));
3926 	} else {
3927 		r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer),
3928 			(int)sldns_buffer_remaining(c->buffer));
3929 	}
3930 	if(r <= 0) {
3931 		int want = SSL_get_error(c->ssl, r);
3932 		if(want == SSL_ERROR_ZERO_RETURN) {
3933 			return 0; /* closed */
3934 		} else if(want == SSL_ERROR_WANT_READ) {
3935 			c->ssl_shake_state = comm_ssl_shake_hs_read;
3936 			comm_point_listen_for_rw(c, 1, 0);
3937 			return 1; /* wait for read condition */
3938 		} else if(want == SSL_ERROR_WANT_WRITE) {
3939 #ifdef USE_WINSOCK
3940 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
3941 #endif
3942 			return 1; /* write more later */
3943 		} else if(want == SSL_ERROR_SYSCALL) {
3944 #ifdef EPIPE
3945 			if(errno == EPIPE && verbosity < 2)
3946 				return 0; /* silence 'broken pipe' */
3947 #endif
3948 			if(errno != 0)
3949 				log_err("SSL_write syscall: %s",
3950 					strerror(errno));
3951 			return 0;
3952 		}
3953 		log_crypto_err_io("could not SSL_write", want);
3954 		return 0;
3955 	}
3956 	if(c->tcp_write_and_read) {
3957 		c->tcp_write_byte_count += r;
3958 	} else {
3959 		sldns_buffer_skip(c->buffer, (ssize_t)r);
3960 	}
3961 
3962 	if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) {
3963 		tcp_callback_writer(c);
3964 	}
3965 	return 1;
3966 #else
3967 	(void)c;
3968 	return 0;
3969 #endif /* HAVE_SSL */
3970 }
3971 
3972 /** handle ssl tcp connection with dns contents */
3973 static int
3974 ssl_handle_it(struct comm_point* c, int is_write)
3975 {
3976 	/* handle case where renegotiation wants read during write call
3977 	 * or write during read calls */
3978 	if(is_write && c->ssl_shake_state == comm_ssl_shake_hs_write)
3979 		return ssl_handle_read(c);
3980 	else if(!is_write && c->ssl_shake_state == comm_ssl_shake_hs_read)
3981 		return ssl_handle_write(c);
3982 	/* handle read events for read operation and write events for a
3983 	 * write operation */
3984 	else if(!is_write)
3985 		return ssl_handle_read(c);
3986 	return ssl_handle_write(c);
3987 }
3988 
3989 /**
3990  * Handle tcp reading callback.
3991  * @param fd: file descriptor of socket.
3992  * @param c: comm point to read from into buffer.
3993  * @param short_ok: if true, very short packets are OK (for comm_local).
3994  * @return: 0 on error
3995  */
3996 static int
3997 comm_point_tcp_handle_read(int fd, struct comm_point* c, int short_ok)
3998 {
3999 	ssize_t r;
4000 	int recv_initial = 0;
4001 	log_assert(c->type == comm_tcp || c->type == comm_local);
4002 	if(c->ssl)
4003 		return ssl_handle_it(c, 0);
4004 	if(!c->tcp_is_reading && !c->tcp_write_and_read)
4005 		return 0;
4006 
4007 	log_assert(fd != -1);
4008 	if(c->pp2_enabled && c->pp2_header_state != pp2_header_done) {
4009 		struct pp2_header* header = NULL;
4010 		size_t want_read_size = 0;
4011 		size_t current_read_size = 0;
4012 		if(c->pp2_header_state == pp2_header_none) {
4013 			want_read_size = PP2_HEADER_SIZE;
4014 			if(sldns_buffer_remaining(c->buffer)<want_read_size) {
4015 				log_err_addr("proxy_protocol: not enough "
4016 					"buffer size to read PROXYv2 header", "",
4017 					&c->repinfo.remote_addr,
4018 					c->repinfo.remote_addrlen);
4019 				return 0;
4020 			}
4021 			verbose(VERB_ALGO, "proxy_protocol: reading fixed "
4022 				"part of PROXYv2 header (len %lu)",
4023 				(unsigned long)want_read_size);
4024 			current_read_size = want_read_size;
4025 			if(c->tcp_byte_count < current_read_size) {
4026 				r = recv(fd, (void*)sldns_buffer_at(c->buffer,
4027 					c->tcp_byte_count),
4028 					current_read_size-c->tcp_byte_count, MSG_DONTWAIT);
4029 				if(r == 0) {
4030 					if(c->tcp_req_info)
4031 						return tcp_req_info_handle_read_close(c->tcp_req_info);
4032 					return 0;
4033 				} else if(r == -1) {
4034 					goto recv_error_initial;
4035 				}
4036 				c->tcp_byte_count += r;
4037 				sldns_buffer_skip(c->buffer, r);
4038 				if(c->tcp_byte_count != current_read_size) return 1;
4039 				c->pp2_header_state = pp2_header_init;
4040 			}
4041 		}
4042 		if(c->pp2_header_state == pp2_header_init) {
4043 			int err;
4044 			err = pp2_read_header(
4045 				sldns_buffer_begin(c->buffer),
4046 				sldns_buffer_limit(c->buffer));
4047 			if(err) {
4048 				log_err("proxy_protocol: could not parse "
4049 					"PROXYv2 header (%s)",
4050 					pp_lookup_error(err));
4051 				return 0;
4052 			}
4053 			header = (struct pp2_header*)sldns_buffer_begin(c->buffer);
4054 			want_read_size = ntohs(header->len);
4055 			if(sldns_buffer_limit(c->buffer) <
4056 				PP2_HEADER_SIZE + want_read_size) {
4057 				log_err_addr("proxy_protocol: not enough "
4058 					"buffer size to read PROXYv2 header", "",
4059 					&c->repinfo.remote_addr,
4060 					c->repinfo.remote_addrlen);
4061 				return 0;
4062 			}
4063 			verbose(VERB_ALGO, "proxy_protocol: reading variable "
4064 				"part of PROXYv2 header (len %lu)",
4065 				(unsigned long)want_read_size);
4066 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4067 			if(want_read_size == 0) {
4068 				/* nothing more to read; header is complete */
4069 				c->pp2_header_state = pp2_header_done;
4070 			} else if(c->tcp_byte_count < current_read_size) {
4071 				r = recv(fd, (void*)sldns_buffer_at(c->buffer,
4072 					c->tcp_byte_count),
4073 					current_read_size-c->tcp_byte_count, MSG_DONTWAIT);
4074 				if(r == 0) {
4075 					if(c->tcp_req_info)
4076 						return tcp_req_info_handle_read_close(c->tcp_req_info);
4077 					return 0;
4078 				} else if(r == -1) {
4079 					goto recv_error;
4080 				}
4081 				c->tcp_byte_count += r;
4082 				sldns_buffer_skip(c->buffer, r);
4083 				if(c->tcp_byte_count != current_read_size) return 1;
4084 				c->pp2_header_state = pp2_header_done;
4085 			}
4086 		}
4087 		if(c->pp2_header_state != pp2_header_done || !header) {
4088 			log_err_addr("proxy_protocol: wrong state for the "
4089 				"PROXYv2 header", "", &c->repinfo.remote_addr,
4090 				c->repinfo.remote_addrlen);
4091 			return 0;
4092 		}
4093 		sldns_buffer_flip(c->buffer);
4094 		if(!consume_pp2_header(c->buffer, &c->repinfo, 1)) {
4095 			log_err_addr("proxy_protocol: could not consume "
4096 				"PROXYv2 header", "", &c->repinfo.remote_addr,
4097 				c->repinfo.remote_addrlen);
4098 			return 0;
4099 		}
4100 		verbose(VERB_ALGO, "proxy_protocol: successful read of "
4101 			"PROXYv2 header");
4102 		/* Clear and reset the buffer to read the following
4103 		    * DNS packet(s). */
4104 		sldns_buffer_clear(c->buffer);
4105 		c->tcp_byte_count = 0;
4106 		return 1;
4107 	}
4108 
4109 	if(c->tcp_byte_count < sizeof(uint16_t)) {
4110 		/* read length bytes */
4111 		r = recv(fd,(void*)sldns_buffer_at(c->buffer,c->tcp_byte_count),
4112 			sizeof(uint16_t)-c->tcp_byte_count, MSG_DONTWAIT);
4113 		if(r == 0) {
4114 			if(c->tcp_req_info)
4115 				return tcp_req_info_handle_read_close(c->tcp_req_info);
4116 			return 0;
4117 		} else if(r == -1) {
4118 			if(c->pp2_enabled) goto recv_error;
4119 			goto recv_error_initial;
4120 		}
4121 		c->tcp_byte_count += r;
4122 		if(c->tcp_byte_count != sizeof(uint16_t))
4123 			return 1;
4124 		if(sldns_buffer_read_u16_at(c->buffer, 0) >
4125 			sldns_buffer_capacity(c->buffer)) {
4126 			verbose(VERB_QUERY, "tcp: dropped larger than buffer");
4127 			return 0;
4128 		}
4129 		sldns_buffer_set_limit(c->buffer,
4130 			sldns_buffer_read_u16_at(c->buffer, 0));
4131 		if(!short_ok &&
4132 			sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
4133 			verbose(VERB_QUERY, "tcp: dropped bogus too short.");
4134 			return 0;
4135 		}
4136 		verbose(VERB_ALGO, "Reading tcp query of length %d",
4137 			(int)sldns_buffer_limit(c->buffer));
4138 	}
4139 
4140 	if(sldns_buffer_remaining(c->buffer) == 0)
4141 		log_err("in comm_point_tcp_handle_read buffer_remaining is "
4142 			"not > 0 as expected, continuing with (harmless) 0 "
4143 			"length recv");
4144 	r = recv(fd, (void*)sldns_buffer_current(c->buffer),
4145 		sldns_buffer_remaining(c->buffer), MSG_DONTWAIT);
4146 	if(r == 0) {
4147 		if(c->tcp_req_info)
4148 			return tcp_req_info_handle_read_close(c->tcp_req_info);
4149 		return 0;
4150 	} else if(r == -1) {
4151 		goto recv_error;
4152 	}
4153 	sldns_buffer_skip(c->buffer, r);
4154 	if(sldns_buffer_remaining(c->buffer) <= 0) {
4155 		tcp_callback_reader(c);
4156 	}
4157 	return 1;
4158 
4159 recv_error_initial:
4160 	recv_initial = 1;
4161 recv_error:
4162 #ifndef USE_WINSOCK
4163 	if(errno == EINTR || errno == EAGAIN)
4164 		return 1;
4165 #ifdef ECONNRESET
4166 	if(errno == ECONNRESET && verbosity < 2)
4167 		return 0; /* silence reset by peer */
4168 #endif
4169 	if(recv_initial) {
4170 #ifdef ECONNREFUSED
4171 		if(errno == ECONNREFUSED && verbosity < 2)
4172 			return 0; /* silence reset by peer */
4173 #endif
4174 #ifdef ENETUNREACH
4175 		if(errno == ENETUNREACH && verbosity < 2)
4176 			return 0; /* silence it */
4177 #endif
4178 #ifdef EHOSTDOWN
4179 		if(errno == EHOSTDOWN && verbosity < 2)
4180 			return 0; /* silence it */
4181 #endif
4182 #ifdef EHOSTUNREACH
4183 		if(errno == EHOSTUNREACH && verbosity < 2)
4184 			return 0; /* silence it */
4185 #endif
4186 #ifdef ENETDOWN
4187 		if(errno == ENETDOWN && verbosity < 2)
4188 			return 0; /* silence it */
4189 #endif
4190 #ifdef EACCES
4191 		if(errno == EACCES && verbosity < 2)
4192 			return 0; /* silence it */
4193 #endif
4194 #ifdef ENOTCONN
4195 		if(errno == ENOTCONN) {
4196 			log_err_addr("read (in tcp initial) failed and this "
4197 				"could be because TCP Fast Open is "
4198 				"enabled [--disable-tfo-client "
4199 				"--disable-tfo-server] but does not "
4200 				"work", sock_strerror(errno),
4201 				&c->repinfo.remote_addr,
4202 				c->repinfo.remote_addrlen);
4203 			return 0;
4204 		}
4205 #endif
4206 	}
4207 #else /* USE_WINSOCK */
4208 	if(recv_initial) {
4209 		if(WSAGetLastError() == WSAECONNREFUSED && verbosity < 2)
4210 			return 0;
4211 		if(WSAGetLastError() == WSAEHOSTDOWN && verbosity < 2)
4212 			return 0;
4213 		if(WSAGetLastError() == WSAEHOSTUNREACH && verbosity < 2)
4214 			return 0;
4215 		if(WSAGetLastError() == WSAENETDOWN && verbosity < 2)
4216 			return 0;
4217 		if(WSAGetLastError() == WSAENETUNREACH && verbosity < 2)
4218 			return 0;
4219 	}
4220 	if(WSAGetLastError() == WSAECONNRESET)
4221 		return 0;
4222 	if(WSAGetLastError() == WSAEINPROGRESS)
4223 		return 1;
4224 	if(WSAGetLastError() == WSAEWOULDBLOCK) {
4225 		ub_winsock_tcp_wouldblock(c->ev->ev,
4226 			UB_EV_READ);
4227 		return 1;
4228 	}
4229 #endif
4230 	log_err_addr((recv_initial?"read (in tcp initial)":"read (in tcp)"),
4231 		sock_strerror(errno), &c->repinfo.remote_addr,
4232 		c->repinfo.remote_addrlen);
4233 	return 0;
4234 }
4235 
4236 /**
4237  * Handle tcp writing callback.
4238  * @param fd: file descriptor of socket.
4239  * @param c: comm point to write buffer out of.
4240  * @return: 0 on error
4241  */
4242 static int
4243 comm_point_tcp_handle_write(int fd, struct comm_point* c)
4244 {
4245 	ssize_t r;
4246 	struct sldns_buffer *buffer;
4247 	log_assert(c->type == comm_tcp);
4248 #ifdef USE_DNSCRYPT
4249 	buffer = c->dnscrypt_buffer;
4250 #else
4251 	buffer = c->buffer;
4252 #endif
4253 	if(c->tcp_is_reading && !c->ssl && !c->tcp_write_and_read)
4254 		return 0;
4255 	log_assert(fd != -1);
4256 	if(((!c->tcp_write_and_read && c->tcp_byte_count == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == 0)) && c->tcp_check_nb_connect) {
4257 		/* check for pending error from nonblocking connect */
4258 		/* from Stevens, unix network programming, vol1, 3rd ed, p450*/
4259 		int error = 0;
4260 		socklen_t len = (socklen_t)sizeof(error);
4261 		if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error,
4262 			&len) < 0){
4263 #ifndef USE_WINSOCK
4264 			error = errno; /* on solaris errno is error */
4265 #else /* USE_WINSOCK */
4266 			error = WSAGetLastError();
4267 #endif
4268 		}
4269 #ifndef USE_WINSOCK
4270 #if defined(EINPROGRESS) && defined(EWOULDBLOCK)
4271 		if(error == EINPROGRESS || error == EWOULDBLOCK)
4272 			return 1; /* try again later */
4273 		else
4274 #endif
4275 		if(error != 0 && verbosity < 2)
4276 			return 0; /* silence lots of chatter in the logs */
4277                 else if(error != 0) {
4278 			log_err_addr("tcp connect", strerror(error),
4279 				&c->repinfo.remote_addr,
4280 				c->repinfo.remote_addrlen);
4281 #else /* USE_WINSOCK */
4282 		/* examine error */
4283 		if(error == WSAEINPROGRESS)
4284 			return 1;
4285 		else if(error == WSAEWOULDBLOCK) {
4286 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
4287 			return 1;
4288 		} else if(error != 0 && verbosity < 2)
4289 			return 0;
4290 		else if(error != 0) {
4291 			log_err_addr("tcp connect", wsa_strerror(error),
4292 				&c->repinfo.remote_addr,
4293 				c->repinfo.remote_addrlen);
4294 #endif /* USE_WINSOCK */
4295 			return 0;
4296 		}
4297 	}
4298 	if(c->ssl)
4299 		return ssl_handle_it(c, 1);
4300 
4301 #ifdef USE_MSG_FASTOPEN
4302 	/* Only try this on first use of a connection that uses tfo,
4303 	   otherwise fall through to normal write */
4304 	/* Also, TFO support on WINDOWS not implemented at the moment */
4305 	if(c->tcp_do_fastopen == 1) {
4306 		/* this form of sendmsg() does both a connect() and send() so need to
4307 		   look for various flavours of error*/
4308 		uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer));
4309 		struct msghdr msg;
4310 		struct iovec iov[2];
4311 		c->tcp_do_fastopen = 0;
4312 		memset(&msg, 0, sizeof(msg));
4313 		if(c->tcp_write_and_read) {
4314 			iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count;
4315 			iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count;
4316 			iov[1].iov_base = c->tcp_write_pkt;
4317 			iov[1].iov_len = c->tcp_write_pkt_len;
4318 		} else {
4319 			iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count;
4320 			iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count;
4321 			iov[1].iov_base = sldns_buffer_begin(buffer);
4322 			iov[1].iov_len = sldns_buffer_limit(buffer);
4323 		}
4324 		log_assert(iov[0].iov_len > 0);
4325 		msg.msg_name = &c->repinfo.remote_addr;
4326 		msg.msg_namelen = c->repinfo.remote_addrlen;
4327 		msg.msg_iov = iov;
4328 		msg.msg_iovlen = 2;
4329 		r = sendmsg(fd, &msg, MSG_FASTOPEN);
4330 		if (r == -1) {
4331 #if defined(EINPROGRESS) && defined(EWOULDBLOCK)
4332 			/* Handshake is underway, maybe because no TFO cookie available.
4333 			   Come back to write the message*/
4334 			if(errno == EINPROGRESS || errno == EWOULDBLOCK)
4335 				return 1;
4336 #endif
4337 			if(errno == EINTR || errno == EAGAIN)
4338 				return 1;
4339 			/* Not handling EISCONN here as shouldn't ever hit that case.*/
4340 			if(errno != EPIPE
4341 #ifdef EOPNOTSUPP
4342 				/* if /proc/sys/net/ipv4/tcp_fastopen is
4343 				 * disabled on Linux, sendmsg may return
4344 				 * 'Operation not supported', if so
4345 				 * fallthrough to ordinary connect. */
4346 				&& errno != EOPNOTSUPP
4347 #endif
4348 				&& errno != 0) {
4349 				if(verbosity < 2)
4350 					return 0; /* silence lots of chatter in the logs */
4351 				log_err_addr("tcp sendmsg", strerror(errno),
4352 					&c->repinfo.remote_addr,
4353 					c->repinfo.remote_addrlen);
4354 				return 0;
4355 			}
4356 			verbose(VERB_ALGO, "tcp sendmsg for fastopen failed (with %s), try normal connect", strerror(errno));
4357 			/* fallthrough to nonFASTOPEN
4358 			 * (MSG_FASTOPEN on Linux 3 produces EPIPE)
4359 			 * we need to perform connect() */
4360 			if(connect(fd, (struct sockaddr *)&c->repinfo.remote_addr,
4361 				c->repinfo.remote_addrlen) == -1) {
4362 #ifdef EINPROGRESS
4363 				if(errno == EINPROGRESS)
4364 					return 1; /* wait until connect done*/
4365 #endif
4366 #ifdef USE_WINSOCK
4367 				if(WSAGetLastError() == WSAEINPROGRESS ||
4368 					WSAGetLastError() == WSAEWOULDBLOCK)
4369 					return 1; /* wait until connect done*/
4370 #endif
4371 				if(tcp_connect_errno_needs_log(
4372 					(struct sockaddr *)&c->repinfo.remote_addr,
4373 					c->repinfo.remote_addrlen)) {
4374 					log_err_addr("outgoing tcp: connect after EPIPE for fastopen",
4375 						strerror(errno),
4376 						&c->repinfo.remote_addr,
4377 						c->repinfo.remote_addrlen);
4378 				}
4379 				return 0;
4380 			}
4381 
4382 		} else {
4383 			if(c->tcp_write_and_read) {
4384 				c->tcp_write_byte_count += r;
4385 				if(c->tcp_write_byte_count < sizeof(uint16_t))
4386 					return 1;
4387 			} else {
4388 				c->tcp_byte_count += r;
4389 				if(c->tcp_byte_count < sizeof(uint16_t))
4390 					return 1;
4391 				sldns_buffer_set_position(buffer, c->tcp_byte_count -
4392 					sizeof(uint16_t));
4393 			}
4394 			if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) {
4395 				tcp_callback_writer(c);
4396 				return 1;
4397 			}
4398 		}
4399 	}
4400 #endif /* USE_MSG_FASTOPEN */
4401 
4402 	if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) {
4403 		uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer));
4404 #ifdef HAVE_WRITEV
4405 		struct iovec iov[2];
4406 		if(c->tcp_write_and_read) {
4407 			iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count;
4408 			iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count;
4409 			iov[1].iov_base = c->tcp_write_pkt;
4410 			iov[1].iov_len = c->tcp_write_pkt_len;
4411 		} else {
4412 			iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count;
4413 			iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count;
4414 			iov[1].iov_base = sldns_buffer_begin(buffer);
4415 			iov[1].iov_len = sldns_buffer_limit(buffer);
4416 		}
4417 		log_assert(iov[0].iov_len > 0);
4418 		r = writev(fd, iov, 2);
4419 #else /* HAVE_WRITEV */
4420 		if(c->tcp_write_and_read) {
4421 			r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_write_byte_count),
4422 				sizeof(uint16_t)-c->tcp_write_byte_count, 0);
4423 		} else {
4424 			r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_byte_count),
4425 				sizeof(uint16_t)-c->tcp_byte_count, 0);
4426 		}
4427 #endif /* HAVE_WRITEV */
4428 		if(r == -1) {
4429 #ifndef USE_WINSOCK
4430 #  ifdef EPIPE
4431                 	if(errno == EPIPE && verbosity < 2)
4432                         	return 0; /* silence 'broken pipe' */
4433   #endif
4434 			if(errno == EINTR || errno == EAGAIN)
4435 				return 1;
4436 #ifdef ECONNRESET
4437 			if(errno == ECONNRESET && verbosity < 2)
4438 				return 0; /* silence reset by peer */
4439 #endif
4440 #  ifdef HAVE_WRITEV
4441 			log_err_addr("tcp writev", strerror(errno),
4442 				&c->repinfo.remote_addr,
4443 				c->repinfo.remote_addrlen);
4444 #  else /* HAVE_WRITEV */
4445 			log_err_addr("tcp send s", strerror(errno),
4446 				&c->repinfo.remote_addr,
4447 				c->repinfo.remote_addrlen);
4448 #  endif /* HAVE_WRITEV */
4449 #else
4450 			if(WSAGetLastError() == WSAENOTCONN)
4451 				return 1;
4452 			if(WSAGetLastError() == WSAEINPROGRESS)
4453 				return 1;
4454 			if(WSAGetLastError() == WSAEWOULDBLOCK) {
4455 				ub_winsock_tcp_wouldblock(c->ev->ev,
4456 					UB_EV_WRITE);
4457 				return 1;
4458 			}
4459 			if(WSAGetLastError() == WSAECONNRESET && verbosity < 2)
4460 				return 0; /* silence reset by peer */
4461 			log_err_addr("tcp send s",
4462 				wsa_strerror(WSAGetLastError()),
4463 				&c->repinfo.remote_addr,
4464 				c->repinfo.remote_addrlen);
4465 #endif
4466 			return 0;
4467 		}
4468 		if(c->tcp_write_and_read) {
4469 			c->tcp_write_byte_count += r;
4470 			if(c->tcp_write_byte_count < sizeof(uint16_t))
4471 				return 1;
4472 		} else {
4473 			c->tcp_byte_count += r;
4474 			if(c->tcp_byte_count < sizeof(uint16_t))
4475 				return 1;
4476 			sldns_buffer_set_position(buffer, c->tcp_byte_count -
4477 				sizeof(uint16_t));
4478 		}
4479 		if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) {
4480 			tcp_callback_writer(c);
4481 			return 1;
4482 		}
4483 	}
4484 	log_assert(c->tcp_write_and_read || sldns_buffer_remaining(buffer) > 0);
4485 	log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2);
4486 	if(c->tcp_write_and_read) {
4487 		r = send(fd, (void*)(c->tcp_write_pkt + c->tcp_write_byte_count - 2),
4488 			c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count, 0);
4489 	} else {
4490 		r = send(fd, (void*)sldns_buffer_current(buffer),
4491 			sldns_buffer_remaining(buffer), 0);
4492 	}
4493 	if(r == -1) {
4494 #ifndef USE_WINSOCK
4495 		if(errno == EINTR || errno == EAGAIN)
4496 			return 1;
4497 #ifdef ECONNRESET
4498 		if(errno == ECONNRESET && verbosity < 2)
4499 			return 0; /* silence reset by peer */
4500 #endif
4501 #else
4502 		if(WSAGetLastError() == WSAEINPROGRESS)
4503 			return 1;
4504 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
4505 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
4506 			return 1;
4507 		}
4508 		if(WSAGetLastError() == WSAECONNRESET && verbosity < 2)
4509 			return 0; /* silence reset by peer */
4510 #endif
4511 		log_err_addr("tcp send r", sock_strerror(errno),
4512 			&c->repinfo.remote_addr,
4513 			c->repinfo.remote_addrlen);
4514 		return 0;
4515 	}
4516 	if(c->tcp_write_and_read) {
4517 		c->tcp_write_byte_count += r;
4518 	} else {
4519 		sldns_buffer_skip(buffer, r);
4520 	}
4521 
4522 	if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) {
4523 		tcp_callback_writer(c);
4524 	}
4525 
4526 	return 1;
4527 }
4528 
4529 /** read again to drain buffers when there could be more to read, returns 0
4530  * on failure which means the comm point is closed. */
4531 static int
4532 tcp_req_info_read_again(int fd, struct comm_point* c)
4533 {
4534 	while(c->tcp_req_info->read_again) {
4535 		int r;
4536 		c->tcp_req_info->read_again = 0;
4537 		if(c->tcp_is_reading)
4538 			r = comm_point_tcp_handle_read(fd, c, 0);
4539 		else 	r = comm_point_tcp_handle_write(fd, c);
4540 		if(!r) {
4541 			reclaim_tcp_handler(c);
4542 			if(!c->tcp_do_close) {
4543 				fptr_ok(fptr_whitelist_comm_point(
4544 					c->callback));
4545 				(void)(*c->callback)(c, c->cb_arg,
4546 					NETEVENT_CLOSED, NULL);
4547 			}
4548 			return 0;
4549 		}
4550 	}
4551 	return 1;
4552 }
4553 
4554 /** read again to drain buffers when there could be more to read */
4555 static void
4556 tcp_more_read_again(int fd, struct comm_point* c)
4557 {
4558 	/* if the packet is done, but another one could be waiting on
4559 	 * the connection, the callback signals this, and we try again */
4560 	/* this continues until the read routines get EAGAIN or so,
4561 	 * and thus does not call the callback, and the bool is 0 */
4562 	int* moreread = c->tcp_more_read_again;
4563 	while(moreread && *moreread) {
4564 		*moreread = 0;
4565 		if(!comm_point_tcp_handle_read(fd, c, 0)) {
4566 			reclaim_tcp_handler(c);
4567 			if(!c->tcp_do_close) {
4568 				fptr_ok(fptr_whitelist_comm_point(
4569 					c->callback));
4570 				(void)(*c->callback)(c, c->cb_arg,
4571 					NETEVENT_CLOSED, NULL);
4572 			}
4573 			return;
4574 		}
4575 	}
4576 }
4577 
4578 /** write again to fill up when there could be more to write */
4579 static void
4580 tcp_more_write_again(int fd, struct comm_point* c)
4581 {
4582 	/* if the packet is done, but another is waiting to be written,
4583 	 * the callback signals it and we try again. */
4584 	/* this continues until the write routines get EAGAIN or so,
4585 	 * and thus does not call the callback, and the bool is 0 */
4586 	int* morewrite = c->tcp_more_write_again;
4587 	while(morewrite && *morewrite) {
4588 		*morewrite = 0;
4589 		if(!comm_point_tcp_handle_write(fd, c)) {
4590 			reclaim_tcp_handler(c);
4591 			if(!c->tcp_do_close) {
4592 				fptr_ok(fptr_whitelist_comm_point(
4593 					c->callback));
4594 				(void)(*c->callback)(c, c->cb_arg,
4595 					NETEVENT_CLOSED, NULL);
4596 			}
4597 			return;
4598 		}
4599 	}
4600 }
4601 
4602 void
4603 comm_point_tcp_handle_callback(int fd, short event, void* arg)
4604 {
4605 	struct comm_point* c = (struct comm_point*)arg;
4606 	log_assert(c->type == comm_tcp);
4607 	ub_comm_base_now(c->ev->base);
4608 
4609 	if(c->fd == -1 || c->fd != fd)
4610 		return; /* duplicate event, but commpoint closed. */
4611 
4612 #ifdef USE_DNSCRYPT
4613 	/* Initialize if this is a dnscrypt socket */
4614 	if(c->tcp_parent) {
4615 		c->dnscrypt = c->tcp_parent->dnscrypt;
4616 	}
4617 	if(c->dnscrypt && c->dnscrypt_buffer == c->buffer) {
4618 		c->dnscrypt_buffer = sldns_buffer_new(sldns_buffer_capacity(c->buffer));
4619 		if(!c->dnscrypt_buffer) {
4620 			log_err("Could not allocate dnscrypt buffer");
4621 			reclaim_tcp_handler(c);
4622 			if(!c->tcp_do_close) {
4623 				fptr_ok(fptr_whitelist_comm_point(
4624 					c->callback));
4625 				(void)(*c->callback)(c, c->cb_arg,
4626 					NETEVENT_CLOSED, NULL);
4627 			}
4628 			return;
4629 		}
4630 	}
4631 #endif
4632 
4633 	if(event&UB_EV_TIMEOUT) {
4634 		verbose(VERB_QUERY, "tcp took too long, dropped");
4635 		reclaim_tcp_handler(c);
4636 		if(!c->tcp_do_close) {
4637 			fptr_ok(fptr_whitelist_comm_point(c->callback));
4638 			(void)(*c->callback)(c, c->cb_arg,
4639 				NETEVENT_TIMEOUT, NULL);
4640 		}
4641 		return;
4642 	}
4643 	if(event&UB_EV_READ
4644 #ifdef USE_MSG_FASTOPEN
4645 		&& !(c->tcp_do_fastopen && (event&UB_EV_WRITE))
4646 #endif
4647 		) {
4648 		int has_tcpq = (c->tcp_req_info != NULL);
4649 		int* moreread = c->tcp_more_read_again;
4650 		if(!comm_point_tcp_handle_read(fd, c, 0)) {
4651 			reclaim_tcp_handler(c);
4652 			if(!c->tcp_do_close) {
4653 				fptr_ok(fptr_whitelist_comm_point(
4654 					c->callback));
4655 				(void)(*c->callback)(c, c->cb_arg,
4656 					NETEVENT_CLOSED, NULL);
4657 			}
4658 			return;
4659 		}
4660 		if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) {
4661 			if(!tcp_req_info_read_again(fd, c))
4662 				return;
4663 		}
4664 		if(moreread && *moreread)
4665 			tcp_more_read_again(fd, c);
4666 		return;
4667 	}
4668 	if(event&UB_EV_WRITE) {
4669 		int has_tcpq = (c->tcp_req_info != NULL);
4670 		int* morewrite = c->tcp_more_write_again;
4671 		if(!comm_point_tcp_handle_write(fd, c)) {
4672 			reclaim_tcp_handler(c);
4673 			if(!c->tcp_do_close) {
4674 				fptr_ok(fptr_whitelist_comm_point(
4675 					c->callback));
4676 				(void)(*c->callback)(c, c->cb_arg,
4677 					NETEVENT_CLOSED, NULL);
4678 			}
4679 			return;
4680 		}
4681 		if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) {
4682 			if(!tcp_req_info_read_again(fd, c))
4683 				return;
4684 		}
4685 		if(morewrite && *morewrite)
4686 			tcp_more_write_again(fd, c);
4687 		return;
4688 	}
4689 	log_err("Ignored event %d for tcphdl.", event);
4690 }
4691 
4692 /** Make http handler free for next assignment */
4693 static void
4694 reclaim_http_handler(struct comm_point* c)
4695 {
4696 	log_assert(c->type == comm_http);
4697 	if(c->ssl) {
4698 #ifdef HAVE_SSL
4699 		SSL_shutdown(c->ssl);
4700 		SSL_free(c->ssl);
4701 		c->ssl = NULL;
4702 #endif
4703 	}
4704 	comm_point_close(c);
4705 	if(c->tcp_parent) {
4706 		if(c != c->tcp_parent->tcp_free) {
4707 			c->tcp_parent->cur_tcp_count--;
4708 			c->tcp_free = c->tcp_parent->tcp_free;
4709 			c->tcp_parent->tcp_free = c;
4710 		}
4711 		if(!c->tcp_free) {
4712 			/* re-enable listening on accept socket */
4713 			comm_point_start_listening(c->tcp_parent, -1, -1);
4714 		}
4715 	}
4716 }
4717 
4718 /** read more data for http (with ssl) */
4719 static int
4720 ssl_http_read_more(struct comm_point* c)
4721 {
4722 #ifdef HAVE_SSL
4723 	int r;
4724 	log_assert(sldns_buffer_remaining(c->buffer) > 0);
4725 	ERR_clear_error();
4726 	r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer),
4727 		(int)sldns_buffer_remaining(c->buffer));
4728 	if(r <= 0) {
4729 		int want = SSL_get_error(c->ssl, r);
4730 		if(want == SSL_ERROR_ZERO_RETURN) {
4731 			return 0; /* shutdown, closed */
4732 		} else if(want == SSL_ERROR_WANT_READ) {
4733 			return 1; /* read more later */
4734 		} else if(want == SSL_ERROR_WANT_WRITE) {
4735 			c->ssl_shake_state = comm_ssl_shake_hs_write;
4736 			comm_point_listen_for_rw(c, 0, 1);
4737 			return 1;
4738 		} else if(want == SSL_ERROR_SYSCALL) {
4739 #ifdef ECONNRESET
4740 			if(errno == ECONNRESET && verbosity < 2)
4741 				return 0; /* silence reset by peer */
4742 #endif
4743 			if(errno != 0)
4744 				log_err("SSL_read syscall: %s",
4745 					strerror(errno));
4746 			return 0;
4747 		}
4748 		log_crypto_err_io("could not SSL_read", want);
4749 		return 0;
4750 	}
4751 	verbose(VERB_ALGO, "ssl http read more skip to %d + %d",
4752 		(int)sldns_buffer_position(c->buffer), (int)r);
4753 	sldns_buffer_skip(c->buffer, (ssize_t)r);
4754 	return 1;
4755 #else
4756 	(void)c;
4757 	return 0;
4758 #endif /* HAVE_SSL */
4759 }
4760 
4761 /** read more data for http */
4762 static int
4763 http_read_more(int fd, struct comm_point* c)
4764 {
4765 	ssize_t r;
4766 	log_assert(sldns_buffer_remaining(c->buffer) > 0);
4767 	r = recv(fd, (void*)sldns_buffer_current(c->buffer),
4768 		sldns_buffer_remaining(c->buffer), MSG_DONTWAIT);
4769 	if(r == 0) {
4770 		return 0;
4771 	} else if(r == -1) {
4772 #ifndef USE_WINSOCK
4773 		if(errno == EINTR || errno == EAGAIN)
4774 			return 1;
4775 #else /* USE_WINSOCK */
4776 		if(WSAGetLastError() == WSAECONNRESET)
4777 			return 0;
4778 		if(WSAGetLastError() == WSAEINPROGRESS)
4779 			return 1;
4780 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
4781 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
4782 			return 1;
4783 		}
4784 #endif
4785 		log_err_addr("read (in http r)", sock_strerror(errno),
4786 			&c->repinfo.remote_addr, c->repinfo.remote_addrlen);
4787 		return 0;
4788 	}
4789 	verbose(VERB_ALGO, "http read more skip to %d + %d",
4790 		(int)sldns_buffer_position(c->buffer), (int)r);
4791 	sldns_buffer_skip(c->buffer, r);
4792 	return 1;
4793 }
4794 
4795 /** return true if http header has been read (one line complete) */
4796 static int
4797 http_header_done(sldns_buffer* buf)
4798 {
4799 	size_t i;
4800 	for(i=sldns_buffer_position(buf); i<sldns_buffer_limit(buf); i++) {
4801 		/* there was a \r before the \n, but we ignore that */
4802 		if((char)sldns_buffer_read_u8_at(buf, i) == '\n')
4803 			return 1;
4804 	}
4805 	return 0;
4806 }
4807 
4808 /** return character string into buffer for header line, moves buffer
4809  * past that line and puts zero terminator into linefeed-newline */
4810 static char*
4811 http_header_line(sldns_buffer* buf)
4812 {
4813 	char* result = (char*)sldns_buffer_current(buf);
4814 	size_t i;
4815 	for(i=sldns_buffer_position(buf); i<sldns_buffer_limit(buf); i++) {
4816 		/* terminate the string on the \r */
4817 		if((char)sldns_buffer_read_u8_at(buf, i) == '\r')
4818 			sldns_buffer_write_u8_at(buf, i, 0);
4819 		/* terminate on the \n and skip past the it and done */
4820 		if((char)sldns_buffer_read_u8_at(buf, i) == '\n') {
4821 			sldns_buffer_write_u8_at(buf, i, 0);
4822 			sldns_buffer_set_position(buf, i+1);
4823 			return result;
4824 		}
4825 	}
4826 	return NULL;
4827 }
4828 
4829 /** move unread buffer to start and clear rest for putting the rest into it */
4830 static void
4831 http_moveover_buffer(sldns_buffer* buf)
4832 {
4833 	size_t pos = sldns_buffer_position(buf);
4834 	size_t len = sldns_buffer_remaining(buf);
4835 	sldns_buffer_clear(buf);
4836 	memmove(sldns_buffer_begin(buf), sldns_buffer_at(buf, pos), len);
4837 	sldns_buffer_set_position(buf, len);
4838 }
4839 
4840 /** a http header is complete, process it */
4841 static int
4842 http_process_initial_header(struct comm_point* c)
4843 {
4844 	char* line = http_header_line(c->buffer);
4845 	if(!line) return 1;
4846 	verbose(VERB_ALGO, "http header: %s", line);
4847 	if(strncasecmp(line, "HTTP/1.1 ", 9) == 0) {
4848 		/* check returncode */
4849 		if(line[9] != '2') {
4850 			verbose(VERB_ALGO, "http bad status %s", line+9);
4851 			return 0;
4852 		}
4853 	} else if(strncasecmp(line, "Content-Length: ", 16) == 0) {
4854 		if(!c->http_is_chunked)
4855 			c->tcp_byte_count = (size_t)atoi(line+16);
4856 	} else if(strncasecmp(line, "Transfer-Encoding: chunked", 19+7) == 0) {
4857 		c->tcp_byte_count = 0;
4858 		c->http_is_chunked = 1;
4859 	} else if(line[0] == 0) {
4860 		/* end of initial headers */
4861 		c->http_in_headers = 0;
4862 		if(c->http_is_chunked)
4863 			c->http_in_chunk_headers = 1;
4864 		/* remove header text from front of buffer
4865 		 * the buffer is going to be used to return the data segment
4866 		 * itself and we don't want the header to get returned
4867 		 * prepended with it */
4868 		http_moveover_buffer(c->buffer);
4869 		sldns_buffer_flip(c->buffer);
4870 		return 1;
4871 	}
4872 	/* ignore other headers */
4873 	return 1;
4874 }
4875 
4876 /** a chunk header is complete, process it, return 0=fail, 1=continue next
4877  * header line, 2=done with chunked transfer*/
4878 static int
4879 http_process_chunk_header(struct comm_point* c)
4880 {
4881 	char* line = http_header_line(c->buffer);
4882 	if(!line) return 1;
4883 	if(c->http_in_chunk_headers == 3) {
4884 		verbose(VERB_ALGO, "http chunk trailer: %s", line);
4885 		/* are we done ? */
4886 		if(line[0] == 0 && c->tcp_byte_count == 0) {
4887 			/* callback of http reader when NETEVENT_DONE,
4888 			 * end of data, with no data in buffer */
4889 			sldns_buffer_set_position(c->buffer, 0);
4890 			sldns_buffer_set_limit(c->buffer, 0);
4891 			fptr_ok(fptr_whitelist_comm_point(c->callback));
4892 			(void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL);
4893 			/* return that we are done */
4894 			return 2;
4895 		}
4896 		if(line[0] == 0) {
4897 			/* continue with header of the next chunk */
4898 			c->http_in_chunk_headers = 1;
4899 			/* remove header text from front of buffer */
4900 			http_moveover_buffer(c->buffer);
4901 			sldns_buffer_flip(c->buffer);
4902 			return 1;
4903 		}
4904 		/* ignore further trail headers */
4905 		return 1;
4906 	}
4907 	verbose(VERB_ALGO, "http chunk header: %s", line);
4908 	if(c->http_in_chunk_headers == 1) {
4909 		/* read chunked start line */
4910 		char* end = NULL;
4911 		c->tcp_byte_count = (size_t)strtol(line, &end, 16);
4912 		if(end == line)
4913 			return 0;
4914 		c->http_in_chunk_headers = 0;
4915 		/* remove header text from front of buffer */
4916 		http_moveover_buffer(c->buffer);
4917 		sldns_buffer_flip(c->buffer);
4918 		if(c->tcp_byte_count == 0) {
4919 			/* done with chunks, process chunk_trailer lines */
4920 			c->http_in_chunk_headers = 3;
4921 		}
4922 		return 1;
4923 	}
4924 	/* ignore other headers */
4925 	return 1;
4926 }
4927 
4928 /** handle nonchunked data segment, 0=fail, 1=wait */
4929 static int
4930 http_nonchunk_segment(struct comm_point* c)
4931 {
4932 	/* c->buffer at position..limit has new data we read in.
4933 	 * the buffer itself is full of nonchunked data.
4934 	 * we are looking to read tcp_byte_count more data
4935 	 * and then the transfer is done. */
4936 	size_t remainbufferlen;
4937 	size_t got_now = sldns_buffer_limit(c->buffer);
4938 	if(c->tcp_byte_count <= got_now) {
4939 		/* done, this is the last data fragment */
4940 		c->http_stored = 0;
4941 		sldns_buffer_set_position(c->buffer, 0);
4942 		fptr_ok(fptr_whitelist_comm_point(c->callback));
4943 		(void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL);
4944 		return 1;
4945 	}
4946 	/* if we have the buffer space,
4947 	 * read more data collected into the buffer */
4948 	remainbufferlen = sldns_buffer_capacity(c->buffer) -
4949 		sldns_buffer_limit(c->buffer);
4950 	if(remainbufferlen+got_now >= c->tcp_byte_count ||
4951 		remainbufferlen >= (size_t)(c->ssl?16384:2048)) {
4952 		size_t total = sldns_buffer_limit(c->buffer);
4953 		sldns_buffer_clear(c->buffer);
4954 		sldns_buffer_set_position(c->buffer, total);
4955 		c->http_stored = total;
4956 		/* return and wait to read more */
4957 		return 1;
4958 	}
4959 	/* call callback with this data amount, then
4960 	 * wait for more */
4961 	c->tcp_byte_count -= got_now;
4962 	c->http_stored = 0;
4963 	sldns_buffer_set_position(c->buffer, 0);
4964 	fptr_ok(fptr_whitelist_comm_point(c->callback));
4965 	(void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL);
4966 	/* c->callback has to buffer_clear(c->buffer). */
4967 	/* return and wait to read more */
4968 	return 1;
4969 }
4970 
4971 /** handle chunked data segment, return 0=fail, 1=wait, 2=process more */
4972 static int
4973 http_chunked_segment(struct comm_point* c)
4974 {
4975 	/* the c->buffer has from position..limit new data we read. */
4976 	/* the current chunk has length tcp_byte_count.
4977 	 * once we read that read more chunk headers.
4978 	 */
4979 	size_t remainbufferlen;
4980 	size_t got_now = sldns_buffer_limit(c->buffer) - c->http_stored;
4981 	verbose(VERB_ALGO, "http_chunked_segment: got now %d, tcpbytcount %d, http_stored %d, buffer pos %d, buffer limit %d", (int)got_now, (int)c->tcp_byte_count, (int)c->http_stored, (int)sldns_buffer_position(c->buffer), (int)sldns_buffer_limit(c->buffer));
4982 	if(c->tcp_byte_count <= got_now) {
4983 		/* the chunk has completed (with perhaps some extra data
4984 		 * from next chunk header and next chunk) */
4985 		/* save too much info into temp buffer */
4986 		size_t fraglen;
4987 		struct comm_reply repinfo;
4988 		c->http_stored = 0;
4989 		sldns_buffer_skip(c->buffer, (ssize_t)c->tcp_byte_count);
4990 		sldns_buffer_clear(c->http_temp);
4991 		sldns_buffer_write(c->http_temp,
4992 			sldns_buffer_current(c->buffer),
4993 			sldns_buffer_remaining(c->buffer));
4994 		sldns_buffer_flip(c->http_temp);
4995 
4996 		/* callback with this fragment */
4997 		fraglen = sldns_buffer_position(c->buffer);
4998 		sldns_buffer_set_position(c->buffer, 0);
4999 		sldns_buffer_set_limit(c->buffer, fraglen);
5000 		repinfo = c->repinfo;
5001 		fptr_ok(fptr_whitelist_comm_point(c->callback));
5002 		(void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &repinfo);
5003 		/* c->callback has to buffer_clear(). */
5004 
5005 		/* is commpoint deleted? */
5006 		if(!repinfo.c) {
5007 			return 1;
5008 		}
5009 		/* copy waiting info */
5010 		sldns_buffer_clear(c->buffer);
5011 		sldns_buffer_write(c->buffer,
5012 			sldns_buffer_begin(c->http_temp),
5013 			sldns_buffer_remaining(c->http_temp));
5014 		sldns_buffer_flip(c->buffer);
5015 		/* process end of chunk trailer header lines, until
5016 		 * an empty line */
5017 		c->http_in_chunk_headers = 3;
5018 		/* process more data in buffer (if any) */
5019 		return 2;
5020 	}
5021 	c->tcp_byte_count -= got_now;
5022 
5023 	/* if we have the buffer space,
5024 	 * read more data collected into the buffer */
5025 	remainbufferlen = sldns_buffer_capacity(c->buffer) -
5026 		sldns_buffer_limit(c->buffer);
5027 	if(remainbufferlen >= c->tcp_byte_count ||
5028 		remainbufferlen >= 2048) {
5029 		size_t total = sldns_buffer_limit(c->buffer);
5030 		sldns_buffer_clear(c->buffer);
5031 		sldns_buffer_set_position(c->buffer, total);
5032 		c->http_stored = total;
5033 		/* return and wait to read more */
5034 		return 1;
5035 	}
5036 
5037 	/* callback of http reader for a new part of the data */
5038 	c->http_stored = 0;
5039 	sldns_buffer_set_position(c->buffer, 0);
5040 	fptr_ok(fptr_whitelist_comm_point(c->callback));
5041 	(void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL);
5042 	/* c->callback has to buffer_clear(c->buffer). */
5043 	/* return and wait to read more */
5044 	return 1;
5045 }
5046 
5047 #ifdef HAVE_NGHTTP2
5048 /** Create new http2 session. Called when creating handling comm point. */
5049 static struct http2_session* http2_session_create(struct comm_point* c)
5050 {
5051 	struct http2_session* session = calloc(1, sizeof(*session));
5052 	if(!session) {
5053 		log_err("malloc failure while creating http2 session");
5054 		return NULL;
5055 	}
5056 	session->c = c;
5057 
5058 	return session;
5059 }
5060 #endif
5061 
5062 /** Delete http2 session. After closing connection or on error */
5063 static void http2_session_delete(struct http2_session* h2_session)
5064 {
5065 #ifdef HAVE_NGHTTP2
5066 	if(h2_session->callbacks)
5067 		nghttp2_session_callbacks_del(h2_session->callbacks);
5068 	free(h2_session);
5069 #else
5070 	(void)h2_session;
5071 #endif
5072 }
5073 
5074 #ifdef HAVE_NGHTTP2
5075 struct http2_stream* http2_stream_create(int32_t stream_id)
5076 {
5077 	struct http2_stream* h2_stream = calloc(1, sizeof(*h2_stream));
5078 	if(!h2_stream) {
5079 		log_err("malloc failure while creating http2 stream");
5080 		return NULL;
5081 	}
5082 	h2_stream->stream_id = stream_id;
5083 	return h2_stream;
5084 }
5085 #endif
5086 
5087 void http2_stream_add_meshstate(struct http2_stream* h2_stream,
5088 	struct mesh_area* mesh, struct mesh_state* m)
5089 {
5090 	h2_stream->mesh = mesh;
5091 	h2_stream->mesh_state = m;
5092 }
5093 
5094 void http2_stream_remove_mesh_state(struct http2_stream* h2_stream)
5095 {
5096 	if(!h2_stream)
5097 		return;
5098 	h2_stream->mesh_state = NULL;
5099 }
5100 
5101 #ifdef HAVE_NGHTTP2
5102 void http2_session_add_stream(struct http2_session* h2_session,
5103 	struct http2_stream* h2_stream)
5104 {
5105 	if(h2_session->first_stream)
5106 		h2_session->first_stream->prev = h2_stream;
5107 	h2_stream->next = h2_session->first_stream;
5108 	h2_session->first_stream = h2_stream;
5109 }
5110 
5111 /** remove stream from session linked list. After stream close callback or
5112  * closing connection */
5113 static void http2_session_remove_stream(struct http2_session* h2_session,
5114 	struct http2_stream* h2_stream)
5115 {
5116 	if(h2_stream->prev)
5117 		h2_stream->prev->next = h2_stream->next;
5118 	else
5119 		h2_session->first_stream = h2_stream->next;
5120 	if(h2_stream->next)
5121 		h2_stream->next->prev = h2_stream->prev;
5122 
5123 }
5124 
5125 int http2_stream_close_cb(nghttp2_session* ATTR_UNUSED(session),
5126 	int32_t stream_id, uint32_t ATTR_UNUSED(error_code), void* cb_arg)
5127 {
5128 	struct http2_stream* h2_stream;
5129 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
5130 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
5131 		h2_session->session, stream_id))) {
5132 		return 0;
5133 	}
5134 	http2_session_remove_stream(h2_session, h2_stream);
5135 	http2_stream_delete(h2_session, h2_stream);
5136 	return 0;
5137 }
5138 
5139 ssize_t http2_recv_cb(nghttp2_session* ATTR_UNUSED(session), uint8_t* buf,
5140 	size_t len, int ATTR_UNUSED(flags), void* cb_arg)
5141 {
5142 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
5143 	ssize_t ret;
5144 
5145 	log_assert(h2_session->c->type == comm_http);
5146 	log_assert(h2_session->c->h2_session);
5147 
5148 #ifdef HAVE_SSL
5149 	if(h2_session->c->ssl) {
5150 		int r;
5151 		ERR_clear_error();
5152 		r = SSL_read(h2_session->c->ssl, buf, len);
5153 		if(r <= 0) {
5154 			int want = SSL_get_error(h2_session->c->ssl, r);
5155 			if(want == SSL_ERROR_ZERO_RETURN) {
5156 				return NGHTTP2_ERR_EOF;
5157 			} else if(want == SSL_ERROR_WANT_READ) {
5158 				return NGHTTP2_ERR_WOULDBLOCK;
5159 			} else if(want == SSL_ERROR_WANT_WRITE) {
5160 				h2_session->c->ssl_shake_state = comm_ssl_shake_hs_write;
5161 				comm_point_listen_for_rw(h2_session->c, 0, 1);
5162 				return NGHTTP2_ERR_WOULDBLOCK;
5163 			} else if(want == SSL_ERROR_SYSCALL) {
5164 #ifdef ECONNRESET
5165 				if(errno == ECONNRESET && verbosity < 2)
5166 					return NGHTTP2_ERR_CALLBACK_FAILURE;
5167 #endif
5168 				if(errno != 0)
5169 					log_err("SSL_read syscall: %s",
5170 						strerror(errno));
5171 				return NGHTTP2_ERR_CALLBACK_FAILURE;
5172 			}
5173 			log_crypto_err_io("could not SSL_read", want);
5174 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5175 		}
5176 		return r;
5177 	}
5178 #endif /* HAVE_SSL */
5179 
5180 	ret = recv(h2_session->c->fd, buf, len, MSG_DONTWAIT);
5181 	if(ret == 0) {
5182 		return NGHTTP2_ERR_EOF;
5183 	} else if(ret < 0) {
5184 #ifndef USE_WINSOCK
5185 		if(errno == EINTR || errno == EAGAIN)
5186 			return NGHTTP2_ERR_WOULDBLOCK;
5187 #ifdef ECONNRESET
5188 		if(errno == ECONNRESET && verbosity < 2)
5189 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5190 #endif
5191 		log_err_addr("could not http2 recv: %s", strerror(errno),
5192 			&h2_session->c->repinfo.remote_addr,
5193 			h2_session->c->repinfo.remote_addrlen);
5194 #else /* USE_WINSOCK */
5195 		if(WSAGetLastError() == WSAECONNRESET)
5196 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5197 		if(WSAGetLastError() == WSAEINPROGRESS)
5198 			return NGHTTP2_ERR_WOULDBLOCK;
5199 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
5200 			ub_winsock_tcp_wouldblock(h2_session->c->ev->ev,
5201 				UB_EV_READ);
5202 			return NGHTTP2_ERR_WOULDBLOCK;
5203 		}
5204 		log_err_addr("could not http2 recv: %s",
5205 			wsa_strerror(WSAGetLastError()),
5206 			&h2_session->c->repinfo.remote_addr,
5207 			h2_session->c->repinfo.remote_addrlen);
5208 #endif
5209 		return NGHTTP2_ERR_CALLBACK_FAILURE;
5210 	}
5211 	return ret;
5212 }
5213 #endif /* HAVE_NGHTTP2 */
5214 
5215 /** Handle http2 read */
5216 static int
5217 comm_point_http2_handle_read(int ATTR_UNUSED(fd), struct comm_point* c)
5218 {
5219 #ifdef HAVE_NGHTTP2
5220 	int ret;
5221 	log_assert(c->h2_session);
5222 
5223 	/* reading until recv cb returns NGHTTP2_ERR_WOULDBLOCK */
5224 	ret = nghttp2_session_recv(c->h2_session->session);
5225 	if(ret) {
5226 		if(ret != NGHTTP2_ERR_EOF &&
5227 			ret != NGHTTP2_ERR_CALLBACK_FAILURE) {
5228 			char a[256];
5229 			addr_to_str(&c->repinfo.remote_addr,
5230 				c->repinfo.remote_addrlen, a, sizeof(a));
5231 			verbose(VERB_QUERY, "http2: session_recv from %s failed, "
5232 				"error: %s", a, nghttp2_strerror(ret));
5233 		}
5234 		return 0;
5235 	}
5236 	if(nghttp2_session_want_write(c->h2_session->session)) {
5237 		c->tcp_is_reading = 0;
5238 		comm_point_stop_listening(c);
5239 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
5240 	} else if(!nghttp2_session_want_read(c->h2_session->session))
5241 		return 0; /* connection can be closed */
5242 	return 1;
5243 #else
5244 	(void)c;
5245 	return 0;
5246 #endif
5247 }
5248 
5249 /**
5250  * Handle http reading callback.
5251  * @param fd: file descriptor of socket.
5252  * @param c: comm point to read from into buffer.
5253  * @return: 0 on error
5254  */
5255 static int
5256 comm_point_http_handle_read(int fd, struct comm_point* c)
5257 {
5258 	log_assert(c->type == comm_http);
5259 	log_assert(fd != -1);
5260 
5261 	/* if we are in ssl handshake, handle SSL handshake */
5262 #ifdef HAVE_SSL
5263 	if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) {
5264 		if(!ssl_handshake(c))
5265 			return 0;
5266 		if(c->ssl_shake_state != comm_ssl_shake_none)
5267 			return 1;
5268 	}
5269 #endif /* HAVE_SSL */
5270 
5271 	if(!c->tcp_is_reading)
5272 		return 1;
5273 
5274 	if(c->use_h2) {
5275 		return comm_point_http2_handle_read(fd, c);
5276 	}
5277 
5278 	/* http version is <= http/1.1 */
5279 
5280 	if(c->http_min_version >= http_version_2) {
5281 		/* HTTP/2 failed, not allowed to use lower version. */
5282 		return 0;
5283 	}
5284 
5285 	/* read more data */
5286 	if(c->ssl) {
5287 		if(!ssl_http_read_more(c))
5288 			return 0;
5289 	} else {
5290 		if(!http_read_more(fd, c))
5291 			return 0;
5292 	}
5293 
5294 	if(c->http_stored >= sldns_buffer_position(c->buffer)) {
5295 		/* read did not work but we wanted more data, there is
5296 		 * no bytes to process now. */
5297 		return 1;
5298 	}
5299 	sldns_buffer_flip(c->buffer);
5300 	/* if we are partway in a segment of data, position us at the point
5301 	 * where we left off previously */
5302 	if(c->http_stored < sldns_buffer_limit(c->buffer))
5303 		sldns_buffer_set_position(c->buffer, c->http_stored);
5304 	else	sldns_buffer_set_position(c->buffer, sldns_buffer_limit(c->buffer));
5305 
5306 	while(sldns_buffer_remaining(c->buffer) > 0) {
5307 		/* Handle HTTP/1.x data */
5308 		/* if we are reading headers, read more headers */
5309 		if(c->http_in_headers || c->http_in_chunk_headers) {
5310 			/* if header is done, process the header */
5311 			if(!http_header_done(c->buffer)) {
5312 				/* copy remaining data to front of buffer
5313 				 * and set rest for writing into it */
5314 				http_moveover_buffer(c->buffer);
5315 				/* return and wait to read more */
5316 				return 1;
5317 			}
5318 			if(!c->http_in_chunk_headers) {
5319 				/* process initial headers */
5320 				if(!http_process_initial_header(c))
5321 					return 0;
5322 			} else {
5323 				/* process chunk headers */
5324 				int r = http_process_chunk_header(c);
5325 				if(r == 0) return 0;
5326 				if(r == 2) return 1; /* done */
5327 				/* r == 1, continue */
5328 			}
5329 			/* see if we have more to process */
5330 			continue;
5331 		}
5332 
5333 		if(!c->http_is_chunked) {
5334 			/* if we are reading nonchunks, process that*/
5335 			return http_nonchunk_segment(c);
5336 		} else {
5337 			/* if we are reading chunks, read the chunk */
5338 			int r = http_chunked_segment(c);
5339 			if(r == 0) return 0;
5340 			if(r == 1) return 1;
5341 			continue;
5342 		}
5343 	}
5344 	/* broke out of the loop; could not process header instead need
5345 	 * to read more */
5346 	/* moveover any remaining data and read more data */
5347 	http_moveover_buffer(c->buffer);
5348 	/* return and wait to read more */
5349 	return 1;
5350 }
5351 
5352 /** check pending connect for http */
5353 static int
5354 http_check_connect(int fd, struct comm_point* c)
5355 {
5356 	/* check for pending error from nonblocking connect */
5357 	/* from Stevens, unix network programming, vol1, 3rd ed, p450*/
5358 	int error = 0;
5359 	socklen_t len = (socklen_t)sizeof(error);
5360 	if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error,
5361 		&len) < 0){
5362 #ifndef USE_WINSOCK
5363 		error = errno; /* on solaris errno is error */
5364 #else /* USE_WINSOCK */
5365 		error = WSAGetLastError();
5366 #endif
5367 	}
5368 #ifndef USE_WINSOCK
5369 #if defined(EINPROGRESS) && defined(EWOULDBLOCK)
5370 	if(error == EINPROGRESS || error == EWOULDBLOCK)
5371 		return 1; /* try again later */
5372 	else
5373 #endif
5374 	if(error != 0 && verbosity < 2)
5375 		return 0; /* silence lots of chatter in the logs */
5376 	else if(error != 0) {
5377 		log_err_addr("http connect", strerror(error),
5378 			&c->repinfo.remote_addr, c->repinfo.remote_addrlen);
5379 #else /* USE_WINSOCK */
5380 	/* examine error */
5381 	if(error == WSAEINPROGRESS)
5382 		return 1;
5383 	else if(error == WSAEWOULDBLOCK) {
5384 		ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
5385 		return 1;
5386 	} else if(error != 0 && verbosity < 2)
5387 		return 0;
5388 	else if(error != 0) {
5389 		log_err_addr("http connect", wsa_strerror(error),
5390 			&c->repinfo.remote_addr, c->repinfo.remote_addrlen);
5391 #endif /* USE_WINSOCK */
5392 		return 0;
5393 	}
5394 	/* keep on processing this socket */
5395 	return 2;
5396 }
5397 
5398 /** write more data for http (with ssl) */
5399 static int
5400 ssl_http_write_more(struct comm_point* c)
5401 {
5402 #ifdef HAVE_SSL
5403 	int r;
5404 	log_assert(sldns_buffer_remaining(c->buffer) > 0);
5405 	ERR_clear_error();
5406 	r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer),
5407 		(int)sldns_buffer_remaining(c->buffer));
5408 	if(r <= 0) {
5409 		int want = SSL_get_error(c->ssl, r);
5410 		if(want == SSL_ERROR_ZERO_RETURN) {
5411 			return 0; /* closed */
5412 		} else if(want == SSL_ERROR_WANT_READ) {
5413 			c->ssl_shake_state = comm_ssl_shake_hs_read;
5414 			comm_point_listen_for_rw(c, 1, 0);
5415 			return 1; /* wait for read condition */
5416 		} else if(want == SSL_ERROR_WANT_WRITE) {
5417 			return 1; /* write more later */
5418 		} else if(want == SSL_ERROR_SYSCALL) {
5419 #ifdef EPIPE
5420 			if(errno == EPIPE && verbosity < 2)
5421 				return 0; /* silence 'broken pipe' */
5422 #endif
5423 			if(errno != 0)
5424 				log_err("SSL_write syscall: %s",
5425 					strerror(errno));
5426 			return 0;
5427 		}
5428 		log_crypto_err_io("could not SSL_write", want);
5429 		return 0;
5430 	}
5431 	sldns_buffer_skip(c->buffer, (ssize_t)r);
5432 	return 1;
5433 #else
5434 	(void)c;
5435 	return 0;
5436 #endif /* HAVE_SSL */
5437 }
5438 
5439 /** write more data for http */
5440 static int
5441 http_write_more(int fd, struct comm_point* c)
5442 {
5443 	ssize_t r;
5444 	log_assert(sldns_buffer_remaining(c->buffer) > 0);
5445 	r = send(fd, (void*)sldns_buffer_current(c->buffer),
5446 		sldns_buffer_remaining(c->buffer), 0);
5447 	if(r == -1) {
5448 #ifndef USE_WINSOCK
5449 		if(errno == EINTR || errno == EAGAIN)
5450 			return 1;
5451 #else
5452 		if(WSAGetLastError() == WSAEINPROGRESS)
5453 			return 1;
5454 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
5455 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
5456 			return 1;
5457 		}
5458 #endif
5459 		log_err_addr("http send r", sock_strerror(errno),
5460 			&c->repinfo.remote_addr, c->repinfo.remote_addrlen);
5461 		return 0;
5462 	}
5463 	sldns_buffer_skip(c->buffer, r);
5464 	return 1;
5465 }
5466 
5467 #ifdef HAVE_NGHTTP2
5468 ssize_t http2_send_cb(nghttp2_session* ATTR_UNUSED(session), const uint8_t* buf,
5469 	size_t len, int ATTR_UNUSED(flags), void* cb_arg)
5470 {
5471 	ssize_t ret;
5472 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
5473 	log_assert(h2_session->c->type == comm_http);
5474 	log_assert(h2_session->c->h2_session);
5475 
5476 #ifdef HAVE_SSL
5477 	if(h2_session->c->ssl) {
5478 		int r;
5479 		ERR_clear_error();
5480 		r = SSL_write(h2_session->c->ssl, buf, len);
5481 		if(r <= 0) {
5482 			int want = SSL_get_error(h2_session->c->ssl, r);
5483 			if(want == SSL_ERROR_ZERO_RETURN) {
5484 				return NGHTTP2_ERR_CALLBACK_FAILURE;
5485 			} else if(want == SSL_ERROR_WANT_READ) {
5486 				h2_session->c->ssl_shake_state = comm_ssl_shake_hs_read;
5487 				comm_point_listen_for_rw(h2_session->c, 1, 0);
5488 				return NGHTTP2_ERR_WOULDBLOCK;
5489 			} else if(want == SSL_ERROR_WANT_WRITE) {
5490 				return NGHTTP2_ERR_WOULDBLOCK;
5491 			} else if(want == SSL_ERROR_SYSCALL) {
5492 #ifdef EPIPE
5493 				if(errno == EPIPE && verbosity < 2)
5494 					return NGHTTP2_ERR_CALLBACK_FAILURE;
5495 #endif
5496 				if(errno != 0)
5497 					log_err("SSL_write syscall: %s",
5498 						strerror(errno));
5499 				return NGHTTP2_ERR_CALLBACK_FAILURE;
5500 			}
5501 			log_crypto_err_io("could not SSL_write", want);
5502 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5503 		}
5504 		return r;
5505 	}
5506 #endif /* HAVE_SSL */
5507 
5508 	ret = send(h2_session->c->fd, buf, len, 0);
5509 	if(ret == 0) {
5510 		return NGHTTP2_ERR_CALLBACK_FAILURE;
5511 	} else if(ret < 0) {
5512 #ifndef USE_WINSOCK
5513 		if(errno == EINTR || errno == EAGAIN)
5514 			return NGHTTP2_ERR_WOULDBLOCK;
5515 #ifdef EPIPE
5516 		if(errno == EPIPE && verbosity < 2)
5517 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5518 #endif
5519 #ifdef ECONNRESET
5520 		if(errno == ECONNRESET && verbosity < 2)
5521 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5522 #endif
5523 		log_err_addr("could not http2 write: %s", strerror(errno),
5524 			&h2_session->c->repinfo.remote_addr,
5525 			h2_session->c->repinfo.remote_addrlen);
5526 #else /* USE_WINSOCK */
5527 		if(WSAGetLastError() == WSAENOTCONN)
5528 			return NGHTTP2_ERR_WOULDBLOCK;
5529 		if(WSAGetLastError() == WSAEINPROGRESS)
5530 			return NGHTTP2_ERR_WOULDBLOCK;
5531 		if(WSAGetLastError() == WSAEWOULDBLOCK) {
5532 			ub_winsock_tcp_wouldblock(h2_session->c->ev->ev,
5533 				UB_EV_WRITE);
5534 			return NGHTTP2_ERR_WOULDBLOCK;
5535 		}
5536 		if(WSAGetLastError() == WSAECONNRESET && verbosity < 2)
5537 			return NGHTTP2_ERR_CALLBACK_FAILURE;
5538 		log_err_addr("could not http2 write: %s",
5539 			wsa_strerror(WSAGetLastError()),
5540 			&h2_session->c->repinfo.remote_addr,
5541 			h2_session->c->repinfo.remote_addrlen);
5542 #endif
5543 		return NGHTTP2_ERR_CALLBACK_FAILURE;
5544 	}
5545 	return ret;
5546 }
5547 #endif /* HAVE_NGHTTP2 */
5548 
5549 /** Handle http2 writing */
5550 static int
5551 comm_point_http2_handle_write(int ATTR_UNUSED(fd), struct comm_point* c)
5552 {
5553 #ifdef HAVE_NGHTTP2
5554 	int ret;
5555 	log_assert(c->h2_session);
5556 
5557 	ret = nghttp2_session_send(c->h2_session->session);
5558 	if(ret) {
5559 		verbose(VERB_QUERY, "http2: session_send failed, "
5560 			"error: %s", nghttp2_strerror(ret));
5561 		return 0;
5562 	}
5563 
5564 	if(nghttp2_session_want_read(c->h2_session->session)) {
5565 		c->tcp_is_reading = 1;
5566 		comm_point_stop_listening(c);
5567 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
5568 	} else if(!nghttp2_session_want_write(c->h2_session->session))
5569 		return 0; /* connection can be closed */
5570 	return 1;
5571 #else
5572 	(void)c;
5573 	return 0;
5574 #endif
5575 }
5576 
5577 /**
5578  * Handle http writing callback.
5579  * @param fd: file descriptor of socket.
5580  * @param c: comm point to write buffer out of.
5581  * @return: 0 on error
5582  */
5583 static int
5584 comm_point_http_handle_write(int fd, struct comm_point* c)
5585 {
5586 	log_assert(c->type == comm_http);
5587 	log_assert(fd != -1);
5588 
5589 	/* check pending connect errors, if that fails, we wait for more,
5590 	 * or we can continue to write contents */
5591 	if(c->tcp_check_nb_connect) {
5592 		int r = http_check_connect(fd, c);
5593 		if(r == 0) return 0;
5594 		if(r == 1) return 1;
5595 		c->tcp_check_nb_connect = 0;
5596 	}
5597 	/* if we are in ssl handshake, handle SSL handshake */
5598 #ifdef HAVE_SSL
5599 	if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) {
5600 		if(!ssl_handshake(c))
5601 			return 0;
5602 		if(c->ssl_shake_state != comm_ssl_shake_none)
5603 			return 1;
5604 	}
5605 #endif /* HAVE_SSL */
5606 	if(c->tcp_is_reading)
5607 		return 1;
5608 
5609 	if(c->use_h2) {
5610 		return comm_point_http2_handle_write(fd, c);
5611 	}
5612 
5613 	/* http version is <= http/1.1 */
5614 
5615 	if(c->http_min_version >= http_version_2) {
5616 		/* HTTP/2 failed, not allowed to use lower version. */
5617 		return 0;
5618 	}
5619 
5620 	/* if we are writing, write more */
5621 	if(c->ssl) {
5622 		if(!ssl_http_write_more(c))
5623 			return 0;
5624 	} else {
5625 		if(!http_write_more(fd, c))
5626 			return 0;
5627 	}
5628 
5629 	/* we write a single buffer contents, that can contain
5630 	 * the http request, and then flip to read the results */
5631 	/* see if write is done */
5632 	if(sldns_buffer_remaining(c->buffer) == 0) {
5633 		sldns_buffer_clear(c->buffer);
5634 		if(c->tcp_do_toggle_rw)
5635 			c->tcp_is_reading = 1;
5636 		c->tcp_byte_count = 0;
5637 		/* switch from listening(write) to listening(read) */
5638 		comm_point_stop_listening(c);
5639 		comm_point_start_listening(c, -1, -1);
5640 	}
5641 	return 1;
5642 }
5643 
5644 void
5645 comm_point_http_handle_callback(int fd, short event, void* arg)
5646 {
5647 	struct comm_point* c = (struct comm_point*)arg;
5648 	log_assert(c->type == comm_http);
5649 	ub_comm_base_now(c->ev->base);
5650 
5651 	if(event&UB_EV_TIMEOUT) {
5652 		verbose(VERB_QUERY, "http took too long, dropped");
5653 		reclaim_http_handler(c);
5654 		if(!c->tcp_do_close) {
5655 			fptr_ok(fptr_whitelist_comm_point(c->callback));
5656 			(void)(*c->callback)(c, c->cb_arg,
5657 				NETEVENT_TIMEOUT, NULL);
5658 		}
5659 		return;
5660 	}
5661 	if(event&UB_EV_READ) {
5662 		if(!comm_point_http_handle_read(fd, c)) {
5663 			reclaim_http_handler(c);
5664 			if(!c->tcp_do_close) {
5665 				fptr_ok(fptr_whitelist_comm_point(
5666 					c->callback));
5667 				(void)(*c->callback)(c, c->cb_arg,
5668 					NETEVENT_CLOSED, NULL);
5669 			}
5670 		}
5671 		return;
5672 	}
5673 	if(event&UB_EV_WRITE) {
5674 		if(!comm_point_http_handle_write(fd, c)) {
5675 			reclaim_http_handler(c);
5676 			if(!c->tcp_do_close) {
5677 				fptr_ok(fptr_whitelist_comm_point(
5678 					c->callback));
5679 				(void)(*c->callback)(c, c->cb_arg,
5680 					NETEVENT_CLOSED, NULL);
5681 			}
5682 		}
5683 		return;
5684 	}
5685 	log_err("Ignored event %d for httphdl.", event);
5686 }
5687 
5688 void comm_point_local_handle_callback(int fd, short event, void* arg)
5689 {
5690 	struct comm_point* c = (struct comm_point*)arg;
5691 	log_assert(c->type == comm_local);
5692 	ub_comm_base_now(c->ev->base);
5693 
5694 	if(event&UB_EV_READ) {
5695 		if(!comm_point_tcp_handle_read(fd, c, 1)) {
5696 			fptr_ok(fptr_whitelist_comm_point(c->callback));
5697 			(void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED,
5698 				NULL);
5699 		}
5700 		return;
5701 	}
5702 	log_err("Ignored event %d for localhdl.", event);
5703 }
5704 
5705 void comm_point_raw_handle_callback(int ATTR_UNUSED(fd),
5706 	short event, void* arg)
5707 {
5708 	struct comm_point* c = (struct comm_point*)arg;
5709 	int err = NETEVENT_NOERROR;
5710 	log_assert(c->type == comm_raw);
5711 	ub_comm_base_now(c->ev->base);
5712 
5713 	if(event&UB_EV_TIMEOUT)
5714 		err = NETEVENT_TIMEOUT;
5715 	fptr_ok(fptr_whitelist_comm_point_raw(c->callback));
5716 	(void)(*c->callback)(c, c->cb_arg, err, NULL);
5717 }
5718 
5719 struct comm_point*
5720 comm_point_create_udp(struct comm_base *base, int fd, sldns_buffer* buffer,
5721 	int pp2_enabled, comm_point_callback_type* callback,
5722 	void* callback_arg, struct unbound_socket* socket)
5723 {
5724 	struct comm_point* c = (struct comm_point*)calloc(1,
5725 		sizeof(struct comm_point));
5726 	short evbits;
5727 	if(!c)
5728 		return NULL;
5729 	c->ev = (struct internal_event*)calloc(1,
5730 		sizeof(struct internal_event));
5731 	if(!c->ev) {
5732 		free(c);
5733 		return NULL;
5734 	}
5735 	c->ev->base = base;
5736 	c->fd = fd;
5737 	c->buffer = buffer;
5738 	c->timeout = NULL;
5739 	c->tcp_is_reading = 0;
5740 	c->tcp_byte_count = 0;
5741 	c->tcp_parent = NULL;
5742 	c->max_tcp_count = 0;
5743 	c->cur_tcp_count = 0;
5744 	c->tcp_handlers = NULL;
5745 	c->tcp_free = NULL;
5746 	c->type = comm_udp;
5747 	c->tcp_do_close = 0;
5748 	c->do_not_close = 0;
5749 	c->tcp_do_toggle_rw = 0;
5750 	c->tcp_check_nb_connect = 0;
5751 #ifdef USE_MSG_FASTOPEN
5752 	c->tcp_do_fastopen = 0;
5753 #endif
5754 #ifdef USE_DNSCRYPT
5755 	c->dnscrypt = 0;
5756 	c->dnscrypt_buffer = buffer;
5757 #endif
5758 	c->inuse = 0;
5759 	c->callback = callback;
5760 	c->cb_arg = callback_arg;
5761 	c->socket = socket;
5762 	c->pp2_enabled = pp2_enabled;
5763 	c->pp2_header_state = pp2_header_none;
5764 	evbits = UB_EV_READ | UB_EV_PERSIST;
5765 	/* ub_event stuff */
5766 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
5767 		comm_point_udp_callback, c);
5768 	if(c->ev->ev == NULL) {
5769 		log_err("could not baseset udp event");
5770 		comm_point_delete(c);
5771 		return NULL;
5772 	}
5773 	if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) {
5774 		log_err("could not add udp event");
5775 		comm_point_delete(c);
5776 		return NULL;
5777 	}
5778 	c->event_added = 1;
5779 	return c;
5780 }
5781 
5782 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
5783 struct comm_point*
5784 comm_point_create_udp_ancil(struct comm_base *base, int fd,
5785 	sldns_buffer* buffer, int pp2_enabled,
5786 	comm_point_callback_type* callback, void* callback_arg, struct unbound_socket* socket)
5787 {
5788 	struct comm_point* c = (struct comm_point*)calloc(1,
5789 		sizeof(struct comm_point));
5790 	short evbits;
5791 	if(!c)
5792 		return NULL;
5793 	c->ev = (struct internal_event*)calloc(1,
5794 		sizeof(struct internal_event));
5795 	if(!c->ev) {
5796 		free(c);
5797 		return NULL;
5798 	}
5799 	c->ev->base = base;
5800 	c->fd = fd;
5801 	c->buffer = buffer;
5802 	c->timeout = NULL;
5803 	c->tcp_is_reading = 0;
5804 	c->tcp_byte_count = 0;
5805 	c->tcp_parent = NULL;
5806 	c->max_tcp_count = 0;
5807 	c->cur_tcp_count = 0;
5808 	c->tcp_handlers = NULL;
5809 	c->tcp_free = NULL;
5810 	c->type = comm_udp;
5811 	c->tcp_do_close = 0;
5812 	c->do_not_close = 0;
5813 #ifdef USE_DNSCRYPT
5814 	c->dnscrypt = 0;
5815 	c->dnscrypt_buffer = buffer;
5816 #endif
5817 	c->inuse = 0;
5818 	c->tcp_do_toggle_rw = 0;
5819 	c->tcp_check_nb_connect = 0;
5820 #ifdef USE_MSG_FASTOPEN
5821 	c->tcp_do_fastopen = 0;
5822 #endif
5823 	c->callback = callback;
5824 	c->cb_arg = callback_arg;
5825 	c->socket = socket;
5826 	c->pp2_enabled = pp2_enabled;
5827 	c->pp2_header_state = pp2_header_none;
5828 	evbits = UB_EV_READ | UB_EV_PERSIST;
5829 	/* ub_event stuff */
5830 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
5831 		comm_point_udp_ancil_callback, c);
5832 	if(c->ev->ev == NULL) {
5833 		log_err("could not baseset udp event");
5834 		comm_point_delete(c);
5835 		return NULL;
5836 	}
5837 	if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) {
5838 		log_err("could not add udp event");
5839 		comm_point_delete(c);
5840 		return NULL;
5841 	}
5842 	c->event_added = 1;
5843 	return c;
5844 }
5845 #endif
5846 
5847 struct comm_point*
5848 comm_point_create_doq(struct comm_base *base, int fd, sldns_buffer* buffer,
5849 	comm_point_callback_type* callback, void* callback_arg,
5850 	struct unbound_socket* socket, struct doq_table* table,
5851 	struct ub_randstate* rnd, const void* quic_sslctx,
5852 	struct config_file* cfg)
5853 {
5854 #ifdef HAVE_NGTCP2
5855 	struct comm_point* c = (struct comm_point*)calloc(1,
5856 		sizeof(struct comm_point));
5857 	short evbits;
5858 	if(!c)
5859 		return NULL;
5860 	c->ev = (struct internal_event*)calloc(1,
5861 		sizeof(struct internal_event));
5862 	if(!c->ev) {
5863 		free(c);
5864 		return NULL;
5865 	}
5866 	c->ev->base = base;
5867 	c->fd = fd;
5868 	c->buffer = buffer;
5869 	c->timeout = NULL;
5870 	c->tcp_is_reading = 0;
5871 	c->tcp_byte_count = 0;
5872 	c->tcp_parent = NULL;
5873 	c->max_tcp_count = 0;
5874 	c->cur_tcp_count = 0;
5875 	c->tcp_handlers = NULL;
5876 	c->tcp_free = NULL;
5877 	c->type = comm_doq;
5878 	c->tcp_do_close = 0;
5879 	c->do_not_close = 0;
5880 	c->tcp_do_toggle_rw = 0;
5881 	c->tcp_check_nb_connect = 0;
5882 #ifdef USE_MSG_FASTOPEN
5883 	c->tcp_do_fastopen = 0;
5884 #endif
5885 #ifdef USE_DNSCRYPT
5886 	c->dnscrypt = 0;
5887 	c->dnscrypt_buffer = NULL;
5888 #endif
5889 	c->doq_socket = doq_server_socket_create(table, rnd, quic_sslctx, c,
5890 		base, cfg);
5891 	if(!c->doq_socket) {
5892 		log_err("could not create doq comm_point");
5893 		comm_point_delete(c);
5894 		return NULL;
5895 	}
5896 	c->inuse = 0;
5897 	c->callback = callback;
5898 	c->cb_arg = callback_arg;
5899 	c->socket = socket;
5900 	c->pp2_enabled = 0;
5901 	c->pp2_header_state = pp2_header_none;
5902 	evbits = UB_EV_READ | UB_EV_PERSIST;
5903 	/* ub_event stuff */
5904 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
5905 		comm_point_doq_callback, c);
5906 	if(c->ev->ev == NULL) {
5907 		log_err("could not baseset udp event");
5908 		comm_point_delete(c);
5909 		return NULL;
5910 	}
5911 	if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) {
5912 		log_err("could not add udp event");
5913 		comm_point_delete(c);
5914 		return NULL;
5915 	}
5916 	c->event_added = 1;
5917 	return c;
5918 #else
5919 	/* no libngtcp2, so no QUIC support */
5920 	(void)base;
5921 	(void)buffer;
5922 	(void)callback;
5923 	(void)callback_arg;
5924 	(void)socket;
5925 	(void)rnd;
5926 	(void)table;
5927 	(void)quic_sslctx;
5928 	(void)cfg;
5929 	sock_close(fd);
5930 	return NULL;
5931 #endif /* HAVE_NGTCP2 */
5932 }
5933 
5934 static struct comm_point*
5935 comm_point_create_tcp_handler(struct comm_base *base,
5936 	struct comm_point* parent, size_t bufsize,
5937 	struct sldns_buffer* spoolbuf, comm_point_callback_type* callback,
5938 	void* callback_arg, struct unbound_socket* socket)
5939 {
5940 	struct comm_point* c = (struct comm_point*)calloc(1,
5941 		sizeof(struct comm_point));
5942 	short evbits;
5943 	if(!c)
5944 		return NULL;
5945 	c->ev = (struct internal_event*)calloc(1,
5946 		sizeof(struct internal_event));
5947 	if(!c->ev) {
5948 		free(c);
5949 		return NULL;
5950 	}
5951 	c->ev->base = base;
5952 	c->fd = -1;
5953 	c->buffer = sldns_buffer_new(bufsize);
5954 	if(!c->buffer) {
5955 		free(c->ev);
5956 		free(c);
5957 		return NULL;
5958 	}
5959 	c->timeout = (struct timeval*)malloc(sizeof(struct timeval));
5960 	if(!c->timeout) {
5961 		sldns_buffer_free(c->buffer);
5962 		free(c->ev);
5963 		free(c);
5964 		return NULL;
5965 	}
5966 	c->tcp_is_reading = 0;
5967 	c->tcp_byte_count = 0;
5968 	c->tcp_parent = parent;
5969 	c->tcp_timeout_msec = parent->tcp_timeout_msec;
5970 	c->tcp_conn_limit = parent->tcp_conn_limit;
5971 	c->tcl_addr = NULL;
5972 	c->tcp_keepalive = 0;
5973 	c->max_tcp_count = 0;
5974 	c->cur_tcp_count = 0;
5975 	c->tcp_handlers = NULL;
5976 	c->tcp_free = NULL;
5977 	c->type = comm_tcp;
5978 	c->tcp_do_close = 0;
5979 	c->do_not_close = 0;
5980 	c->tcp_do_toggle_rw = 1;
5981 	c->tcp_check_nb_connect = 0;
5982 #ifdef USE_MSG_FASTOPEN
5983 	c->tcp_do_fastopen = 0;
5984 #endif
5985 #ifdef USE_DNSCRYPT
5986 	c->dnscrypt = 0;
5987 	/* We don't know just yet if this is a dnscrypt channel. Allocation
5988 	 * will be done when handling the callback. */
5989 	c->dnscrypt_buffer = c->buffer;
5990 #endif
5991 	c->repinfo.c = c;
5992 	c->callback = callback;
5993 	c->cb_arg = callback_arg;
5994 	c->socket = socket;
5995 	c->pp2_enabled = parent->pp2_enabled;
5996 	c->pp2_header_state = pp2_header_none;
5997 	if(spoolbuf) {
5998 		c->tcp_req_info = tcp_req_info_create(spoolbuf);
5999 		if(!c->tcp_req_info) {
6000 			log_err("could not create tcp commpoint");
6001 			sldns_buffer_free(c->buffer);
6002 			free(c->timeout);
6003 			free(c->ev);
6004 			free(c);
6005 			return NULL;
6006 		}
6007 		c->tcp_req_info->cp = c;
6008 		c->tcp_do_close = 1;
6009 		c->tcp_do_toggle_rw = 0;
6010 	}
6011 	/* add to parent free list */
6012 	c->tcp_free = parent->tcp_free;
6013 	parent->tcp_free = c;
6014 	/* ub_event stuff */
6015 	evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT;
6016 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6017 		comm_point_tcp_handle_callback, c);
6018 	if(c->ev->ev == NULL)
6019 	{
6020 		log_err("could not basetset tcphdl event");
6021 		parent->tcp_free = c->tcp_free;
6022 		tcp_req_info_delete(c->tcp_req_info);
6023 		sldns_buffer_free(c->buffer);
6024 		free(c->timeout);
6025 		free(c->ev);
6026 		free(c);
6027 		return NULL;
6028 	}
6029 	return c;
6030 }
6031 
6032 static struct comm_point*
6033 comm_point_create_http_handler(struct comm_base *base,
6034 	struct comm_point* parent, size_t bufsize, int harden_large_queries,
6035 	uint32_t http_max_streams, char* http_endpoint,
6036 	comm_point_callback_type* callback, void* callback_arg,
6037 	struct unbound_socket* socket)
6038 {
6039 	struct comm_point* c = (struct comm_point*)calloc(1,
6040 		sizeof(struct comm_point));
6041 	short evbits;
6042 	if(!c)
6043 		return NULL;
6044 	c->ev = (struct internal_event*)calloc(1,
6045 		sizeof(struct internal_event));
6046 	if(!c->ev) {
6047 		free(c);
6048 		return NULL;
6049 	}
6050 	c->ev->base = base;
6051 	c->fd = -1;
6052 	c->buffer = sldns_buffer_new(bufsize);
6053 	if(!c->buffer) {
6054 		free(c->ev);
6055 		free(c);
6056 		return NULL;
6057 	}
6058 	c->timeout = (struct timeval*)malloc(sizeof(struct timeval));
6059 	if(!c->timeout) {
6060 		sldns_buffer_free(c->buffer);
6061 		free(c->ev);
6062 		free(c);
6063 		return NULL;
6064 	}
6065 	c->tcp_is_reading = 0;
6066 	c->tcp_byte_count = 0;
6067 	c->tcp_parent = parent;
6068 	c->tcp_timeout_msec = parent->tcp_timeout_msec;
6069 	c->tcp_conn_limit = parent->tcp_conn_limit;
6070 	c->tcl_addr = NULL;
6071 	c->tcp_keepalive = 0;
6072 	c->max_tcp_count = 0;
6073 	c->cur_tcp_count = 0;
6074 	c->tcp_handlers = NULL;
6075 	c->tcp_free = NULL;
6076 	c->type = comm_http;
6077 	c->tcp_do_close = 1;
6078 	c->do_not_close = 0;
6079 	c->tcp_do_toggle_rw = 1; /* will be set to 0 after http2 upgrade */
6080 	c->tcp_check_nb_connect = 0;
6081 #ifdef USE_MSG_FASTOPEN
6082 	c->tcp_do_fastopen = 0;
6083 #endif
6084 #ifdef USE_DNSCRYPT
6085 	c->dnscrypt = 0;
6086 	c->dnscrypt_buffer = NULL;
6087 #endif
6088 	c->repinfo.c = c;
6089 	c->callback = callback;
6090 	c->cb_arg = callback_arg;
6091 	c->socket = socket;
6092 	c->pp2_enabled = 0;
6093 	c->pp2_header_state = pp2_header_none;
6094 
6095 	c->http_min_version = http_version_2;
6096 	c->http2_stream_max_qbuffer_size = bufsize;
6097 	if(harden_large_queries && bufsize > 512)
6098 		c->http2_stream_max_qbuffer_size = 512;
6099 	c->http2_max_streams = http_max_streams;
6100 	if(!(c->http_endpoint = strdup(http_endpoint))) {
6101 		log_err("could not strdup http_endpoint");
6102 		sldns_buffer_free(c->buffer);
6103 		free(c->timeout);
6104 		free(c->ev);
6105 		free(c);
6106 		return NULL;
6107 	}
6108 	c->use_h2 = 0;
6109 #ifdef HAVE_NGHTTP2
6110 	if(!(c->h2_session = http2_session_create(c))) {
6111 		log_err("could not create http2 session");
6112 		free(c->http_endpoint);
6113 		sldns_buffer_free(c->buffer);
6114 		free(c->timeout);
6115 		free(c->ev);
6116 		free(c);
6117 		return NULL;
6118 	}
6119 	if(!(c->h2_session->callbacks = http2_req_callbacks_create())) {
6120 		log_err("could not create http2 callbacks");
6121 		http2_session_delete(c->h2_session);
6122 		free(c->http_endpoint);
6123 		sldns_buffer_free(c->buffer);
6124 		free(c->timeout);
6125 		free(c->ev);
6126 		free(c);
6127 		return NULL;
6128 	}
6129 #endif
6130 
6131 	/* add to parent free list */
6132 	c->tcp_free = parent->tcp_free;
6133 	parent->tcp_free = c;
6134 	/* ub_event stuff */
6135 	evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT;
6136 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6137 		comm_point_http_handle_callback, c);
6138 	if(c->ev->ev == NULL)
6139 	{
6140 		log_err("could not set http handler event");
6141 		parent->tcp_free = c->tcp_free;
6142 		http2_session_delete(c->h2_session);
6143 		sldns_buffer_free(c->buffer);
6144 		free(c->timeout);
6145 		free(c->ev);
6146 		free(c);
6147 		return NULL;
6148 	}
6149 	return c;
6150 }
6151 
6152 struct comm_point*
6153 comm_point_create_tcp(struct comm_base *base, int fd, int num,
6154 	int idle_timeout, int harden_large_queries,
6155 	uint32_t http_max_streams, char* http_endpoint,
6156 	struct tcl_list* tcp_conn_limit, size_t bufsize,
6157 	struct sldns_buffer* spoolbuf, enum listen_type port_type,
6158 	int pp2_enabled, comm_point_callback_type* callback,
6159 	void* callback_arg, struct unbound_socket* socket)
6160 {
6161 	struct comm_point* c = (struct comm_point*)calloc(1,
6162 		sizeof(struct comm_point));
6163 	short evbits;
6164 	int i;
6165 	/* first allocate the TCP accept listener */
6166 	if(!c)
6167 		return NULL;
6168 	c->ev = (struct internal_event*)calloc(1,
6169 		sizeof(struct internal_event));
6170 	if(!c->ev) {
6171 		free(c);
6172 		return NULL;
6173 	}
6174 	c->ev->base = base;
6175 	c->fd = fd;
6176 	c->buffer = NULL;
6177 	c->timeout = NULL;
6178 	c->tcp_is_reading = 0;
6179 	c->tcp_byte_count = 0;
6180 	c->tcp_timeout_msec = idle_timeout;
6181 	c->tcp_conn_limit = tcp_conn_limit;
6182 	c->tcl_addr = NULL;
6183 	c->tcp_keepalive = 0;
6184 	c->tcp_parent = NULL;
6185 	c->max_tcp_count = num;
6186 	c->cur_tcp_count = 0;
6187 	c->tcp_handlers = (struct comm_point**)calloc((size_t)num,
6188 		sizeof(struct comm_point*));
6189 	if(!c->tcp_handlers) {
6190 		free(c->ev);
6191 		free(c);
6192 		return NULL;
6193 	}
6194 	c->tcp_free = NULL;
6195 	c->type = comm_tcp_accept;
6196 	c->tcp_do_close = 0;
6197 	c->do_not_close = 0;
6198 	c->tcp_do_toggle_rw = 0;
6199 	c->tcp_check_nb_connect = 0;
6200 #ifdef USE_MSG_FASTOPEN
6201 	c->tcp_do_fastopen = 0;
6202 #endif
6203 #ifdef USE_DNSCRYPT
6204 	c->dnscrypt = 0;
6205 	c->dnscrypt_buffer = NULL;
6206 #endif
6207 	c->callback = NULL;
6208 	c->cb_arg = NULL;
6209 	c->socket = socket;
6210 	c->pp2_enabled = (port_type==listen_type_http?0:pp2_enabled);
6211 	c->pp2_header_state = pp2_header_none;
6212 	evbits = UB_EV_READ | UB_EV_PERSIST;
6213 	/* ub_event stuff */
6214 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6215 		comm_point_tcp_accept_callback, c);
6216 	if(c->ev->ev == NULL) {
6217 		log_err("could not baseset tcpacc event");
6218 		comm_point_delete(c);
6219 		return NULL;
6220 	}
6221 	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
6222 		log_err("could not add tcpacc event");
6223 		comm_point_delete(c);
6224 		return NULL;
6225 	}
6226 	c->event_added = 1;
6227 	/* now prealloc the handlers */
6228 	for(i=0; i<num; i++) {
6229 		if(port_type == listen_type_tcp ||
6230 			port_type == listen_type_ssl ||
6231 			port_type == listen_type_tcp_dnscrypt) {
6232 			c->tcp_handlers[i] = comm_point_create_tcp_handler(base,
6233 				c, bufsize, spoolbuf, callback, callback_arg, socket);
6234 		} else if(port_type == listen_type_http) {
6235 			c->tcp_handlers[i] = comm_point_create_http_handler(
6236 				base, c, bufsize, harden_large_queries,
6237 				http_max_streams, http_endpoint,
6238 				callback, callback_arg, socket);
6239 		}
6240 		else {
6241 			log_err("could not create tcp handler, unknown listen "
6242 				"type");
6243 			return NULL;
6244 		}
6245 		if(!c->tcp_handlers[i]) {
6246 			comm_point_delete(c);
6247 			return NULL;
6248 		}
6249 	}
6250 
6251 	return c;
6252 }
6253 
6254 struct comm_point*
6255 comm_point_create_tcp_out(struct comm_base *base, size_t bufsize,
6256         comm_point_callback_type* callback, void* callback_arg)
6257 {
6258 	struct comm_point* c = (struct comm_point*)calloc(1,
6259 		sizeof(struct comm_point));
6260 	short evbits;
6261 	if(!c)
6262 		return NULL;
6263 	c->ev = (struct internal_event*)calloc(1,
6264 		sizeof(struct internal_event));
6265 	if(!c->ev) {
6266 		free(c);
6267 		return NULL;
6268 	}
6269 	c->ev->base = base;
6270 	c->fd = -1;
6271 	c->buffer = sldns_buffer_new(bufsize);
6272 	if(!c->buffer) {
6273 		free(c->ev);
6274 		free(c);
6275 		return NULL;
6276 	}
6277 	c->timeout = NULL;
6278 	c->tcp_is_reading = 0;
6279 	c->tcp_byte_count = 0;
6280 	c->tcp_timeout_msec = TCP_QUERY_TIMEOUT;
6281 	c->tcp_conn_limit = NULL;
6282 	c->tcl_addr = NULL;
6283 	c->tcp_keepalive = 0;
6284 	c->tcp_parent = NULL;
6285 	c->max_tcp_count = 0;
6286 	c->cur_tcp_count = 0;
6287 	c->tcp_handlers = NULL;
6288 	c->tcp_free = NULL;
6289 	c->type = comm_tcp;
6290 	c->tcp_do_close = 0;
6291 	c->do_not_close = 0;
6292 	c->tcp_do_toggle_rw = 1;
6293 	c->tcp_check_nb_connect = 1;
6294 #ifdef USE_MSG_FASTOPEN
6295 	c->tcp_do_fastopen = 1;
6296 #endif
6297 #ifdef USE_DNSCRYPT
6298 	c->dnscrypt = 0;
6299 	c->dnscrypt_buffer = c->buffer;
6300 #endif
6301 	c->repinfo.c = c;
6302 	c->callback = callback;
6303 	c->cb_arg = callback_arg;
6304 	c->pp2_enabled = 0;
6305 	c->pp2_header_state = pp2_header_none;
6306 	evbits = UB_EV_PERSIST | UB_EV_WRITE;
6307 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6308 		comm_point_tcp_handle_callback, c);
6309 	if(c->ev->ev == NULL)
6310 	{
6311 		log_err("could not baseset tcpout event");
6312 		sldns_buffer_free(c->buffer);
6313 		free(c->ev);
6314 		free(c);
6315 		return NULL;
6316 	}
6317 
6318 	return c;
6319 }
6320 
6321 struct comm_point*
6322 comm_point_create_http_out(struct comm_base *base, size_t bufsize,
6323         comm_point_callback_type* callback, void* callback_arg,
6324 	sldns_buffer* temp)
6325 {
6326 	struct comm_point* c = (struct comm_point*)calloc(1,
6327 		sizeof(struct comm_point));
6328 	short evbits;
6329 	if(!c)
6330 		return NULL;
6331 	c->ev = (struct internal_event*)calloc(1,
6332 		sizeof(struct internal_event));
6333 	if(!c->ev) {
6334 		free(c);
6335 		return NULL;
6336 	}
6337 	c->ev->base = base;
6338 	c->fd = -1;
6339 	c->buffer = sldns_buffer_new(bufsize);
6340 	if(!c->buffer) {
6341 		free(c->ev);
6342 		free(c);
6343 		return NULL;
6344 	}
6345 	c->timeout = NULL;
6346 	c->tcp_is_reading = 0;
6347 	c->tcp_byte_count = 0;
6348 	c->tcp_parent = NULL;
6349 	c->max_tcp_count = 0;
6350 	c->cur_tcp_count = 0;
6351 	c->tcp_handlers = NULL;
6352 	c->tcp_free = NULL;
6353 	c->type = comm_http;
6354 	c->tcp_do_close = 0;
6355 	c->do_not_close = 0;
6356 	c->tcp_do_toggle_rw = 1;
6357 	c->tcp_check_nb_connect = 1;
6358 	c->http_in_headers = 1;
6359 	c->http_in_chunk_headers = 0;
6360 	c->http_is_chunked = 0;
6361 	c->http_temp = temp;
6362 #ifdef USE_MSG_FASTOPEN
6363 	c->tcp_do_fastopen = 1;
6364 #endif
6365 #ifdef USE_DNSCRYPT
6366 	c->dnscrypt = 0;
6367 	c->dnscrypt_buffer = c->buffer;
6368 #endif
6369 	c->repinfo.c = c;
6370 	c->callback = callback;
6371 	c->cb_arg = callback_arg;
6372 	c->pp2_enabled = 0;
6373 	c->pp2_header_state = pp2_header_none;
6374 	evbits = UB_EV_PERSIST | UB_EV_WRITE;
6375 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6376 		comm_point_http_handle_callback, c);
6377 	if(c->ev->ev == NULL)
6378 	{
6379 		log_err("could not baseset tcpout event");
6380 #ifdef HAVE_SSL
6381 		SSL_free(c->ssl);
6382 #endif
6383 		sldns_buffer_free(c->buffer);
6384 		free(c->ev);
6385 		free(c);
6386 		return NULL;
6387 	}
6388 
6389 	return c;
6390 }
6391 
6392 struct comm_point*
6393 comm_point_create_local(struct comm_base *base, int fd, size_t bufsize,
6394         comm_point_callback_type* callback, void* callback_arg)
6395 {
6396 	struct comm_point* c = (struct comm_point*)calloc(1,
6397 		sizeof(struct comm_point));
6398 	short evbits;
6399 	if(!c)
6400 		return NULL;
6401 	c->ev = (struct internal_event*)calloc(1,
6402 		sizeof(struct internal_event));
6403 	if(!c->ev) {
6404 		free(c);
6405 		return NULL;
6406 	}
6407 	c->ev->base = base;
6408 	c->fd = fd;
6409 	c->buffer = sldns_buffer_new(bufsize);
6410 	if(!c->buffer) {
6411 		free(c->ev);
6412 		free(c);
6413 		return NULL;
6414 	}
6415 	c->timeout = NULL;
6416 	c->tcp_is_reading = 1;
6417 	c->tcp_byte_count = 0;
6418 	c->tcp_parent = NULL;
6419 	c->max_tcp_count = 0;
6420 	c->cur_tcp_count = 0;
6421 	c->tcp_handlers = NULL;
6422 	c->tcp_free = NULL;
6423 	c->type = comm_local;
6424 	c->tcp_do_close = 0;
6425 	c->do_not_close = 1;
6426 	c->tcp_do_toggle_rw = 0;
6427 	c->tcp_check_nb_connect = 0;
6428 #ifdef USE_MSG_FASTOPEN
6429 	c->tcp_do_fastopen = 0;
6430 #endif
6431 #ifdef USE_DNSCRYPT
6432 	c->dnscrypt = 0;
6433 	c->dnscrypt_buffer = c->buffer;
6434 #endif
6435 	c->callback = callback;
6436 	c->cb_arg = callback_arg;
6437 	c->pp2_enabled = 0;
6438 	c->pp2_header_state = pp2_header_none;
6439 	/* ub_event stuff */
6440 	evbits = UB_EV_PERSIST | UB_EV_READ;
6441 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6442 		comm_point_local_handle_callback, c);
6443 	if(c->ev->ev == NULL) {
6444 		log_err("could not baseset localhdl event");
6445 		free(c->ev);
6446 		free(c);
6447 		return NULL;
6448 	}
6449 	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
6450 		log_err("could not add localhdl event");
6451 		ub_event_free(c->ev->ev);
6452 		free(c->ev);
6453 		free(c);
6454 		return NULL;
6455 	}
6456 	c->event_added = 1;
6457 	return c;
6458 }
6459 
6460 struct comm_point*
6461 comm_point_create_raw(struct comm_base* base, int fd, int writing,
6462 	comm_point_callback_type* callback, void* callback_arg)
6463 {
6464 	struct comm_point* c = (struct comm_point*)calloc(1,
6465 		sizeof(struct comm_point));
6466 	short evbits;
6467 	if(!c)
6468 		return NULL;
6469 	c->ev = (struct internal_event*)calloc(1,
6470 		sizeof(struct internal_event));
6471 	if(!c->ev) {
6472 		free(c);
6473 		return NULL;
6474 	}
6475 	c->ev->base = base;
6476 	c->fd = fd;
6477 	c->buffer = NULL;
6478 	c->timeout = NULL;
6479 	c->tcp_is_reading = 0;
6480 	c->tcp_byte_count = 0;
6481 	c->tcp_parent = NULL;
6482 	c->max_tcp_count = 0;
6483 	c->cur_tcp_count = 0;
6484 	c->tcp_handlers = NULL;
6485 	c->tcp_free = NULL;
6486 	c->type = comm_raw;
6487 	c->tcp_do_close = 0;
6488 	c->do_not_close = 1;
6489 	c->tcp_do_toggle_rw = 0;
6490 	c->tcp_check_nb_connect = 0;
6491 #ifdef USE_MSG_FASTOPEN
6492 	c->tcp_do_fastopen = 0;
6493 #endif
6494 #ifdef USE_DNSCRYPT
6495 	c->dnscrypt = 0;
6496 	c->dnscrypt_buffer = c->buffer;
6497 #endif
6498 	c->callback = callback;
6499 	c->cb_arg = callback_arg;
6500 	c->pp2_enabled = 0;
6501 	c->pp2_header_state = pp2_header_none;
6502 	/* ub_event stuff */
6503 	if(writing)
6504 		evbits = UB_EV_PERSIST | UB_EV_WRITE;
6505 	else 	evbits = UB_EV_PERSIST | UB_EV_READ;
6506 	c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits,
6507 		comm_point_raw_handle_callback, c);
6508 	if(c->ev->ev == NULL) {
6509 		log_err("could not baseset rawhdl event");
6510 		free(c->ev);
6511 		free(c);
6512 		return NULL;
6513 	}
6514 	if (ub_event_add(c->ev->ev, c->timeout) != 0) {
6515 		log_err("could not add rawhdl event");
6516 		ub_event_free(c->ev->ev);
6517 		free(c->ev);
6518 		free(c);
6519 		return NULL;
6520 	}
6521 	c->event_added = 1;
6522 	return c;
6523 }
6524 
6525 void
6526 comm_point_close(struct comm_point* c)
6527 {
6528 	if(!c)
6529 		return;
6530 	if(c->fd != -1) {
6531 		verbose(5, "comm_point_close of %d: event_del", c->fd);
6532 		if(c->event_added) {
6533 			if(ub_event_del(c->ev->ev) != 0) {
6534 				log_err("could not event_del on close");
6535 			}
6536 			c->event_added = 0;
6537 		}
6538 	}
6539 	tcl_close_connection(c->tcl_addr);
6540 	if(c->tcp_req_info)
6541 		tcp_req_info_clear(c->tcp_req_info);
6542 	if(c->h2_session)
6543 		http2_session_server_delete(c->h2_session);
6544 	/* stop the comm point from reading or writing after it is closed. */
6545 	if(c->tcp_more_read_again && *c->tcp_more_read_again)
6546 		*c->tcp_more_read_again = 0;
6547 	if(c->tcp_more_write_again && *c->tcp_more_write_again)
6548 		*c->tcp_more_write_again = 0;
6549 
6550 	/* close fd after removing from event lists, or epoll.. is messed up */
6551 	if(c->fd != -1 && !c->do_not_close) {
6552 #ifdef USE_WINSOCK
6553 		if(c->type == comm_tcp || c->type == comm_http) {
6554 			/* delete sticky events for the fd, it gets closed */
6555 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ);
6556 			ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE);
6557 		}
6558 #endif
6559 		verbose(VERB_ALGO, "close fd %d", c->fd);
6560 		sock_close(c->fd);
6561 	}
6562 	c->fd = -1;
6563 }
6564 
6565 void
6566 comm_point_delete(struct comm_point* c)
6567 {
6568 	if(!c)
6569 		return;
6570 	if((c->type == comm_tcp || c->type == comm_http) && c->ssl) {
6571 #ifdef HAVE_SSL
6572 		SSL_shutdown(c->ssl);
6573 		SSL_free(c->ssl);
6574 #endif
6575 	}
6576 	if(c->type == comm_http && c->http_endpoint) {
6577 		free(c->http_endpoint);
6578 		c->http_endpoint = NULL;
6579 	}
6580 	comm_point_close(c);
6581 	if(c->tcp_handlers) {
6582 		int i;
6583 		for(i=0; i<c->max_tcp_count; i++)
6584 			comm_point_delete(c->tcp_handlers[i]);
6585 		free(c->tcp_handlers);
6586 	}
6587 	free(c->timeout);
6588 	if(c->type == comm_tcp || c->type == comm_local || c->type == comm_http) {
6589 		sldns_buffer_free(c->buffer);
6590 #ifdef USE_DNSCRYPT
6591 		if(c->dnscrypt && c->dnscrypt_buffer != c->buffer) {
6592 			sldns_buffer_free(c->dnscrypt_buffer);
6593 		}
6594 #endif
6595 		if(c->tcp_req_info) {
6596 			tcp_req_info_delete(c->tcp_req_info);
6597 		}
6598 		if(c->h2_session) {
6599 			http2_session_delete(c->h2_session);
6600 		}
6601 	}
6602 #ifdef HAVE_NGTCP2
6603 	if(c->doq_socket)
6604 		doq_server_socket_delete(c->doq_socket);
6605 #endif
6606 	ub_event_free(c->ev->ev);
6607 	free(c->ev);
6608 	free(c);
6609 }
6610 
6611 #ifdef USE_DNSTAP
6612 static void
6613 send_reply_dnstap(struct dt_env* dtenv,
6614 	struct sockaddr* addr, socklen_t addrlen,
6615 	struct sockaddr_storage* client_addr, socklen_t client_addrlen,
6616 	enum comm_point_type type, void* ssl, sldns_buffer* buffer)
6617 {
6618 	log_addr(VERB_ALGO, "from local addr", (void*)addr, addrlen);
6619 	log_addr(VERB_ALGO, "response to client", client_addr, client_addrlen);
6620 	dt_msg_send_client_response(dtenv, client_addr,
6621 		(struct sockaddr_storage*)addr, type, ssl, buffer);
6622 }
6623 #endif
6624 
6625 void
6626 comm_point_send_reply(struct comm_reply *repinfo)
6627 {
6628 	struct sldns_buffer* buffer;
6629 	log_assert(repinfo && repinfo->c);
6630 #ifdef USE_DNSCRYPT
6631 	buffer = repinfo->c->dnscrypt_buffer;
6632 	if(!dnsc_handle_uncurved_request(repinfo)) {
6633 		return;
6634 	}
6635 #else
6636 	buffer = repinfo->c->buffer;
6637 #endif
6638 	if(repinfo->c->type == comm_udp) {
6639 		if(repinfo->srctype)
6640 			comm_point_send_udp_msg_if(repinfo->c, buffer,
6641 				(struct sockaddr*)&repinfo->remote_addr,
6642 				repinfo->remote_addrlen, repinfo);
6643 		else
6644 			comm_point_send_udp_msg(repinfo->c, buffer,
6645 				(struct sockaddr*)&repinfo->remote_addr,
6646 				repinfo->remote_addrlen, 0);
6647 #ifdef USE_DNSTAP
6648 		/*
6649 		 * sending src (client)/dst (local service) addresses over
6650 		 * DNSTAP from udp callback
6651 		 */
6652 		if(repinfo->c->dtenv != NULL && repinfo->c->dtenv->log_client_response_messages) {
6653 			send_reply_dnstap(repinfo->c->dtenv,
6654 				repinfo->c->socket->addr,
6655 				repinfo->c->socket->addrlen,
6656 				&repinfo->client_addr, repinfo->client_addrlen,
6657 				repinfo->c->type, repinfo->c->ssl,
6658 				repinfo->c->buffer);
6659 		}
6660 #endif
6661 	} else {
6662 #ifdef USE_DNSTAP
6663 		struct dt_env* dtenv =
6664 #ifdef HAVE_NGTCP2
6665 			repinfo->c->doq_socket
6666 			?repinfo->c->dtenv:
6667 #endif
6668 			repinfo->c->tcp_parent->dtenv;
6669 		struct sldns_buffer* dtbuffer = repinfo->c->tcp_req_info
6670 			?repinfo->c->tcp_req_info->spool_buffer
6671 			:repinfo->c->buffer;
6672 #ifdef USE_DNSCRYPT
6673 		if(repinfo->c->dnscrypt && repinfo->is_dnscrypted)
6674 			dtbuffer = repinfo->c->buffer;
6675 #endif
6676 		/*
6677 		 * sending src (client)/dst (local service) addresses over
6678 		 * DNSTAP from other callbacks
6679 		 */
6680 		if(dtenv != NULL && dtenv->log_client_response_messages) {
6681 			send_reply_dnstap(dtenv,
6682 				repinfo->c->socket->addr,
6683 				repinfo->c->socket->addrlen,
6684 				&repinfo->client_addr, repinfo->client_addrlen,
6685 				repinfo->c->type, repinfo->c->ssl,
6686 				dtbuffer);
6687 		}
6688 #endif
6689 		if(repinfo->c->tcp_req_info) {
6690 			tcp_req_info_send_reply(repinfo->c->tcp_req_info);
6691 		} else if(repinfo->c->use_h2) {
6692 			if(!http2_submit_dns_response(repinfo->c->h2_session)) {
6693 				comm_point_drop_reply(repinfo);
6694 				return;
6695 			}
6696 			repinfo->c->h2_stream = NULL;
6697 			repinfo->c->tcp_is_reading = 0;
6698 			comm_point_stop_listening(repinfo->c);
6699 			comm_point_start_listening(repinfo->c, -1,
6700 				adjusted_tcp_timeout(repinfo->c));
6701 			return;
6702 #ifdef HAVE_NGTCP2
6703 		} else if(repinfo->c->doq_socket) {
6704 			doq_socket_send_reply(repinfo);
6705 #endif
6706 		} else {
6707 			comm_point_start_listening(repinfo->c, -1,
6708 				adjusted_tcp_timeout(repinfo->c));
6709 		}
6710 	}
6711 }
6712 
6713 void
6714 comm_point_drop_reply(struct comm_reply* repinfo)
6715 {
6716 	if(!repinfo)
6717 		return;
6718 	log_assert(repinfo->c);
6719 	log_assert(repinfo->c->type != comm_tcp_accept);
6720 	if(repinfo->c->type == comm_udp)
6721 		return;
6722 	if(repinfo->c->tcp_req_info)
6723 		repinfo->c->tcp_req_info->is_drop = 1;
6724 	if(repinfo->c->type == comm_http) {
6725 		if(repinfo->c->h2_session) {
6726 			repinfo->c->h2_session->is_drop = 1;
6727 			if(!repinfo->c->h2_session->postpone_drop)
6728 				reclaim_http_handler(repinfo->c);
6729 			return;
6730 		}
6731 		reclaim_http_handler(repinfo->c);
6732 		return;
6733 #ifdef HAVE_NGTCP2
6734 	} else if(repinfo->c->doq_socket) {
6735 		doq_socket_drop_reply(repinfo);
6736 		return;
6737 #endif
6738 	}
6739 	reclaim_tcp_handler(repinfo->c);
6740 }
6741 
6742 void
6743 comm_point_stop_listening(struct comm_point* c)
6744 {
6745 	verbose(VERB_ALGO, "comm point stop listening %d", c->fd);
6746 	if(c->event_added) {
6747 		if(ub_event_del(c->ev->ev) != 0) {
6748 			log_err("event_del error to stoplisten");
6749 		}
6750 		c->event_added = 0;
6751 	}
6752 }
6753 
6754 void
6755 comm_point_start_listening(struct comm_point* c, int newfd, int msec)
6756 {
6757 	verbose(VERB_ALGO, "comm point start listening %d (%d msec)",
6758 		c->fd==-1?newfd:c->fd, msec);
6759 	if(c->type == comm_tcp_accept && !c->tcp_free) {
6760 		/* no use to start listening no free slots. */
6761 		return;
6762 	}
6763 	if(c->event_added) {
6764 		if(ub_event_del(c->ev->ev) != 0) {
6765 			log_err("event_del error to startlisten");
6766 		}
6767 		c->event_added = 0;
6768 	}
6769 	if(msec != -1 && msec != 0) {
6770 		if(!c->timeout) {
6771 			c->timeout = (struct timeval*)malloc(sizeof(
6772 				struct timeval));
6773 			if(!c->timeout) {
6774 				log_err("cpsl: malloc failed. No net read.");
6775 				return;
6776 			}
6777 		}
6778 		ub_event_add_bits(c->ev->ev, UB_EV_TIMEOUT);
6779 #ifndef S_SPLINT_S /* splint fails on struct timeval. */
6780 		c->timeout->tv_sec = msec/1000;
6781 		c->timeout->tv_usec = (msec%1000)*1000;
6782 #endif /* S_SPLINT_S */
6783 	} else {
6784 		if(msec == 0 || !c->timeout) {
6785 			ub_event_del_bits(c->ev->ev, UB_EV_TIMEOUT);
6786 		}
6787 	}
6788 	if(c->type == comm_tcp || c->type == comm_http) {
6789 		ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE);
6790 		if(c->tcp_write_and_read) {
6791 			verbose(5, "startlistening %d mode rw", (newfd==-1?c->fd:newfd));
6792 			ub_event_add_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE);
6793 		} else if(c->tcp_is_reading) {
6794 			verbose(5, "startlistening %d mode r", (newfd==-1?c->fd:newfd));
6795 			ub_event_add_bits(c->ev->ev, UB_EV_READ);
6796 		} else	{
6797 			verbose(5, "startlistening %d mode w", (newfd==-1?c->fd:newfd));
6798 			ub_event_add_bits(c->ev->ev, UB_EV_WRITE);
6799 		}
6800 	}
6801 	if(newfd != -1) {
6802 		if(c->fd != -1 && c->fd != newfd) {
6803 			verbose(5, "cpsl close of fd %d for %d", c->fd, newfd);
6804 			sock_close(c->fd);
6805 		}
6806 		c->fd = newfd;
6807 		ub_event_set_fd(c->ev->ev, c->fd);
6808 	}
6809 	if(ub_event_add(c->ev->ev, msec==0?NULL:c->timeout) != 0) {
6810 		log_err("event_add failed. in cpsl.");
6811 		return;
6812 	}
6813 	c->event_added = 1;
6814 }
6815 
6816 void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr)
6817 {
6818 	verbose(VERB_ALGO, "comm point listen_for_rw %d %d", c->fd, wr);
6819 	if(c->event_added) {
6820 		if(ub_event_del(c->ev->ev) != 0) {
6821 			log_err("event_del error to cplf");
6822 		}
6823 		c->event_added = 0;
6824 	}
6825 	if(!c->timeout) {
6826 		ub_event_del_bits(c->ev->ev, UB_EV_TIMEOUT);
6827 	}
6828 	ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE);
6829 	if(rd) ub_event_add_bits(c->ev->ev, UB_EV_READ);
6830 	if(wr) ub_event_add_bits(c->ev->ev, UB_EV_WRITE);
6831 	if(ub_event_add(c->ev->ev, c->timeout) != 0) {
6832 		log_err("event_add failed. in cplf.");
6833 		return;
6834 	}
6835 	c->event_added = 1;
6836 }
6837 
6838 size_t comm_point_get_mem(struct comm_point* c)
6839 {
6840 	size_t s;
6841 	if(!c)
6842 		return 0;
6843 	s = sizeof(*c) + sizeof(*c->ev);
6844 	if(c->timeout)
6845 		s += sizeof(*c->timeout);
6846 	if(c->type == comm_tcp || c->type == comm_local) {
6847 		s += sizeof(*c->buffer) + sldns_buffer_capacity(c->buffer);
6848 #ifdef USE_DNSCRYPT
6849 		s += sizeof(*c->dnscrypt_buffer);
6850 		if(c->buffer != c->dnscrypt_buffer) {
6851 			s += sldns_buffer_capacity(c->dnscrypt_buffer);
6852 		}
6853 #endif
6854 	}
6855 	if(c->type == comm_tcp_accept) {
6856 		int i;
6857 		for(i=0; i<c->max_tcp_count; i++)
6858 			s += comm_point_get_mem(c->tcp_handlers[i]);
6859 	}
6860 	return s;
6861 }
6862 
6863 struct comm_timer*
6864 comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg)
6865 {
6866 	struct internal_timer *tm = (struct internal_timer*)calloc(1,
6867 		sizeof(struct internal_timer));
6868 	if(!tm) {
6869 		log_err("malloc failed");
6870 		return NULL;
6871 	}
6872 	tm->super.ev_timer = tm;
6873 	tm->base = base;
6874 	tm->super.callback = cb;
6875 	tm->super.cb_arg = cb_arg;
6876 	tm->ev = ub_event_new(base->eb->base, -1, UB_EV_TIMEOUT,
6877 		comm_timer_callback, &tm->super);
6878 	if(tm->ev == NULL) {
6879 		log_err("timer_create: event_base_set failed.");
6880 		free(tm);
6881 		return NULL;
6882 	}
6883 	return &tm->super;
6884 }
6885 
6886 void
6887 comm_timer_disable(struct comm_timer* timer)
6888 {
6889 	if(!timer)
6890 		return;
6891 	ub_timer_del(timer->ev_timer->ev);
6892 	timer->ev_timer->enabled = 0;
6893 }
6894 
6895 void
6896 comm_timer_set(struct comm_timer* timer, struct timeval* tv)
6897 {
6898 	log_assert(tv);
6899 	if(timer->ev_timer->enabled)
6900 		comm_timer_disable(timer);
6901 	if(ub_timer_add(timer->ev_timer->ev, timer->ev_timer->base->eb->base,
6902 		comm_timer_callback, timer, tv) != 0)
6903 		log_err("comm_timer_set: evtimer_add failed.");
6904 	timer->ev_timer->enabled = 1;
6905 }
6906 
6907 void
6908 comm_timer_delete(struct comm_timer* timer)
6909 {
6910 	if(!timer)
6911 		return;
6912 	comm_timer_disable(timer);
6913 	/* Free the sub struct timer->ev_timer derived from the super struct timer.
6914 	 * i.e. assert(timer == timer->ev_timer)
6915 	 */
6916 	ub_event_free(timer->ev_timer->ev);
6917 	free(timer->ev_timer);
6918 }
6919 
6920 void
6921 comm_timer_callback(int ATTR_UNUSED(fd), short event, void* arg)
6922 {
6923 	struct comm_timer* tm = (struct comm_timer*)arg;
6924 	if(!(event&UB_EV_TIMEOUT))
6925 		return;
6926 	ub_comm_base_now(tm->ev_timer->base);
6927 	tm->ev_timer->enabled = 0;
6928 	fptr_ok(fptr_whitelist_comm_timer(tm->callback));
6929 	(*tm->callback)(tm->cb_arg);
6930 }
6931 
6932 int
6933 comm_timer_is_set(struct comm_timer* timer)
6934 {
6935 	return (int)timer->ev_timer->enabled;
6936 }
6937 
6938 size_t
6939 comm_timer_get_mem(struct comm_timer* timer)
6940 {
6941 	if(!timer) return 0;
6942 	return sizeof(struct internal_timer);
6943 }
6944 
6945 struct comm_signal*
6946 comm_signal_create(struct comm_base* base,
6947         void (*callback)(int, void*), void* cb_arg)
6948 {
6949 	struct comm_signal* com = (struct comm_signal*)malloc(
6950 		sizeof(struct comm_signal));
6951 	if(!com) {
6952 		log_err("malloc failed");
6953 		return NULL;
6954 	}
6955 	com->base = base;
6956 	com->callback = callback;
6957 	com->cb_arg = cb_arg;
6958 	com->ev_signal = NULL;
6959 	return com;
6960 }
6961 
6962 void
6963 comm_signal_callback(int sig, short event, void* arg)
6964 {
6965 	struct comm_signal* comsig = (struct comm_signal*)arg;
6966 	if(!(event & UB_EV_SIGNAL))
6967 		return;
6968 	ub_comm_base_now(comsig->base);
6969 	fptr_ok(fptr_whitelist_comm_signal(comsig->callback));
6970 	(*comsig->callback)(sig, comsig->cb_arg);
6971 }
6972 
6973 int
6974 comm_signal_bind(struct comm_signal* comsig, int sig)
6975 {
6976 	struct internal_signal* entry = (struct internal_signal*)calloc(1,
6977 		sizeof(struct internal_signal));
6978 	if(!entry) {
6979 		log_err("malloc failed");
6980 		return 0;
6981 	}
6982 	log_assert(comsig);
6983 	/* add signal event */
6984 	entry->ev = ub_signal_new(comsig->base->eb->base, sig,
6985 		comm_signal_callback, comsig);
6986 	if(entry->ev == NULL) {
6987 		log_err("Could not create signal event");
6988 		free(entry);
6989 		return 0;
6990 	}
6991 	if(ub_signal_add(entry->ev, NULL) != 0) {
6992 		log_err("Could not add signal handler");
6993 		ub_event_free(entry->ev);
6994 		free(entry);
6995 		return 0;
6996 	}
6997 	/* link into list */
6998 	entry->next = comsig->ev_signal;
6999 	comsig->ev_signal = entry;
7000 	return 1;
7001 }
7002 
7003 void
7004 comm_signal_delete(struct comm_signal* comsig)
7005 {
7006 	struct internal_signal* p, *np;
7007 	if(!comsig)
7008 		return;
7009 	p=comsig->ev_signal;
7010 	while(p) {
7011 		np = p->next;
7012 		ub_signal_del(p->ev);
7013 		ub_event_free(p->ev);
7014 		free(p);
7015 		p = np;
7016 	}
7017 	free(comsig);
7018 }
7019