xref: /freebsd/sys/net/debugnet.c (revision e37bb444aa945ed0725766e986698a09bd61b1b2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/endian.h>
40 #include <sys/errno.h>
41 #include <sys/eventhandler.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 
48 #ifdef DDB
49 #include <ddb/ddb.h>
50 #include <ddb/db_lex.h>
51 #endif
52 
53 #include <net/ethernet.h>
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/if_dl.h>
57 #include <net/if_types.h>
58 #include <net/if_var.h>
59 #include <net/if_private.h>
60 #include <net/vnet.h>
61 #include <net/route.h>
62 #include <net/route/nhop.h>
63 
64 #include <netinet/in.h>
65 #include <netinet/in_fib.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/in_var.h>
68 #include <netinet/ip.h>
69 #include <netinet/ip_var.h>
70 #include <netinet/ip_options.h>
71 #include <netinet/udp.h>
72 #include <netinet/udp_var.h>
73 
74 #include <machine/in_cksum.h>
75 #include <machine/pcb.h>
76 
77 #include <net/debugnet.h>
78 #define	DEBUGNET_INTERNAL
79 #include <net/debugnet_int.h>
80 
81 FEATURE(debugnet, "Debugnet support");
82 
83 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
84     "debugnet parameters");
85 
86 unsigned debugnet_debug;
87 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
88     &debugnet_debug, 0,
89     "Debug message verbosity (0: off; 1: on; 2: verbose)");
90 
91 int debugnet_npolls = 2000;
92 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
93     &debugnet_npolls, 0,
94     "Number of times to poll before assuming packet loss (0.5ms per poll)");
95 int debugnet_nretries = 10;
96 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
97     &debugnet_nretries, 0,
98     "Number of retransmit attempts before giving up");
99 int debugnet_fib = RT_DEFAULT_FIB;
100 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
101     &debugnet_fib, 0,
102     "Fib to use when sending dump");
103 
104 static bool g_debugnet_pcb_inuse;
105 static struct debugnet_pcb g_dnet_pcb;
106 
107 /*
108  * Simple accessors for opaque PCB.
109  */
110 const unsigned char *
111 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
112 {
113 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
114 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
115 	return (pcb->dp_gw_mac.octet);
116 }
117 
118 /*
119  * Start of network primitives, beginning with output primitives.
120  */
121 
122 /*
123  * Handles creation of the ethernet header, then places outgoing packets into
124  * the tx buffer for the NIC
125  *
126  * Parameters:
127  *	m	The mbuf containing the packet to be sent (will be freed by
128  *		this function or the NIC driver)
129  *	ifp	The interface to send on
130  *	dst	The destination ethernet address (source address will be looked
131  *		up using ifp)
132  *	etype	The ETHERTYPE_* value for the protocol that is being sent
133  *
134  * Returns:
135  *	int	see errno.h, 0 for success
136  */
137 int
138 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
139     u_short etype)
140 {
141 	struct ether_header *eh;
142 
143 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
144 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
145 		if_printf(ifp, "%s: interface isn't up\n", __func__);
146 		m_freem(m);
147 		return (ENETDOWN);
148 	}
149 
150 	/* Fill in the ethernet header. */
151 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
152 	if (m == NULL) {
153 		printf("%s: out of mbufs\n", __func__);
154 		return (ENOBUFS);
155 	}
156 	eh = mtod(m, struct ether_header *);
157 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
158 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
159 	eh->ether_type = htons(etype);
160 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
161 }
162 
163 /*
164  * Unreliable transmission of an mbuf chain to the debugnet server
165  * Note: can't handle fragmentation; fails if the packet is larger than
166  *	 ifp->if_mtu after adding the UDP/IP headers
167  *
168  * Parameters:
169  *	pcb	The debugnet context block
170  *	m	mbuf chain
171  *
172  * Returns:
173  *	int	see errno.h, 0 for success
174  */
175 static int
176 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
177 {
178 	struct udphdr *udp;
179 
180 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
181 
182 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
183 	if (m == NULL) {
184 		printf("%s: out of mbufs\n", __func__);
185 		return (ENOBUFS);
186 	}
187 
188 	udp = mtod(m, void *);
189 	udp->uh_ulen = htons(m->m_pkthdr.len);
190 	/* Use this src port so that the server can connect() the socket */
191 	udp->uh_sport = htons(pcb->dp_client_port);
192 	udp->uh_dport = htons(pcb->dp_server_port);
193 	/* Computed later (protocol-dependent). */
194 	udp->uh_sum = 0;
195 
196 	return (debugnet_ip_output(pcb, m));
197 }
198 
199 int
200 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
201 {
202 	struct debugnet_ack *dn_ack;
203 	struct mbuf *m;
204 
205 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
206 
207 	m = m_gethdr(M_NOWAIT, MT_DATA);
208 	if (m == NULL) {
209 		printf("%s: Out of mbufs\n", __func__);
210 		return (ENOBUFS);
211 	}
212 	m->m_len = sizeof(*dn_ack);
213 	m->m_pkthdr.len = sizeof(*dn_ack);
214 	MH_ALIGN(m, sizeof(*dn_ack));
215 	dn_ack = mtod(m, void *);
216 	dn_ack->da_seqno = seqno;
217 
218 	return (debugnet_udp_output(pcb, m));
219 }
220 
221 /*
222  * Dummy free function for debugnet clusters.
223  */
224 static void
225 debugnet_mbuf_free(struct mbuf *m __unused)
226 {
227 }
228 
229 /*
230  * Construct and reliably send a debugnet packet.  May fail from a resource
231  * shortage or extreme number of unacknowledged retransmissions.  Wait for
232  * an acknowledgement before returning.  Splits packets into chunks small
233  * enough to be sent without fragmentation (looks up the interface MTU)
234  *
235  * Parameters:
236  *	type	debugnet packet type (HERALD, FINISHED, ...)
237  *	data	data
238  *	datalen	data size (bytes)
239  *	auxdata	optional auxiliary information
240  *
241  * Returns:
242  *	int see errno.h, 0 for success
243  */
244 int
245 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
246     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
247 {
248 	struct debugnet_msg_hdr *dn_msg_hdr;
249 	struct mbuf *m, *m2;
250 	uint64_t want_acks;
251 	uint32_t i, pktlen, sent_so_far;
252 	int retries, polls, error;
253 
254 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
255 		return (ECONNRESET);
256 
257 	want_acks = 0;
258 	pcb->dp_rcvd_acks = 0;
259 	retries = 0;
260 
261 retransmit:
262 	/* Chunks can be too big to fit in packets. */
263 	for (i = sent_so_far = 0; sent_so_far < datalen ||
264 	    (i == 0 && datalen == 0); i++) {
265 		pktlen = datalen - sent_so_far;
266 
267 		/* Bound: the interface MTU (assume no IP options). */
268 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
269 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
270 
271 		/*
272 		 * Check if it is retransmitting and this has been ACKed
273 		 * already.
274 		 */
275 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
276 			sent_so_far += pktlen;
277 			continue;
278 		}
279 
280 		/*
281 		 * Get and fill a header mbuf, then chain data as an extended
282 		 * mbuf.
283 		 */
284 		m = m_gethdr(M_NOWAIT, MT_DATA);
285 		if (m == NULL) {
286 			printf("%s: Out of mbufs\n", __func__);
287 			return (ENOBUFS);
288 		}
289 		m->m_len = sizeof(struct debugnet_msg_hdr);
290 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
291 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
292 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
293 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
294 		dn_msg_hdr->mh_type = htonl(type);
295 		dn_msg_hdr->mh_len = htonl(pktlen);
296 
297 		if (auxdata != NULL) {
298 			dn_msg_hdr->mh_offset =
299 			    htobe64(auxdata->dp_offset_start + sent_so_far);
300 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
301 		} else {
302 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
303 			dn_msg_hdr->mh_aux2 = 0;
304 		}
305 
306 		if (pktlen != 0) {
307 			m2 = m_get(M_NOWAIT, MT_DATA);
308 			if (m2 == NULL) {
309 				m_freem(m);
310 				printf("%s: Out of mbufs\n", __func__);
311 				return (ENOBUFS);
312 			}
313 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
314 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
315 			    EXT_DISPOSABLE);
316 			m2->m_len = pktlen;
317 
318 			m_cat(m, m2);
319 			m->m_pkthdr.len += pktlen;
320 		}
321 		error = debugnet_udp_output(pcb, m);
322 		if (error != 0)
323 			return (error);
324 
325 		/* Note that we're waiting for this packet in the bitfield. */
326 		want_acks |= (1 << i);
327 		sent_so_far += pktlen;
328 	}
329 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
330 		printf("Warning: Sent more than %d packets (%d). "
331 		    "Acknowledgements will fail unless the size of "
332 		    "rcvd_acks/want_acks is increased.\n",
333 		    DEBUGNET_MAX_IN_FLIGHT, i);
334 
335 	/*
336 	 * Wait for acks.  A *real* window would speed things up considerably.
337 	 */
338 	polls = 0;
339 	while (pcb->dp_rcvd_acks != want_acks) {
340 		if (polls++ > debugnet_npolls) {
341 			if (retries++ > debugnet_nretries)
342 				return (ETIMEDOUT);
343 			printf(". ");
344 			goto retransmit;
345 		}
346 		debugnet_network_poll(pcb);
347 		DELAY(500);
348 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
349 			return (ECONNRESET);
350 	}
351 	pcb->dp_seqno += i;
352 	return (0);
353 }
354 
355 /*
356  * Network input primitives.
357  */
358 
359 /*
360  * Just introspect the header enough to fire off a seqno ack and validate
361  * length fits.
362  */
363 static void
364 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
365 {
366 	const struct debugnet_msg_hdr *dnh;
367 	struct mbuf *m;
368 	int error;
369 
370 	m = *mb;
371 
372 	if (m->m_pkthdr.len < sizeof(*dnh)) {
373 		DNETDEBUG("ignoring small debugnet_msg packet\n");
374 		return;
375 	}
376 
377 	/* Get ND header. */
378 	if (m->m_len < sizeof(*dnh)) {
379 		m = m_pullup(m, sizeof(*dnh));
380 		*mb = m;
381 		if (m == NULL) {
382 			DNETDEBUG("m_pullup failed\n");
383 			return;
384 		}
385 	}
386 	dnh = mtod(m, const void *);
387 
388 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
389 		DNETDEBUG("Dropping short packet.\n");
390 		return;
391 	}
392 
393 	/*
394 	 * If the issue is transient (ENOBUFS), sender should resend.  If
395 	 * non-transient (like driver objecting to rx -> tx from the same
396 	 * thread), not much else we can do.
397 	 */
398 	error = debugnet_ack_output(pcb, dnh->mh_seqno);
399 	if (error != 0)
400 		return;
401 
402 	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
403 		printf("Remote shut down the connection on us!\n");
404 		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
405 
406 		/*
407 		 * Continue through to the user handler so they are signalled
408 		 * not to wait for further rx.
409 		 */
410 	}
411 
412 	pcb->dp_rx_handler(pcb, mb);
413 }
414 
415 static void
416 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
417 {
418 	const struct debugnet_ack *dn_ack;
419 	struct mbuf *m;
420 	uint32_t rcv_ackno;
421 
422 	m = *mb;
423 
424 	/* Get Ack. */
425 	if (m->m_len < sizeof(*dn_ack)) {
426 		m = m_pullup(m, sizeof(*dn_ack));
427 		*mb = m;
428 		if (m == NULL) {
429 			DNETDEBUG("m_pullup failed\n");
430 			return;
431 		}
432 	}
433 	dn_ack = mtod(m, const void *);
434 
435 	/* Debugnet processing. */
436 	/*
437 	 * Packet is meant for us.  Extract the ack sequence number and the
438 	 * port number if necessary.
439 	 */
440 	rcv_ackno = ntohl(dn_ack->da_seqno);
441 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
442 		pcb->dp_server_port = sport;
443 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
444 	}
445 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
446 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
447 	else if (rcv_ackno >= pcb->dp_seqno) {
448 		/* We're interested in this ack. Record it. */
449 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
450 	}
451 }
452 
453 void
454 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
455 {
456 	const struct udphdr *udp;
457 	struct mbuf *m;
458 	uint16_t sport, ulen;
459 
460 	/* UDP processing. */
461 
462 	m = *mb;
463 	if (m->m_pkthdr.len < sizeof(*udp)) {
464 		DNETDEBUG("ignoring small UDP packet\n");
465 		return;
466 	}
467 
468 	/* Get UDP headers. */
469 	if (m->m_len < sizeof(*udp)) {
470 		m = m_pullup(m, sizeof(*udp));
471 		*mb = m;
472 		if (m == NULL) {
473 			DNETDEBUG("m_pullup failed\n");
474 			return;
475 		}
476 	}
477 	udp = mtod(m, const void *);
478 
479 	/* We expect to receive UDP packets on the configured client port. */
480 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
481 		DNETDEBUG("not on the expected port.\n");
482 		return;
483 	}
484 
485 	/* Check that ulen does not exceed actual size of data. */
486 	ulen = ntohs(udp->uh_ulen);
487 	if (m->m_pkthdr.len < ulen) {
488 		DNETDEBUG("ignoring runt UDP packet\n");
489 		return;
490 	}
491 
492 	sport = ntohs(udp->uh_sport);
493 
494 	m_adj(m, sizeof(*udp));
495 	ulen -= sizeof(*udp);
496 
497 	if (ulen == sizeof(struct debugnet_ack)) {
498 		debugnet_handle_ack(pcb, mb, sport);
499 		return;
500 	}
501 
502 	if (pcb->dp_rx_handler == NULL) {
503 		if (ulen < sizeof(struct debugnet_ack))
504 			DNETDEBUG("ignoring small ACK packet\n");
505 		else
506 			DNETDEBUG("ignoring unexpected non-ACK packet on "
507 			    "half-duplex connection.\n");
508 		return;
509 	}
510 
511 	debugnet_handle_rx_msg(pcb, mb);
512 }
513 
514 /*
515  * Handler for incoming packets directly from the network adapter
516  * Identifies the packet type (IP or ARP) and passes it along to one of the
517  * helper functions debugnet_handle_ip or debugnet_handle_arp.
518  *
519  * It needs to partially replicate the behaviour of ether_input() and
520  * ether_demux().
521  *
522  * Parameters:
523  *	ifp	the interface the packet came from
524  *	m	an mbuf containing the packet received
525  */
526 static void
527 debugnet_input_one(struct ifnet *ifp, struct mbuf *m)
528 {
529 	struct ifreq ifr;
530 	struct ether_header *eh;
531 	u_short etype;
532 
533 	/* Ethernet processing. */
534 	if ((m->m_flags & M_PKTHDR) == 0) {
535 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
536 		goto done;
537 	}
538 	if (m->m_len < ETHER_HDR_LEN) {
539 		DNETDEBUG_IF(ifp,
540 	    "discard frame without leading eth header (len %u pktlen %u)\n",
541 		    m->m_len, m->m_pkthdr.len);
542 		goto done;
543 	}
544 	if ((m->m_flags & M_HASFCS) != 0) {
545 		m_adj(m, -ETHER_CRC_LEN);
546 		m->m_flags &= ~M_HASFCS;
547 	}
548 	eh = mtod(m, struct ether_header *);
549 	etype = ntohs(eh->ether_type);
550 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
551 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
552 		goto done;
553 	}
554 	if (if_gethwaddr(ifp, &ifr) != 0) {
555 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
556 		goto done;
557 	}
558 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
559 	    ETHER_ADDR_LEN) != 0 &&
560 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
561 		DNETDEBUG_IF(ifp,
562 		    "discard frame with incorrect destination addr\n");
563 		goto done;
564 	}
565 
566 	MPASS(g_debugnet_pcb_inuse);
567 
568 	/* Done ethernet processing. Strip off the ethernet header. */
569 	m_adj(m, ETHER_HDR_LEN);
570 	switch (etype) {
571 	case ETHERTYPE_ARP:
572 		debugnet_handle_arp(&g_dnet_pcb, &m);
573 		break;
574 	case ETHERTYPE_IP:
575 		debugnet_handle_ip(&g_dnet_pcb, &m);
576 		break;
577 	default:
578 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
579 		break;
580 	}
581 done:
582 	if (m != NULL)
583 		m_freem(m);
584 }
585 
586 static void
587 debugnet_input(struct ifnet *ifp, struct mbuf *m)
588 {
589 	struct mbuf *n;
590 
591 	do {
592 		n = m->m_nextpkt;
593 		m->m_nextpkt = NULL;
594 		debugnet_input_one(ifp, m);
595 		m = n;
596 	} while (m != NULL);
597 }
598 
599 /*
600  * Network polling primitive.
601  *
602  * Instead of assuming that most of the network stack is sane, we just poll the
603  * driver directly for packets.
604  */
605 void
606 debugnet_network_poll(struct debugnet_pcb *pcb)
607 {
608 	struct ifnet *ifp;
609 
610 	ifp = pcb->dp_ifp;
611 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
612 }
613 
614 /*
615  * Start of consumer API surface.
616  */
617 void
618 debugnet_free(struct debugnet_pcb *pcb)
619 {
620 	struct ifnet *ifp;
621 
622 	MPASS(pcb == &g_dnet_pcb);
623 	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);
624 
625 	ifp = pcb->dp_ifp;
626 	if (ifp != NULL) {
627 		if (pcb->dp_drv_input != NULL)
628 			ifp->if_input = pcb->dp_drv_input;
629 		if (pcb->dp_event_started)
630 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
631 	}
632 	debugnet_mbuf_finish();
633 
634 	g_debugnet_pcb_inuse = false;
635 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
636 }
637 
638 int
639 debugnet_connect(const struct debugnet_conn_params *dcp,
640     struct debugnet_pcb **pcb_out)
641 {
642 	struct debugnet_proto_aux herald_auxdata;
643 	struct debugnet_pcb *pcb;
644 	struct ifnet *ifp;
645 	int error;
646 
647 	if (g_debugnet_pcb_inuse) {
648 		printf("%s: Only one connection at a time.\n", __func__);
649 		return (EBUSY);
650 	}
651 
652 	pcb = &g_dnet_pcb;
653 	*pcb = (struct debugnet_pcb) {
654 		.dp_state = DN_STATE_INIT,
655 		.dp_client = dcp->dc_client,
656 		.dp_server = dcp->dc_server,
657 		.dp_gateway = dcp->dc_gateway,
658 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
659 		.dp_client_port = dcp->dc_client_port,
660 		.dp_seqno = 1,
661 		.dp_ifp = dcp->dc_ifp,
662 		.dp_rx_handler = dcp->dc_rx_handler,
663 		.dp_drv_input = NULL,
664 	};
665 
666 	/* Switch to the debugnet mbuf zones. */
667 	debugnet_mbuf_start();
668 
669 	/* At least one needed parameter is missing; infer it. */
670 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
671 	    pcb->dp_ifp == NULL) {
672 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
673 		struct ifnet *rt_ifp;
674 		struct nhop_object *nh;
675 
676 		memset(&dest_sin, 0, sizeof(dest_sin));
677 		dest_sin = (struct sockaddr_in) {
678 			.sin_len = sizeof(dest_sin),
679 			.sin_family = AF_INET,
680 			.sin_addr.s_addr = pcb->dp_server,
681 		};
682 
683 		CURVNET_SET(vnet0);
684 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
685 		    NHR_NONE);
686 		CURVNET_RESTORE();
687 
688 		if (nh == NULL) {
689 			printf("%s: Could not get route for that server.\n",
690 			    __func__);
691 			error = ENOENT;
692 			goto cleanup;
693 		}
694 
695 		/* TODO support AF_INET6 */
696 		if (nh->gw_sa.sa_family == AF_INET)
697 			gw_sin = &nh->gw4_sa;
698 		else {
699 			if (nh->gw_sa.sa_family == AF_LINK)
700 				DNETDEBUG("Destination address is on link.\n");
701 			gw_sin = NULL;
702 		}
703 
704 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
705 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
706 
707 		rt_ifp = nh->nh_ifp;
708 
709 		if (pcb->dp_client == INADDR_ANY)
710 			pcb->dp_client = local_sin->sin_addr.s_addr;
711 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
712 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
713 		if (pcb->dp_ifp == NULL)
714 			pcb->dp_ifp = rt_ifp;
715 	}
716 
717 	ifp = pcb->dp_ifp;
718 
719 	if (debugnet_debug > 0) {
720 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
721 		    gwbuf[INET_ADDRSTRLEN];
722 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
723 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
724 		if (pcb->dp_gateway != INADDR_ANY)
725 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
726 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
727 		    serbuf, pcb->dp_server_port,
728 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
729 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
730 		    clibuf, pcb->dp_client_port, if_name(ifp));
731 	}
732 
733 	/* Validate iface is online and supported. */
734 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
735 		printf("%s: interface '%s' does not support debugnet\n",
736 		    __func__, if_name(ifp));
737 		error = ENODEV;
738 		goto cleanup;
739 	}
740 	if ((if_getflags(ifp) & IFF_UP) == 0) {
741 		printf("%s: interface '%s' link is down\n", __func__,
742 		    if_name(ifp));
743 		error = ENXIO;
744 		goto cleanup;
745 	}
746 
747 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
748 	pcb->dp_event_started = true;
749 
750 	/*
751 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
752 	 * while the debugnet ifp's if_input is overridden with
753 	 * debugnet_input().
754 	 */
755 	g_debugnet_pcb_inuse = true;
756 
757 	/* Make the card use *our* receive callback. */
758 	pcb->dp_drv_input = ifp->if_input;
759 	ifp->if_input = debugnet_input;
760 
761 	printf("%s: searching for %s MAC...\n", __func__,
762 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
763 
764 	error = debugnet_arp_gw(pcb);
765 	if (error != 0) {
766 		printf("%s: failed to locate MAC address\n", __func__);
767 		goto cleanup;
768 	}
769 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
770 
771 	herald_auxdata = (struct debugnet_proto_aux) {
772 		.dp_offset_start = dcp->dc_herald_offset,
773 		.dp_aux2 = dcp->dc_herald_aux2,
774 	};
775 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
776 	    dcp->dc_herald_datalen, &herald_auxdata);
777 	if (error != 0) {
778 		printf("%s: failed to herald debugnet server\n", __func__);
779 		goto cleanup;
780 	}
781 
782 	*pcb_out = pcb;
783 	return (0);
784 
785 cleanup:
786 	debugnet_free(pcb);
787 	return (error);
788 }
789 
790 /*
791  * Pre-allocated dump-time mbuf tracking.
792  *
793  * We just track the high water mark we've ever seen and allocate appropriately
794  * for that iface/mtu combo.
795  */
796 static struct {
797 	int nmbuf;
798 	int ncl;
799 	int clsize;
800 } dn_hwm;
801 static struct mtx dn_hwm_lk;
802 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
803 
804 static void
805 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
806 {
807 	bool any;
808 
809 	any = false;
810 	mtx_lock(&dn_hwm_lk);
811 
812 	if (nmbuf > dn_hwm.nmbuf) {
813 		any = true;
814 		dn_hwm.nmbuf = nmbuf;
815 	} else
816 		nmbuf = dn_hwm.nmbuf;
817 
818 	if (ncl > dn_hwm.ncl) {
819 		any = true;
820 		dn_hwm.ncl = ncl;
821 	} else
822 		ncl = dn_hwm.ncl;
823 
824 	if (clsize > dn_hwm.clsize) {
825 		any = true;
826 		dn_hwm.clsize = clsize;
827 	} else
828 		clsize = dn_hwm.clsize;
829 
830 	mtx_unlock(&dn_hwm_lk);
831 
832 	if (any)
833 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
834 }
835 
836 void
837 debugnet_any_ifnet_update(struct ifnet *ifp)
838 {
839 	int clsize, nmbuf, ncl, nrxr;
840 
841 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
842 		return;
843 
844 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
845 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
846 
847 	/*
848 	 * We need two headers per message on the transmit side. Multiply by
849 	 * four to give us some breathing room.
850 	 */
851 	nmbuf = ncl * (4 + nrxr);
852 	ncl *= nrxr;
853 
854 	/*
855 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
856 	 * dn_init method is available.
857 	 */
858 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
859 #ifndef INVARIANTS
860 		if (bootverbose)
861 #endif
862 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
863 		    __func__, if_name(ifp), ifp);
864 		return;
865 	}
866 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
867 }
868 
869 /*
870  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
871  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
872  *
873  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
874  * because the driver is still in attach.  Since we cannot use down interfaces,
875  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
876  * with vtnet and dhcpclient that event just never occurs.
877  *
878  * So that's how I've landed on the lower level ifnet_link_event.
879  */
880 
881 static void
882 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
883 {
884 	if (link_state == LINK_STATE_UP)
885 		debugnet_any_ifnet_update(ifp);
886 }
887 
888 static eventhandler_tag dn_attach_cookie;
889 static void
890 dn_evh_init(void *ctx __unused)
891 {
892 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
893 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
894 }
895 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
896 
897 /*
898  * DDB parsing helpers for debugnet(4) consumers.
899  */
900 #ifdef DDB
901 struct my_inet_opt {
902 	bool has_opt;
903 	const char *printname;
904 	in_addr_t *result;
905 };
906 
907 static int
908 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
909 {
910 	in_addr_t tmp;
911 	unsigned octet;
912 	int t;
913 
914 	tmp = 0;
915 	for (octet = 0; octet < 4; octet++) {
916 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
917 		if (t != tNUMBER) {
918 			db_printf("%s:%s: octet %u expected number; found %d\n",
919 			    __func__, opt->printname, octet, t);
920 			return (EINVAL);
921 		}
922 		/*
923 		 * db_lex lexes '-' distinctly from the number itself, but
924 		 * let's document that invariant.
925 		 */
926 		MPASS(db_tok_number >= 0);
927 
928 		if (db_tok_number > UINT8_MAX) {
929 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
930 			    opt->printname, octet, (intmax_t)db_tok_number);
931 			return (EDOM);
932 		}
933 
934 		/* Constructed host-endian and converted to network later. */
935 		tmp = (tmp << 8) | db_tok_number;
936 
937 		if (octet < 3) {
938 			t = db_read_token_flags(DRT_WSPACE);
939 			if (t != tDOT) {
940 				db_printf("%s:%s: octet %u expected '.'; found"
941 				    " %d\n", __func__, opt->printname, octet,
942 				    t);
943 				return (EINVAL);
944 			}
945 		}
946 	}
947 
948 	*opt->result = htonl(tmp);
949 	opt->has_opt = true;
950 	return (0);
951 }
952 
953 int
954 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
955 {
956 	struct ifnet *ifp;
957 	int t, error;
958 	bool want_ifp;
959 	char ch;
960 
961 	struct my_inet_opt opt_client = {
962 		.printname = "client",
963 		.result = &result->dd_client,
964 	},
965 	opt_server = {
966 		.printname = "server",
967 		.result = &result->dd_server,
968 	},
969 	opt_gateway = {
970 		.printname = "gateway",
971 		.result = &result->dd_gateway,
972 	},
973 	*cur_inet_opt;
974 
975 	ifp = NULL;
976 	memset(result, 0, sizeof(*result));
977 
978 	/*
979 	 * command [space] [-] [opt] [[space] [optarg]] ...
980 	 *
981 	 * db_command has already lexed 'command' for us.
982 	 */
983 	t = db_read_token_flags(DRT_WSPACE);
984 	if (t == tWSPACE)
985 		t = db_read_token_flags(DRT_WSPACE);
986 
987 	while (t != tEOL) {
988 		if (t != tMINUS) {
989 			db_printf("%s: Bad syntax; expected '-', got %d\n",
990 			    cmd, t);
991 			goto usage;
992 		}
993 
994 		t = db_read_token_flags(DRT_WSPACE);
995 		if (t != tIDENT) {
996 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
997 			    cmd, t);
998 			goto usage;
999 		}
1000 
1001 		if (strlen(db_tok_string) > 1) {
1002 			db_printf("%s: Bad syntax; expected single option "
1003 			    "flag, got '%s'\n", cmd, db_tok_string);
1004 			goto usage;
1005 		}
1006 
1007 		want_ifp = false;
1008 		cur_inet_opt = NULL;
1009 		switch ((ch = db_tok_string[0])) {
1010 		default:
1011 			DNETDEBUG("Unexpected: '%c'\n", ch);
1012 			/* FALLTHROUGH */
1013 		case 'h':
1014 			goto usage;
1015 		case 'c':
1016 			cur_inet_opt = &opt_client;
1017 			break;
1018 		case 'g':
1019 			cur_inet_opt = &opt_gateway;
1020 			break;
1021 		case 's':
1022 			cur_inet_opt = &opt_server;
1023 			break;
1024 		case 'i':
1025 			want_ifp = true;
1026 			break;
1027 		}
1028 
1029 		t = db_read_token_flags(DRT_WSPACE);
1030 		if (t != tWSPACE) {
1031 			db_printf("%s: Bad syntax; expected space after "
1032 			    "flag %c, got %d\n", cmd, ch, t);
1033 			goto usage;
1034 		}
1035 
1036 		if (want_ifp) {
1037 			t = db_read_token_flags(DRT_WSPACE);
1038 			if (t != tIDENT) {
1039 				db_printf("%s: Expected interface but got %d\n",
1040 				    cmd, t);
1041 				goto usage;
1042 			}
1043 
1044 			CURVNET_SET(vnet0);
1045 			/*
1046 			 * We *don't* take a ref here because the only current
1047 			 * consumer, db_netdump_cmd, does not need it.  It
1048 			 * (somewhat redundantly) extracts the if_name(),
1049 			 * re-lookups the ifp, and takes its own reference.
1050 			 */
1051 			ifp = ifunit(db_tok_string);
1052 			CURVNET_RESTORE();
1053 			if (ifp == NULL) {
1054 				db_printf("Could not locate interface %s\n",
1055 				    db_tok_string);
1056 				error = ENOENT;
1057 				goto cleanup;
1058 			}
1059 		} else {
1060 			MPASS(cur_inet_opt != NULL);
1061 			/* Assume IPv4 for now. */
1062 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1063 			if (error != 0)
1064 				goto cleanup;
1065 		}
1066 
1067 		/* Skip (mandatory) whitespace after option, if not EOL. */
1068 		t = db_read_token_flags(DRT_WSPACE);
1069 		if (t == tEOL)
1070 			break;
1071 		if (t != tWSPACE) {
1072 			db_printf("%s: Bad syntax; expected space after "
1073 			    "flag %c option; got %d\n", cmd, ch, t);
1074 			goto usage;
1075 		}
1076 		t = db_read_token_flags(DRT_WSPACE);
1077 	}
1078 
1079 	if (!opt_server.has_opt) {
1080 		db_printf("%s: need a destination server address\n", cmd);
1081 		goto usage;
1082 	}
1083 
1084 	result->dd_has_client = opt_client.has_opt;
1085 	result->dd_has_gateway = opt_gateway.has_opt;
1086 	result->dd_ifp = ifp;
1087 
1088 	/* We parsed the full line to tEOL already, or bailed with an error. */
1089 	return (0);
1090 
1091 usage:
1092 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1093 	    "-i <interface>]\n", cmd);
1094 	error = EINVAL;
1095 	/* FALLTHROUGH */
1096 cleanup:
1097 	db_skip_to_eol();
1098 	return (error);
1099 }
1100 #endif /* DDB */
1101