xref: /freebsd/sys/net/debugnet.c (revision 52c81be11a107cdedb865a274b5567b0c95c0308)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/endian.h>
40 #include <sys/errno.h>
41 #include <sys/eventhandler.h>
42 #include <sys/socket.h>
43 #include <sys/sysctl.h>
44 
45 #ifdef DDB
46 #include <ddb/ddb.h>
47 #include <ddb/db_lex.h>
48 #endif
49 
50 #include <net/ethernet.h>
51 #include <net/if.h>
52 #include <net/if_arp.h>
53 #include <net/if_dl.h>
54 #include <net/if_types.h>
55 #include <net/if_var.h>
56 #include <net/route.h>
57 #include <net/route/nhop.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_fib.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
63 #include <netinet/ip.h>
64 #include <netinet/ip_var.h>
65 #include <netinet/ip_options.h>
66 #include <netinet/udp.h>
67 #include <netinet/udp_var.h>
68 
69 #include <machine/in_cksum.h>
70 #include <machine/pcb.h>
71 
72 #include <net/debugnet.h>
73 #define	DEBUGNET_INTERNAL
74 #include <net/debugnet_int.h>
75 
76 FEATURE(debugnet, "Debugnet support");
77 
78 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
79     "debugnet parameters");
80 
81 unsigned debugnet_debug;
82 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
83     &debugnet_debug, 0,
84     "Debug message verbosity (0: off; 1: on; 2: verbose)");
85 
86 int debugnet_npolls = 2000;
87 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
88     &debugnet_npolls, 0,
89     "Number of times to poll before assuming packet loss (0.5ms per poll)");
90 int debugnet_nretries = 10;
91 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
92     &debugnet_nretries, 0,
93     "Number of retransmit attempts before giving up");
94 
95 static bool g_debugnet_pcb_inuse;
96 static struct debugnet_pcb g_dnet_pcb;
97 
98 /*
99  * Simple accessors for opaque PCB.
100  */
101 const unsigned char *
102 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
103 {
104 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
105 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
106 	return (pcb->dp_gw_mac.octet);
107 }
108 
109 /*
110  * Start of network primitives, beginning with output primitives.
111  */
112 
113 /*
114  * Handles creation of the ethernet header, then places outgoing packets into
115  * the tx buffer for the NIC
116  *
117  * Parameters:
118  *	m	The mbuf containing the packet to be sent (will be freed by
119  *		this function or the NIC driver)
120  *	ifp	The interface to send on
121  *	dst	The destination ethernet address (source address will be looked
122  *		up using ifp)
123  *	etype	The ETHERTYPE_* value for the protocol that is being sent
124  *
125  * Returns:
126  *	int	see errno.h, 0 for success
127  */
128 int
129 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
130     u_short etype)
131 {
132 	struct ether_header *eh;
133 
134 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
135 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
136 		if_printf(ifp, "%s: interface isn't up\n", __func__);
137 		m_freem(m);
138 		return (ENETDOWN);
139 	}
140 
141 	/* Fill in the ethernet header. */
142 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
143 	if (m == NULL) {
144 		printf("%s: out of mbufs\n", __func__);
145 		return (ENOBUFS);
146 	}
147 	eh = mtod(m, struct ether_header *);
148 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
149 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
150 	eh->ether_type = htons(etype);
151 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
152 }
153 
154 /*
155  * Unreliable transmission of an mbuf chain to the debugnet server
156  * Note: can't handle fragmentation; fails if the packet is larger than
157  *	 ifp->if_mtu after adding the UDP/IP headers
158  *
159  * Parameters:
160  *	pcb	The debugnet context block
161  *	m	mbuf chain
162  *
163  * Returns:
164  *	int	see errno.h, 0 for success
165  */
166 static int
167 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
168 {
169 	struct udphdr *udp;
170 
171 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
172 
173 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
174 	if (m == NULL) {
175 		printf("%s: out of mbufs\n", __func__);
176 		return (ENOBUFS);
177 	}
178 
179 	udp = mtod(m, void *);
180 	udp->uh_ulen = htons(m->m_pkthdr.len);
181 	/* Use this src port so that the server can connect() the socket */
182 	udp->uh_sport = htons(pcb->dp_client_port);
183 	udp->uh_dport = htons(pcb->dp_server_port);
184 	/* Computed later (protocol-dependent). */
185 	udp->uh_sum = 0;
186 
187 	return (debugnet_ip_output(pcb, m));
188 }
189 
190 int
191 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
192 {
193 	struct debugnet_ack *dn_ack;
194 	struct mbuf *m;
195 
196 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
197 
198 	m = m_gethdr(M_NOWAIT, MT_DATA);
199 	if (m == NULL) {
200 		printf("%s: Out of mbufs\n", __func__);
201 		return (ENOBUFS);
202 	}
203 	m->m_len = sizeof(*dn_ack);
204 	m->m_pkthdr.len = sizeof(*dn_ack);
205 	MH_ALIGN(m, sizeof(*dn_ack));
206 	dn_ack = mtod(m, void *);
207 	dn_ack->da_seqno = seqno;
208 
209 	return (debugnet_udp_output(pcb, m));
210 }
211 
212 /*
213  * Dummy free function for debugnet clusters.
214  */
215 static void
216 debugnet_mbuf_free(struct mbuf *m __unused)
217 {
218 }
219 
220 /*
221  * Construct and reliably send a debugnet packet.  May fail from a resource
222  * shortage or extreme number of unacknowledged retransmissions.  Wait for
223  * an acknowledgement before returning.  Splits packets into chunks small
224  * enough to be sent without fragmentation (looks up the interface MTU)
225  *
226  * Parameters:
227  *	type	debugnet packet type (HERALD, FINISHED, ...)
228  *	data	data
229  *	datalen	data size (bytes)
230  *	auxdata	optional auxiliary information
231  *
232  * Returns:
233  *	int see errno.h, 0 for success
234  */
235 int
236 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
237     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
238 {
239 	struct debugnet_msg_hdr *dn_msg_hdr;
240 	struct mbuf *m, *m2;
241 	uint64_t want_acks;
242 	uint32_t i, pktlen, sent_so_far;
243 	int retries, polls, error;
244 
245 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
246 		return (ECONNRESET);
247 
248 	want_acks = 0;
249 	pcb->dp_rcvd_acks = 0;
250 	retries = 0;
251 
252 retransmit:
253 	/* Chunks can be too big to fit in packets. */
254 	for (i = sent_so_far = 0; sent_so_far < datalen ||
255 	    (i == 0 && datalen == 0); i++) {
256 		pktlen = datalen - sent_so_far;
257 
258 		/* Bound: the interface MTU (assume no IP options). */
259 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
260 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
261 
262 		/*
263 		 * Check if it is retransmitting and this has been ACKed
264 		 * already.
265 		 */
266 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
267 			sent_so_far += pktlen;
268 			continue;
269 		}
270 
271 		/*
272 		 * Get and fill a header mbuf, then chain data as an extended
273 		 * mbuf.
274 		 */
275 		m = m_gethdr(M_NOWAIT, MT_DATA);
276 		if (m == NULL) {
277 			printf("%s: Out of mbufs\n", __func__);
278 			return (ENOBUFS);
279 		}
280 		m->m_len = sizeof(struct debugnet_msg_hdr);
281 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
282 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
283 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
284 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
285 		dn_msg_hdr->mh_type = htonl(type);
286 		dn_msg_hdr->mh_len = htonl(pktlen);
287 
288 		if (auxdata != NULL) {
289 			dn_msg_hdr->mh_offset =
290 			    htobe64(auxdata->dp_offset_start + sent_so_far);
291 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
292 		} else {
293 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
294 			dn_msg_hdr->mh_aux2 = 0;
295 		}
296 
297 		if (pktlen != 0) {
298 			m2 = m_get(M_NOWAIT, MT_DATA);
299 			if (m2 == NULL) {
300 				m_freem(m);
301 				printf("%s: Out of mbufs\n", __func__);
302 				return (ENOBUFS);
303 			}
304 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
305 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
306 			    EXT_DISPOSABLE);
307 			m2->m_len = pktlen;
308 
309 			m_cat(m, m2);
310 			m->m_pkthdr.len += pktlen;
311 		}
312 		error = debugnet_udp_output(pcb, m);
313 		if (error != 0)
314 			return (error);
315 
316 		/* Note that we're waiting for this packet in the bitfield. */
317 		want_acks |= (1 << i);
318 		sent_so_far += pktlen;
319 	}
320 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
321 		printf("Warning: Sent more than %d packets (%d). "
322 		    "Acknowledgements will fail unless the size of "
323 		    "rcvd_acks/want_acks is increased.\n",
324 		    DEBUGNET_MAX_IN_FLIGHT, i);
325 
326 	/*
327 	 * Wait for acks.  A *real* window would speed things up considerably.
328 	 */
329 	polls = 0;
330 	while (pcb->dp_rcvd_acks != want_acks) {
331 		if (polls++ > debugnet_npolls) {
332 			if (retries++ > debugnet_nretries)
333 				return (ETIMEDOUT);
334 			printf(". ");
335 			goto retransmit;
336 		}
337 		debugnet_network_poll(pcb);
338 		DELAY(500);
339 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
340 			return (ECONNRESET);
341 	}
342 	pcb->dp_seqno += i;
343 	return (0);
344 }
345 
346 /*
347  * Network input primitives.
348  */
349 
350 /*
351  * Just introspect the header enough to fire off a seqno ack and validate
352  * length fits.
353  */
354 static void
355 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
356 {
357 	const struct debugnet_msg_hdr *dnh;
358 	struct mbuf *m;
359 	int error;
360 
361 	m = *mb;
362 
363 	if (m->m_pkthdr.len < sizeof(*dnh)) {
364 		DNETDEBUG("ignoring small debugnet_msg packet\n");
365 		return;
366 	}
367 
368 	/* Get ND header. */
369 	if (m->m_len < sizeof(*dnh)) {
370 		m = m_pullup(m, sizeof(*dnh));
371 		*mb = m;
372 		if (m == NULL) {
373 			DNETDEBUG("m_pullup failed\n");
374 			return;
375 		}
376 	}
377 	dnh = mtod(m, const void *);
378 
379 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
380 		DNETDEBUG("Dropping short packet.\n");
381 		return;
382 	}
383 
384 	/*
385 	 * If the issue is transient (ENOBUFS), sender should resend.  If
386 	 * non-transient (like driver objecting to rx -> tx from the same
387 	 * thread), not much else we can do.
388 	 */
389 	error = debugnet_ack_output(pcb, dnh->mh_seqno);
390 	if (error != 0)
391 		return;
392 
393 	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
394 		printf("Remote shut down the connection on us!\n");
395 		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
396 
397 		/*
398 		 * Continue through to the user handler so they are signalled
399 		 * not to wait for further rx.
400 		 */
401 	}
402 
403 	pcb->dp_rx_handler(pcb, mb);
404 }
405 
406 static void
407 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
408 {
409 	const struct debugnet_ack *dn_ack;
410 	struct mbuf *m;
411 	uint32_t rcv_ackno;
412 
413 	m = *mb;
414 
415 	/* Get Ack. */
416 	if (m->m_len < sizeof(*dn_ack)) {
417 		m = m_pullup(m, sizeof(*dn_ack));
418 		*mb = m;
419 		if (m == NULL) {
420 			DNETDEBUG("m_pullup failed\n");
421 			return;
422 		}
423 	}
424 	dn_ack = mtod(m, const void *);
425 
426 	/* Debugnet processing. */
427 	/*
428 	 * Packet is meant for us.  Extract the ack sequence number and the
429 	 * port number if necessary.
430 	 */
431 	rcv_ackno = ntohl(dn_ack->da_seqno);
432 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
433 		pcb->dp_server_port = sport;
434 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
435 	}
436 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
437 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
438 	else if (rcv_ackno >= pcb->dp_seqno) {
439 		/* We're interested in this ack. Record it. */
440 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
441 	}
442 }
443 
444 void
445 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
446 {
447 	const struct udphdr *udp;
448 	struct mbuf *m;
449 	uint16_t sport, ulen;
450 
451 	/* UDP processing. */
452 
453 	m = *mb;
454 	if (m->m_pkthdr.len < sizeof(*udp)) {
455 		DNETDEBUG("ignoring small UDP packet\n");
456 		return;
457 	}
458 
459 	/* Get UDP headers. */
460 	if (m->m_len < sizeof(*udp)) {
461 		m = m_pullup(m, sizeof(*udp));
462 		*mb = m;
463 		if (m == NULL) {
464 			DNETDEBUG("m_pullup failed\n");
465 			return;
466 		}
467 	}
468 	udp = mtod(m, const void *);
469 
470 	/* We expect to receive UDP packets on the configured client port. */
471 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
472 		DNETDEBUG("not on the expected port.\n");
473 		return;
474 	}
475 
476 	/* Check that ulen does not exceed actual size of data. */
477 	ulen = ntohs(udp->uh_ulen);
478 	if (m->m_pkthdr.len < ulen) {
479 		DNETDEBUG("ignoring runt UDP packet\n");
480 		return;
481 	}
482 
483 	sport = ntohs(udp->uh_sport);
484 
485 	m_adj(m, sizeof(*udp));
486 	ulen -= sizeof(*udp);
487 
488 	if (ulen == sizeof(struct debugnet_ack)) {
489 		debugnet_handle_ack(pcb, mb, sport);
490 		return;
491 	}
492 
493 	if (pcb->dp_rx_handler == NULL) {
494 		if (ulen < sizeof(struct debugnet_ack))
495 			DNETDEBUG("ignoring small ACK packet\n");
496 		else
497 			DNETDEBUG("ignoring unexpected non-ACK packet on "
498 			    "half-duplex connection.\n");
499 		return;
500 	}
501 
502 	debugnet_handle_rx_msg(pcb, mb);
503 }
504 
505 /*
506  * Handler for incoming packets directly from the network adapter
507  * Identifies the packet type (IP or ARP) and passes it along to one of the
508  * helper functions debugnet_handle_ip or debugnet_handle_arp.
509  *
510  * It needs to partially replicate the behaviour of ether_input() and
511  * ether_demux().
512  *
513  * Parameters:
514  *	ifp	the interface the packet came from
515  *	m	an mbuf containing the packet received
516  */
517 static void
518 debugnet_pkt_in(struct ifnet *ifp, struct mbuf *m)
519 {
520 	struct ifreq ifr;
521 	struct ether_header *eh;
522 	u_short etype;
523 
524 	/* Ethernet processing. */
525 	if ((m->m_flags & M_PKTHDR) == 0) {
526 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
527 		goto done;
528 	}
529 	if (m->m_len < ETHER_HDR_LEN) {
530 		DNETDEBUG_IF(ifp,
531 	    "discard frame without leading eth header (len %u pktlen %u)\n",
532 		    m->m_len, m->m_pkthdr.len);
533 		goto done;
534 	}
535 	if ((m->m_flags & M_HASFCS) != 0) {
536 		m_adj(m, -ETHER_CRC_LEN);
537 		m->m_flags &= ~M_HASFCS;
538 	}
539 	eh = mtod(m, struct ether_header *);
540 	etype = ntohs(eh->ether_type);
541 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
542 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
543 		goto done;
544 	}
545 	if (if_gethwaddr(ifp, &ifr) != 0) {
546 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
547 		goto done;
548 	}
549 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
550 	    ETHER_ADDR_LEN) != 0 &&
551 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
552 		DNETDEBUG_IF(ifp,
553 		    "discard frame with incorrect destination addr\n");
554 		goto done;
555 	}
556 
557 	MPASS(g_debugnet_pcb_inuse);
558 
559 	/* Done ethernet processing. Strip off the ethernet header. */
560 	m_adj(m, ETHER_HDR_LEN);
561 	switch (etype) {
562 	case ETHERTYPE_ARP:
563 		debugnet_handle_arp(&g_dnet_pcb, &m);
564 		break;
565 	case ETHERTYPE_IP:
566 		debugnet_handle_ip(&g_dnet_pcb, &m);
567 		break;
568 	default:
569 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
570 		break;
571 	}
572 done:
573 	if (m != NULL)
574 		m_freem(m);
575 }
576 
577 /*
578  * Network polling primitive.
579  *
580  * Instead of assuming that most of the network stack is sane, we just poll the
581  * driver directly for packets.
582  */
583 void
584 debugnet_network_poll(struct debugnet_pcb *pcb)
585 {
586 	struct ifnet *ifp;
587 
588 	ifp = pcb->dp_ifp;
589 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
590 }
591 
592 /*
593  * Start of consumer API surface.
594  */
595 void
596 debugnet_free(struct debugnet_pcb *pcb)
597 {
598 	struct ifnet *ifp;
599 
600 	MPASS(g_debugnet_pcb_inuse);
601 	MPASS(pcb == &g_dnet_pcb);
602 
603 	ifp = pcb->dp_ifp;
604 	if (ifp != NULL) {
605 		if (pcb->dp_drv_input != NULL)
606 			ifp->if_input = pcb->dp_drv_input;
607 		if (pcb->dp_event_started)
608 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
609 	}
610 	debugnet_mbuf_finish();
611 
612 	g_debugnet_pcb_inuse = false;
613 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
614 }
615 
616 int
617 debugnet_connect(const struct debugnet_conn_params *dcp,
618     struct debugnet_pcb **pcb_out)
619 {
620 	struct debugnet_proto_aux herald_auxdata;
621 	struct debugnet_pcb *pcb;
622 	struct ifnet *ifp;
623 	int error;
624 
625 	if (g_debugnet_pcb_inuse) {
626 		printf("%s: Only one connection at a time.\n", __func__);
627 		return (EBUSY);
628 	}
629 
630 	pcb = &g_dnet_pcb;
631 	*pcb = (struct debugnet_pcb) {
632 		.dp_state = DN_STATE_INIT,
633 		.dp_client = dcp->dc_client,
634 		.dp_server = dcp->dc_server,
635 		.dp_gateway = dcp->dc_gateway,
636 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
637 		.dp_client_port = dcp->dc_client_port,
638 		.dp_seqno = 1,
639 		.dp_ifp = dcp->dc_ifp,
640 		.dp_rx_handler = dcp->dc_rx_handler,
641 	};
642 
643 	/* Switch to the debugnet mbuf zones. */
644 	debugnet_mbuf_start();
645 
646 	/* At least one needed parameter is missing; infer it. */
647 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
648 	    pcb->dp_ifp == NULL) {
649 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
650 		struct ifnet *rt_ifp;
651 		struct nhop_object *nh;
652 
653 		memset(&dest_sin, 0, sizeof(dest_sin));
654 		dest_sin = (struct sockaddr_in) {
655 			.sin_len = sizeof(dest_sin),
656 			.sin_family = AF_INET,
657 			.sin_addr.s_addr = pcb->dp_server,
658 		};
659 
660 		CURVNET_SET(vnet0);
661 		nh = fib4_lookup_debugnet(RT_DEFAULT_FIB, dest_sin.sin_addr, 0,
662 		    NHR_NONE);
663 		CURVNET_RESTORE();
664 
665 		if (nh == NULL) {
666 			printf("%s: Could not get route for that server.\n",
667 			    __func__);
668 			error = ENOENT;
669 			goto cleanup;
670 		}
671 
672 		if (nh->gw_sa.sa_family == AF_INET)
673 			gw_sin = &nh->gw4_sa;
674 		else {
675 			if (nh->gw_sa.sa_family == AF_LINK)
676 				DNETDEBUG("Destination address is on link.\n");
677 			gw_sin = NULL;
678 		}
679 
680 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
681 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
682 
683 		rt_ifp = nh->nh_ifp;
684 
685 		if (pcb->dp_client == INADDR_ANY)
686 			pcb->dp_client = local_sin->sin_addr.s_addr;
687 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
688 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
689 		if (pcb->dp_ifp == NULL)
690 			pcb->dp_ifp = rt_ifp;
691 	}
692 
693 	ifp = pcb->dp_ifp;
694 
695 	if (debugnet_debug > 0) {
696 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
697 		    gwbuf[INET_ADDRSTRLEN];
698 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
699 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
700 		if (pcb->dp_gateway != INADDR_ANY)
701 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
702 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
703 		    serbuf, pcb->dp_server_port,
704 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
705 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
706 		    clibuf, pcb->dp_client_port, if_name(ifp));
707 	}
708 
709 	/* Validate iface is online and supported. */
710 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
711 		printf("%s: interface '%s' does not support debugnet\n",
712 		    __func__, if_name(ifp));
713 		error = ENODEV;
714 		goto cleanup;
715 	}
716 	if ((if_getflags(ifp) & IFF_UP) == 0) {
717 		printf("%s: interface '%s' link is down\n", __func__,
718 		    if_name(ifp));
719 		error = ENXIO;
720 		goto cleanup;
721 	}
722 
723 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
724 	pcb->dp_event_started = true;
725 
726 	/*
727 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
728 	 * while the debugnet ifp's if_input is overridden with
729 	 * debugnet_pkt_in.
730 	 */
731 	g_debugnet_pcb_inuse = true;
732 
733 	/* Make the card use *our* receive callback. */
734 	pcb->dp_drv_input = ifp->if_input;
735 	ifp->if_input = debugnet_pkt_in;
736 
737 	printf("%s: searching for %s MAC...\n", __func__,
738 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
739 
740 	error = debugnet_arp_gw(pcb);
741 	if (error != 0) {
742 		printf("%s: failed to locate MAC address\n", __func__);
743 		goto cleanup;
744 	}
745 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
746 
747 	herald_auxdata = (struct debugnet_proto_aux) {
748 		.dp_offset_start = dcp->dc_herald_offset,
749 		.dp_aux2 = dcp->dc_herald_aux2,
750 	};
751 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
752 	    dcp->dc_herald_datalen, &herald_auxdata);
753 	if (error != 0) {
754 		printf("%s: failed to herald debugnet server\n", __func__);
755 		goto cleanup;
756 	}
757 
758 	*pcb_out = pcb;
759 	return (0);
760 
761 cleanup:
762 	debugnet_free(pcb);
763 	return (error);
764 }
765 
766 /*
767  * Pre-allocated dump-time mbuf tracking.
768  *
769  * We just track the high water mark we've ever seen and allocate appropriately
770  * for that iface/mtu combo.
771  */
772 static struct {
773 	int nmbuf;
774 	int ncl;
775 	int clsize;
776 } dn_hwm;
777 static struct mtx dn_hwm_lk;
778 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
779 
780 static void
781 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
782 {
783 	bool any;
784 
785 	any = false;
786 	mtx_lock(&dn_hwm_lk);
787 
788 	if (nmbuf > dn_hwm.nmbuf) {
789 		any = true;
790 		dn_hwm.nmbuf = nmbuf;
791 	} else
792 		nmbuf = dn_hwm.nmbuf;
793 
794 	if (ncl > dn_hwm.ncl) {
795 		any = true;
796 		dn_hwm.ncl = ncl;
797 	} else
798 		ncl = dn_hwm.ncl;
799 
800 	if (clsize > dn_hwm.clsize) {
801 		any = true;
802 		dn_hwm.clsize = clsize;
803 	} else
804 		clsize = dn_hwm.clsize;
805 
806 	mtx_unlock(&dn_hwm_lk);
807 
808 	if (any)
809 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
810 }
811 
812 void
813 debugnet_any_ifnet_update(struct ifnet *ifp)
814 {
815 	int clsize, nmbuf, ncl, nrxr;
816 
817 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
818 		return;
819 
820 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
821 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
822 
823 	/*
824 	 * We need two headers per message on the transmit side. Multiply by
825 	 * four to give us some breathing room.
826 	 */
827 	nmbuf = ncl * (4 + nrxr);
828 	ncl *= nrxr;
829 
830 	/*
831 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
832 	 * dn_init method is available.
833 	 */
834 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
835 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
836 		    __func__, if_name(ifp), ifp);
837 		return;
838 	}
839 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
840 }
841 
842 /*
843  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
844  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
845  *
846  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
847  * because the driver is still in attach.  Since we cannot use down interfaces,
848  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
849  * with vtnet and dhcpclient that event just never occurs.
850  *
851  * So that's how I've landed on the lower level ifnet_link_event.
852  */
853 
854 static void
855 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
856 {
857 	if (link_state == LINK_STATE_UP)
858 		debugnet_any_ifnet_update(ifp);
859 }
860 
861 static eventhandler_tag dn_attach_cookie;
862 static void
863 dn_evh_init(void *ctx __unused)
864 {
865 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
866 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
867 }
868 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
869 
870 /*
871  * DDB parsing helpers for debugnet(4) consumers.
872  */
873 #ifdef DDB
874 struct my_inet_opt {
875 	bool has_opt;
876 	const char *printname;
877 	in_addr_t *result;
878 };
879 
880 static int
881 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
882 {
883 	in_addr_t tmp;
884 	unsigned octet;
885 	int t;
886 
887 	tmp = 0;
888 	for (octet = 0; octet < 4; octet++) {
889 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
890 		if (t != tNUMBER) {
891 			db_printf("%s:%s: octet %u expected number; found %d\n",
892 			    __func__, opt->printname, octet, t);
893 			return (EINVAL);
894 		}
895 		/*
896 		 * db_lex lexes '-' distinctly from the number itself, but
897 		 * let's document that invariant.
898 		 */
899 		MPASS(db_tok_number >= 0);
900 
901 		if (db_tok_number > UINT8_MAX) {
902 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
903 			    opt->printname, octet, (intmax_t)db_tok_number);
904 			return (EDOM);
905 		}
906 
907 		/* Constructed host-endian and converted to network later. */
908 		tmp = (tmp << 8) | db_tok_number;
909 
910 		if (octet < 3) {
911 			t = db_read_token_flags(DRT_WSPACE);
912 			if (t != tDOT) {
913 				db_printf("%s:%s: octet %u expected '.'; found"
914 				    " %d\n", __func__, opt->printname, octet,
915 				    t);
916 				return (EINVAL);
917 			}
918 		}
919 	}
920 
921 	*opt->result = htonl(tmp);
922 	opt->has_opt = true;
923 	return (0);
924 }
925 
926 int
927 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
928 {
929 	struct ifnet *ifp;
930 	int t, error;
931 	bool want_ifp;
932 	char ch;
933 
934 	struct my_inet_opt opt_client = {
935 		.printname = "client",
936 		.result = &result->dd_client,
937 	},
938 	opt_server = {
939 		.printname = "server",
940 		.result = &result->dd_server,
941 	},
942 	opt_gateway = {
943 		.printname = "gateway",
944 		.result = &result->dd_gateway,
945 	},
946 	*cur_inet_opt;
947 
948 	ifp = NULL;
949 	memset(result, 0, sizeof(*result));
950 
951 	/*
952 	 * command [space] [-] [opt] [[space] [optarg]] ...
953 	 *
954 	 * db_command has already lexed 'command' for us.
955 	 */
956 	t = db_read_token_flags(DRT_WSPACE);
957 	if (t == tWSPACE)
958 		t = db_read_token_flags(DRT_WSPACE);
959 
960 	while (t != tEOL) {
961 		if (t != tMINUS) {
962 			db_printf("%s: Bad syntax; expected '-', got %d\n",
963 			    cmd, t);
964 			goto usage;
965 		}
966 
967 		t = db_read_token_flags(DRT_WSPACE);
968 		if (t != tIDENT) {
969 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
970 			    cmd, t);
971 			goto usage;
972 		}
973 
974 		if (strlen(db_tok_string) > 1) {
975 			db_printf("%s: Bad syntax; expected single option "
976 			    "flag, got '%s'\n", cmd, db_tok_string);
977 			goto usage;
978 		}
979 
980 		want_ifp = false;
981 		cur_inet_opt = NULL;
982 		switch ((ch = db_tok_string[0])) {
983 		default:
984 			DNETDEBUG("Unexpected: '%c'\n", ch);
985 			/* FALLTHROUGH */
986 		case 'h':
987 			goto usage;
988 		case 'c':
989 			cur_inet_opt = &opt_client;
990 			break;
991 		case 'g':
992 			cur_inet_opt = &opt_gateway;
993 			break;
994 		case 's':
995 			cur_inet_opt = &opt_server;
996 			break;
997 		case 'i':
998 			want_ifp = true;
999 			break;
1000 		}
1001 
1002 		t = db_read_token_flags(DRT_WSPACE);
1003 		if (t != tWSPACE) {
1004 			db_printf("%s: Bad syntax; expected space after "
1005 			    "flag %c, got %d\n", cmd, ch, t);
1006 			goto usage;
1007 		}
1008 
1009 		if (want_ifp) {
1010 			t = db_read_token_flags(DRT_WSPACE);
1011 			if (t != tIDENT) {
1012 				db_printf("%s: Expected interface but got %d\n",
1013 				    cmd, t);
1014 				goto usage;
1015 			}
1016 
1017 			CURVNET_SET(vnet0);
1018 			/*
1019 			 * We *don't* take a ref here because the only current
1020 			 * consumer, db_netdump_cmd, does not need it.  It
1021 			 * (somewhat redundantly) extracts the if_name(),
1022 			 * re-lookups the ifp, and takes its own reference.
1023 			 */
1024 			ifp = ifunit(db_tok_string);
1025 			CURVNET_RESTORE();
1026 			if (ifp == NULL) {
1027 				db_printf("Could not locate interface %s\n",
1028 				    db_tok_string);
1029 				goto cleanup;
1030 			}
1031 		} else {
1032 			MPASS(cur_inet_opt != NULL);
1033 			/* Assume IPv4 for now. */
1034 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1035 			if (error != 0)
1036 				goto cleanup;
1037 		}
1038 
1039 		/* Skip (mandatory) whitespace after option, if not EOL. */
1040 		t = db_read_token_flags(DRT_WSPACE);
1041 		if (t == tEOL)
1042 			break;
1043 		if (t != tWSPACE) {
1044 			db_printf("%s: Bad syntax; expected space after "
1045 			    "flag %c option; got %d\n", cmd, ch, t);
1046 			goto usage;
1047 		}
1048 		t = db_read_token_flags(DRT_WSPACE);
1049 	}
1050 
1051 	if (!opt_server.has_opt) {
1052 		db_printf("%s: need a destination server address\n", cmd);
1053 		goto usage;
1054 	}
1055 
1056 	result->dd_has_client = opt_client.has_opt;
1057 	result->dd_has_gateway = opt_gateway.has_opt;
1058 	result->dd_ifp = ifp;
1059 
1060 	/* We parsed the full line to tEOL already, or bailed with an error. */
1061 	return (0);
1062 
1063 usage:
1064 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1065 	    "-i <interface>]\n", cmd);
1066 	error = EINVAL;
1067 	/* FALLTHROUGH */
1068 cleanup:
1069 	db_skip_to_eol();
1070 	return (error);
1071 }
1072 #endif /* DDB */
1073