xref: /freebsd/sys/net/debugnet.c (revision 87b759f0fa1f7554d50ce640c40138512bbded44)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/endian.h>
38 #include <sys/errno.h>
39 #include <sys/eventhandler.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 
46 #ifdef DDB
47 #include <ddb/ddb.h>
48 #include <ddb/db_lex.h>
49 #endif
50 
51 #include <net/ethernet.h>
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/if_dl.h>
55 #include <net/if_types.h>
56 #include <net/if_var.h>
57 #include <net/if_private.h>
58 #include <net/vnet.h>
59 #include <net/route.h>
60 #include <net/route/nhop.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/in_fib.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_options.h>
69 #include <netinet/udp.h>
70 #include <netinet/udp_var.h>
71 
72 #include <machine/in_cksum.h>
73 #include <machine/pcb.h>
74 
75 #include <net/debugnet.h>
76 #define	DEBUGNET_INTERNAL
77 #include <net/debugnet_int.h>
78 
79 FEATURE(debugnet, "Debugnet support");
80 
81 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
82     "debugnet parameters");
83 
84 unsigned debugnet_debug;
85 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
86     &debugnet_debug, 0,
87     "Debug message verbosity (0: off; 1: on; 2: verbose)");
88 
89 int debugnet_npolls = 2000;
90 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
91     &debugnet_npolls, 0,
92     "Number of times to poll before assuming packet loss (0.5ms per poll)");
93 int debugnet_nretries = 10;
94 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
95     &debugnet_nretries, 0,
96     "Number of retransmit attempts before giving up");
97 int debugnet_fib = RT_DEFAULT_FIB;
98 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
99     &debugnet_fib, 0,
100     "Fib to use when sending dump");
101 
102 static bool g_debugnet_pcb_inuse;
103 static struct debugnet_pcb g_dnet_pcb;
104 
105 /*
106  * Simple accessors for opaque PCB.
107  */
108 const unsigned char *
109 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
110 {
111 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
112 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
113 	return (pcb->dp_gw_mac.octet);
114 }
115 
116 const in_addr_t *
117 debugnet_get_server_addr(const struct debugnet_pcb *pcb)
118 {
119 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
120 	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
121 	return (&pcb->dp_server);
122 }
123 
124 const uint16_t
125 debugnet_get_server_port(const struct debugnet_pcb *pcb)
126 {
127 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
128 	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
129 	return (pcb->dp_server_port);
130 }
131 
132 /*
133  * Start of network primitives, beginning with output primitives.
134  */
135 
136 /*
137  * Handles creation of the ethernet header, then places outgoing packets into
138  * the tx buffer for the NIC
139  *
140  * Parameters:
141  *	m	The mbuf containing the packet to be sent (will be freed by
142  *		this function or the NIC driver)
143  *	ifp	The interface to send on
144  *	dst	The destination ethernet address (source address will be looked
145  *		up using ifp)
146  *	etype	The ETHERTYPE_* value for the protocol that is being sent
147  *
148  * Returns:
149  *	int	see errno.h, 0 for success
150  */
151 int
152 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
153     u_short etype)
154 {
155 	struct ether_header *eh;
156 
157 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
158 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
159 		if_printf(ifp, "%s: interface isn't up\n", __func__);
160 		m_freem(m);
161 		return (ENETDOWN);
162 	}
163 
164 	/* Fill in the ethernet header. */
165 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
166 	if (m == NULL) {
167 		printf("%s: out of mbufs\n", __func__);
168 		return (ENOBUFS);
169 	}
170 	eh = mtod(m, struct ether_header *);
171 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
172 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
173 	eh->ether_type = htons(etype);
174 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
175 }
176 
177 /*
178  * Unreliable transmission of an mbuf chain to the debugnet server
179  * Note: can't handle fragmentation; fails if the packet is larger than
180  *	 ifp->if_mtu after adding the UDP/IP headers
181  *
182  * Parameters:
183  *	pcb	The debugnet context block
184  *	m	mbuf chain
185  *
186  * Returns:
187  *	int	see errno.h, 0 for success
188  */
189 static int
190 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
191 {
192 	struct udphdr *udp;
193 
194 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
195 
196 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
197 	if (m == NULL) {
198 		printf("%s: out of mbufs\n", __func__);
199 		return (ENOBUFS);
200 	}
201 
202 	udp = mtod(m, struct udphdr *);
203 	udp->uh_ulen = htons(m->m_pkthdr.len);
204 	/* Use this src port so that the server can connect() the socket */
205 	udp->uh_sport = htons(pcb->dp_client_port);
206 	udp->uh_dport = htons(pcb->dp_server_port);
207 	/* Computed later (protocol-dependent). */
208 	udp->uh_sum = 0;
209 
210 	return (debugnet_ip_output(pcb, m));
211 }
212 
213 int
214 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
215 {
216 	struct debugnet_ack *dn_ack;
217 	struct mbuf *m;
218 
219 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
220 
221 	m = m_gethdr(M_NOWAIT, MT_DATA);
222 	if (m == NULL) {
223 		printf("%s: Out of mbufs\n", __func__);
224 		return (ENOBUFS);
225 	}
226 	m->m_len = sizeof(*dn_ack);
227 	m->m_pkthdr.len = sizeof(*dn_ack);
228 	MH_ALIGN(m, sizeof(*dn_ack));
229 	dn_ack = mtod(m, struct debugnet_ack *);
230 	dn_ack->da_seqno = seqno;
231 
232 	return (debugnet_udp_output(pcb, m));
233 }
234 
235 /*
236  * Dummy free function for debugnet clusters.
237  */
238 static void
239 debugnet_mbuf_free(struct mbuf *m __unused)
240 {
241 }
242 
243 /*
244  * Construct and reliably send a debugnet packet.  May fail from a resource
245  * shortage or extreme number of unacknowledged retransmissions.  Wait for
246  * an acknowledgement before returning.  Splits packets into chunks small
247  * enough to be sent without fragmentation (looks up the interface MTU)
248  *
249  * Parameters:
250  *	type	debugnet packet type (HERALD, FINISHED, ...)
251  *	data	data
252  *	datalen	data size (bytes)
253  *	auxdata	optional auxiliary information
254  *
255  * Returns:
256  *	int see errno.h, 0 for success
257  */
258 int
259 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
260     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
261 {
262 	struct debugnet_msg_hdr *dn_msg_hdr;
263 	struct mbuf *m, *m2;
264 	uint64_t want_acks;
265 	uint32_t i, pktlen, sent_so_far;
266 	int retries, polls, error;
267 
268 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
269 		return (ECONNRESET);
270 
271 	want_acks = 0;
272 	pcb->dp_rcvd_acks = 0;
273 	retries = 0;
274 
275 retransmit:
276 	/* Chunks can be too big to fit in packets. */
277 	for (i = sent_so_far = 0; sent_so_far < datalen ||
278 	    (i == 0 && datalen == 0); i++) {
279 		pktlen = datalen - sent_so_far;
280 
281 		/* Bound: the interface MTU (assume no IP options). */
282 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
283 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
284 
285 		/*
286 		 * Check if it is retransmitting and this has been ACKed
287 		 * already.
288 		 */
289 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
290 			sent_so_far += pktlen;
291 			continue;
292 		}
293 
294 		/*
295 		 * Get and fill a header mbuf, then chain data as an extended
296 		 * mbuf.
297 		 */
298 		m = m_gethdr(M_NOWAIT, MT_DATA);
299 		if (m == NULL) {
300 			printf("%s: Out of mbufs\n", __func__);
301 			return (ENOBUFS);
302 		}
303 		m->m_len = sizeof(struct debugnet_msg_hdr);
304 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
305 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
306 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
307 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
308 		dn_msg_hdr->mh_type = htonl(type);
309 		dn_msg_hdr->mh_len = htonl(pktlen);
310 
311 		if (auxdata != NULL) {
312 			dn_msg_hdr->mh_offset =
313 			    htobe64(auxdata->dp_offset_start + sent_so_far);
314 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
315 		} else {
316 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
317 			dn_msg_hdr->mh_aux2 = 0;
318 		}
319 
320 		if (pktlen != 0) {
321 			m2 = m_get(M_NOWAIT, MT_DATA);
322 			if (m2 == NULL) {
323 				m_freem(m);
324 				printf("%s: Out of mbufs\n", __func__);
325 				return (ENOBUFS);
326 			}
327 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
328 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
329 			    EXT_DISPOSABLE);
330 			m2->m_len = pktlen;
331 
332 			m_cat(m, m2);
333 			m->m_pkthdr.len += pktlen;
334 		}
335 		error = debugnet_udp_output(pcb, m);
336 		if (error != 0)
337 			return (error);
338 
339 		/* Note that we're waiting for this packet in the bitfield. */
340 		want_acks |= (1 << i);
341 		sent_so_far += pktlen;
342 	}
343 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
344 		printf("Warning: Sent more than %d packets (%d). "
345 		    "Acknowledgements will fail unless the size of "
346 		    "rcvd_acks/want_acks is increased.\n",
347 		    DEBUGNET_MAX_IN_FLIGHT, i);
348 
349 	/*
350 	 * Wait for acks.  A *real* window would speed things up considerably.
351 	 */
352 	polls = 0;
353 	while (pcb->dp_rcvd_acks != want_acks) {
354 		if (polls++ > debugnet_npolls) {
355 			if (retries++ > debugnet_nretries)
356 				return (ETIMEDOUT);
357 			printf(". ");
358 			goto retransmit;
359 		}
360 		debugnet_network_poll(pcb);
361 		DELAY(500);
362 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
363 			return (ECONNRESET);
364 	}
365 	pcb->dp_seqno += i;
366 	return (0);
367 }
368 
369 /*
370  * Network input primitives.
371  */
372 
373 /*
374  * Just introspect the header enough to fire off a seqno ack and validate
375  * length fits.
376  */
377 static void
378 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
379 {
380 	const struct debugnet_msg_hdr *dnh;
381 	struct mbuf *m;
382 	uint32_t hdr_type;
383 	uint32_t seqno;
384 	int error;
385 
386 	m = *mb;
387 
388 	if (m->m_pkthdr.len < sizeof(*dnh)) {
389 		DNETDEBUG("ignoring small debugnet_msg packet\n");
390 		return;
391 	}
392 
393 	/* Get ND header. */
394 	if (m->m_len < sizeof(*dnh)) {
395 		m = m_pullup(m, sizeof(*dnh));
396 		*mb = m;
397 		if (m == NULL) {
398 			DNETDEBUG("m_pullup failed\n");
399 			return;
400 		}
401 	}
402 
403 	dnh = mtod(m, const struct debugnet_msg_hdr *);
404 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
405 		DNETDEBUG("Dropping short packet.\n");
406 		return;
407 	}
408 
409 	hdr_type = ntohl(dnh->mh_type);
410 	if (hdr_type != DEBUGNET_DATA) {
411 		if (hdr_type == DEBUGNET_FINISHED) {
412 			printf("Remote shut down the connection on us!\n");
413 			pcb->dp_state = DN_STATE_REMOTE_CLOSED;
414 			if (pcb->dp_finish_handler != NULL) {
415 				pcb->dp_finish_handler();
416 			}
417 		} else {
418 			DNETDEBUG("Got unexpected debugnet message %u\n", hdr_type);
419 		}
420 		return;
421 	}
422 
423 	/*
424 	 * If the issue is transient (ENOBUFS), sender should resend.  If
425 	 * non-transient (like driver objecting to rx -> tx from the same
426 	 * thread), not much else we can do.
427 	 */
428 	seqno = dnh->mh_seqno; /* net endian */
429 	m_adj(m, sizeof(*dnh));
430 	dnh = NULL;
431 	error = pcb->dp_rx_handler(m);
432 	if (error != 0) {
433 		DNETDEBUG("RX handler was not able to accept message, error %d. "
434 		    "Skipping ack.\n", error);
435 		return;
436 	}
437 
438 	error = debugnet_ack_output(pcb, seqno);
439 	if (error != 0) {
440 		DNETDEBUG("Couldn't ACK rx packet %u; %d\n", ntohl(seqno), error);
441 	}
442 }
443 
444 static void
445 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
446 {
447 	const struct debugnet_ack *dn_ack;
448 	struct mbuf *m;
449 	uint32_t rcv_ackno;
450 
451 	m = *mb;
452 
453 	/* Get Ack. */
454 	if (m->m_len < sizeof(*dn_ack)) {
455 		m = m_pullup(m, sizeof(*dn_ack));
456 		*mb = m;
457 		if (m == NULL) {
458 			DNETDEBUG("m_pullup failed\n");
459 			return;
460 		}
461 	}
462 	dn_ack = mtod(m, const struct debugnet_ack *);
463 
464 	/* Debugnet processing. */
465 	/*
466 	 * Packet is meant for us.  Extract the ack sequence number and the
467 	 * port number if necessary.
468 	 */
469 	rcv_ackno = ntohl(dn_ack->da_seqno);
470 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
471 		pcb->dp_server_port = sport;
472 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
473 	}
474 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
475 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
476 	else if (rcv_ackno >= pcb->dp_seqno) {
477 		/* We're interested in this ack. Record it. */
478 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
479 	}
480 }
481 
482 void
483 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
484 {
485 	const struct udphdr *udp;
486 	struct mbuf *m;
487 	uint16_t sport, ulen;
488 
489 	/* UDP processing. */
490 
491 	m = *mb;
492 	if (m->m_pkthdr.len < sizeof(*udp)) {
493 		DNETDEBUG("ignoring small UDP packet\n");
494 		return;
495 	}
496 
497 	/* Get UDP headers. */
498 	if (m->m_len < sizeof(*udp)) {
499 		m = m_pullup(m, sizeof(*udp));
500 		*mb = m;
501 		if (m == NULL) {
502 			DNETDEBUG("m_pullup failed\n");
503 			return;
504 		}
505 	}
506 	udp = mtod(m, const struct udphdr *);
507 
508 	/* We expect to receive UDP packets on the configured client port. */
509 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
510 		DNETDEBUG("not on the expected port.\n");
511 		return;
512 	}
513 
514 	/* Check that ulen does not exceed actual size of data. */
515 	ulen = ntohs(udp->uh_ulen);
516 	if (m->m_pkthdr.len < ulen) {
517 		DNETDEBUG("ignoring runt UDP packet\n");
518 		return;
519 	}
520 
521 	sport = ntohs(udp->uh_sport);
522 
523 	m_adj(m, sizeof(*udp));
524 	ulen -= sizeof(*udp);
525 
526 	if (ulen == sizeof(struct debugnet_ack)) {
527 		debugnet_handle_ack(pcb, mb, sport);
528 		return;
529 	}
530 
531 	if (pcb->dp_rx_handler == NULL) {
532 		if (ulen < sizeof(struct debugnet_ack))
533 			DNETDEBUG("ignoring small ACK packet\n");
534 		else
535 			DNETDEBUG("ignoring unexpected non-ACK packet on "
536 			    "half-duplex connection.\n");
537 		return;
538 	}
539 
540 	debugnet_handle_rx_msg(pcb, mb);
541 }
542 
543 /*
544  * Handler for incoming packets directly from the network adapter
545  * Identifies the packet type (IP or ARP) and passes it along to one of the
546  * helper functions debugnet_handle_ip or debugnet_handle_arp.
547  *
548  * It needs to partially replicate the behaviour of ether_input() and
549  * ether_demux().
550  *
551  * Parameters:
552  *	ifp	the interface the packet came from
553  *	m	an mbuf containing the packet received
554  */
555 static void
556 debugnet_input_one(struct ifnet *ifp, struct mbuf *m)
557 {
558 	struct ifreq ifr;
559 	struct ether_header *eh;
560 	u_short etype;
561 
562 	/* Ethernet processing. */
563 	if ((m->m_flags & M_PKTHDR) == 0) {
564 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
565 		goto done;
566 	}
567 	if (m->m_len < ETHER_HDR_LEN) {
568 		DNETDEBUG_IF(ifp,
569 	    "discard frame without leading eth header (len %d pktlen %d)\n",
570 		    m->m_len, m->m_pkthdr.len);
571 		goto done;
572 	}
573 	eh = mtod(m, struct ether_header *);
574 	etype = ntohs(eh->ether_type);
575 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
576 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
577 		goto done;
578 	}
579 	if (if_gethwaddr(ifp, &ifr) != 0) {
580 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
581 		goto done;
582 	}
583 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
584 	    ETHER_ADDR_LEN) != 0 &&
585 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
586 		DNETDEBUG_IF(ifp,
587 		    "discard frame with incorrect destination addr\n");
588 		goto done;
589 	}
590 
591 	MPASS(g_debugnet_pcb_inuse);
592 
593 	/* Done ethernet processing. Strip off the ethernet header. */
594 	m_adj(m, ETHER_HDR_LEN);
595 	switch (etype) {
596 	case ETHERTYPE_ARP:
597 		debugnet_handle_arp(&g_dnet_pcb, &m);
598 		break;
599 	case ETHERTYPE_IP:
600 		debugnet_handle_ip(&g_dnet_pcb, &m);
601 		break;
602 	default:
603 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
604 		break;
605 	}
606 done:
607 	if (m != NULL)
608 		m_freem(m);
609 }
610 
611 static void
612 debugnet_input(struct ifnet *ifp, struct mbuf *m)
613 {
614 	struct mbuf *n;
615 
616 	do {
617 		n = m->m_nextpkt;
618 		m->m_nextpkt = NULL;
619 		debugnet_input_one(ifp, m);
620 		m = n;
621 	} while (m != NULL);
622 }
623 
624 /*
625  * Network polling primitive.
626  *
627  * Instead of assuming that most of the network stack is sane, we just poll the
628  * driver directly for packets.
629  */
630 void
631 debugnet_network_poll(struct debugnet_pcb *pcb)
632 {
633 	struct ifnet *ifp;
634 
635 	ifp = pcb->dp_ifp;
636 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
637 }
638 
639 /*
640  * Start of consumer API surface.
641  */
642 void
643 debugnet_free(struct debugnet_pcb *pcb)
644 {
645 	struct ifnet *ifp;
646 
647 	MPASS(pcb == &g_dnet_pcb);
648 	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);
649 
650 	ifp = pcb->dp_ifp;
651 	if (ifp != NULL) {
652 		if (pcb->dp_drv_input != NULL)
653 			ifp->if_input = pcb->dp_drv_input;
654 		if (pcb->dp_event_started)
655 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
656 	}
657 	debugnet_mbuf_finish();
658 
659 	g_debugnet_pcb_inuse = false;
660 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
661 }
662 
663 int
664 debugnet_connect(const struct debugnet_conn_params *dcp,
665     struct debugnet_pcb **pcb_out)
666 {
667 	struct debugnet_proto_aux herald_auxdata;
668 	struct debugnet_pcb *pcb;
669 	struct ifnet *ifp;
670 	int error;
671 
672 	if (g_debugnet_pcb_inuse) {
673 		printf("%s: Only one connection at a time.\n", __func__);
674 		return (EBUSY);
675 	}
676 
677 	pcb = &g_dnet_pcb;
678 	*pcb = (struct debugnet_pcb) {
679 		.dp_state = DN_STATE_INIT,
680 		.dp_client = dcp->dc_client,
681 		.dp_server = dcp->dc_server,
682 		.dp_gateway = dcp->dc_gateway,
683 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
684 		.dp_client_port = dcp->dc_client_port,
685 		.dp_seqno = 1,
686 		.dp_ifp = dcp->dc_ifp,
687 		.dp_rx_handler = dcp->dc_rx_handler,
688 		.dp_drv_input = NULL,
689 	};
690 
691 	/* Switch to the debugnet mbuf zones. */
692 	debugnet_mbuf_start();
693 
694 	/* At least one needed parameter is missing; infer it. */
695 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
696 	    pcb->dp_ifp == NULL) {
697 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
698 		struct ifnet *rt_ifp;
699 		struct nhop_object *nh;
700 
701 		memset(&dest_sin, 0, sizeof(dest_sin));
702 		dest_sin = (struct sockaddr_in) {
703 			.sin_len = sizeof(dest_sin),
704 			.sin_family = AF_INET,
705 			.sin_addr.s_addr = pcb->dp_server,
706 		};
707 
708 		CURVNET_SET(vnet0);
709 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
710 		    NHR_NONE);
711 		CURVNET_RESTORE();
712 
713 		if (nh == NULL) {
714 			printf("%s: Could not get route for that server.\n",
715 			    __func__);
716 			error = ENOENT;
717 			goto cleanup;
718 		}
719 
720 		/* TODO support AF_INET6 */
721 		if (nh->gw_sa.sa_family == AF_INET)
722 			gw_sin = &nh->gw4_sa;
723 		else {
724 			if (nh->gw_sa.sa_family == AF_LINK)
725 				DNETDEBUG("Destination address is on link.\n");
726 			gw_sin = NULL;
727 		}
728 
729 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
730 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
731 
732 		rt_ifp = nh->nh_ifp;
733 
734 		if (pcb->dp_client == INADDR_ANY)
735 			pcb->dp_client = local_sin->sin_addr.s_addr;
736 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
737 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
738 		if (pcb->dp_ifp == NULL)
739 			pcb->dp_ifp = rt_ifp;
740 	}
741 
742 	ifp = pcb->dp_ifp;
743 
744 	if (debugnet_debug > 0) {
745 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
746 		    gwbuf[INET_ADDRSTRLEN];
747 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
748 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
749 		if (pcb->dp_gateway != INADDR_ANY)
750 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
751 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
752 		    serbuf, pcb->dp_server_port,
753 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
754 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
755 		    clibuf, pcb->dp_client_port, if_name(ifp));
756 	}
757 
758 	/* Validate iface is online and supported. */
759 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
760 		printf("%s: interface '%s' does not support debugnet\n",
761 		    __func__, if_name(ifp));
762 		error = ENODEV;
763 		goto cleanup;
764 	}
765 	if ((if_getflags(ifp) & IFF_UP) == 0) {
766 		printf("%s: interface '%s' link is down\n", __func__,
767 		    if_name(ifp));
768 		error = ENXIO;
769 		goto cleanup;
770 	}
771 
772 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
773 	pcb->dp_event_started = true;
774 
775 	/*
776 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
777 	 * while the debugnet ifp's if_input is overridden with
778 	 * debugnet_input().
779 	 */
780 	g_debugnet_pcb_inuse = true;
781 
782 	/* Make the card use *our* receive callback. */
783 	pcb->dp_drv_input = ifp->if_input;
784 	ifp->if_input = debugnet_input;
785 
786 	printf("%s: searching for %s MAC...\n", __func__,
787 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
788 
789 	error = debugnet_arp_gw(pcb);
790 	if (error != 0) {
791 		printf("%s: failed to locate MAC address\n", __func__);
792 		goto cleanup;
793 	}
794 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
795 
796 	herald_auxdata = (struct debugnet_proto_aux) {
797 		.dp_offset_start = dcp->dc_herald_offset,
798 		.dp_aux2 = dcp->dc_herald_aux2,
799 	};
800 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
801 	    dcp->dc_herald_datalen, &herald_auxdata);
802 	if (error != 0) {
803 		printf("%s: failed to herald debugnet server\n", __func__);
804 		goto cleanup;
805 	}
806 
807 	*pcb_out = pcb;
808 	return (0);
809 
810 cleanup:
811 	debugnet_free(pcb);
812 	return (error);
813 }
814 
815 /*
816  * Pre-allocated dump-time mbuf tracking.
817  *
818  * We just track the high water mark we've ever seen and allocate appropriately
819  * for that iface/mtu combo.
820  */
821 static struct {
822 	int nmbuf;
823 	int ncl;
824 	int clsize;
825 } dn_hwm;
826 static struct mtx dn_hwm_lk;
827 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
828 
829 static void
830 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
831 {
832 	bool any;
833 
834 	any = false;
835 	mtx_lock(&dn_hwm_lk);
836 
837 	if (nmbuf > dn_hwm.nmbuf) {
838 		any = true;
839 		dn_hwm.nmbuf = nmbuf;
840 	} else
841 		nmbuf = dn_hwm.nmbuf;
842 
843 	if (ncl > dn_hwm.ncl) {
844 		any = true;
845 		dn_hwm.ncl = ncl;
846 	} else
847 		ncl = dn_hwm.ncl;
848 
849 	if (clsize > dn_hwm.clsize) {
850 		any = true;
851 		dn_hwm.clsize = clsize;
852 	} else
853 		clsize = dn_hwm.clsize;
854 
855 	mtx_unlock(&dn_hwm_lk);
856 
857 	if (any)
858 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
859 }
860 
861 void
862 debugnet_any_ifnet_update(struct ifnet *ifp)
863 {
864 	int clsize, nmbuf, ncl, nrxr;
865 
866 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
867 		return;
868 
869 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
870 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
871 
872 	/*
873 	 * We need two headers per message on the transmit side. Multiply by
874 	 * four to give us some breathing room.
875 	 */
876 	nmbuf = ncl * (4 + nrxr);
877 	ncl *= nrxr;
878 
879 	/*
880 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
881 	 * dn_init method is available.
882 	 */
883 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
884 #ifndef INVARIANTS
885 		if (bootverbose)
886 #endif
887 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
888 		    __func__, if_name(ifp), ifp);
889 		return;
890 	}
891 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
892 }
893 
894 /*
895  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
896  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
897  *
898  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
899  * because the driver is still in attach.  Since we cannot use down interfaces,
900  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
901  * with vtnet and dhcpclient that event just never occurs.
902  *
903  * So that's how I've landed on the lower level ifnet_link_event.
904  */
905 
906 static void
907 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
908 {
909 	if (link_state == LINK_STATE_UP)
910 		debugnet_any_ifnet_update(ifp);
911 }
912 
913 static eventhandler_tag dn_attach_cookie;
914 static void
915 dn_evh_init(void *ctx __unused)
916 {
917 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
918 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
919 }
920 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
921 
922 /*
923  * DDB parsing helpers for debugnet(4) consumers.
924  */
925 #ifdef DDB
926 struct my_inet_opt {
927 	bool has_opt;
928 	const char *printname;
929 	in_addr_t *result;
930 };
931 
932 static int
933 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
934 {
935 	in_addr_t tmp;
936 	unsigned octet;
937 	int t;
938 
939 	tmp = 0;
940 	for (octet = 0; octet < 4; octet++) {
941 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
942 		if (t != tNUMBER) {
943 			db_printf("%s:%s: octet %u expected number; found %d\n",
944 			    __func__, opt->printname, octet, t);
945 			return (EINVAL);
946 		}
947 		/*
948 		 * db_lex lexes '-' distinctly from the number itself, but
949 		 * let's document that invariant.
950 		 */
951 		MPASS(db_tok_number >= 0);
952 
953 		if (db_tok_number > UINT8_MAX) {
954 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
955 			    opt->printname, octet, (intmax_t)db_tok_number);
956 			return (EDOM);
957 		}
958 
959 		/* Constructed host-endian and converted to network later. */
960 		tmp = (tmp << 8) | db_tok_number;
961 
962 		if (octet < 3) {
963 			t = db_read_token_flags(DRT_WSPACE);
964 			if (t != tDOT) {
965 				db_printf("%s:%s: octet %u expected '.'; found"
966 				    " %d\n", __func__, opt->printname, octet,
967 				    t);
968 				return (EINVAL);
969 			}
970 		}
971 	}
972 
973 	*opt->result = htonl(tmp);
974 	opt->has_opt = true;
975 	return (0);
976 }
977 
978 int
979 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
980 {
981 	struct ifnet *ifp;
982 	int t, error;
983 	bool want_ifp;
984 	char ch;
985 
986 	struct my_inet_opt opt_client = {
987 		.printname = "client",
988 		.result = &result->dd_client,
989 	},
990 	opt_server = {
991 		.printname = "server",
992 		.result = &result->dd_server,
993 	},
994 	opt_gateway = {
995 		.printname = "gateway",
996 		.result = &result->dd_gateway,
997 	},
998 	*cur_inet_opt;
999 
1000 	ifp = NULL;
1001 	memset(result, 0, sizeof(*result));
1002 
1003 	/*
1004 	 * command [space] [-] [opt] [[space] [optarg]] ...
1005 	 *
1006 	 * db_command has already lexed 'command' for us.
1007 	 */
1008 	t = db_read_token_flags(DRT_WSPACE);
1009 	if (t == tWSPACE)
1010 		t = db_read_token_flags(DRT_WSPACE);
1011 
1012 	while (t != tEOL) {
1013 		if (t != tMINUS) {
1014 			db_printf("%s: Bad syntax; expected '-', got %d\n",
1015 			    cmd, t);
1016 			goto usage;
1017 		}
1018 
1019 		t = db_read_token_flags(DRT_WSPACE);
1020 		if (t != tIDENT) {
1021 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
1022 			    cmd, t);
1023 			goto usage;
1024 		}
1025 
1026 		if (strlen(db_tok_string) > 1) {
1027 			db_printf("%s: Bad syntax; expected single option "
1028 			    "flag, got '%s'\n", cmd, db_tok_string);
1029 			goto usage;
1030 		}
1031 
1032 		want_ifp = false;
1033 		cur_inet_opt = NULL;
1034 		switch ((ch = db_tok_string[0])) {
1035 		default:
1036 			DNETDEBUG("Unexpected: '%c'\n", ch);
1037 			/* FALLTHROUGH */
1038 		case 'h':
1039 			goto usage;
1040 		case 'c':
1041 			cur_inet_opt = &opt_client;
1042 			break;
1043 		case 'g':
1044 			cur_inet_opt = &opt_gateway;
1045 			break;
1046 		case 's':
1047 			cur_inet_opt = &opt_server;
1048 			break;
1049 		case 'i':
1050 			want_ifp = true;
1051 			break;
1052 		}
1053 
1054 		t = db_read_token_flags(DRT_WSPACE);
1055 		if (t != tWSPACE) {
1056 			db_printf("%s: Bad syntax; expected space after "
1057 			    "flag %c, got %d\n", cmd, ch, t);
1058 			goto usage;
1059 		}
1060 
1061 		if (want_ifp) {
1062 			t = db_read_token_flags(DRT_WSPACE);
1063 			if (t != tIDENT) {
1064 				db_printf("%s: Expected interface but got %d\n",
1065 				    cmd, t);
1066 				goto usage;
1067 			}
1068 
1069 			CURVNET_SET(vnet0);
1070 			/*
1071 			 * We *don't* take a ref here because the only current
1072 			 * consumer, db_netdump_cmd, does not need it.  It
1073 			 * (somewhat redundantly) extracts the if_name(),
1074 			 * re-lookups the ifp, and takes its own reference.
1075 			 */
1076 			ifp = ifunit(db_tok_string);
1077 			CURVNET_RESTORE();
1078 			if (ifp == NULL) {
1079 				db_printf("Could not locate interface %s\n",
1080 				    db_tok_string);
1081 				error = ENOENT;
1082 				goto cleanup;
1083 			}
1084 		} else {
1085 			MPASS(cur_inet_opt != NULL);
1086 			/* Assume IPv4 for now. */
1087 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1088 			if (error != 0)
1089 				goto cleanup;
1090 		}
1091 
1092 		/* Skip (mandatory) whitespace after option, if not EOL. */
1093 		t = db_read_token_flags(DRT_WSPACE);
1094 		if (t == tEOL)
1095 			break;
1096 		if (t != tWSPACE) {
1097 			db_printf("%s: Bad syntax; expected space after "
1098 			    "flag %c option; got %d\n", cmd, ch, t);
1099 			goto usage;
1100 		}
1101 		t = db_read_token_flags(DRT_WSPACE);
1102 	}
1103 
1104 	if (!opt_server.has_opt) {
1105 		db_printf("%s: need a destination server address\n", cmd);
1106 		goto usage;
1107 	}
1108 
1109 	result->dd_has_client = opt_client.has_opt;
1110 	result->dd_has_gateway = opt_gateway.has_opt;
1111 	result->dd_ifp = ifp;
1112 
1113 	/* We parsed the full line to tEOL already, or bailed with an error. */
1114 	return (0);
1115 
1116 usage:
1117 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1118 	    "-i <interface>]\n", cmd);
1119 	error = EINVAL;
1120 	/* FALLTHROUGH */
1121 cleanup:
1122 	db_skip_to_eol();
1123 	return (error);
1124 }
1125 #endif /* DDB */
1126