xref: /freebsd/sys/net/debugnet.c (revision 66fd12cf4896eb08ad8e7a2627537f84ead84dd3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/endian.h>
40 #include <sys/errno.h>
41 #include <sys/eventhandler.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 
48 #ifdef DDB
49 #include <ddb/ddb.h>
50 #include <ddb/db_lex.h>
51 #endif
52 
53 #include <net/ethernet.h>
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/if_dl.h>
57 #include <net/if_types.h>
58 #include <net/if_var.h>
59 #include <net/if_private.h>
60 #include <net/vnet.h>
61 #include <net/route.h>
62 #include <net/route/nhop.h>
63 
64 #include <netinet/in.h>
65 #include <netinet/in_fib.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/in_var.h>
68 #include <netinet/ip.h>
69 #include <netinet/ip_var.h>
70 #include <netinet/ip_options.h>
71 #include <netinet/udp.h>
72 #include <netinet/udp_var.h>
73 
74 #include <machine/in_cksum.h>
75 #include <machine/pcb.h>
76 
77 #include <net/debugnet.h>
78 #define	DEBUGNET_INTERNAL
79 #include <net/debugnet_int.h>
80 
81 FEATURE(debugnet, "Debugnet support");
82 
83 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
84     "debugnet parameters");
85 
86 unsigned debugnet_debug;
87 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
88     &debugnet_debug, 0,
89     "Debug message verbosity (0: off; 1: on; 2: verbose)");
90 
91 int debugnet_npolls = 2000;
92 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
93     &debugnet_npolls, 0,
94     "Number of times to poll before assuming packet loss (0.5ms per poll)");
95 int debugnet_nretries = 10;
96 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
97     &debugnet_nretries, 0,
98     "Number of retransmit attempts before giving up");
99 int debugnet_fib = RT_DEFAULT_FIB;
100 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
101     &debugnet_fib, 0,
102     "Fib to use when sending dump");
103 
104 static bool g_debugnet_pcb_inuse;
105 static struct debugnet_pcb g_dnet_pcb;
106 
107 /*
108  * Simple accessors for opaque PCB.
109  */
110 const unsigned char *
111 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
112 {
113 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
114 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
115 	return (pcb->dp_gw_mac.octet);
116 }
117 
118 const in_addr_t *
119 debugnet_get_server_addr(const struct debugnet_pcb *pcb)
120 {
121 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
122 	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
123 	return (&pcb->dp_server);
124 }
125 
126 const uint16_t
127 debugnet_get_server_port(const struct debugnet_pcb *pcb)
128 {
129 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
130 	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
131 	return (pcb->dp_server_port);
132 }
133 
134 /*
135  * Start of network primitives, beginning with output primitives.
136  */
137 
138 /*
139  * Handles creation of the ethernet header, then places outgoing packets into
140  * the tx buffer for the NIC
141  *
142  * Parameters:
143  *	m	The mbuf containing the packet to be sent (will be freed by
144  *		this function or the NIC driver)
145  *	ifp	The interface to send on
146  *	dst	The destination ethernet address (source address will be looked
147  *		up using ifp)
148  *	etype	The ETHERTYPE_* value for the protocol that is being sent
149  *
150  * Returns:
151  *	int	see errno.h, 0 for success
152  */
153 int
154 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
155     u_short etype)
156 {
157 	struct ether_header *eh;
158 
159 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
160 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
161 		if_printf(ifp, "%s: interface isn't up\n", __func__);
162 		m_freem(m);
163 		return (ENETDOWN);
164 	}
165 
166 	/* Fill in the ethernet header. */
167 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
168 	if (m == NULL) {
169 		printf("%s: out of mbufs\n", __func__);
170 		return (ENOBUFS);
171 	}
172 	eh = mtod(m, struct ether_header *);
173 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
174 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
175 	eh->ether_type = htons(etype);
176 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
177 }
178 
179 /*
180  * Unreliable transmission of an mbuf chain to the debugnet server
181  * Note: can't handle fragmentation; fails if the packet is larger than
182  *	 ifp->if_mtu after adding the UDP/IP headers
183  *
184  * Parameters:
185  *	pcb	The debugnet context block
186  *	m	mbuf chain
187  *
188  * Returns:
189  *	int	see errno.h, 0 for success
190  */
191 static int
192 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
193 {
194 	struct udphdr *udp;
195 
196 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
197 
198 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
199 	if (m == NULL) {
200 		printf("%s: out of mbufs\n", __func__);
201 		return (ENOBUFS);
202 	}
203 
204 	udp = mtod(m, void *);
205 	udp->uh_ulen = htons(m->m_pkthdr.len);
206 	/* Use this src port so that the server can connect() the socket */
207 	udp->uh_sport = htons(pcb->dp_client_port);
208 	udp->uh_dport = htons(pcb->dp_server_port);
209 	/* Computed later (protocol-dependent). */
210 	udp->uh_sum = 0;
211 
212 	return (debugnet_ip_output(pcb, m));
213 }
214 
215 int
216 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
217 {
218 	struct debugnet_ack *dn_ack;
219 	struct mbuf *m;
220 
221 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
222 
223 	m = m_gethdr(M_NOWAIT, MT_DATA);
224 	if (m == NULL) {
225 		printf("%s: Out of mbufs\n", __func__);
226 		return (ENOBUFS);
227 	}
228 	m->m_len = sizeof(*dn_ack);
229 	m->m_pkthdr.len = sizeof(*dn_ack);
230 	MH_ALIGN(m, sizeof(*dn_ack));
231 	dn_ack = mtod(m, void *);
232 	dn_ack->da_seqno = seqno;
233 
234 	return (debugnet_udp_output(pcb, m));
235 }
236 
237 /*
238  * Dummy free function for debugnet clusters.
239  */
240 static void
241 debugnet_mbuf_free(struct mbuf *m __unused)
242 {
243 }
244 
245 /*
246  * Construct and reliably send a debugnet packet.  May fail from a resource
247  * shortage or extreme number of unacknowledged retransmissions.  Wait for
248  * an acknowledgement before returning.  Splits packets into chunks small
249  * enough to be sent without fragmentation (looks up the interface MTU)
250  *
251  * Parameters:
252  *	type	debugnet packet type (HERALD, FINISHED, ...)
253  *	data	data
254  *	datalen	data size (bytes)
255  *	auxdata	optional auxiliary information
256  *
257  * Returns:
258  *	int see errno.h, 0 for success
259  */
260 int
261 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
262     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
263 {
264 	struct debugnet_msg_hdr *dn_msg_hdr;
265 	struct mbuf *m, *m2;
266 	uint64_t want_acks;
267 	uint32_t i, pktlen, sent_so_far;
268 	int retries, polls, error;
269 
270 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
271 		return (ECONNRESET);
272 
273 	want_acks = 0;
274 	pcb->dp_rcvd_acks = 0;
275 	retries = 0;
276 
277 retransmit:
278 	/* Chunks can be too big to fit in packets. */
279 	for (i = sent_so_far = 0; sent_so_far < datalen ||
280 	    (i == 0 && datalen == 0); i++) {
281 		pktlen = datalen - sent_so_far;
282 
283 		/* Bound: the interface MTU (assume no IP options). */
284 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
285 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
286 
287 		/*
288 		 * Check if it is retransmitting and this has been ACKed
289 		 * already.
290 		 */
291 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
292 			sent_so_far += pktlen;
293 			continue;
294 		}
295 
296 		/*
297 		 * Get and fill a header mbuf, then chain data as an extended
298 		 * mbuf.
299 		 */
300 		m = m_gethdr(M_NOWAIT, MT_DATA);
301 		if (m == NULL) {
302 			printf("%s: Out of mbufs\n", __func__);
303 			return (ENOBUFS);
304 		}
305 		m->m_len = sizeof(struct debugnet_msg_hdr);
306 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
307 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
308 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
309 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
310 		dn_msg_hdr->mh_type = htonl(type);
311 		dn_msg_hdr->mh_len = htonl(pktlen);
312 
313 		if (auxdata != NULL) {
314 			dn_msg_hdr->mh_offset =
315 			    htobe64(auxdata->dp_offset_start + sent_so_far);
316 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
317 		} else {
318 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
319 			dn_msg_hdr->mh_aux2 = 0;
320 		}
321 
322 		if (pktlen != 0) {
323 			m2 = m_get(M_NOWAIT, MT_DATA);
324 			if (m2 == NULL) {
325 				m_freem(m);
326 				printf("%s: Out of mbufs\n", __func__);
327 				return (ENOBUFS);
328 			}
329 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
330 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
331 			    EXT_DISPOSABLE);
332 			m2->m_len = pktlen;
333 
334 			m_cat(m, m2);
335 			m->m_pkthdr.len += pktlen;
336 		}
337 		error = debugnet_udp_output(pcb, m);
338 		if (error != 0)
339 			return (error);
340 
341 		/* Note that we're waiting for this packet in the bitfield. */
342 		want_acks |= (1 << i);
343 		sent_so_far += pktlen;
344 	}
345 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
346 		printf("Warning: Sent more than %d packets (%d). "
347 		    "Acknowledgements will fail unless the size of "
348 		    "rcvd_acks/want_acks is increased.\n",
349 		    DEBUGNET_MAX_IN_FLIGHT, i);
350 
351 	/*
352 	 * Wait for acks.  A *real* window would speed things up considerably.
353 	 */
354 	polls = 0;
355 	while (pcb->dp_rcvd_acks != want_acks) {
356 		if (polls++ > debugnet_npolls) {
357 			if (retries++ > debugnet_nretries)
358 				return (ETIMEDOUT);
359 			printf(". ");
360 			goto retransmit;
361 		}
362 		debugnet_network_poll(pcb);
363 		DELAY(500);
364 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
365 			return (ECONNRESET);
366 	}
367 	pcb->dp_seqno += i;
368 	return (0);
369 }
370 
371 /*
372  * Network input primitives.
373  */
374 
375 /*
376  * Just introspect the header enough to fire off a seqno ack and validate
377  * length fits.
378  */
379 static void
380 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
381 {
382 	const struct debugnet_msg_hdr *dnh;
383 	struct mbuf *m;
384 	uint32_t hdr_type;
385 	uint32_t seqno;
386 	int error;
387 
388 	m = *mb;
389 
390 	if (m->m_pkthdr.len < sizeof(*dnh)) {
391 		DNETDEBUG("ignoring small debugnet_msg packet\n");
392 		return;
393 	}
394 
395 	/* Get ND header. */
396 	if (m->m_len < sizeof(*dnh)) {
397 		m = m_pullup(m, sizeof(*dnh));
398 		*mb = m;
399 		if (m == NULL) {
400 			DNETDEBUG("m_pullup failed\n");
401 			return;
402 		}
403 	}
404 
405 	dnh = mtod(m, const void *);
406 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
407 		DNETDEBUG("Dropping short packet.\n");
408 		return;
409 	}
410 
411 	hdr_type = ntohl(dnh->mh_type);
412 	if (hdr_type != DEBUGNET_DATA) {
413 		if (hdr_type == DEBUGNET_FINISHED) {
414 			printf("Remote shut down the connection on us!\n");
415 			pcb->dp_state = DN_STATE_REMOTE_CLOSED;
416 			if (pcb->dp_finish_handler != NULL) {
417 				pcb->dp_finish_handler();
418 			}
419 		} else {
420 			DNETDEBUG("Got unexpected debugnet message %u\n", hdr_type);
421 		}
422 		return;
423 	}
424 
425 	/*
426 	 * If the issue is transient (ENOBUFS), sender should resend.  If
427 	 * non-transient (like driver objecting to rx -> tx from the same
428 	 * thread), not much else we can do.
429 	 */
430 	seqno = dnh->mh_seqno; /* net endian */
431 	m_adj(m, sizeof(*dnh));
432 	dnh = NULL;
433 	error = pcb->dp_rx_handler(m);
434 	if (error != 0) {
435 		DNETDEBUG("RX handler was not able to accept message, error %d. "
436 		    "Skipping ack.\n", error);
437 		return;
438 	}
439 
440 	error = debugnet_ack_output(pcb, seqno);
441 	if (error != 0) {
442 		DNETDEBUG("Couldn't ACK rx packet %u; %d\n", ntohl(seqno), error);
443 	}
444 }
445 
446 static void
447 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
448 {
449 	const struct debugnet_ack *dn_ack;
450 	struct mbuf *m;
451 	uint32_t rcv_ackno;
452 
453 	m = *mb;
454 
455 	/* Get Ack. */
456 	if (m->m_len < sizeof(*dn_ack)) {
457 		m = m_pullup(m, sizeof(*dn_ack));
458 		*mb = m;
459 		if (m == NULL) {
460 			DNETDEBUG("m_pullup failed\n");
461 			return;
462 		}
463 	}
464 	dn_ack = mtod(m, const void *);
465 
466 	/* Debugnet processing. */
467 	/*
468 	 * Packet is meant for us.  Extract the ack sequence number and the
469 	 * port number if necessary.
470 	 */
471 	rcv_ackno = ntohl(dn_ack->da_seqno);
472 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
473 		pcb->dp_server_port = sport;
474 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
475 	}
476 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
477 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
478 	else if (rcv_ackno >= pcb->dp_seqno) {
479 		/* We're interested in this ack. Record it. */
480 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
481 	}
482 }
483 
484 void
485 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
486 {
487 	const struct udphdr *udp;
488 	struct mbuf *m;
489 	uint16_t sport, ulen;
490 
491 	/* UDP processing. */
492 
493 	m = *mb;
494 	if (m->m_pkthdr.len < sizeof(*udp)) {
495 		DNETDEBUG("ignoring small UDP packet\n");
496 		return;
497 	}
498 
499 	/* Get UDP headers. */
500 	if (m->m_len < sizeof(*udp)) {
501 		m = m_pullup(m, sizeof(*udp));
502 		*mb = m;
503 		if (m == NULL) {
504 			DNETDEBUG("m_pullup failed\n");
505 			return;
506 		}
507 	}
508 	udp = mtod(m, const void *);
509 
510 	/* We expect to receive UDP packets on the configured client port. */
511 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
512 		DNETDEBUG("not on the expected port.\n");
513 		return;
514 	}
515 
516 	/* Check that ulen does not exceed actual size of data. */
517 	ulen = ntohs(udp->uh_ulen);
518 	if (m->m_pkthdr.len < ulen) {
519 		DNETDEBUG("ignoring runt UDP packet\n");
520 		return;
521 	}
522 
523 	sport = ntohs(udp->uh_sport);
524 
525 	m_adj(m, sizeof(*udp));
526 	ulen -= sizeof(*udp);
527 
528 	if (ulen == sizeof(struct debugnet_ack)) {
529 		debugnet_handle_ack(pcb, mb, sport);
530 		return;
531 	}
532 
533 	if (pcb->dp_rx_handler == NULL) {
534 		if (ulen < sizeof(struct debugnet_ack))
535 			DNETDEBUG("ignoring small ACK packet\n");
536 		else
537 			DNETDEBUG("ignoring unexpected non-ACK packet on "
538 			    "half-duplex connection.\n");
539 		return;
540 	}
541 
542 	debugnet_handle_rx_msg(pcb, mb);
543 }
544 
545 /*
546  * Handler for incoming packets directly from the network adapter
547  * Identifies the packet type (IP or ARP) and passes it along to one of the
548  * helper functions debugnet_handle_ip or debugnet_handle_arp.
549  *
550  * It needs to partially replicate the behaviour of ether_input() and
551  * ether_demux().
552  *
553  * Parameters:
554  *	ifp	the interface the packet came from
555  *	m	an mbuf containing the packet received
556  */
557 static void
558 debugnet_input_one(struct ifnet *ifp, struct mbuf *m)
559 {
560 	struct ifreq ifr;
561 	struct ether_header *eh;
562 	u_short etype;
563 
564 	/* Ethernet processing. */
565 	if ((m->m_flags & M_PKTHDR) == 0) {
566 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
567 		goto done;
568 	}
569 	if (m->m_len < ETHER_HDR_LEN) {
570 		DNETDEBUG_IF(ifp,
571 	    "discard frame without leading eth header (len %u pktlen %u)\n",
572 		    m->m_len, m->m_pkthdr.len);
573 		goto done;
574 	}
575 	if ((m->m_flags & M_HASFCS) != 0) {
576 		m_adj(m, -ETHER_CRC_LEN);
577 		m->m_flags &= ~M_HASFCS;
578 	}
579 	eh = mtod(m, struct ether_header *);
580 	etype = ntohs(eh->ether_type);
581 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
582 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
583 		goto done;
584 	}
585 	if (if_gethwaddr(ifp, &ifr) != 0) {
586 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
587 		goto done;
588 	}
589 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
590 	    ETHER_ADDR_LEN) != 0 &&
591 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
592 		DNETDEBUG_IF(ifp,
593 		    "discard frame with incorrect destination addr\n");
594 		goto done;
595 	}
596 
597 	MPASS(g_debugnet_pcb_inuse);
598 
599 	/* Done ethernet processing. Strip off the ethernet header. */
600 	m_adj(m, ETHER_HDR_LEN);
601 	switch (etype) {
602 	case ETHERTYPE_ARP:
603 		debugnet_handle_arp(&g_dnet_pcb, &m);
604 		break;
605 	case ETHERTYPE_IP:
606 		debugnet_handle_ip(&g_dnet_pcb, &m);
607 		break;
608 	default:
609 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
610 		break;
611 	}
612 done:
613 	if (m != NULL)
614 		m_freem(m);
615 }
616 
617 static void
618 debugnet_input(struct ifnet *ifp, struct mbuf *m)
619 {
620 	struct mbuf *n;
621 
622 	do {
623 		n = m->m_nextpkt;
624 		m->m_nextpkt = NULL;
625 		debugnet_input_one(ifp, m);
626 		m = n;
627 	} while (m != NULL);
628 }
629 
630 /*
631  * Network polling primitive.
632  *
633  * Instead of assuming that most of the network stack is sane, we just poll the
634  * driver directly for packets.
635  */
636 void
637 debugnet_network_poll(struct debugnet_pcb *pcb)
638 {
639 	struct ifnet *ifp;
640 
641 	ifp = pcb->dp_ifp;
642 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
643 }
644 
645 /*
646  * Start of consumer API surface.
647  */
648 void
649 debugnet_free(struct debugnet_pcb *pcb)
650 {
651 	struct ifnet *ifp;
652 
653 	MPASS(pcb == &g_dnet_pcb);
654 	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);
655 
656 	ifp = pcb->dp_ifp;
657 	if (ifp != NULL) {
658 		if (pcb->dp_drv_input != NULL)
659 			ifp->if_input = pcb->dp_drv_input;
660 		if (pcb->dp_event_started)
661 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
662 	}
663 	debugnet_mbuf_finish();
664 
665 	g_debugnet_pcb_inuse = false;
666 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
667 }
668 
669 int
670 debugnet_connect(const struct debugnet_conn_params *dcp,
671     struct debugnet_pcb **pcb_out)
672 {
673 	struct debugnet_proto_aux herald_auxdata;
674 	struct debugnet_pcb *pcb;
675 	struct ifnet *ifp;
676 	int error;
677 
678 	if (g_debugnet_pcb_inuse) {
679 		printf("%s: Only one connection at a time.\n", __func__);
680 		return (EBUSY);
681 	}
682 
683 	pcb = &g_dnet_pcb;
684 	*pcb = (struct debugnet_pcb) {
685 		.dp_state = DN_STATE_INIT,
686 		.dp_client = dcp->dc_client,
687 		.dp_server = dcp->dc_server,
688 		.dp_gateway = dcp->dc_gateway,
689 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
690 		.dp_client_port = dcp->dc_client_port,
691 		.dp_seqno = 1,
692 		.dp_ifp = dcp->dc_ifp,
693 		.dp_rx_handler = dcp->dc_rx_handler,
694 		.dp_drv_input = NULL,
695 	};
696 
697 	/* Switch to the debugnet mbuf zones. */
698 	debugnet_mbuf_start();
699 
700 	/* At least one needed parameter is missing; infer it. */
701 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
702 	    pcb->dp_ifp == NULL) {
703 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
704 		struct ifnet *rt_ifp;
705 		struct nhop_object *nh;
706 
707 		memset(&dest_sin, 0, sizeof(dest_sin));
708 		dest_sin = (struct sockaddr_in) {
709 			.sin_len = sizeof(dest_sin),
710 			.sin_family = AF_INET,
711 			.sin_addr.s_addr = pcb->dp_server,
712 		};
713 
714 		CURVNET_SET(vnet0);
715 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
716 		    NHR_NONE);
717 		CURVNET_RESTORE();
718 
719 		if (nh == NULL) {
720 			printf("%s: Could not get route for that server.\n",
721 			    __func__);
722 			error = ENOENT;
723 			goto cleanup;
724 		}
725 
726 		/* TODO support AF_INET6 */
727 		if (nh->gw_sa.sa_family == AF_INET)
728 			gw_sin = &nh->gw4_sa;
729 		else {
730 			if (nh->gw_sa.sa_family == AF_LINK)
731 				DNETDEBUG("Destination address is on link.\n");
732 			gw_sin = NULL;
733 		}
734 
735 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
736 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
737 
738 		rt_ifp = nh->nh_ifp;
739 
740 		if (pcb->dp_client == INADDR_ANY)
741 			pcb->dp_client = local_sin->sin_addr.s_addr;
742 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
743 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
744 		if (pcb->dp_ifp == NULL)
745 			pcb->dp_ifp = rt_ifp;
746 	}
747 
748 	ifp = pcb->dp_ifp;
749 
750 	if (debugnet_debug > 0) {
751 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
752 		    gwbuf[INET_ADDRSTRLEN];
753 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
754 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
755 		if (pcb->dp_gateway != INADDR_ANY)
756 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
757 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
758 		    serbuf, pcb->dp_server_port,
759 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
760 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
761 		    clibuf, pcb->dp_client_port, if_name(ifp));
762 	}
763 
764 	/* Validate iface is online and supported. */
765 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
766 		printf("%s: interface '%s' does not support debugnet\n",
767 		    __func__, if_name(ifp));
768 		error = ENODEV;
769 		goto cleanup;
770 	}
771 	if ((if_getflags(ifp) & IFF_UP) == 0) {
772 		printf("%s: interface '%s' link is down\n", __func__,
773 		    if_name(ifp));
774 		error = ENXIO;
775 		goto cleanup;
776 	}
777 
778 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
779 	pcb->dp_event_started = true;
780 
781 	/*
782 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
783 	 * while the debugnet ifp's if_input is overridden with
784 	 * debugnet_input().
785 	 */
786 	g_debugnet_pcb_inuse = true;
787 
788 	/* Make the card use *our* receive callback. */
789 	pcb->dp_drv_input = ifp->if_input;
790 	ifp->if_input = debugnet_input;
791 
792 	printf("%s: searching for %s MAC...\n", __func__,
793 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
794 
795 	error = debugnet_arp_gw(pcb);
796 	if (error != 0) {
797 		printf("%s: failed to locate MAC address\n", __func__);
798 		goto cleanup;
799 	}
800 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
801 
802 	herald_auxdata = (struct debugnet_proto_aux) {
803 		.dp_offset_start = dcp->dc_herald_offset,
804 		.dp_aux2 = dcp->dc_herald_aux2,
805 	};
806 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
807 	    dcp->dc_herald_datalen, &herald_auxdata);
808 	if (error != 0) {
809 		printf("%s: failed to herald debugnet server\n", __func__);
810 		goto cleanup;
811 	}
812 
813 	*pcb_out = pcb;
814 	return (0);
815 
816 cleanup:
817 	debugnet_free(pcb);
818 	return (error);
819 }
820 
821 /*
822  * Pre-allocated dump-time mbuf tracking.
823  *
824  * We just track the high water mark we've ever seen and allocate appropriately
825  * for that iface/mtu combo.
826  */
827 static struct {
828 	int nmbuf;
829 	int ncl;
830 	int clsize;
831 } dn_hwm;
832 static struct mtx dn_hwm_lk;
833 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
834 
835 static void
836 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
837 {
838 	bool any;
839 
840 	any = false;
841 	mtx_lock(&dn_hwm_lk);
842 
843 	if (nmbuf > dn_hwm.nmbuf) {
844 		any = true;
845 		dn_hwm.nmbuf = nmbuf;
846 	} else
847 		nmbuf = dn_hwm.nmbuf;
848 
849 	if (ncl > dn_hwm.ncl) {
850 		any = true;
851 		dn_hwm.ncl = ncl;
852 	} else
853 		ncl = dn_hwm.ncl;
854 
855 	if (clsize > dn_hwm.clsize) {
856 		any = true;
857 		dn_hwm.clsize = clsize;
858 	} else
859 		clsize = dn_hwm.clsize;
860 
861 	mtx_unlock(&dn_hwm_lk);
862 
863 	if (any)
864 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
865 }
866 
867 void
868 debugnet_any_ifnet_update(struct ifnet *ifp)
869 {
870 	int clsize, nmbuf, ncl, nrxr;
871 
872 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
873 		return;
874 
875 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
876 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
877 
878 	/*
879 	 * We need two headers per message on the transmit side. Multiply by
880 	 * four to give us some breathing room.
881 	 */
882 	nmbuf = ncl * (4 + nrxr);
883 	ncl *= nrxr;
884 
885 	/*
886 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
887 	 * dn_init method is available.
888 	 */
889 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
890 #ifndef INVARIANTS
891 		if (bootverbose)
892 #endif
893 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
894 		    __func__, if_name(ifp), ifp);
895 		return;
896 	}
897 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
898 }
899 
900 /*
901  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
902  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
903  *
904  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
905  * because the driver is still in attach.  Since we cannot use down interfaces,
906  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
907  * with vtnet and dhcpclient that event just never occurs.
908  *
909  * So that's how I've landed on the lower level ifnet_link_event.
910  */
911 
912 static void
913 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
914 {
915 	if (link_state == LINK_STATE_UP)
916 		debugnet_any_ifnet_update(ifp);
917 }
918 
919 static eventhandler_tag dn_attach_cookie;
920 static void
921 dn_evh_init(void *ctx __unused)
922 {
923 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
924 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
925 }
926 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
927 
928 /*
929  * DDB parsing helpers for debugnet(4) consumers.
930  */
931 #ifdef DDB
932 struct my_inet_opt {
933 	bool has_opt;
934 	const char *printname;
935 	in_addr_t *result;
936 };
937 
938 static int
939 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
940 {
941 	in_addr_t tmp;
942 	unsigned octet;
943 	int t;
944 
945 	tmp = 0;
946 	for (octet = 0; octet < 4; octet++) {
947 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
948 		if (t != tNUMBER) {
949 			db_printf("%s:%s: octet %u expected number; found %d\n",
950 			    __func__, opt->printname, octet, t);
951 			return (EINVAL);
952 		}
953 		/*
954 		 * db_lex lexes '-' distinctly from the number itself, but
955 		 * let's document that invariant.
956 		 */
957 		MPASS(db_tok_number >= 0);
958 
959 		if (db_tok_number > UINT8_MAX) {
960 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
961 			    opt->printname, octet, (intmax_t)db_tok_number);
962 			return (EDOM);
963 		}
964 
965 		/* Constructed host-endian and converted to network later. */
966 		tmp = (tmp << 8) | db_tok_number;
967 
968 		if (octet < 3) {
969 			t = db_read_token_flags(DRT_WSPACE);
970 			if (t != tDOT) {
971 				db_printf("%s:%s: octet %u expected '.'; found"
972 				    " %d\n", __func__, opt->printname, octet,
973 				    t);
974 				return (EINVAL);
975 			}
976 		}
977 	}
978 
979 	*opt->result = htonl(tmp);
980 	opt->has_opt = true;
981 	return (0);
982 }
983 
984 int
985 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
986 {
987 	struct ifnet *ifp;
988 	int t, error;
989 	bool want_ifp;
990 	char ch;
991 
992 	struct my_inet_opt opt_client = {
993 		.printname = "client",
994 		.result = &result->dd_client,
995 	},
996 	opt_server = {
997 		.printname = "server",
998 		.result = &result->dd_server,
999 	},
1000 	opt_gateway = {
1001 		.printname = "gateway",
1002 		.result = &result->dd_gateway,
1003 	},
1004 	*cur_inet_opt;
1005 
1006 	ifp = NULL;
1007 	memset(result, 0, sizeof(*result));
1008 
1009 	/*
1010 	 * command [space] [-] [opt] [[space] [optarg]] ...
1011 	 *
1012 	 * db_command has already lexed 'command' for us.
1013 	 */
1014 	t = db_read_token_flags(DRT_WSPACE);
1015 	if (t == tWSPACE)
1016 		t = db_read_token_flags(DRT_WSPACE);
1017 
1018 	while (t != tEOL) {
1019 		if (t != tMINUS) {
1020 			db_printf("%s: Bad syntax; expected '-', got %d\n",
1021 			    cmd, t);
1022 			goto usage;
1023 		}
1024 
1025 		t = db_read_token_flags(DRT_WSPACE);
1026 		if (t != tIDENT) {
1027 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
1028 			    cmd, t);
1029 			goto usage;
1030 		}
1031 
1032 		if (strlen(db_tok_string) > 1) {
1033 			db_printf("%s: Bad syntax; expected single option "
1034 			    "flag, got '%s'\n", cmd, db_tok_string);
1035 			goto usage;
1036 		}
1037 
1038 		want_ifp = false;
1039 		cur_inet_opt = NULL;
1040 		switch ((ch = db_tok_string[0])) {
1041 		default:
1042 			DNETDEBUG("Unexpected: '%c'\n", ch);
1043 			/* FALLTHROUGH */
1044 		case 'h':
1045 			goto usage;
1046 		case 'c':
1047 			cur_inet_opt = &opt_client;
1048 			break;
1049 		case 'g':
1050 			cur_inet_opt = &opt_gateway;
1051 			break;
1052 		case 's':
1053 			cur_inet_opt = &opt_server;
1054 			break;
1055 		case 'i':
1056 			want_ifp = true;
1057 			break;
1058 		}
1059 
1060 		t = db_read_token_flags(DRT_WSPACE);
1061 		if (t != tWSPACE) {
1062 			db_printf("%s: Bad syntax; expected space after "
1063 			    "flag %c, got %d\n", cmd, ch, t);
1064 			goto usage;
1065 		}
1066 
1067 		if (want_ifp) {
1068 			t = db_read_token_flags(DRT_WSPACE);
1069 			if (t != tIDENT) {
1070 				db_printf("%s: Expected interface but got %d\n",
1071 				    cmd, t);
1072 				goto usage;
1073 			}
1074 
1075 			CURVNET_SET(vnet0);
1076 			/*
1077 			 * We *don't* take a ref here because the only current
1078 			 * consumer, db_netdump_cmd, does not need it.  It
1079 			 * (somewhat redundantly) extracts the if_name(),
1080 			 * re-lookups the ifp, and takes its own reference.
1081 			 */
1082 			ifp = ifunit(db_tok_string);
1083 			CURVNET_RESTORE();
1084 			if (ifp == NULL) {
1085 				db_printf("Could not locate interface %s\n",
1086 				    db_tok_string);
1087 				error = ENOENT;
1088 				goto cleanup;
1089 			}
1090 		} else {
1091 			MPASS(cur_inet_opt != NULL);
1092 			/* Assume IPv4 for now. */
1093 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1094 			if (error != 0)
1095 				goto cleanup;
1096 		}
1097 
1098 		/* Skip (mandatory) whitespace after option, if not EOL. */
1099 		t = db_read_token_flags(DRT_WSPACE);
1100 		if (t == tEOL)
1101 			break;
1102 		if (t != tWSPACE) {
1103 			db_printf("%s: Bad syntax; expected space after "
1104 			    "flag %c option; got %d\n", cmd, ch, t);
1105 			goto usage;
1106 		}
1107 		t = db_read_token_flags(DRT_WSPACE);
1108 	}
1109 
1110 	if (!opt_server.has_opt) {
1111 		db_printf("%s: need a destination server address\n", cmd);
1112 		goto usage;
1113 	}
1114 
1115 	result->dd_has_client = opt_client.has_opt;
1116 	result->dd_has_gateway = opt_gateway.has_opt;
1117 	result->dd_ifp = ifp;
1118 
1119 	/* We parsed the full line to tEOL already, or bailed with an error. */
1120 	return (0);
1121 
1122 usage:
1123 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1124 	    "-i <interface>]\n", cmd);
1125 	error = EINVAL;
1126 	/* FALLTHROUGH */
1127 cleanup:
1128 	db_skip_to_eol();
1129 	return (error);
1130 }
1131 #endif /* DDB */
1132