xref: /freebsd/sys/net/debugnet.c (revision 21b492ed51aa6ff8008a8aa83333b1de30288a15)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/endian.h>
40 #include <sys/errno.h>
41 #include <sys/eventhandler.h>
42 #include <sys/socket.h>
43 #include <sys/sysctl.h>
44 
45 #ifdef DDB
46 #include <ddb/ddb.h>
47 #include <ddb/db_lex.h>
48 #endif
49 
50 #include <net/ethernet.h>
51 #include <net/if.h>
52 #include <net/if_arp.h>
53 #include <net/if_dl.h>
54 #include <net/if_types.h>
55 #include <net/if_var.h>
56 #include <net/route.h>
57 #include <net/route/nhop.h>
58 
59 #include <netinet/in.h>
60 #include <netinet/in_fib.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
63 #include <netinet/ip.h>
64 #include <netinet/ip_var.h>
65 #include <netinet/ip_options.h>
66 #include <netinet/udp.h>
67 #include <netinet/udp_var.h>
68 
69 #include <machine/in_cksum.h>
70 #include <machine/pcb.h>
71 
72 #include <net/debugnet.h>
73 #define	DEBUGNET_INTERNAL
74 #include <net/debugnet_int.h>
75 
76 FEATURE(debugnet, "Debugnet support");
77 
78 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
79     "debugnet parameters");
80 
81 unsigned debugnet_debug;
82 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
83     &debugnet_debug, 0,
84     "Debug message verbosity (0: off; 1: on; 2: verbose)");
85 
86 int debugnet_npolls = 2000;
87 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
88     &debugnet_npolls, 0,
89     "Number of times to poll before assuming packet loss (0.5ms per poll)");
90 int debugnet_nretries = 10;
91 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
92     &debugnet_nretries, 0,
93     "Number of retransmit attempts before giving up");
94 int debugnet_fib = RT_DEFAULT_FIB;
95 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
96     &debugnet_fib, 0,
97     "Fib to use when sending dump");
98 
99 static bool g_debugnet_pcb_inuse;
100 static struct debugnet_pcb g_dnet_pcb;
101 
102 /*
103  * Simple accessors for opaque PCB.
104  */
105 const unsigned char *
106 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
107 {
108 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
109 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
110 	return (pcb->dp_gw_mac.octet);
111 }
112 
113 /*
114  * Start of network primitives, beginning with output primitives.
115  */
116 
117 /*
118  * Handles creation of the ethernet header, then places outgoing packets into
119  * the tx buffer for the NIC
120  *
121  * Parameters:
122  *	m	The mbuf containing the packet to be sent (will be freed by
123  *		this function or the NIC driver)
124  *	ifp	The interface to send on
125  *	dst	The destination ethernet address (source address will be looked
126  *		up using ifp)
127  *	etype	The ETHERTYPE_* value for the protocol that is being sent
128  *
129  * Returns:
130  *	int	see errno.h, 0 for success
131  */
132 int
133 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
134     u_short etype)
135 {
136 	struct ether_header *eh;
137 
138 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
139 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
140 		if_printf(ifp, "%s: interface isn't up\n", __func__);
141 		m_freem(m);
142 		return (ENETDOWN);
143 	}
144 
145 	/* Fill in the ethernet header. */
146 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
147 	if (m == NULL) {
148 		printf("%s: out of mbufs\n", __func__);
149 		return (ENOBUFS);
150 	}
151 	eh = mtod(m, struct ether_header *);
152 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
153 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
154 	eh->ether_type = htons(etype);
155 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
156 }
157 
158 /*
159  * Unreliable transmission of an mbuf chain to the debugnet server
160  * Note: can't handle fragmentation; fails if the packet is larger than
161  *	 ifp->if_mtu after adding the UDP/IP headers
162  *
163  * Parameters:
164  *	pcb	The debugnet context block
165  *	m	mbuf chain
166  *
167  * Returns:
168  *	int	see errno.h, 0 for success
169  */
170 static int
171 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
172 {
173 	struct udphdr *udp;
174 
175 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
176 
177 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
178 	if (m == NULL) {
179 		printf("%s: out of mbufs\n", __func__);
180 		return (ENOBUFS);
181 	}
182 
183 	udp = mtod(m, void *);
184 	udp->uh_ulen = htons(m->m_pkthdr.len);
185 	/* Use this src port so that the server can connect() the socket */
186 	udp->uh_sport = htons(pcb->dp_client_port);
187 	udp->uh_dport = htons(pcb->dp_server_port);
188 	/* Computed later (protocol-dependent). */
189 	udp->uh_sum = 0;
190 
191 	return (debugnet_ip_output(pcb, m));
192 }
193 
194 int
195 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
196 {
197 	struct debugnet_ack *dn_ack;
198 	struct mbuf *m;
199 
200 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
201 
202 	m = m_gethdr(M_NOWAIT, MT_DATA);
203 	if (m == NULL) {
204 		printf("%s: Out of mbufs\n", __func__);
205 		return (ENOBUFS);
206 	}
207 	m->m_len = sizeof(*dn_ack);
208 	m->m_pkthdr.len = sizeof(*dn_ack);
209 	MH_ALIGN(m, sizeof(*dn_ack));
210 	dn_ack = mtod(m, void *);
211 	dn_ack->da_seqno = seqno;
212 
213 	return (debugnet_udp_output(pcb, m));
214 }
215 
216 /*
217  * Dummy free function for debugnet clusters.
218  */
219 static void
220 debugnet_mbuf_free(struct mbuf *m __unused)
221 {
222 }
223 
224 /*
225  * Construct and reliably send a debugnet packet.  May fail from a resource
226  * shortage or extreme number of unacknowledged retransmissions.  Wait for
227  * an acknowledgement before returning.  Splits packets into chunks small
228  * enough to be sent without fragmentation (looks up the interface MTU)
229  *
230  * Parameters:
231  *	type	debugnet packet type (HERALD, FINISHED, ...)
232  *	data	data
233  *	datalen	data size (bytes)
234  *	auxdata	optional auxiliary information
235  *
236  * Returns:
237  *	int see errno.h, 0 for success
238  */
239 int
240 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
241     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
242 {
243 	struct debugnet_msg_hdr *dn_msg_hdr;
244 	struct mbuf *m, *m2;
245 	uint64_t want_acks;
246 	uint32_t i, pktlen, sent_so_far;
247 	int retries, polls, error;
248 
249 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
250 		return (ECONNRESET);
251 
252 	want_acks = 0;
253 	pcb->dp_rcvd_acks = 0;
254 	retries = 0;
255 
256 retransmit:
257 	/* Chunks can be too big to fit in packets. */
258 	for (i = sent_so_far = 0; sent_so_far < datalen ||
259 	    (i == 0 && datalen == 0); i++) {
260 		pktlen = datalen - sent_so_far;
261 
262 		/* Bound: the interface MTU (assume no IP options). */
263 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
264 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
265 
266 		/*
267 		 * Check if it is retransmitting and this has been ACKed
268 		 * already.
269 		 */
270 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
271 			sent_so_far += pktlen;
272 			continue;
273 		}
274 
275 		/*
276 		 * Get and fill a header mbuf, then chain data as an extended
277 		 * mbuf.
278 		 */
279 		m = m_gethdr(M_NOWAIT, MT_DATA);
280 		if (m == NULL) {
281 			printf("%s: Out of mbufs\n", __func__);
282 			return (ENOBUFS);
283 		}
284 		m->m_len = sizeof(struct debugnet_msg_hdr);
285 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
286 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
287 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
288 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
289 		dn_msg_hdr->mh_type = htonl(type);
290 		dn_msg_hdr->mh_len = htonl(pktlen);
291 
292 		if (auxdata != NULL) {
293 			dn_msg_hdr->mh_offset =
294 			    htobe64(auxdata->dp_offset_start + sent_so_far);
295 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
296 		} else {
297 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
298 			dn_msg_hdr->mh_aux2 = 0;
299 		}
300 
301 		if (pktlen != 0) {
302 			m2 = m_get(M_NOWAIT, MT_DATA);
303 			if (m2 == NULL) {
304 				m_freem(m);
305 				printf("%s: Out of mbufs\n", __func__);
306 				return (ENOBUFS);
307 			}
308 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
309 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
310 			    EXT_DISPOSABLE);
311 			m2->m_len = pktlen;
312 
313 			m_cat(m, m2);
314 			m->m_pkthdr.len += pktlen;
315 		}
316 		error = debugnet_udp_output(pcb, m);
317 		if (error != 0)
318 			return (error);
319 
320 		/* Note that we're waiting for this packet in the bitfield. */
321 		want_acks |= (1 << i);
322 		sent_so_far += pktlen;
323 	}
324 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
325 		printf("Warning: Sent more than %d packets (%d). "
326 		    "Acknowledgements will fail unless the size of "
327 		    "rcvd_acks/want_acks is increased.\n",
328 		    DEBUGNET_MAX_IN_FLIGHT, i);
329 
330 	/*
331 	 * Wait for acks.  A *real* window would speed things up considerably.
332 	 */
333 	polls = 0;
334 	while (pcb->dp_rcvd_acks != want_acks) {
335 		if (polls++ > debugnet_npolls) {
336 			if (retries++ > debugnet_nretries)
337 				return (ETIMEDOUT);
338 			printf(". ");
339 			goto retransmit;
340 		}
341 		debugnet_network_poll(pcb);
342 		DELAY(500);
343 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
344 			return (ECONNRESET);
345 	}
346 	pcb->dp_seqno += i;
347 	return (0);
348 }
349 
350 /*
351  * Network input primitives.
352  */
353 
354 /*
355  * Just introspect the header enough to fire off a seqno ack and validate
356  * length fits.
357  */
358 static void
359 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
360 {
361 	const struct debugnet_msg_hdr *dnh;
362 	struct mbuf *m;
363 	int error;
364 
365 	m = *mb;
366 
367 	if (m->m_pkthdr.len < sizeof(*dnh)) {
368 		DNETDEBUG("ignoring small debugnet_msg packet\n");
369 		return;
370 	}
371 
372 	/* Get ND header. */
373 	if (m->m_len < sizeof(*dnh)) {
374 		m = m_pullup(m, sizeof(*dnh));
375 		*mb = m;
376 		if (m == NULL) {
377 			DNETDEBUG("m_pullup failed\n");
378 			return;
379 		}
380 	}
381 	dnh = mtod(m, const void *);
382 
383 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
384 		DNETDEBUG("Dropping short packet.\n");
385 		return;
386 	}
387 
388 	/*
389 	 * If the issue is transient (ENOBUFS), sender should resend.  If
390 	 * non-transient (like driver objecting to rx -> tx from the same
391 	 * thread), not much else we can do.
392 	 */
393 	error = debugnet_ack_output(pcb, dnh->mh_seqno);
394 	if (error != 0)
395 		return;
396 
397 	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
398 		printf("Remote shut down the connection on us!\n");
399 		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
400 
401 		/*
402 		 * Continue through to the user handler so they are signalled
403 		 * not to wait for further rx.
404 		 */
405 	}
406 
407 	pcb->dp_rx_handler(pcb, mb);
408 }
409 
410 static void
411 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
412 {
413 	const struct debugnet_ack *dn_ack;
414 	struct mbuf *m;
415 	uint32_t rcv_ackno;
416 
417 	m = *mb;
418 
419 	/* Get Ack. */
420 	if (m->m_len < sizeof(*dn_ack)) {
421 		m = m_pullup(m, sizeof(*dn_ack));
422 		*mb = m;
423 		if (m == NULL) {
424 			DNETDEBUG("m_pullup failed\n");
425 			return;
426 		}
427 	}
428 	dn_ack = mtod(m, const void *);
429 
430 	/* Debugnet processing. */
431 	/*
432 	 * Packet is meant for us.  Extract the ack sequence number and the
433 	 * port number if necessary.
434 	 */
435 	rcv_ackno = ntohl(dn_ack->da_seqno);
436 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
437 		pcb->dp_server_port = sport;
438 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
439 	}
440 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
441 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
442 	else if (rcv_ackno >= pcb->dp_seqno) {
443 		/* We're interested in this ack. Record it. */
444 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
445 	}
446 }
447 
448 void
449 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
450 {
451 	const struct udphdr *udp;
452 	struct mbuf *m;
453 	uint16_t sport, ulen;
454 
455 	/* UDP processing. */
456 
457 	m = *mb;
458 	if (m->m_pkthdr.len < sizeof(*udp)) {
459 		DNETDEBUG("ignoring small UDP packet\n");
460 		return;
461 	}
462 
463 	/* Get UDP headers. */
464 	if (m->m_len < sizeof(*udp)) {
465 		m = m_pullup(m, sizeof(*udp));
466 		*mb = m;
467 		if (m == NULL) {
468 			DNETDEBUG("m_pullup failed\n");
469 			return;
470 		}
471 	}
472 	udp = mtod(m, const void *);
473 
474 	/* We expect to receive UDP packets on the configured client port. */
475 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
476 		DNETDEBUG("not on the expected port.\n");
477 		return;
478 	}
479 
480 	/* Check that ulen does not exceed actual size of data. */
481 	ulen = ntohs(udp->uh_ulen);
482 	if (m->m_pkthdr.len < ulen) {
483 		DNETDEBUG("ignoring runt UDP packet\n");
484 		return;
485 	}
486 
487 	sport = ntohs(udp->uh_sport);
488 
489 	m_adj(m, sizeof(*udp));
490 	ulen -= sizeof(*udp);
491 
492 	if (ulen == sizeof(struct debugnet_ack)) {
493 		debugnet_handle_ack(pcb, mb, sport);
494 		return;
495 	}
496 
497 	if (pcb->dp_rx_handler == NULL) {
498 		if (ulen < sizeof(struct debugnet_ack))
499 			DNETDEBUG("ignoring small ACK packet\n");
500 		else
501 			DNETDEBUG("ignoring unexpected non-ACK packet on "
502 			    "half-duplex connection.\n");
503 		return;
504 	}
505 
506 	debugnet_handle_rx_msg(pcb, mb);
507 }
508 
509 /*
510  * Handler for incoming packets directly from the network adapter
511  * Identifies the packet type (IP or ARP) and passes it along to one of the
512  * helper functions debugnet_handle_ip or debugnet_handle_arp.
513  *
514  * It needs to partially replicate the behaviour of ether_input() and
515  * ether_demux().
516  *
517  * Parameters:
518  *	ifp	the interface the packet came from
519  *	m	an mbuf containing the packet received
520  */
521 static void
522 debugnet_pkt_in(struct ifnet *ifp, struct mbuf *m)
523 {
524 	struct ifreq ifr;
525 	struct ether_header *eh;
526 	u_short etype;
527 
528 	/* Ethernet processing. */
529 	if ((m->m_flags & M_PKTHDR) == 0) {
530 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
531 		goto done;
532 	}
533 	if (m->m_len < ETHER_HDR_LEN) {
534 		DNETDEBUG_IF(ifp,
535 	    "discard frame without leading eth header (len %u pktlen %u)\n",
536 		    m->m_len, m->m_pkthdr.len);
537 		goto done;
538 	}
539 	if ((m->m_flags & M_HASFCS) != 0) {
540 		m_adj(m, -ETHER_CRC_LEN);
541 		m->m_flags &= ~M_HASFCS;
542 	}
543 	eh = mtod(m, struct ether_header *);
544 	etype = ntohs(eh->ether_type);
545 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
546 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
547 		goto done;
548 	}
549 	if (if_gethwaddr(ifp, &ifr) != 0) {
550 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
551 		goto done;
552 	}
553 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
554 	    ETHER_ADDR_LEN) != 0 &&
555 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
556 		DNETDEBUG_IF(ifp,
557 		    "discard frame with incorrect destination addr\n");
558 		goto done;
559 	}
560 
561 	MPASS(g_debugnet_pcb_inuse);
562 
563 	/* Done ethernet processing. Strip off the ethernet header. */
564 	m_adj(m, ETHER_HDR_LEN);
565 	switch (etype) {
566 	case ETHERTYPE_ARP:
567 		debugnet_handle_arp(&g_dnet_pcb, &m);
568 		break;
569 	case ETHERTYPE_IP:
570 		debugnet_handle_ip(&g_dnet_pcb, &m);
571 		break;
572 	default:
573 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
574 		break;
575 	}
576 done:
577 	if (m != NULL)
578 		m_freem(m);
579 }
580 
581 /*
582  * Network polling primitive.
583  *
584  * Instead of assuming that most of the network stack is sane, we just poll the
585  * driver directly for packets.
586  */
587 void
588 debugnet_network_poll(struct debugnet_pcb *pcb)
589 {
590 	struct ifnet *ifp;
591 
592 	ifp = pcb->dp_ifp;
593 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
594 }
595 
596 /*
597  * Start of consumer API surface.
598  */
599 void
600 debugnet_free(struct debugnet_pcb *pcb)
601 {
602 	struct ifnet *ifp;
603 
604 	MPASS(g_debugnet_pcb_inuse);
605 	MPASS(pcb == &g_dnet_pcb);
606 
607 	ifp = pcb->dp_ifp;
608 	if (ifp != NULL) {
609 		if (pcb->dp_drv_input != NULL)
610 			ifp->if_input = pcb->dp_drv_input;
611 		if (pcb->dp_event_started)
612 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
613 	}
614 	debugnet_mbuf_finish();
615 
616 	g_debugnet_pcb_inuse = false;
617 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
618 }
619 
620 int
621 debugnet_connect(const struct debugnet_conn_params *dcp,
622     struct debugnet_pcb **pcb_out)
623 {
624 	struct debugnet_proto_aux herald_auxdata;
625 	struct debugnet_pcb *pcb;
626 	struct ifnet *ifp;
627 	int error;
628 
629 	if (g_debugnet_pcb_inuse) {
630 		printf("%s: Only one connection at a time.\n", __func__);
631 		return (EBUSY);
632 	}
633 
634 	pcb = &g_dnet_pcb;
635 	*pcb = (struct debugnet_pcb) {
636 		.dp_state = DN_STATE_INIT,
637 		.dp_client = dcp->dc_client,
638 		.dp_server = dcp->dc_server,
639 		.dp_gateway = dcp->dc_gateway,
640 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
641 		.dp_client_port = dcp->dc_client_port,
642 		.dp_seqno = 1,
643 		.dp_ifp = dcp->dc_ifp,
644 		.dp_rx_handler = dcp->dc_rx_handler,
645 	};
646 
647 	/* Switch to the debugnet mbuf zones. */
648 	debugnet_mbuf_start();
649 
650 	/* At least one needed parameter is missing; infer it. */
651 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
652 	    pcb->dp_ifp == NULL) {
653 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
654 		struct ifnet *rt_ifp;
655 		struct nhop_object *nh;
656 
657 		memset(&dest_sin, 0, sizeof(dest_sin));
658 		dest_sin = (struct sockaddr_in) {
659 			.sin_len = sizeof(dest_sin),
660 			.sin_family = AF_INET,
661 			.sin_addr.s_addr = pcb->dp_server,
662 		};
663 
664 		CURVNET_SET(vnet0);
665 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
666 		    NHR_NONE);
667 		CURVNET_RESTORE();
668 
669 		if (nh == NULL) {
670 			printf("%s: Could not get route for that server.\n",
671 			    __func__);
672 			error = ENOENT;
673 			goto cleanup;
674 		}
675 
676 		/* TODO support AF_INET6 */
677 		if (nh->gw_sa.sa_family == AF_INET)
678 			gw_sin = &nh->gw4_sa;
679 		else {
680 			if (nh->gw_sa.sa_family == AF_LINK)
681 				DNETDEBUG("Destination address is on link.\n");
682 			gw_sin = NULL;
683 		}
684 
685 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
686 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
687 
688 		rt_ifp = nh->nh_ifp;
689 
690 		if (pcb->dp_client == INADDR_ANY)
691 			pcb->dp_client = local_sin->sin_addr.s_addr;
692 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
693 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
694 		if (pcb->dp_ifp == NULL)
695 			pcb->dp_ifp = rt_ifp;
696 	}
697 
698 	ifp = pcb->dp_ifp;
699 
700 	if (debugnet_debug > 0) {
701 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
702 		    gwbuf[INET_ADDRSTRLEN];
703 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
704 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
705 		if (pcb->dp_gateway != INADDR_ANY)
706 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
707 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
708 		    serbuf, pcb->dp_server_port,
709 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
710 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
711 		    clibuf, pcb->dp_client_port, if_name(ifp));
712 	}
713 
714 	/* Validate iface is online and supported. */
715 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
716 		printf("%s: interface '%s' does not support debugnet\n",
717 		    __func__, if_name(ifp));
718 		error = ENODEV;
719 		goto cleanup;
720 	}
721 	if ((if_getflags(ifp) & IFF_UP) == 0) {
722 		printf("%s: interface '%s' link is down\n", __func__,
723 		    if_name(ifp));
724 		error = ENXIO;
725 		goto cleanup;
726 	}
727 
728 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
729 	pcb->dp_event_started = true;
730 
731 	/*
732 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
733 	 * while the debugnet ifp's if_input is overridden with
734 	 * debugnet_pkt_in.
735 	 */
736 	g_debugnet_pcb_inuse = true;
737 
738 	/* Make the card use *our* receive callback. */
739 	pcb->dp_drv_input = ifp->if_input;
740 	ifp->if_input = debugnet_pkt_in;
741 
742 	printf("%s: searching for %s MAC...\n", __func__,
743 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
744 
745 	error = debugnet_arp_gw(pcb);
746 	if (error != 0) {
747 		printf("%s: failed to locate MAC address\n", __func__);
748 		goto cleanup;
749 	}
750 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
751 
752 	herald_auxdata = (struct debugnet_proto_aux) {
753 		.dp_offset_start = dcp->dc_herald_offset,
754 		.dp_aux2 = dcp->dc_herald_aux2,
755 	};
756 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
757 	    dcp->dc_herald_datalen, &herald_auxdata);
758 	if (error != 0) {
759 		printf("%s: failed to herald debugnet server\n", __func__);
760 		goto cleanup;
761 	}
762 
763 	*pcb_out = pcb;
764 	return (0);
765 
766 cleanup:
767 	debugnet_free(pcb);
768 	return (error);
769 }
770 
771 /*
772  * Pre-allocated dump-time mbuf tracking.
773  *
774  * We just track the high water mark we've ever seen and allocate appropriately
775  * for that iface/mtu combo.
776  */
777 static struct {
778 	int nmbuf;
779 	int ncl;
780 	int clsize;
781 } dn_hwm;
782 static struct mtx dn_hwm_lk;
783 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
784 
785 static void
786 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
787 {
788 	bool any;
789 
790 	any = false;
791 	mtx_lock(&dn_hwm_lk);
792 
793 	if (nmbuf > dn_hwm.nmbuf) {
794 		any = true;
795 		dn_hwm.nmbuf = nmbuf;
796 	} else
797 		nmbuf = dn_hwm.nmbuf;
798 
799 	if (ncl > dn_hwm.ncl) {
800 		any = true;
801 		dn_hwm.ncl = ncl;
802 	} else
803 		ncl = dn_hwm.ncl;
804 
805 	if (clsize > dn_hwm.clsize) {
806 		any = true;
807 		dn_hwm.clsize = clsize;
808 	} else
809 		clsize = dn_hwm.clsize;
810 
811 	mtx_unlock(&dn_hwm_lk);
812 
813 	if (any)
814 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
815 }
816 
817 void
818 debugnet_any_ifnet_update(struct ifnet *ifp)
819 {
820 	int clsize, nmbuf, ncl, nrxr;
821 
822 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
823 		return;
824 
825 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
826 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
827 
828 	/*
829 	 * We need two headers per message on the transmit side. Multiply by
830 	 * four to give us some breathing room.
831 	 */
832 	nmbuf = ncl * (4 + nrxr);
833 	ncl *= nrxr;
834 
835 	/*
836 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
837 	 * dn_init method is available.
838 	 */
839 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
840 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
841 		    __func__, if_name(ifp), ifp);
842 		return;
843 	}
844 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
845 }
846 
847 /*
848  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
849  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
850  *
851  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
852  * because the driver is still in attach.  Since we cannot use down interfaces,
853  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
854  * with vtnet and dhcpclient that event just never occurs.
855  *
856  * So that's how I've landed on the lower level ifnet_link_event.
857  */
858 
859 static void
860 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
861 {
862 	if (link_state == LINK_STATE_UP)
863 		debugnet_any_ifnet_update(ifp);
864 }
865 
866 static eventhandler_tag dn_attach_cookie;
867 static void
868 dn_evh_init(void *ctx __unused)
869 {
870 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
871 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
872 }
873 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
874 
875 /*
876  * DDB parsing helpers for debugnet(4) consumers.
877  */
878 #ifdef DDB
879 struct my_inet_opt {
880 	bool has_opt;
881 	const char *printname;
882 	in_addr_t *result;
883 };
884 
885 static int
886 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
887 {
888 	in_addr_t tmp;
889 	unsigned octet;
890 	int t;
891 
892 	tmp = 0;
893 	for (octet = 0; octet < 4; octet++) {
894 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
895 		if (t != tNUMBER) {
896 			db_printf("%s:%s: octet %u expected number; found %d\n",
897 			    __func__, opt->printname, octet, t);
898 			return (EINVAL);
899 		}
900 		/*
901 		 * db_lex lexes '-' distinctly from the number itself, but
902 		 * let's document that invariant.
903 		 */
904 		MPASS(db_tok_number >= 0);
905 
906 		if (db_tok_number > UINT8_MAX) {
907 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
908 			    opt->printname, octet, (intmax_t)db_tok_number);
909 			return (EDOM);
910 		}
911 
912 		/* Constructed host-endian and converted to network later. */
913 		tmp = (tmp << 8) | db_tok_number;
914 
915 		if (octet < 3) {
916 			t = db_read_token_flags(DRT_WSPACE);
917 			if (t != tDOT) {
918 				db_printf("%s:%s: octet %u expected '.'; found"
919 				    " %d\n", __func__, opt->printname, octet,
920 				    t);
921 				return (EINVAL);
922 			}
923 		}
924 	}
925 
926 	*opt->result = htonl(tmp);
927 	opt->has_opt = true;
928 	return (0);
929 }
930 
931 int
932 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
933 {
934 	struct ifnet *ifp;
935 	int t, error;
936 	bool want_ifp;
937 	char ch;
938 
939 	struct my_inet_opt opt_client = {
940 		.printname = "client",
941 		.result = &result->dd_client,
942 	},
943 	opt_server = {
944 		.printname = "server",
945 		.result = &result->dd_server,
946 	},
947 	opt_gateway = {
948 		.printname = "gateway",
949 		.result = &result->dd_gateway,
950 	},
951 	*cur_inet_opt;
952 
953 	ifp = NULL;
954 	memset(result, 0, sizeof(*result));
955 
956 	/*
957 	 * command [space] [-] [opt] [[space] [optarg]] ...
958 	 *
959 	 * db_command has already lexed 'command' for us.
960 	 */
961 	t = db_read_token_flags(DRT_WSPACE);
962 	if (t == tWSPACE)
963 		t = db_read_token_flags(DRT_WSPACE);
964 
965 	while (t != tEOL) {
966 		if (t != tMINUS) {
967 			db_printf("%s: Bad syntax; expected '-', got %d\n",
968 			    cmd, t);
969 			goto usage;
970 		}
971 
972 		t = db_read_token_flags(DRT_WSPACE);
973 		if (t != tIDENT) {
974 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
975 			    cmd, t);
976 			goto usage;
977 		}
978 
979 		if (strlen(db_tok_string) > 1) {
980 			db_printf("%s: Bad syntax; expected single option "
981 			    "flag, got '%s'\n", cmd, db_tok_string);
982 			goto usage;
983 		}
984 
985 		want_ifp = false;
986 		cur_inet_opt = NULL;
987 		switch ((ch = db_tok_string[0])) {
988 		default:
989 			DNETDEBUG("Unexpected: '%c'\n", ch);
990 			/* FALLTHROUGH */
991 		case 'h':
992 			goto usage;
993 		case 'c':
994 			cur_inet_opt = &opt_client;
995 			break;
996 		case 'g':
997 			cur_inet_opt = &opt_gateway;
998 			break;
999 		case 's':
1000 			cur_inet_opt = &opt_server;
1001 			break;
1002 		case 'i':
1003 			want_ifp = true;
1004 			break;
1005 		}
1006 
1007 		t = db_read_token_flags(DRT_WSPACE);
1008 		if (t != tWSPACE) {
1009 			db_printf("%s: Bad syntax; expected space after "
1010 			    "flag %c, got %d\n", cmd, ch, t);
1011 			goto usage;
1012 		}
1013 
1014 		if (want_ifp) {
1015 			t = db_read_token_flags(DRT_WSPACE);
1016 			if (t != tIDENT) {
1017 				db_printf("%s: Expected interface but got %d\n",
1018 				    cmd, t);
1019 				goto usage;
1020 			}
1021 
1022 			CURVNET_SET(vnet0);
1023 			/*
1024 			 * We *don't* take a ref here because the only current
1025 			 * consumer, db_netdump_cmd, does not need it.  It
1026 			 * (somewhat redundantly) extracts the if_name(),
1027 			 * re-lookups the ifp, and takes its own reference.
1028 			 */
1029 			ifp = ifunit(db_tok_string);
1030 			CURVNET_RESTORE();
1031 			if (ifp == NULL) {
1032 				db_printf("Could not locate interface %s\n",
1033 				    db_tok_string);
1034 				goto cleanup;
1035 			}
1036 		} else {
1037 			MPASS(cur_inet_opt != NULL);
1038 			/* Assume IPv4 for now. */
1039 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1040 			if (error != 0)
1041 				goto cleanup;
1042 		}
1043 
1044 		/* Skip (mandatory) whitespace after option, if not EOL. */
1045 		t = db_read_token_flags(DRT_WSPACE);
1046 		if (t == tEOL)
1047 			break;
1048 		if (t != tWSPACE) {
1049 			db_printf("%s: Bad syntax; expected space after "
1050 			    "flag %c option; got %d\n", cmd, ch, t);
1051 			goto usage;
1052 		}
1053 		t = db_read_token_flags(DRT_WSPACE);
1054 	}
1055 
1056 	if (!opt_server.has_opt) {
1057 		db_printf("%s: need a destination server address\n", cmd);
1058 		goto usage;
1059 	}
1060 
1061 	result->dd_has_client = opt_client.has_opt;
1062 	result->dd_has_gateway = opt_gateway.has_opt;
1063 	result->dd_ifp = ifp;
1064 
1065 	/* We parsed the full line to tEOL already, or bailed with an error. */
1066 	return (0);
1067 
1068 usage:
1069 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1070 	    "-i <interface>]\n", cmd);
1071 	error = EINVAL;
1072 	/* FALLTHROUGH */
1073 cleanup:
1074 	db_skip_to_eol();
1075 	return (error);
1076 }
1077 #endif /* DDB */
1078