xref: /freebsd/sys/netinet/in_pcb.c (revision 59c7ad52aaa5b26e503871334672af0f58f9c2e8)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2007-2009 Robert N. M. Watson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_ddb.h"
38 #include "opt_ipsec.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/callout.h>
47 #include <sys/domain.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/priv.h>
52 #include <sys/proc.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60 
61 #include <vm/uma.h>
62 
63 #include <net/if.h>
64 #include <net/if_types.h>
65 #include <net/route.h>
66 #include <net/vnet.h>
67 
68 #if defined(INET) || defined(INET6)
69 #include <netinet/in.h>
70 #include <netinet/in_pcb.h>
71 #include <netinet/ip_var.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/udp.h>
74 #include <netinet/udp_var.h>
75 #endif
76 #ifdef INET
77 #include <netinet/in_var.h>
78 #endif
79 #ifdef INET6
80 #include <netinet/ip6.h>
81 #include <netinet6/in6_pcb.h>
82 #include <netinet6/in6_var.h>
83 #include <netinet6/ip6_var.h>
84 #endif /* INET6 */
85 
86 
87 #ifdef IPSEC
88 #include <netipsec/ipsec.h>
89 #include <netipsec/key.h>
90 #endif /* IPSEC */
91 
92 #include <security/mac/mac_framework.h>
93 
94 static struct callout	ipport_tick_callout;
95 
96 /*
97  * These configure the range of local port addresses assigned to
98  * "unspecified" outgoing connections/packets/whatever.
99  */
100 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
101 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
102 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
103 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
104 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
105 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
106 
107 /*
108  * Reserved ports accessible only to root. There are significant
109  * security considerations that must be accounted for when changing these,
110  * but the security benefits can be great. Please be careful.
111  */
112 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
113 VNET_DEFINE(int, ipport_reservedlow);
114 
115 /* Variables dealing with random ephemeral port allocation. */
116 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
117 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
118 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
119 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
120 VNET_DEFINE(int, ipport_tcpallocs);
121 static VNET_DEFINE(int, ipport_tcplastcount);
122 
123 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
124 
125 static void	in_pcbremlists(struct inpcb *inp);
126 
127 #ifdef INET
128 #define RANGECHK(var, min, max) \
129 	if ((var) < (min)) { (var) = (min); } \
130 	else if ((var) > (max)) { (var) = (max); }
131 
132 static int
133 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
134 {
135 	int error;
136 
137 #ifdef VIMAGE
138 	error = vnet_sysctl_handle_int(oidp, arg1, arg2, req);
139 #else
140 	error = sysctl_handle_int(oidp, arg1, arg2, req);
141 #endif
142 	if (error == 0) {
143 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
144 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
145 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
146 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
147 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
148 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
149 	}
150 	return (error);
151 }
152 
153 #undef RANGECHK
154 
155 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
156 
157 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
158 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
159 	&sysctl_net_ipport_check, "I", "");
160 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
161 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
162 	&sysctl_net_ipport_check, "I", "");
163 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
164 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
165 	&sysctl_net_ipport_check, "I", "");
166 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
167 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
168 	&sysctl_net_ipport_check, "I", "");
169 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
170 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
171 	&sysctl_net_ipport_check, "I", "");
172 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
173 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
174 	&sysctl_net_ipport_check, "I", "");
175 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
176 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
177 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
178 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
179 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
180 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
181 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
182 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
183 	"allocations before switching to a sequental one");
184 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
185 	&VNET_NAME(ipport_randomtime), 0,
186 	"Minimum time to keep sequental port "
187 	"allocation before switching to a random one");
188 #endif
189 
190 /*
191  * in_pcb.c: manage the Protocol Control Blocks.
192  *
193  * NOTE: It is assumed that most of these functions will be called with
194  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
195  * functions often modify hash chains or addresses in pcbs.
196  */
197 
198 /*
199  * Initialize an inpcbinfo -- we should be able to reduce the number of
200  * arguments in time.
201  */
202 void
203 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
204     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
205     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
206     uint32_t inpcbzone_flags)
207 {
208 
209 	INP_INFO_LOCK_INIT(pcbinfo, name);
210 #ifdef VIMAGE
211 	pcbinfo->ipi_vnet = curvnet;
212 #endif
213 	pcbinfo->ipi_listhead = listhead;
214 	LIST_INIT(pcbinfo->ipi_listhead);
215 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
216 	    &pcbinfo->ipi_hashmask);
217 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
218 	    &pcbinfo->ipi_porthashmask);
219 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
220 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
221 	    inpcbzone_flags);
222 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
223 }
224 
225 /*
226  * Destroy an inpcbinfo.
227  */
228 void
229 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
230 {
231 
232 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
233 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
234 	    pcbinfo->ipi_porthashmask);
235 	uma_zdestroy(pcbinfo->ipi_zone);
236 	INP_INFO_LOCK_DESTROY(pcbinfo);
237 }
238 
239 /*
240  * Allocate a PCB and associate it with the socket.
241  * On success return with the PCB locked.
242  */
243 int
244 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
245 {
246 	struct inpcb *inp;
247 	int error;
248 
249 	INP_INFO_WLOCK_ASSERT(pcbinfo);
250 	error = 0;
251 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
252 	if (inp == NULL)
253 		return (ENOBUFS);
254 	bzero(inp, inp_zero_size);
255 	inp->inp_pcbinfo = pcbinfo;
256 	inp->inp_socket = so;
257 	inp->inp_cred = crhold(so->so_cred);
258 	inp->inp_inc.inc_fibnum = so->so_fibnum;
259 #ifdef MAC
260 	error = mac_inpcb_init(inp, M_NOWAIT);
261 	if (error != 0)
262 		goto out;
263 	mac_inpcb_create(so, inp);
264 #endif
265 #ifdef IPSEC
266 	error = ipsec_init_policy(so, &inp->inp_sp);
267 	if (error != 0) {
268 #ifdef MAC
269 		mac_inpcb_destroy(inp);
270 #endif
271 		goto out;
272 	}
273 #endif /*IPSEC*/
274 #ifdef INET6
275 	if (INP_SOCKAF(so) == AF_INET6) {
276 		inp->inp_vflag |= INP_IPV6PROTO;
277 		if (V_ip6_v6only)
278 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
279 	}
280 #endif
281 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
282 	pcbinfo->ipi_count++;
283 	so->so_pcb = (caddr_t)inp;
284 #ifdef INET6
285 	if (V_ip6_auto_flowlabel)
286 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
287 #endif
288 	INP_WLOCK(inp);
289 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
290 	inp->inp_refcount = 1;	/* Reference from the inpcbinfo */
291 #if defined(IPSEC) || defined(MAC)
292 out:
293 	if (error != 0) {
294 		crfree(inp->inp_cred);
295 		uma_zfree(pcbinfo->ipi_zone, inp);
296 	}
297 #endif
298 	return (error);
299 }
300 
301 #ifdef INET
302 int
303 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
304 {
305 	int anonport, error;
306 
307 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
308 	INP_WLOCK_ASSERT(inp);
309 
310 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
311 		return (EINVAL);
312 	anonport = inp->inp_lport == 0 && (nam == NULL ||
313 	    ((struct sockaddr_in *)nam)->sin_port == 0);
314 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
315 	    &inp->inp_lport, cred);
316 	if (error)
317 		return (error);
318 	if (in_pcbinshash(inp) != 0) {
319 		inp->inp_laddr.s_addr = INADDR_ANY;
320 		inp->inp_lport = 0;
321 		return (EAGAIN);
322 	}
323 	if (anonport)
324 		inp->inp_flags |= INP_ANONPORT;
325 	return (0);
326 }
327 #endif
328 
329 #if defined(INET) || defined(INET6)
330 int
331 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
332     struct ucred *cred, int wild)
333 {
334 	struct inpcbinfo *pcbinfo;
335 	struct inpcb *tmpinp;
336 	unsigned short *lastport;
337 	int count, dorandom, error;
338 	u_short aux, first, last, lport;
339 #ifdef INET
340 	struct in_addr laddr;
341 #endif
342 
343 	pcbinfo = inp->inp_pcbinfo;
344 
345 	/*
346 	 * Because no actual state changes occur here, a global write lock on
347 	 * the pcbinfo isn't required.
348 	 */
349 	INP_INFO_LOCK_ASSERT(pcbinfo);
350 	INP_LOCK_ASSERT(inp);
351 
352 	if (inp->inp_flags & INP_HIGHPORT) {
353 		first = V_ipport_hifirstauto;	/* sysctl */
354 		last  = V_ipport_hilastauto;
355 		lastport = &pcbinfo->ipi_lasthi;
356 	} else if (inp->inp_flags & INP_LOWPORT) {
357 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
358 		if (error)
359 			return (error);
360 		first = V_ipport_lowfirstauto;	/* 1023 */
361 		last  = V_ipport_lowlastauto;	/* 600 */
362 		lastport = &pcbinfo->ipi_lastlow;
363 	} else {
364 		first = V_ipport_firstauto;	/* sysctl */
365 		last  = V_ipport_lastauto;
366 		lastport = &pcbinfo->ipi_lastport;
367 	}
368 	/*
369 	 * For UDP, use random port allocation as long as the user
370 	 * allows it.  For TCP (and as of yet unknown) connections,
371 	 * use random port allocation only if the user allows it AND
372 	 * ipport_tick() allows it.
373 	 */
374 	if (V_ipport_randomized &&
375 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
376 		dorandom = 1;
377 	else
378 		dorandom = 0;
379 	/*
380 	 * It makes no sense to do random port allocation if
381 	 * we have the only port available.
382 	 */
383 	if (first == last)
384 		dorandom = 0;
385 	/* Make sure to not include UDP packets in the count. */
386 	if (pcbinfo != &V_udbinfo)
387 		V_ipport_tcpallocs++;
388 	/*
389 	 * Instead of having two loops further down counting up or down
390 	 * make sure that first is always <= last and go with only one
391 	 * code path implementing all logic.
392 	 */
393 	if (first > last) {
394 		aux = first;
395 		first = last;
396 		last = aux;
397 	}
398 
399 #ifdef INET
400 	/* Make the compiler happy. */
401 	laddr.s_addr = 0;
402 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
403 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
404 		    __func__, inp));
405 		laddr = *laddrp;
406 	}
407 #endif
408 	tmpinp = NULL;	/* Make compiler happy. */
409 	lport = *lportp;
410 
411 	if (dorandom)
412 		*lastport = first + (arc4random() % (last - first));
413 
414 	count = last - first;
415 
416 	do {
417 		if (count-- < 0)	/* completely used? */
418 			return (EADDRNOTAVAIL);
419 		++*lastport;
420 		if (*lastport < first || *lastport > last)
421 			*lastport = first;
422 		lport = htons(*lastport);
423 
424 #ifdef INET6
425 		if ((inp->inp_vflag & INP_IPV6) != 0)
426 			tmpinp = in6_pcblookup_local(pcbinfo,
427 			    &inp->in6p_laddr, lport, wild, cred);
428 #endif
429 #if defined(INET) && defined(INET6)
430 		else
431 #endif
432 #ifdef INET
433 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
434 			    lport, wild, cred);
435 #endif
436 	} while (tmpinp != NULL);
437 
438 #ifdef INET
439 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
440 		laddrp->s_addr = laddr.s_addr;
441 #endif
442 	*lportp = lport;
443 
444 	return (0);
445 }
446 #endif /* INET || INET6 */
447 
448 #ifdef INET
449 /*
450  * Set up a bind operation on a PCB, performing port allocation
451  * as required, but do not actually modify the PCB. Callers can
452  * either complete the bind by setting inp_laddr/inp_lport and
453  * calling in_pcbinshash(), or they can just use the resulting
454  * port and address to authorise the sending of a once-off packet.
455  *
456  * On error, the values of *laddrp and *lportp are not changed.
457  */
458 int
459 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
460     u_short *lportp, struct ucred *cred)
461 {
462 	struct socket *so = inp->inp_socket;
463 	struct sockaddr_in *sin;
464 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
465 	struct in_addr laddr;
466 	u_short lport = 0;
467 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
468 	int error;
469 
470 	/*
471 	 * Because no actual state changes occur here, a global write lock on
472 	 * the pcbinfo isn't required.
473 	 */
474 	INP_INFO_LOCK_ASSERT(pcbinfo);
475 	INP_LOCK_ASSERT(inp);
476 
477 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
478 		return (EADDRNOTAVAIL);
479 	laddr.s_addr = *laddrp;
480 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
481 		return (EINVAL);
482 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
483 		wild = INPLOOKUP_WILDCARD;
484 	if (nam == NULL) {
485 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
486 			return (error);
487 	} else {
488 		sin = (struct sockaddr_in *)nam;
489 		if (nam->sa_len != sizeof (*sin))
490 			return (EINVAL);
491 #ifdef notdef
492 		/*
493 		 * We should check the family, but old programs
494 		 * incorrectly fail to initialize it.
495 		 */
496 		if (sin->sin_family != AF_INET)
497 			return (EAFNOSUPPORT);
498 #endif
499 		error = prison_local_ip4(cred, &sin->sin_addr);
500 		if (error)
501 			return (error);
502 		if (sin->sin_port != *lportp) {
503 			/* Don't allow the port to change. */
504 			if (*lportp != 0)
505 				return (EINVAL);
506 			lport = sin->sin_port;
507 		}
508 		/* NB: lport is left as 0 if the port isn't being changed. */
509 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
510 			/*
511 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
512 			 * allow complete duplication of binding if
513 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
514 			 * and a multicast address is bound on both
515 			 * new and duplicated sockets.
516 			 */
517 			if (so->so_options & SO_REUSEADDR)
518 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
519 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
520 			sin->sin_port = 0;		/* yech... */
521 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
522 			/*
523 			 * Is the address a local IP address?
524 			 * If INP_BINDANY is set, then the socket may be bound
525 			 * to any endpoint address, local or not.
526 			 */
527 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
528 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
529 				return (EADDRNOTAVAIL);
530 		}
531 		laddr = sin->sin_addr;
532 		if (lport) {
533 			struct inpcb *t;
534 			struct tcptw *tw;
535 
536 			/* GROSS */
537 			if (ntohs(lport) <= V_ipport_reservedhigh &&
538 			    ntohs(lport) >= V_ipport_reservedlow &&
539 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
540 			    0))
541 				return (EACCES);
542 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
543 			    priv_check_cred(inp->inp_cred,
544 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
545 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
546 				    lport, INPLOOKUP_WILDCARD, cred);
547 	/*
548 	 * XXX
549 	 * This entire block sorely needs a rewrite.
550 	 */
551 				if (t &&
552 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
553 				    (so->so_type != SOCK_STREAM ||
554 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
555 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
556 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
557 				     (t->inp_socket->so_options &
558 					 SO_REUSEPORT) == 0) &&
559 				    (inp->inp_cred->cr_uid !=
560 				     t->inp_cred->cr_uid))
561 					return (EADDRINUSE);
562 			}
563 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
564 			    lport, wild, cred);
565 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
566 				/*
567 				 * XXXRW: If an incpb has had its timewait
568 				 * state recycled, we treat the address as
569 				 * being in use (for now).  This is better
570 				 * than a panic, but not desirable.
571 				 */
572 				tw = intotw(inp);
573 				if (tw == NULL ||
574 				    (reuseport & tw->tw_so_options) == 0)
575 					return (EADDRINUSE);
576 			} else if (t &&
577 			    (reuseport & t->inp_socket->so_options) == 0) {
578 #ifdef INET6
579 				if (ntohl(sin->sin_addr.s_addr) !=
580 				    INADDR_ANY ||
581 				    ntohl(t->inp_laddr.s_addr) !=
582 				    INADDR_ANY ||
583 				    INP_SOCKAF(so) ==
584 				    INP_SOCKAF(t->inp_socket))
585 #endif
586 				return (EADDRINUSE);
587 			}
588 		}
589 	}
590 	if (*lportp != 0)
591 		lport = *lportp;
592 	if (lport == 0) {
593 		error = in_pcb_lport(inp, &laddr, &lport, cred, wild);
594 		if (error != 0)
595 			return (error);
596 
597 	}
598 	*laddrp = laddr.s_addr;
599 	*lportp = lport;
600 	return (0);
601 }
602 
603 /*
604  * Connect from a socket to a specified address.
605  * Both address and port must be specified in argument sin.
606  * If don't have a local address for this socket yet,
607  * then pick one.
608  */
609 int
610 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
611 {
612 	u_short lport, fport;
613 	in_addr_t laddr, faddr;
614 	int anonport, error;
615 
616 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
617 	INP_WLOCK_ASSERT(inp);
618 
619 	lport = inp->inp_lport;
620 	laddr = inp->inp_laddr.s_addr;
621 	anonport = (lport == 0);
622 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
623 	    NULL, cred);
624 	if (error)
625 		return (error);
626 
627 	/* Do the initial binding of the local address if required. */
628 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
629 		inp->inp_lport = lport;
630 		inp->inp_laddr.s_addr = laddr;
631 		if (in_pcbinshash(inp) != 0) {
632 			inp->inp_laddr.s_addr = INADDR_ANY;
633 			inp->inp_lport = 0;
634 			return (EAGAIN);
635 		}
636 	}
637 
638 	/* Commit the remaining changes. */
639 	inp->inp_lport = lport;
640 	inp->inp_laddr.s_addr = laddr;
641 	inp->inp_faddr.s_addr = faddr;
642 	inp->inp_fport = fport;
643 	in_pcbrehash(inp);
644 
645 	if (anonport)
646 		inp->inp_flags |= INP_ANONPORT;
647 	return (0);
648 }
649 
650 /*
651  * Do proper source address selection on an unbound socket in case
652  * of connect. Take jails into account as well.
653  */
654 static int
655 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
656     struct ucred *cred)
657 {
658 	struct ifaddr *ifa;
659 	struct sockaddr *sa;
660 	struct sockaddr_in *sin;
661 	struct route sro;
662 	int error;
663 
664 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
665 
666 	/*
667 	 * Bypass source address selection and use the primary jail IP
668 	 * if requested.
669 	 */
670 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
671 		return (0);
672 
673 	error = 0;
674 	bzero(&sro, sizeof(sro));
675 
676 	sin = (struct sockaddr_in *)&sro.ro_dst;
677 	sin->sin_family = AF_INET;
678 	sin->sin_len = sizeof(struct sockaddr_in);
679 	sin->sin_addr.s_addr = faddr->s_addr;
680 
681 	/*
682 	 * If route is known our src addr is taken from the i/f,
683 	 * else punt.
684 	 *
685 	 * Find out route to destination.
686 	 */
687 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
688 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
689 
690 	/*
691 	 * If we found a route, use the address corresponding to
692 	 * the outgoing interface.
693 	 *
694 	 * Otherwise assume faddr is reachable on a directly connected
695 	 * network and try to find a corresponding interface to take
696 	 * the source address from.
697 	 */
698 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
699 		struct in_ifaddr *ia;
700 		struct ifnet *ifp;
701 
702 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
703 		if (ia == NULL)
704 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
705 		if (ia == NULL) {
706 			error = ENETUNREACH;
707 			goto done;
708 		}
709 
710 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
711 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
712 			ifa_free(&ia->ia_ifa);
713 			goto done;
714 		}
715 
716 		ifp = ia->ia_ifp;
717 		ifa_free(&ia->ia_ifa);
718 		ia = NULL;
719 		IF_ADDR_LOCK(ifp);
720 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
721 
722 			sa = ifa->ifa_addr;
723 			if (sa->sa_family != AF_INET)
724 				continue;
725 			sin = (struct sockaddr_in *)sa;
726 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
727 				ia = (struct in_ifaddr *)ifa;
728 				break;
729 			}
730 		}
731 		if (ia != NULL) {
732 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
733 			IF_ADDR_UNLOCK(ifp);
734 			goto done;
735 		}
736 		IF_ADDR_UNLOCK(ifp);
737 
738 		/* 3. As a last resort return the 'default' jail address. */
739 		error = prison_get_ip4(cred, laddr);
740 		goto done;
741 	}
742 
743 	/*
744 	 * If the outgoing interface on the route found is not
745 	 * a loopback interface, use the address from that interface.
746 	 * In case of jails do those three steps:
747 	 * 1. check if the interface address belongs to the jail. If so use it.
748 	 * 2. check if we have any address on the outgoing interface
749 	 *    belonging to this jail. If so use it.
750 	 * 3. as a last resort return the 'default' jail address.
751 	 */
752 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
753 		struct in_ifaddr *ia;
754 		struct ifnet *ifp;
755 
756 		/* If not jailed, use the default returned. */
757 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
758 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
759 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
760 			goto done;
761 		}
762 
763 		/* Jailed. */
764 		/* 1. Check if the iface address belongs to the jail. */
765 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
766 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
767 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
768 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
769 			goto done;
770 		}
771 
772 		/*
773 		 * 2. Check if we have any address on the outgoing interface
774 		 *    belonging to this jail.
775 		 */
776 		ia = NULL;
777 		ifp = sro.ro_rt->rt_ifp;
778 		IF_ADDR_LOCK(ifp);
779 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
780 			sa = ifa->ifa_addr;
781 			if (sa->sa_family != AF_INET)
782 				continue;
783 			sin = (struct sockaddr_in *)sa;
784 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
785 				ia = (struct in_ifaddr *)ifa;
786 				break;
787 			}
788 		}
789 		if (ia != NULL) {
790 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
791 			IF_ADDR_UNLOCK(ifp);
792 			goto done;
793 		}
794 		IF_ADDR_UNLOCK(ifp);
795 
796 		/* 3. As a last resort return the 'default' jail address. */
797 		error = prison_get_ip4(cred, laddr);
798 		goto done;
799 	}
800 
801 	/*
802 	 * The outgoing interface is marked with 'loopback net', so a route
803 	 * to ourselves is here.
804 	 * Try to find the interface of the destination address and then
805 	 * take the address from there. That interface is not necessarily
806 	 * a loopback interface.
807 	 * In case of jails, check that it is an address of the jail
808 	 * and if we cannot find, fall back to the 'default' jail address.
809 	 */
810 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
811 		struct sockaddr_in sain;
812 		struct in_ifaddr *ia;
813 
814 		bzero(&sain, sizeof(struct sockaddr_in));
815 		sain.sin_family = AF_INET;
816 		sain.sin_len = sizeof(struct sockaddr_in);
817 		sain.sin_addr.s_addr = faddr->s_addr;
818 
819 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
820 		if (ia == NULL)
821 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
822 		if (ia == NULL)
823 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
824 
825 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
826 			if (ia == NULL) {
827 				error = ENETUNREACH;
828 				goto done;
829 			}
830 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
831 			ifa_free(&ia->ia_ifa);
832 			goto done;
833 		}
834 
835 		/* Jailed. */
836 		if (ia != NULL) {
837 			struct ifnet *ifp;
838 
839 			ifp = ia->ia_ifp;
840 			ifa_free(&ia->ia_ifa);
841 			ia = NULL;
842 			IF_ADDR_LOCK(ifp);
843 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
844 
845 				sa = ifa->ifa_addr;
846 				if (sa->sa_family != AF_INET)
847 					continue;
848 				sin = (struct sockaddr_in *)sa;
849 				if (prison_check_ip4(cred,
850 				    &sin->sin_addr) == 0) {
851 					ia = (struct in_ifaddr *)ifa;
852 					break;
853 				}
854 			}
855 			if (ia != NULL) {
856 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
857 				IF_ADDR_UNLOCK(ifp);
858 				goto done;
859 			}
860 			IF_ADDR_UNLOCK(ifp);
861 		}
862 
863 		/* 3. As a last resort return the 'default' jail address. */
864 		error = prison_get_ip4(cred, laddr);
865 		goto done;
866 	}
867 
868 done:
869 	if (sro.ro_rt != NULL)
870 		RTFREE(sro.ro_rt);
871 	return (error);
872 }
873 
874 /*
875  * Set up for a connect from a socket to the specified address.
876  * On entry, *laddrp and *lportp should contain the current local
877  * address and port for the PCB; these are updated to the values
878  * that should be placed in inp_laddr and inp_lport to complete
879  * the connect.
880  *
881  * On success, *faddrp and *fportp will be set to the remote address
882  * and port. These are not updated in the error case.
883  *
884  * If the operation fails because the connection already exists,
885  * *oinpp will be set to the PCB of that connection so that the
886  * caller can decide to override it. In all other cases, *oinpp
887  * is set to NULL.
888  */
889 int
890 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
891     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
892     struct inpcb **oinpp, struct ucred *cred)
893 {
894 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
895 	struct in_ifaddr *ia;
896 	struct inpcb *oinp;
897 	struct in_addr laddr, faddr;
898 	u_short lport, fport;
899 	int error;
900 
901 	/*
902 	 * Because a global state change doesn't actually occur here, a read
903 	 * lock is sufficient.
904 	 */
905 	INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
906 	INP_LOCK_ASSERT(inp);
907 
908 	if (oinpp != NULL)
909 		*oinpp = NULL;
910 	if (nam->sa_len != sizeof (*sin))
911 		return (EINVAL);
912 	if (sin->sin_family != AF_INET)
913 		return (EAFNOSUPPORT);
914 	if (sin->sin_port == 0)
915 		return (EADDRNOTAVAIL);
916 	laddr.s_addr = *laddrp;
917 	lport = *lportp;
918 	faddr = sin->sin_addr;
919 	fport = sin->sin_port;
920 
921 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
922 		/*
923 		 * If the destination address is INADDR_ANY,
924 		 * use the primary local address.
925 		 * If the supplied address is INADDR_BROADCAST,
926 		 * and the primary interface supports broadcast,
927 		 * choose the broadcast address for that interface.
928 		 */
929 		if (faddr.s_addr == INADDR_ANY) {
930 			IN_IFADDR_RLOCK();
931 			faddr =
932 			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
933 			IN_IFADDR_RUNLOCK();
934 			if (cred != NULL &&
935 			    (error = prison_get_ip4(cred, &faddr)) != 0)
936 				return (error);
937 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
938 			IN_IFADDR_RLOCK();
939 			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
940 			    IFF_BROADCAST)
941 				faddr = satosin(&TAILQ_FIRST(
942 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
943 			IN_IFADDR_RUNLOCK();
944 		}
945 	}
946 	if (laddr.s_addr == INADDR_ANY) {
947 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
948 		/*
949 		 * If the destination address is multicast and an outgoing
950 		 * interface has been set as a multicast option, prefer the
951 		 * address of that interface as our source address.
952 		 */
953 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
954 		    inp->inp_moptions != NULL) {
955 			struct ip_moptions *imo;
956 			struct ifnet *ifp;
957 
958 			imo = inp->inp_moptions;
959 			if (imo->imo_multicast_ifp != NULL) {
960 				ifp = imo->imo_multicast_ifp;
961 				IN_IFADDR_RLOCK();
962 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
963 					if ((ia->ia_ifp == ifp) &&
964 					    (cred == NULL ||
965 					    prison_check_ip4(cred,
966 					    &ia->ia_addr.sin_addr) == 0))
967 						break;
968 				}
969 				if (ia == NULL)
970 					error = EADDRNOTAVAIL;
971 				else {
972 					laddr = ia->ia_addr.sin_addr;
973 					error = 0;
974 				}
975 				IN_IFADDR_RUNLOCK();
976 			}
977 		}
978 		if (error)
979 			return (error);
980 	}
981 	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
982 	    0, NULL);
983 	if (oinp != NULL) {
984 		if (oinpp != NULL)
985 			*oinpp = oinp;
986 		return (EADDRINUSE);
987 	}
988 	if (lport == 0) {
989 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
990 		    cred);
991 		if (error)
992 			return (error);
993 	}
994 	*laddrp = laddr.s_addr;
995 	*lportp = lport;
996 	*faddrp = faddr.s_addr;
997 	*fportp = fport;
998 	return (0);
999 }
1000 
1001 void
1002 in_pcbdisconnect(struct inpcb *inp)
1003 {
1004 
1005 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
1006 	INP_WLOCK_ASSERT(inp);
1007 
1008 	inp->inp_faddr.s_addr = INADDR_ANY;
1009 	inp->inp_fport = 0;
1010 	in_pcbrehash(inp);
1011 }
1012 #endif
1013 
1014 /*
1015  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1016  * For most protocols, this will be invoked immediately prior to calling
1017  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1018  * socket, in which case in_pcbfree() is deferred.
1019  */
1020 void
1021 in_pcbdetach(struct inpcb *inp)
1022 {
1023 
1024 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1025 
1026 	inp->inp_socket->so_pcb = NULL;
1027 	inp->inp_socket = NULL;
1028 }
1029 
1030 /*
1031  * in_pcbfree_internal() frees an inpcb that has been detached from its
1032  * socket, and whose reference count has reached 0.  It will also remove the
1033  * inpcb from any global lists it might remain on.
1034  */
1035 static void
1036 in_pcbfree_internal(struct inpcb *inp)
1037 {
1038 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
1039 
1040 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1041 	KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
1042 
1043 	INP_INFO_WLOCK_ASSERT(ipi);
1044 	INP_WLOCK_ASSERT(inp);
1045 
1046 #ifdef IPSEC
1047 	if (inp->inp_sp != NULL)
1048 		ipsec_delete_pcbpolicy(inp);
1049 #endif /* IPSEC */
1050 	inp->inp_gencnt = ++ipi->ipi_gencnt;
1051 	in_pcbremlists(inp);
1052 #ifdef INET6
1053 	if (inp->inp_vflag & INP_IPV6PROTO) {
1054 		ip6_freepcbopts(inp->in6p_outputopts);
1055 		if (inp->in6p_moptions != NULL)
1056 			ip6_freemoptions(inp->in6p_moptions);
1057 	}
1058 #endif
1059 	if (inp->inp_options)
1060 		(void)m_free(inp->inp_options);
1061 #ifdef INET
1062 	if (inp->inp_moptions != NULL)
1063 		inp_freemoptions(inp->inp_moptions);
1064 #endif
1065 	inp->inp_vflag = 0;
1066 	crfree(inp->inp_cred);
1067 
1068 #ifdef MAC
1069 	mac_inpcb_destroy(inp);
1070 #endif
1071 	INP_WUNLOCK(inp);
1072 	uma_zfree(ipi->ipi_zone, inp);
1073 }
1074 
1075 /*
1076  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1077  * stability of an inpcb pointer despite the inpcb lock being released.  This
1078  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1079  * but where the inpcb lock is already held.
1080  *
1081  * While the inpcb will not be freed, releasing the inpcb lock means that the
1082  * connection's state may change, so the caller should be careful to
1083  * revalidate any cached state on reacquiring the lock.  Drop the reference
1084  * using in_pcbrele().
1085  */
1086 void
1087 in_pcbref(struct inpcb *inp)
1088 {
1089 
1090 	INP_WLOCK_ASSERT(inp);
1091 
1092 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1093 
1094 	inp->inp_refcount++;
1095 }
1096 
1097 /*
1098  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1099  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1100  * return a flag indicating whether or not the inpcb remains valid.  If it is
1101  * valid, we return with the inpcb lock held.
1102  */
1103 int
1104 in_pcbrele(struct inpcb *inp)
1105 {
1106 #ifdef INVARIANTS
1107 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
1108 #endif
1109 
1110 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1111 
1112 	INP_INFO_WLOCK_ASSERT(ipi);
1113 	INP_WLOCK_ASSERT(inp);
1114 
1115 	inp->inp_refcount--;
1116 	if (inp->inp_refcount > 0)
1117 		return (0);
1118 	in_pcbfree_internal(inp);
1119 	return (1);
1120 }
1121 
1122 /*
1123  * Unconditionally schedule an inpcb to be freed by decrementing its
1124  * reference count, which should occur only after the inpcb has been detached
1125  * from its socket.  If another thread holds a temporary reference (acquired
1126  * using in_pcbref()) then the free is deferred until that reference is
1127  * released using in_pcbrele(), but the inpcb is still unlocked.
1128  */
1129 void
1130 in_pcbfree(struct inpcb *inp)
1131 {
1132 #ifdef INVARIANTS
1133 	struct inpcbinfo *ipi = inp->inp_pcbinfo;
1134 #endif
1135 
1136 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
1137 	    __func__));
1138 
1139 	INP_INFO_WLOCK_ASSERT(ipi);
1140 	INP_WLOCK_ASSERT(inp);
1141 
1142 	if (!in_pcbrele(inp))
1143 		INP_WUNLOCK(inp);
1144 }
1145 
1146 /*
1147  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1148  * port reservation, and preventing it from being returned by inpcb lookups.
1149  *
1150  * It is used by TCP to mark an inpcb as unused and avoid future packet
1151  * delivery or event notification when a socket remains open but TCP has
1152  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1153  * or a RST on the wire, and allows the port binding to be reused while still
1154  * maintaining the invariant that so_pcb always points to a valid inpcb until
1155  * in_pcbdetach().
1156  *
1157  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1158  * in_pcbnotifyall() and in_pcbpurgeif0()?
1159  */
1160 void
1161 in_pcbdrop(struct inpcb *inp)
1162 {
1163 
1164 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
1165 	INP_WLOCK_ASSERT(inp);
1166 
1167 	inp->inp_flags |= INP_DROPPED;
1168 	if (inp->inp_flags & INP_INHASHLIST) {
1169 		struct inpcbport *phd = inp->inp_phd;
1170 
1171 		LIST_REMOVE(inp, inp_hash);
1172 		LIST_REMOVE(inp, inp_portlist);
1173 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1174 			LIST_REMOVE(phd, phd_hash);
1175 			free(phd, M_PCB);
1176 		}
1177 		inp->inp_flags &= ~INP_INHASHLIST;
1178 	}
1179 }
1180 
1181 #ifdef INET
1182 /*
1183  * Common routines to return the socket addresses associated with inpcbs.
1184  */
1185 struct sockaddr *
1186 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1187 {
1188 	struct sockaddr_in *sin;
1189 
1190 	sin = malloc(sizeof *sin, M_SONAME,
1191 		M_WAITOK | M_ZERO);
1192 	sin->sin_family = AF_INET;
1193 	sin->sin_len = sizeof(*sin);
1194 	sin->sin_addr = *addr_p;
1195 	sin->sin_port = port;
1196 
1197 	return (struct sockaddr *)sin;
1198 }
1199 
1200 int
1201 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1202 {
1203 	struct inpcb *inp;
1204 	struct in_addr addr;
1205 	in_port_t port;
1206 
1207 	inp = sotoinpcb(so);
1208 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1209 
1210 	INP_RLOCK(inp);
1211 	port = inp->inp_lport;
1212 	addr = inp->inp_laddr;
1213 	INP_RUNLOCK(inp);
1214 
1215 	*nam = in_sockaddr(port, &addr);
1216 	return 0;
1217 }
1218 
1219 int
1220 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1221 {
1222 	struct inpcb *inp;
1223 	struct in_addr addr;
1224 	in_port_t port;
1225 
1226 	inp = sotoinpcb(so);
1227 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1228 
1229 	INP_RLOCK(inp);
1230 	port = inp->inp_fport;
1231 	addr = inp->inp_faddr;
1232 	INP_RUNLOCK(inp);
1233 
1234 	*nam = in_sockaddr(port, &addr);
1235 	return 0;
1236 }
1237 
1238 void
1239 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1240     struct inpcb *(*notify)(struct inpcb *, int))
1241 {
1242 	struct inpcb *inp, *inp_temp;
1243 
1244 	INP_INFO_WLOCK(pcbinfo);
1245 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1246 		INP_WLOCK(inp);
1247 #ifdef INET6
1248 		if ((inp->inp_vflag & INP_IPV4) == 0) {
1249 			INP_WUNLOCK(inp);
1250 			continue;
1251 		}
1252 #endif
1253 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
1254 		    inp->inp_socket == NULL) {
1255 			INP_WUNLOCK(inp);
1256 			continue;
1257 		}
1258 		if ((*notify)(inp, errno))
1259 			INP_WUNLOCK(inp);
1260 	}
1261 	INP_INFO_WUNLOCK(pcbinfo);
1262 }
1263 
1264 void
1265 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1266 {
1267 	struct inpcb *inp;
1268 	struct ip_moptions *imo;
1269 	int i, gap;
1270 
1271 	INP_INFO_RLOCK(pcbinfo);
1272 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1273 		INP_WLOCK(inp);
1274 		imo = inp->inp_moptions;
1275 		if ((inp->inp_vflag & INP_IPV4) &&
1276 		    imo != NULL) {
1277 			/*
1278 			 * Unselect the outgoing interface if it is being
1279 			 * detached.
1280 			 */
1281 			if (imo->imo_multicast_ifp == ifp)
1282 				imo->imo_multicast_ifp = NULL;
1283 
1284 			/*
1285 			 * Drop multicast group membership if we joined
1286 			 * through the interface being detached.
1287 			 */
1288 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
1289 			    i++) {
1290 				if (imo->imo_membership[i]->inm_ifp == ifp) {
1291 					in_delmulti(imo->imo_membership[i]);
1292 					gap++;
1293 				} else if (gap != 0)
1294 					imo->imo_membership[i - gap] =
1295 					    imo->imo_membership[i];
1296 			}
1297 			imo->imo_num_memberships -= gap;
1298 		}
1299 		INP_WUNLOCK(inp);
1300 	}
1301 	INP_INFO_RUNLOCK(pcbinfo);
1302 }
1303 
1304 /*
1305  * Lookup a PCB based on the local address and port.
1306  */
1307 #define INP_LOOKUP_MAPPED_PCB_COST	3
1308 struct inpcb *
1309 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1310     u_short lport, int wild_okay, struct ucred *cred)
1311 {
1312 	struct inpcb *inp;
1313 #ifdef INET6
1314 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1315 #else
1316 	int matchwild = 3;
1317 #endif
1318 	int wildcard;
1319 
1320 	INP_INFO_LOCK_ASSERT(pcbinfo);
1321 
1322 	if (!wild_okay) {
1323 		struct inpcbhead *head;
1324 		/*
1325 		 * Look for an unconnected (wildcard foreign addr) PCB that
1326 		 * matches the local address and port we're looking for.
1327 		 */
1328 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1329 		    0, pcbinfo->ipi_hashmask)];
1330 		LIST_FOREACH(inp, head, inp_hash) {
1331 #ifdef INET6
1332 			/* XXX inp locking */
1333 			if ((inp->inp_vflag & INP_IPV4) == 0)
1334 				continue;
1335 #endif
1336 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1337 			    inp->inp_laddr.s_addr == laddr.s_addr &&
1338 			    inp->inp_lport == lport) {
1339 				/*
1340 				 * Found?
1341 				 */
1342 				if (cred == NULL ||
1343 				    prison_equal_ip4(cred->cr_prison,
1344 					inp->inp_cred->cr_prison))
1345 					return (inp);
1346 			}
1347 		}
1348 		/*
1349 		 * Not found.
1350 		 */
1351 		return (NULL);
1352 	} else {
1353 		struct inpcbporthead *porthash;
1354 		struct inpcbport *phd;
1355 		struct inpcb *match = NULL;
1356 		/*
1357 		 * Best fit PCB lookup.
1358 		 *
1359 		 * First see if this local port is in use by looking on the
1360 		 * port hash list.
1361 		 */
1362 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1363 		    pcbinfo->ipi_porthashmask)];
1364 		LIST_FOREACH(phd, porthash, phd_hash) {
1365 			if (phd->phd_port == lport)
1366 				break;
1367 		}
1368 		if (phd != NULL) {
1369 			/*
1370 			 * Port is in use by one or more PCBs. Look for best
1371 			 * fit.
1372 			 */
1373 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1374 				wildcard = 0;
1375 				if (cred != NULL &&
1376 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
1377 					cred->cr_prison))
1378 					continue;
1379 #ifdef INET6
1380 				/* XXX inp locking */
1381 				if ((inp->inp_vflag & INP_IPV4) == 0)
1382 					continue;
1383 				/*
1384 				 * We never select the PCB that has
1385 				 * INP_IPV6 flag and is bound to :: if
1386 				 * we have another PCB which is bound
1387 				 * to 0.0.0.0.  If a PCB has the
1388 				 * INP_IPV6 flag, then we set its cost
1389 				 * higher than IPv4 only PCBs.
1390 				 *
1391 				 * Note that the case only happens
1392 				 * when a socket is bound to ::, under
1393 				 * the condition that the use of the
1394 				 * mapped address is allowed.
1395 				 */
1396 				if ((inp->inp_vflag & INP_IPV6) != 0)
1397 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1398 #endif
1399 				if (inp->inp_faddr.s_addr != INADDR_ANY)
1400 					wildcard++;
1401 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
1402 					if (laddr.s_addr == INADDR_ANY)
1403 						wildcard++;
1404 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
1405 						continue;
1406 				} else {
1407 					if (laddr.s_addr != INADDR_ANY)
1408 						wildcard++;
1409 				}
1410 				if (wildcard < matchwild) {
1411 					match = inp;
1412 					matchwild = wildcard;
1413 					if (matchwild == 0)
1414 						break;
1415 				}
1416 			}
1417 		}
1418 		return (match);
1419 	}
1420 }
1421 #undef INP_LOOKUP_MAPPED_PCB_COST
1422 
1423 /*
1424  * Lookup PCB in hash list.
1425  */
1426 struct inpcb *
1427 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1428     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
1429     struct ifnet *ifp)
1430 {
1431 	struct inpcbhead *head;
1432 	struct inpcb *inp, *tmpinp;
1433 	u_short fport = fport_arg, lport = lport_arg;
1434 
1435 	INP_INFO_LOCK_ASSERT(pcbinfo);
1436 
1437 	/*
1438 	 * First look for an exact match.
1439 	 */
1440 	tmpinp = NULL;
1441 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1442 	    pcbinfo->ipi_hashmask)];
1443 	LIST_FOREACH(inp, head, inp_hash) {
1444 #ifdef INET6
1445 		/* XXX inp locking */
1446 		if ((inp->inp_vflag & INP_IPV4) == 0)
1447 			continue;
1448 #endif
1449 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1450 		    inp->inp_laddr.s_addr == laddr.s_addr &&
1451 		    inp->inp_fport == fport &&
1452 		    inp->inp_lport == lport) {
1453 			/*
1454 			 * XXX We should be able to directly return
1455 			 * the inp here, without any checks.
1456 			 * Well unless both bound with SO_REUSEPORT?
1457 			 */
1458 			if (prison_flag(inp->inp_cred, PR_IP4))
1459 				return (inp);
1460 			if (tmpinp == NULL)
1461 				tmpinp = inp;
1462 		}
1463 	}
1464 	if (tmpinp != NULL)
1465 		return (tmpinp);
1466 
1467 	/*
1468 	 * Then look for a wildcard match, if requested.
1469 	 */
1470 	if (wildcard == INPLOOKUP_WILDCARD) {
1471 		struct inpcb *local_wild = NULL, *local_exact = NULL;
1472 #ifdef INET6
1473 		struct inpcb *local_wild_mapped = NULL;
1474 #endif
1475 		struct inpcb *jail_wild = NULL;
1476 		int injail;
1477 
1478 		/*
1479 		 * Order of socket selection - we always prefer jails.
1480 		 *      1. jailed, non-wild.
1481 		 *      2. jailed, wild.
1482 		 *      3. non-jailed, non-wild.
1483 		 *      4. non-jailed, wild.
1484 		 */
1485 
1486 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1487 		    0, pcbinfo->ipi_hashmask)];
1488 		LIST_FOREACH(inp, head, inp_hash) {
1489 #ifdef INET6
1490 			/* XXX inp locking */
1491 			if ((inp->inp_vflag & INP_IPV4) == 0)
1492 				continue;
1493 #endif
1494 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1495 			    inp->inp_lport != lport)
1496 				continue;
1497 
1498 			/* XXX inp locking */
1499 			if (ifp && ifp->if_type == IFT_FAITH &&
1500 			    (inp->inp_flags & INP_FAITH) == 0)
1501 				continue;
1502 
1503 			injail = prison_flag(inp->inp_cred, PR_IP4);
1504 			if (injail) {
1505 				if (prison_check_ip4(inp->inp_cred,
1506 				    &laddr) != 0)
1507 					continue;
1508 			} else {
1509 				if (local_exact != NULL)
1510 					continue;
1511 			}
1512 
1513 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1514 				if (injail)
1515 					return (inp);
1516 				else
1517 					local_exact = inp;
1518 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1519 #ifdef INET6
1520 				/* XXX inp locking, NULL check */
1521 				if (inp->inp_vflag & INP_IPV6PROTO)
1522 					local_wild_mapped = inp;
1523 				else
1524 #endif /* INET6 */
1525 					if (injail)
1526 						jail_wild = inp;
1527 					else
1528 						local_wild = inp;
1529 			}
1530 		} /* LIST_FOREACH */
1531 		if (jail_wild != NULL)
1532 			return (jail_wild);
1533 		if (local_exact != NULL)
1534 			return (local_exact);
1535 		if (local_wild != NULL)
1536 			return (local_wild);
1537 #ifdef INET6
1538 		if (local_wild_mapped != NULL)
1539 			return (local_wild_mapped);
1540 #endif /* defined(INET6) */
1541 	} /* if (wildcard == INPLOOKUP_WILDCARD) */
1542 
1543 	return (NULL);
1544 }
1545 #endif /* INET */
1546 
1547 /*
1548  * Insert PCB onto various hash lists.
1549  */
1550 int
1551 in_pcbinshash(struct inpcb *inp)
1552 {
1553 	struct inpcbhead *pcbhash;
1554 	struct inpcbporthead *pcbporthash;
1555 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1556 	struct inpcbport *phd;
1557 	u_int32_t hashkey_faddr;
1558 
1559 	INP_INFO_WLOCK_ASSERT(pcbinfo);
1560 	INP_WLOCK_ASSERT(inp);
1561 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
1562 	    ("in_pcbinshash: INP_INHASHLIST"));
1563 
1564 #ifdef INET6
1565 	if (inp->inp_vflag & INP_IPV6)
1566 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1567 	else
1568 #endif /* INET6 */
1569 	hashkey_faddr = inp->inp_faddr.s_addr;
1570 
1571 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1572 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1573 
1574 	pcbporthash = &pcbinfo->ipi_porthashbase[
1575 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
1576 
1577 	/*
1578 	 * Go through port list and look for a head for this lport.
1579 	 */
1580 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
1581 		if (phd->phd_port == inp->inp_lport)
1582 			break;
1583 	}
1584 	/*
1585 	 * If none exists, malloc one and tack it on.
1586 	 */
1587 	if (phd == NULL) {
1588 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1589 		if (phd == NULL) {
1590 			return (ENOBUFS); /* XXX */
1591 		}
1592 		phd->phd_port = inp->inp_lport;
1593 		LIST_INIT(&phd->phd_pcblist);
1594 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1595 	}
1596 	inp->inp_phd = phd;
1597 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1598 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1599 	inp->inp_flags |= INP_INHASHLIST;
1600 	return (0);
1601 }
1602 
1603 /*
1604  * Move PCB to the proper hash bucket when { faddr, fport } have  been
1605  * changed. NOTE: This does not handle the case of the lport changing (the
1606  * hashed port list would have to be updated as well), so the lport must
1607  * not change after in_pcbinshash() has been called.
1608  */
1609 void
1610 in_pcbrehash(struct inpcb *inp)
1611 {
1612 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1613 	struct inpcbhead *head;
1614 	u_int32_t hashkey_faddr;
1615 
1616 	INP_INFO_WLOCK_ASSERT(pcbinfo);
1617 	INP_WLOCK_ASSERT(inp);
1618 	KASSERT(inp->inp_flags & INP_INHASHLIST,
1619 	    ("in_pcbrehash: !INP_INHASHLIST"));
1620 
1621 #ifdef INET6
1622 	if (inp->inp_vflag & INP_IPV6)
1623 		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1624 	else
1625 #endif /* INET6 */
1626 	hashkey_faddr = inp->inp_faddr.s_addr;
1627 
1628 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1629 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1630 
1631 	LIST_REMOVE(inp, inp_hash);
1632 	LIST_INSERT_HEAD(head, inp, inp_hash);
1633 }
1634 
1635 /*
1636  * Remove PCB from various lists.
1637  */
1638 static void
1639 in_pcbremlists(struct inpcb *inp)
1640 {
1641 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1642 
1643 	INP_INFO_WLOCK_ASSERT(pcbinfo);
1644 	INP_WLOCK_ASSERT(inp);
1645 
1646 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1647 	if (inp->inp_flags & INP_INHASHLIST) {
1648 		struct inpcbport *phd = inp->inp_phd;
1649 
1650 		LIST_REMOVE(inp, inp_hash);
1651 		LIST_REMOVE(inp, inp_portlist);
1652 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1653 			LIST_REMOVE(phd, phd_hash);
1654 			free(phd, M_PCB);
1655 		}
1656 		inp->inp_flags &= ~INP_INHASHLIST;
1657 	}
1658 	LIST_REMOVE(inp, inp_list);
1659 	pcbinfo->ipi_count--;
1660 }
1661 
1662 /*
1663  * A set label operation has occurred at the socket layer, propagate the
1664  * label change into the in_pcb for the socket.
1665  */
1666 void
1667 in_pcbsosetlabel(struct socket *so)
1668 {
1669 #ifdef MAC
1670 	struct inpcb *inp;
1671 
1672 	inp = sotoinpcb(so);
1673 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
1674 
1675 	INP_WLOCK(inp);
1676 	SOCK_LOCK(so);
1677 	mac_inpcb_sosetlabel(so, inp);
1678 	SOCK_UNLOCK(so);
1679 	INP_WUNLOCK(inp);
1680 #endif
1681 }
1682 
1683 /*
1684  * ipport_tick runs once per second, determining if random port allocation
1685  * should be continued.  If more than ipport_randomcps ports have been
1686  * allocated in the last second, then we return to sequential port
1687  * allocation. We return to random allocation only once we drop below
1688  * ipport_randomcps for at least ipport_randomtime seconds.
1689  */
1690 static void
1691 ipport_tick(void *xtp)
1692 {
1693 	VNET_ITERATOR_DECL(vnet_iter);
1694 
1695 	VNET_LIST_RLOCK_NOSLEEP();
1696 	VNET_FOREACH(vnet_iter) {
1697 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
1698 		if (V_ipport_tcpallocs <=
1699 		    V_ipport_tcplastcount + V_ipport_randomcps) {
1700 			if (V_ipport_stoprandom > 0)
1701 				V_ipport_stoprandom--;
1702 		} else
1703 			V_ipport_stoprandom = V_ipport_randomtime;
1704 		V_ipport_tcplastcount = V_ipport_tcpallocs;
1705 		CURVNET_RESTORE();
1706 	}
1707 	VNET_LIST_RUNLOCK_NOSLEEP();
1708 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
1709 }
1710 
1711 static void
1712 ip_fini(void *xtp)
1713 {
1714 
1715 	callout_stop(&ipport_tick_callout);
1716 }
1717 
1718 /*
1719  * The ipport_callout should start running at about the time we attach the
1720  * inet or inet6 domains.
1721  */
1722 static void
1723 ipport_tick_init(const void *unused __unused)
1724 {
1725 
1726 	/* Start ipport_tick. */
1727 	callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
1728 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
1729 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
1730 		SHUTDOWN_PRI_DEFAULT);
1731 }
1732 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
1733     ipport_tick_init, NULL);
1734 
1735 void
1736 inp_wlock(struct inpcb *inp)
1737 {
1738 
1739 	INP_WLOCK(inp);
1740 }
1741 
1742 void
1743 inp_wunlock(struct inpcb *inp)
1744 {
1745 
1746 	INP_WUNLOCK(inp);
1747 }
1748 
1749 void
1750 inp_rlock(struct inpcb *inp)
1751 {
1752 
1753 	INP_RLOCK(inp);
1754 }
1755 
1756 void
1757 inp_runlock(struct inpcb *inp)
1758 {
1759 
1760 	INP_RUNLOCK(inp);
1761 }
1762 
1763 #ifdef INVARIANTS
1764 void
1765 inp_lock_assert(struct inpcb *inp)
1766 {
1767 
1768 	INP_WLOCK_ASSERT(inp);
1769 }
1770 
1771 void
1772 inp_unlock_assert(struct inpcb *inp)
1773 {
1774 
1775 	INP_UNLOCK_ASSERT(inp);
1776 }
1777 #endif
1778 
1779 void
1780 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
1781 {
1782 	struct inpcb *inp;
1783 
1784 	INP_INFO_RLOCK(&V_tcbinfo);
1785 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
1786 		INP_WLOCK(inp);
1787 		func(inp, arg);
1788 		INP_WUNLOCK(inp);
1789 	}
1790 	INP_INFO_RUNLOCK(&V_tcbinfo);
1791 }
1792 
1793 struct socket *
1794 inp_inpcbtosocket(struct inpcb *inp)
1795 {
1796 
1797 	INP_WLOCK_ASSERT(inp);
1798 	return (inp->inp_socket);
1799 }
1800 
1801 struct tcpcb *
1802 inp_inpcbtotcpcb(struct inpcb *inp)
1803 {
1804 
1805 	INP_WLOCK_ASSERT(inp);
1806 	return ((struct tcpcb *)inp->inp_ppcb);
1807 }
1808 
1809 int
1810 inp_ip_tos_get(const struct inpcb *inp)
1811 {
1812 
1813 	return (inp->inp_ip_tos);
1814 }
1815 
1816 void
1817 inp_ip_tos_set(struct inpcb *inp, int val)
1818 {
1819 
1820 	inp->inp_ip_tos = val;
1821 }
1822 
1823 void
1824 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
1825     uint32_t *faddr, uint16_t *fp)
1826 {
1827 
1828 	INP_LOCK_ASSERT(inp);
1829 	*laddr = inp->inp_laddr.s_addr;
1830 	*faddr = inp->inp_faddr.s_addr;
1831 	*lp = inp->inp_lport;
1832 	*fp = inp->inp_fport;
1833 }
1834 
1835 struct inpcb *
1836 so_sotoinpcb(struct socket *so)
1837 {
1838 
1839 	return (sotoinpcb(so));
1840 }
1841 
1842 struct tcpcb *
1843 so_sototcpcb(struct socket *so)
1844 {
1845 
1846 	return (sototcpcb(so));
1847 }
1848 
1849 #ifdef DDB
1850 static void
1851 db_print_indent(int indent)
1852 {
1853 	int i;
1854 
1855 	for (i = 0; i < indent; i++)
1856 		db_printf(" ");
1857 }
1858 
1859 static void
1860 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
1861 {
1862 	char faddr_str[48], laddr_str[48];
1863 
1864 	db_print_indent(indent);
1865 	db_printf("%s at %p\n", name, inc);
1866 
1867 	indent += 2;
1868 
1869 #ifdef INET6
1870 	if (inc->inc_flags & INC_ISIPV6) {
1871 		/* IPv6. */
1872 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
1873 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
1874 	} else {
1875 #endif
1876 		/* IPv4. */
1877 		inet_ntoa_r(inc->inc_laddr, laddr_str);
1878 		inet_ntoa_r(inc->inc_faddr, faddr_str);
1879 #ifdef INET6
1880 	}
1881 #endif
1882 	db_print_indent(indent);
1883 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
1884 	    ntohs(inc->inc_lport));
1885 	db_print_indent(indent);
1886 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
1887 	    ntohs(inc->inc_fport));
1888 }
1889 
1890 static void
1891 db_print_inpflags(int inp_flags)
1892 {
1893 	int comma;
1894 
1895 	comma = 0;
1896 	if (inp_flags & INP_RECVOPTS) {
1897 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
1898 		comma = 1;
1899 	}
1900 	if (inp_flags & INP_RECVRETOPTS) {
1901 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
1902 		comma = 1;
1903 	}
1904 	if (inp_flags & INP_RECVDSTADDR) {
1905 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
1906 		comma = 1;
1907 	}
1908 	if (inp_flags & INP_HDRINCL) {
1909 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
1910 		comma = 1;
1911 	}
1912 	if (inp_flags & INP_HIGHPORT) {
1913 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
1914 		comma = 1;
1915 	}
1916 	if (inp_flags & INP_LOWPORT) {
1917 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
1918 		comma = 1;
1919 	}
1920 	if (inp_flags & INP_ANONPORT) {
1921 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
1922 		comma = 1;
1923 	}
1924 	if (inp_flags & INP_RECVIF) {
1925 		db_printf("%sINP_RECVIF", comma ? ", " : "");
1926 		comma = 1;
1927 	}
1928 	if (inp_flags & INP_MTUDISC) {
1929 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
1930 		comma = 1;
1931 	}
1932 	if (inp_flags & INP_FAITH) {
1933 		db_printf("%sINP_FAITH", comma ? ", " : "");
1934 		comma = 1;
1935 	}
1936 	if (inp_flags & INP_RECVTTL) {
1937 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
1938 		comma = 1;
1939 	}
1940 	if (inp_flags & INP_DONTFRAG) {
1941 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
1942 		comma = 1;
1943 	}
1944 	if (inp_flags & IN6P_IPV6_V6ONLY) {
1945 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
1946 		comma = 1;
1947 	}
1948 	if (inp_flags & IN6P_PKTINFO) {
1949 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
1950 		comma = 1;
1951 	}
1952 	if (inp_flags & IN6P_HOPLIMIT) {
1953 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
1954 		comma = 1;
1955 	}
1956 	if (inp_flags & IN6P_HOPOPTS) {
1957 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
1958 		comma = 1;
1959 	}
1960 	if (inp_flags & IN6P_DSTOPTS) {
1961 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
1962 		comma = 1;
1963 	}
1964 	if (inp_flags & IN6P_RTHDR) {
1965 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
1966 		comma = 1;
1967 	}
1968 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
1969 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
1970 		comma = 1;
1971 	}
1972 	if (inp_flags & IN6P_TCLASS) {
1973 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
1974 		comma = 1;
1975 	}
1976 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
1977 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
1978 		comma = 1;
1979 	}
1980 	if (inp_flags & INP_TIMEWAIT) {
1981 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
1982 		comma  = 1;
1983 	}
1984 	if (inp_flags & INP_ONESBCAST) {
1985 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
1986 		comma  = 1;
1987 	}
1988 	if (inp_flags & INP_DROPPED) {
1989 		db_printf("%sINP_DROPPED", comma ? ", " : "");
1990 		comma  = 1;
1991 	}
1992 	if (inp_flags & INP_SOCKREF) {
1993 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
1994 		comma  = 1;
1995 	}
1996 	if (inp_flags & IN6P_RFC2292) {
1997 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
1998 		comma = 1;
1999 	}
2000 	if (inp_flags & IN6P_MTU) {
2001 		db_printf("IN6P_MTU%s", comma ? ", " : "");
2002 		comma = 1;
2003 	}
2004 }
2005 
2006 static void
2007 db_print_inpvflag(u_char inp_vflag)
2008 {
2009 	int comma;
2010 
2011 	comma = 0;
2012 	if (inp_vflag & INP_IPV4) {
2013 		db_printf("%sINP_IPV4", comma ? ", " : "");
2014 		comma  = 1;
2015 	}
2016 	if (inp_vflag & INP_IPV6) {
2017 		db_printf("%sINP_IPV6", comma ? ", " : "");
2018 		comma  = 1;
2019 	}
2020 	if (inp_vflag & INP_IPV6PROTO) {
2021 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
2022 		comma  = 1;
2023 	}
2024 }
2025 
2026 static void
2027 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
2028 {
2029 
2030 	db_print_indent(indent);
2031 	db_printf("%s at %p\n", name, inp);
2032 
2033 	indent += 2;
2034 
2035 	db_print_indent(indent);
2036 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
2037 
2038 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
2039 
2040 	db_print_indent(indent);
2041 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
2042 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
2043 
2044 	db_print_indent(indent);
2045 	db_printf("inp_label: %p   inp_flags: 0x%x (",
2046 	   inp->inp_label, inp->inp_flags);
2047 	db_print_inpflags(inp->inp_flags);
2048 	db_printf(")\n");
2049 
2050 	db_print_indent(indent);
2051 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
2052 	    inp->inp_vflag);
2053 	db_print_inpvflag(inp->inp_vflag);
2054 	db_printf(")\n");
2055 
2056 	db_print_indent(indent);
2057 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
2058 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
2059 
2060 	db_print_indent(indent);
2061 #ifdef INET6
2062 	if (inp->inp_vflag & INP_IPV6) {
2063 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
2064 		    "in6p_moptions: %p\n", inp->in6p_options,
2065 		    inp->in6p_outputopts, inp->in6p_moptions);
2066 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
2067 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
2068 		    inp->in6p_hops);
2069 	} else
2070 #endif
2071 	{
2072 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
2073 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
2074 		    inp->inp_options, inp->inp_moptions);
2075 	}
2076 
2077 	db_print_indent(indent);
2078 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
2079 	    (uintmax_t)inp->inp_gencnt);
2080 }
2081 
2082 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
2083 {
2084 	struct inpcb *inp;
2085 
2086 	if (!have_addr) {
2087 		db_printf("usage: show inpcb <addr>\n");
2088 		return;
2089 	}
2090 	inp = (struct inpcb *)addr;
2091 
2092 	db_print_inpcb(inp, "inpcb", 0);
2093 }
2094 #endif
2095