xref: /freebsd/sys/netinet/in_pcb.c (revision 024248c933c5741a21c17eda63092f330dd98337)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #include "opt_ddb.h"
41 #include "opt_ipsec.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ratelimit.h"
45 #include "opt_route.h"
46 #include "opt_rss.h"
47 
48 #include <sys/param.h>
49 #include <sys/hash.h>
50 #include <sys/systm.h>
51 #include <sys/libkern.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/eventhandler.h>
56 #include <sys/domain.h>
57 #include <sys/proc.h>
58 #include <sys/protosw.h>
59 #include <sys/smp.h>
60 #include <sys/smr.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <sys/priv.h>
65 #include <sys/proc.h>
66 #include <sys/refcount.h>
67 #include <sys/jail.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74 
75 #include <vm/uma.h>
76 #include <vm/vm.h>
77 
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/if_private.h>
81 #include <net/if_types.h>
82 #include <net/if_llatbl.h>
83 #include <net/route.h>
84 #include <net/rss_config.h>
85 #include <net/vnet.h>
86 
87 #if defined(INET) || defined(INET6)
88 #include <netinet/in.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/in_pcb_var.h>
91 #include <netinet/tcp.h>
92 #ifdef INET
93 #include <netinet/in_var.h>
94 #include <netinet/in_fib.h>
95 #endif
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/in6_pcb.h>
100 #include <netinet6/in6_var.h>
101 #include <netinet6/ip6_var.h>
102 #endif /* INET6 */
103 #include <net/route/nhop.h>
104 #endif
105 
106 #include <netipsec/ipsec_support.h>
107 
108 #include <security/mac/mac_framework.h>
109 
110 #define	INPCBLBGROUP_SIZMIN	8
111 #define	INPCBLBGROUP_SIZMAX	256
112 
113 #define	INP_FREED	0x00000200	/* Went through in_pcbfree(). */
114 #define	INP_INLBGROUP	0x01000000	/* Inserted into inpcblbgroup. */
115 
116 /*
117  * These configure the range of local port addresses assigned to
118  * "unspecified" outgoing connections/packets/whatever.
119  */
120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
126 
127 /*
128  * Reserved ports accessible only to root. There are significant
129  * security considerations that must be accounted for when changing these,
130  * but the security benefits can be great. Please be careful.
131  */
132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
133 VNET_DEFINE(int, ipport_reservedlow);
134 
135 /* Enable random ephemeral port allocation by default. */
136 VNET_DEFINE(int, ipport_randomized) = 1;
137 
138 #ifdef INET
139 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
140 			    struct in_addr faddr, u_int fport_arg,
141 			    struct in_addr laddr, u_int lport_arg,
142 			    int lookupflags, uint8_t numa_domain);
143 
144 #define RANGECHK(var, min, max) \
145 	if ((var) < (min)) { (var) = (min); } \
146 	else if ((var) > (max)) { (var) = (max); }
147 
148 static int
149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
150 {
151 	int error;
152 
153 	error = sysctl_handle_int(oidp, arg1, arg2, req);
154 	if (error == 0) {
155 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
156 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
157 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
158 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
159 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
160 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
161 	}
162 	return (error);
163 }
164 
165 #undef RANGECHK
166 
167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
168     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
169     "IP Ports");
170 
171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
172     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
173     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
174     "");
175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
176     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
177     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
178     "");
179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
180     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
181     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
182     "");
183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
184     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
185     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
186     "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
188     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
189     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
190     "");
191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
192     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
193     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
194     "");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
196 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
197 	&VNET_NAME(ipport_reservedhigh), 0, "");
198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
199 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
201 	CTLFLAG_VNET | CTLFLAG_RW,
202 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
203 
204 #ifdef RATELIMIT
205 counter_u64_t rate_limit_new;
206 counter_u64_t rate_limit_chg;
207 counter_u64_t rate_limit_active;
208 counter_u64_t rate_limit_alloc_fail;
209 counter_u64_t rate_limit_set_ok;
210 
211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
212     "IP Rate Limiting");
213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
214     &rate_limit_active, "Active rate limited connections");
215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
216    &rate_limit_alloc_fail, "Rate limited connection failures");
217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
218    &rate_limit_set_ok, "Rate limited setting succeeded");
219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
220    &rate_limit_new, "Total Rate limit new attempts");
221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
222    &rate_limit_chg, "Total Rate limited change attempts");
223 #endif /* RATELIMIT */
224 
225 #endif /* INET */
226 
227 VNET_DEFINE(uint32_t, in_pcbhashseed);
228 static void
229 in_pcbhashseed_init(void)
230 {
231 
232 	V_in_pcbhashseed = arc4random();
233 }
234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
235     in_pcbhashseed_init, 0);
236 
237 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1;
238 #define	V_connect_inaddr_wild	VNET(connect_inaddr_wild)
239 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
240     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
241     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
242 
243 static void in_pcbremhash(struct inpcb *);
244 
245 /*
246  * in_pcb.c: manage the Protocol Control Blocks.
247  *
248  * NOTE: It is assumed that most of these functions will be called with
249  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
250  * functions often modify hash chains or addresses in pcbs.
251  */
252 
253 static struct inpcblbgroup *
254 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
255     u_char vflag, uint16_t port, const union in_dependaddr *addr, int size,
256     uint8_t numa_domain)
257 {
258 	struct inpcblbgroup *grp;
259 	size_t bytes;
260 
261 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
262 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
263 	if (grp == NULL)
264 		return (NULL);
265 	grp->il_cred = crhold(cred);
266 	grp->il_vflag = vflag;
267 	grp->il_lport = port;
268 	grp->il_numa_domain = numa_domain;
269 	grp->il_dependladdr = *addr;
270 	grp->il_inpsiz = size;
271 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
272 	return (grp);
273 }
274 
275 static void
276 in_pcblbgroup_free_deferred(epoch_context_t ctx)
277 {
278 	struct inpcblbgroup *grp;
279 
280 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
281 	crfree(grp->il_cred);
282 	free(grp, M_PCB);
283 }
284 
285 static void
286 in_pcblbgroup_free(struct inpcblbgroup *grp)
287 {
288 
289 	CK_LIST_REMOVE(grp, il_list);
290 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
291 }
292 
293 static struct inpcblbgroup *
294 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
295     struct inpcblbgroup *old_grp, int size)
296 {
297 	struct inpcblbgroup *grp;
298 	int i;
299 
300 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag,
301 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
302 	    old_grp->il_numa_domain);
303 	if (grp == NULL)
304 		return (NULL);
305 
306 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
307 	    ("invalid new local group size %d and old local group count %d",
308 	     grp->il_inpsiz, old_grp->il_inpcnt));
309 
310 	for (i = 0; i < old_grp->il_inpcnt; ++i)
311 		grp->il_inp[i] = old_grp->il_inp[i];
312 	grp->il_inpcnt = old_grp->il_inpcnt;
313 	in_pcblbgroup_free(old_grp);
314 	return (grp);
315 }
316 
317 /*
318  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
319  * and shrink group if possible.
320  */
321 static void
322 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
323     int i)
324 {
325 	struct inpcblbgroup *grp, *new_grp;
326 
327 	grp = *grpp;
328 	for (; i + 1 < grp->il_inpcnt; ++i)
329 		grp->il_inp[i] = grp->il_inp[i + 1];
330 	grp->il_inpcnt--;
331 
332 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
333 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
334 		/* Shrink this group. */
335 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
336 		if (new_grp != NULL)
337 			*grpp = new_grp;
338 	}
339 }
340 
341 /*
342  * Add PCB to load balance group for SO_REUSEPORT_LB option.
343  */
344 static int
345 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
346 {
347 	const static struct timeval interval = { 60, 0 };
348 	static struct timeval lastprint;
349 	struct inpcbinfo *pcbinfo;
350 	struct inpcblbgrouphead *hdr;
351 	struct inpcblbgroup *grp;
352 	uint32_t idx;
353 
354 	pcbinfo = inp->inp_pcbinfo;
355 
356 	INP_WLOCK_ASSERT(inp);
357 	INP_HASH_WLOCK_ASSERT(pcbinfo);
358 
359 #ifdef INET6
360 	/*
361 	 * Don't allow IPv4 mapped INET6 wild socket.
362 	 */
363 	if ((inp->inp_vflag & INP_IPV4) &&
364 	    inp->inp_laddr.s_addr == INADDR_ANY &&
365 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
366 		return (0);
367 	}
368 #endif
369 
370 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
371 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
372 	CK_LIST_FOREACH(grp, hdr, il_list) {
373 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
374 		    grp->il_vflag == inp->inp_vflag &&
375 		    grp->il_lport == inp->inp_lport &&
376 		    grp->il_numa_domain == numa_domain &&
377 		    memcmp(&grp->il_dependladdr,
378 		    &inp->inp_inc.inc_ie.ie_dependladdr,
379 		    sizeof(grp->il_dependladdr)) == 0) {
380 			break;
381 		}
382 	}
383 	if (grp == NULL) {
384 		/* Create new load balance group. */
385 		grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag,
386 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
387 		    INPCBLBGROUP_SIZMIN, numa_domain);
388 		if (grp == NULL)
389 			return (ENOBUFS);
390 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
391 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
392 			if (ratecheck(&lastprint, &interval))
393 				printf("lb group port %d, limit reached\n",
394 				    ntohs(grp->il_lport));
395 			return (0);
396 		}
397 
398 		/* Expand this local group. */
399 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
400 		if (grp == NULL)
401 			return (ENOBUFS);
402 	}
403 
404 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
405 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
406 	    grp->il_inpcnt));
407 
408 	grp->il_inp[grp->il_inpcnt] = inp;
409 	grp->il_inpcnt++;
410 	inp->inp_flags |= INP_INLBGROUP;
411 	return (0);
412 }
413 
414 /*
415  * Remove PCB from load balance group.
416  */
417 static void
418 in_pcbremlbgrouphash(struct inpcb *inp)
419 {
420 	struct inpcbinfo *pcbinfo;
421 	struct inpcblbgrouphead *hdr;
422 	struct inpcblbgroup *grp;
423 	int i;
424 
425 	pcbinfo = inp->inp_pcbinfo;
426 
427 	INP_WLOCK_ASSERT(inp);
428 	MPASS(inp->inp_flags & INP_INLBGROUP);
429 	INP_HASH_WLOCK_ASSERT(pcbinfo);
430 
431 	hdr = &pcbinfo->ipi_lbgrouphashbase[
432 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
433 	CK_LIST_FOREACH(grp, hdr, il_list) {
434 		for (i = 0; i < grp->il_inpcnt; ++i) {
435 			if (grp->il_inp[i] != inp)
436 				continue;
437 
438 			if (grp->il_inpcnt == 1) {
439 				/* We are the last, free this local group. */
440 				in_pcblbgroup_free(grp);
441 			} else {
442 				/* Pull up inpcbs, shrink group if possible. */
443 				in_pcblbgroup_reorder(hdr, &grp, i);
444 			}
445 			inp->inp_flags &= ~INP_INLBGROUP;
446 			return;
447 		}
448 	}
449 	KASSERT(0, ("%s: did not find %p", __func__, inp));
450 }
451 
452 int
453 in_pcblbgroup_numa(struct inpcb *inp, int arg)
454 {
455 	struct inpcbinfo *pcbinfo;
456 	struct inpcblbgrouphead *hdr;
457 	struct inpcblbgroup *grp;
458 	int err, i;
459 	uint8_t numa_domain;
460 
461 	switch (arg) {
462 	case TCP_REUSPORT_LB_NUMA_NODOM:
463 		numa_domain = M_NODOM;
464 		break;
465 	case TCP_REUSPORT_LB_NUMA_CURDOM:
466 		numa_domain = PCPU_GET(domain);
467 		break;
468 	default:
469 		if (arg < 0 || arg >= vm_ndomains)
470 			return (EINVAL);
471 		numa_domain = arg;
472 	}
473 
474 	err = 0;
475 	pcbinfo = inp->inp_pcbinfo;
476 	INP_WLOCK_ASSERT(inp);
477 	INP_HASH_WLOCK(pcbinfo);
478 	hdr = &pcbinfo->ipi_lbgrouphashbase[
479 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
480 	CK_LIST_FOREACH(grp, hdr, il_list) {
481 		for (i = 0; i < grp->il_inpcnt; ++i) {
482 			if (grp->il_inp[i] != inp)
483 				continue;
484 
485 			if (grp->il_numa_domain == numa_domain) {
486 				goto abort_with_hash_wlock;
487 			}
488 
489 			/* Remove it from the old group. */
490 			in_pcbremlbgrouphash(inp);
491 
492 			/* Add it to the new group based on numa domain. */
493 			in_pcbinslbgrouphash(inp, numa_domain);
494 			goto abort_with_hash_wlock;
495 		}
496 	}
497 	err = ENOENT;
498 abort_with_hash_wlock:
499 	INP_HASH_WUNLOCK(pcbinfo);
500 	return (err);
501 }
502 
503 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
504 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
505 
506 /*
507  * Initialize an inpcbinfo - a per-VNET instance of connections db.
508  */
509 void
510 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
511     u_int hash_nelements, u_int porthash_nelements)
512 {
513 
514 	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
515 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
516 	    NULL, MTX_DEF);
517 #ifdef VIMAGE
518 	pcbinfo->ipi_vnet = curvnet;
519 #endif
520 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
521 	pcbinfo->ipi_count = 0;
522 	pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
523 	    &pcbinfo->ipi_hashmask);
524 	pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
525 	    &pcbinfo->ipi_hashmask);
526 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
527 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
528 	    &pcbinfo->ipi_porthashmask);
529 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
530 	    &pcbinfo->ipi_lbgrouphashmask);
531 	pcbinfo->ipi_zone = pcbstor->ips_zone;
532 	pcbinfo->ipi_portzone = pcbstor->ips_portzone;
533 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
534 }
535 
536 /*
537  * Destroy an inpcbinfo.
538  */
539 void
540 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
541 {
542 
543 	KASSERT(pcbinfo->ipi_count == 0,
544 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
545 
546 	hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
547 	hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
548 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
549 	    pcbinfo->ipi_porthashmask);
550 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
551 	    pcbinfo->ipi_lbgrouphashmask);
552 	mtx_destroy(&pcbinfo->ipi_hash_lock);
553 	mtx_destroy(&pcbinfo->ipi_lock);
554 }
555 
556 /*
557  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
558  */
559 static void inpcb_fini(void *, int);
560 void
561 in_pcbstorage_init(void *arg)
562 {
563 	struct inpcbstorage *pcbstor = arg;
564 
565 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
566 	    pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
567 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
568 	pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
569 	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
570 	uma_zone_set_smr(pcbstor->ips_portzone,
571 	    uma_zone_get_smr(pcbstor->ips_zone));
572 }
573 
574 /*
575  * Destroy a pcbstorage - used by unloadable protocols.
576  */
577 void
578 in_pcbstorage_destroy(void *arg)
579 {
580 	struct inpcbstorage *pcbstor = arg;
581 
582 	uma_zdestroy(pcbstor->ips_zone);
583 	uma_zdestroy(pcbstor->ips_portzone);
584 }
585 
586 /*
587  * Allocate a PCB and associate it with the socket.
588  * On success return with the PCB locked.
589  */
590 int
591 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
592 {
593 	struct inpcb *inp;
594 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
595 	int error;
596 #endif
597 
598 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
599 	if (inp == NULL)
600 		return (ENOBUFS);
601 	bzero(&inp->inp_start_zero, inp_zero_size);
602 #ifdef NUMA
603 	inp->inp_numa_domain = M_NODOM;
604 #endif
605 	inp->inp_pcbinfo = pcbinfo;
606 	inp->inp_socket = so;
607 	inp->inp_cred = crhold(so->so_cred);
608 	inp->inp_inc.inc_fibnum = so->so_fibnum;
609 #ifdef MAC
610 	error = mac_inpcb_init(inp, M_NOWAIT);
611 	if (error != 0)
612 		goto out;
613 	mac_inpcb_create(so, inp);
614 #endif
615 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
616 	error = ipsec_init_pcbpolicy(inp);
617 	if (error != 0) {
618 #ifdef MAC
619 		mac_inpcb_destroy(inp);
620 #endif
621 		goto out;
622 	}
623 #endif /*IPSEC*/
624 #ifdef INET6
625 	if (INP_SOCKAF(so) == AF_INET6) {
626 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
627 		if (V_ip6_v6only)
628 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
629 #ifdef INET
630 		else
631 			inp->inp_vflag |= INP_IPV4;
632 #endif
633 		if (V_ip6_auto_flowlabel)
634 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
635 		inp->in6p_hops = -1;	/* use kernel default */
636 	}
637 #endif
638 #if defined(INET) && defined(INET6)
639 	else
640 #endif
641 #ifdef INET
642 		inp->inp_vflag |= INP_IPV4;
643 #endif
644 	inp->inp_smr = SMR_SEQ_INVALID;
645 
646 	/*
647 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
648 	 * to be cleaned up.
649 	 */
650 	inp->inp_route.ro_flags = RT_LLE_CACHE;
651 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
652 	INP_WLOCK(inp);
653 	INP_INFO_WLOCK(pcbinfo);
654 	pcbinfo->ipi_count++;
655 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
656 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
657 	INP_INFO_WUNLOCK(pcbinfo);
658 	so->so_pcb = inp;
659 
660 	return (0);
661 
662 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
663 out:
664 	crfree(inp->inp_cred);
665 #ifdef INVARIANTS
666 	inp->inp_cred = NULL;
667 #endif
668 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
669 	return (error);
670 #endif
671 }
672 
673 #ifdef INET
674 int
675 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
676 {
677 	int anonport, error;
678 
679 	KASSERT(sin == NULL || sin->sin_family == AF_INET,
680 	    ("%s: invalid address family for %p", __func__, sin));
681 	KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
682 	    ("%s: invalid address length for %p", __func__, sin));
683 	INP_WLOCK_ASSERT(inp);
684 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
685 
686 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
687 		return (EINVAL);
688 	anonport = sin == NULL || sin->sin_port == 0;
689 	error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
690 	    &inp->inp_lport, cred);
691 	if (error)
692 		return (error);
693 	if (in_pcbinshash(inp) != 0) {
694 		inp->inp_laddr.s_addr = INADDR_ANY;
695 		inp->inp_lport = 0;
696 		return (EAGAIN);
697 	}
698 	if (anonport)
699 		inp->inp_flags |= INP_ANONPORT;
700 	return (0);
701 }
702 #endif
703 
704 #if defined(INET) || defined(INET6)
705 /*
706  * Assign a local port like in_pcb_lport(), but also used with connect()
707  * and a foreign address and port.  If fsa is non-NULL, choose a local port
708  * that is unused with those, otherwise one that is completely unused.
709  * lsa can be NULL for IPv6.
710  */
711 int
712 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
713     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
714 {
715 	struct inpcbinfo *pcbinfo;
716 	struct inpcb *tmpinp;
717 	unsigned short *lastport;
718 	int count, error;
719 	u_short aux, first, last, lport;
720 #ifdef INET
721 	struct in_addr laddr, faddr;
722 #endif
723 #ifdef INET6
724 	struct in6_addr *laddr6, *faddr6;
725 #endif
726 
727 	pcbinfo = inp->inp_pcbinfo;
728 
729 	/*
730 	 * Because no actual state changes occur here, a global write lock on
731 	 * the pcbinfo isn't required.
732 	 */
733 	INP_LOCK_ASSERT(inp);
734 	INP_HASH_LOCK_ASSERT(pcbinfo);
735 
736 	if (inp->inp_flags & INP_HIGHPORT) {
737 		first = V_ipport_hifirstauto;	/* sysctl */
738 		last  = V_ipport_hilastauto;
739 		lastport = &pcbinfo->ipi_lasthi;
740 	} else if (inp->inp_flags & INP_LOWPORT) {
741 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
742 		if (error)
743 			return (error);
744 		first = V_ipport_lowfirstauto;	/* 1023 */
745 		last  = V_ipport_lowlastauto;	/* 600 */
746 		lastport = &pcbinfo->ipi_lastlow;
747 	} else {
748 		first = V_ipport_firstauto;	/* sysctl */
749 		last  = V_ipport_lastauto;
750 		lastport = &pcbinfo->ipi_lastport;
751 	}
752 
753 	/*
754 	 * Instead of having two loops further down counting up or down
755 	 * make sure that first is always <= last and go with only one
756 	 * code path implementing all logic.
757 	 */
758 	if (first > last) {
759 		aux = first;
760 		first = last;
761 		last = aux;
762 	}
763 
764 #ifdef INET
765 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
766 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
767 		if (lsa != NULL)
768 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
769 		if (fsa != NULL)
770 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
771 	}
772 #endif
773 #ifdef INET6
774 	laddr6 = NULL;
775 	if ((inp->inp_vflag & INP_IPV6) != 0) {
776 		if (lsa != NULL)
777 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
778 		if (fsa != NULL)
779 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
780 	}
781 #endif
782 
783 	tmpinp = NULL;
784 	lport = *lportp;
785 
786 	if (V_ipport_randomized)
787 		*lastport = first + (arc4random() % (last - first));
788 
789 	count = last - first;
790 
791 	do {
792 		if (count-- < 0)	/* completely used? */
793 			return (EADDRNOTAVAIL);
794 		++*lastport;
795 		if (*lastport < first || *lastport > last)
796 			*lastport = first;
797 		lport = htons(*lastport);
798 
799 		if (fsa != NULL) {
800 #ifdef INET
801 			if (lsa->sa_family == AF_INET) {
802 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
803 				    faddr, fport, laddr, lport, lookupflags,
804 				    M_NODOM);
805 			}
806 #endif
807 #ifdef INET6
808 			if (lsa->sa_family == AF_INET6) {
809 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
810 				    faddr6, fport, laddr6, lport, lookupflags,
811 				    M_NODOM);
812 			}
813 #endif
814 		} else {
815 #ifdef INET6
816 			if ((inp->inp_vflag & INP_IPV6) != 0) {
817 				tmpinp = in6_pcblookup_local(pcbinfo,
818 				    &inp->in6p_laddr, lport, lookupflags, cred);
819 #ifdef INET
820 				if (tmpinp == NULL &&
821 				    (inp->inp_vflag & INP_IPV4))
822 					tmpinp = in_pcblookup_local(pcbinfo,
823 					    laddr, lport, lookupflags, cred);
824 #endif
825 			}
826 #endif
827 #if defined(INET) && defined(INET6)
828 			else
829 #endif
830 #ifdef INET
831 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
832 				    lport, lookupflags, cred);
833 #endif
834 		}
835 	} while (tmpinp != NULL);
836 
837 	*lportp = lport;
838 
839 	return (0);
840 }
841 
842 /*
843  * Select a local port (number) to use.
844  */
845 int
846 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
847     struct ucred *cred, int lookupflags)
848 {
849 	struct sockaddr_in laddr;
850 
851 	if (laddrp) {
852 		bzero(&laddr, sizeof(laddr));
853 		laddr.sin_family = AF_INET;
854 		laddr.sin_addr = *laddrp;
855 	}
856 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
857 	    NULL, lportp, NULL, 0, cred, lookupflags));
858 }
859 #endif /* INET || INET6 */
860 
861 #ifdef INET
862 /*
863  * Set up a bind operation on a PCB, performing port allocation
864  * as required, but do not actually modify the PCB. Callers can
865  * either complete the bind by setting inp_laddr/inp_lport and
866  * calling in_pcbinshash(), or they can just use the resulting
867  * port and address to authorise the sending of a once-off packet.
868  *
869  * On error, the values of *laddrp and *lportp are not changed.
870  */
871 int
872 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
873     u_short *lportp, struct ucred *cred)
874 {
875 	struct socket *so = inp->inp_socket;
876 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
877 	struct in_addr laddr;
878 	u_short lport = 0;
879 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
880 	int error;
881 
882 	/*
883 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
884 	 * so that we don't have to add to the (already messy) code below.
885 	 */
886 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
887 
888 	/*
889 	 * No state changes, so read locks are sufficient here.
890 	 */
891 	INP_LOCK_ASSERT(inp);
892 	INP_HASH_LOCK_ASSERT(pcbinfo);
893 
894 	laddr.s_addr = *laddrp;
895 	if (sin != NULL && laddr.s_addr != INADDR_ANY)
896 		return (EINVAL);
897 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
898 		lookupflags = INPLOOKUP_WILDCARD;
899 	if (sin == NULL) {
900 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
901 			return (error);
902 	} else {
903 		KASSERT(sin->sin_family == AF_INET,
904 		    ("%s: invalid family for address %p", __func__, sin));
905 		KASSERT(sin->sin_len == sizeof(*sin),
906 		    ("%s: invalid length for address %p", __func__, sin));
907 
908 		error = prison_local_ip4(cred, &sin->sin_addr);
909 		if (error)
910 			return (error);
911 		if (sin->sin_port != *lportp) {
912 			/* Don't allow the port to change. */
913 			if (*lportp != 0)
914 				return (EINVAL);
915 			lport = sin->sin_port;
916 		}
917 		/* NB: lport is left as 0 if the port isn't being changed. */
918 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
919 			/*
920 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
921 			 * allow complete duplication of binding if
922 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
923 			 * and a multicast address is bound on both
924 			 * new and duplicated sockets.
925 			 */
926 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
927 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
928 			/*
929 			 * XXX: How to deal with SO_REUSEPORT_LB here?
930 			 * Treat same as SO_REUSEPORT for now.
931 			 */
932 			if ((so->so_options &
933 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
934 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
935 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
936 			sin->sin_port = 0;		/* yech... */
937 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
938 			/*
939 			 * Is the address a local IP address?
940 			 * If INP_BINDANY is set, then the socket may be bound
941 			 * to any endpoint address, local or not.
942 			 */
943 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
944 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
945 				return (EADDRNOTAVAIL);
946 		}
947 		laddr = sin->sin_addr;
948 		if (lport) {
949 			struct inpcb *t;
950 
951 			/* GROSS */
952 			if (ntohs(lport) <= V_ipport_reservedhigh &&
953 			    ntohs(lport) >= V_ipport_reservedlow &&
954 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
955 				return (EACCES);
956 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
957 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
958 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
959 				    lport, INPLOOKUP_WILDCARD, cred);
960 	/*
961 	 * XXX
962 	 * This entire block sorely needs a rewrite.
963 	 */
964 				if (t != NULL &&
965 				    (so->so_type != SOCK_STREAM ||
966 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
967 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
968 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
969 				     (t->inp_socket->so_options & SO_REUSEPORT) ||
970 				     (t->inp_socket->so_options & SO_REUSEPORT_LB) == 0) &&
971 				    (inp->inp_cred->cr_uid !=
972 				     t->inp_cred->cr_uid))
973 					return (EADDRINUSE);
974 			}
975 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
976 			    lport, lookupflags, cred);
977 			if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 &&
978 			    (reuseport_lb & t->inp_socket->so_options) == 0) {
979 #ifdef INET6
980 				if (ntohl(sin->sin_addr.s_addr) !=
981 				    INADDR_ANY ||
982 				    ntohl(t->inp_laddr.s_addr) !=
983 				    INADDR_ANY ||
984 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
985 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
986 #endif
987 						return (EADDRINUSE);
988 			}
989 		}
990 	}
991 	if (*lportp != 0)
992 		lport = *lportp;
993 	if (lport == 0) {
994 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
995 		if (error != 0)
996 			return (error);
997 	}
998 	*laddrp = laddr.s_addr;
999 	*lportp = lport;
1000 	return (0);
1001 }
1002 
1003 /*
1004  * Connect from a socket to a specified address.
1005  * Both address and port must be specified in argument sin.
1006  * If don't have a local address for this socket yet,
1007  * then pick one.
1008  */
1009 int
1010 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
1011     bool rehash __unused)
1012 {
1013 	u_short lport, fport;
1014 	in_addr_t laddr, faddr;
1015 	int anonport, error;
1016 
1017 	INP_WLOCK_ASSERT(inp);
1018 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1019 	KASSERT(in_nullhost(inp->inp_faddr),
1020 	    ("%s: inp is already connected", __func__));
1021 
1022 	lport = inp->inp_lport;
1023 	laddr = inp->inp_laddr.s_addr;
1024 	anonport = (lport == 0);
1025 	error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport,
1026 	    cred);
1027 	if (error)
1028 		return (error);
1029 
1030 	inp->inp_faddr.s_addr = faddr;
1031 	inp->inp_fport = fport;
1032 
1033 	/* Do the initial binding of the local address if required. */
1034 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1035 		inp->inp_lport = lport;
1036 		inp->inp_laddr.s_addr = laddr;
1037 		if (in_pcbinshash(inp) != 0) {
1038 			inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
1039 			    INADDR_ANY;
1040 			inp->inp_lport = inp->inp_fport = 0;
1041 			return (EAGAIN);
1042 		}
1043 	} else {
1044 		inp->inp_lport = lport;
1045 		inp->inp_laddr.s_addr = laddr;
1046 		if ((inp->inp_flags & INP_INHASHLIST) != 0)
1047 			in_pcbrehash(inp);
1048 		else
1049 			in_pcbinshash(inp);
1050 	}
1051 
1052 	if (anonport)
1053 		inp->inp_flags |= INP_ANONPORT;
1054 	return (0);
1055 }
1056 
1057 /*
1058  * Do proper source address selection on an unbound socket in case
1059  * of connect. Take jails into account as well.
1060  */
1061 int
1062 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1063     struct ucred *cred)
1064 {
1065 	struct ifaddr *ifa;
1066 	struct sockaddr *sa;
1067 	struct sockaddr_in *sin, dst;
1068 	struct nhop_object *nh;
1069 	int error;
1070 
1071 	NET_EPOCH_ASSERT();
1072 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1073 
1074 	/*
1075 	 * Bypass source address selection and use the primary jail IP
1076 	 * if requested.
1077 	 */
1078 	if (!prison_saddrsel_ip4(cred, laddr))
1079 		return (0);
1080 
1081 	error = 0;
1082 
1083 	nh = NULL;
1084 	bzero(&dst, sizeof(dst));
1085 	sin = &dst;
1086 	sin->sin_family = AF_INET;
1087 	sin->sin_len = sizeof(struct sockaddr_in);
1088 	sin->sin_addr.s_addr = faddr->s_addr;
1089 
1090 	/*
1091 	 * If route is known our src addr is taken from the i/f,
1092 	 * else punt.
1093 	 *
1094 	 * Find out route to destination.
1095 	 */
1096 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1097 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1098 		    0, NHR_NONE, 0);
1099 
1100 	/*
1101 	 * If we found a route, use the address corresponding to
1102 	 * the outgoing interface.
1103 	 *
1104 	 * Otherwise assume faddr is reachable on a directly connected
1105 	 * network and try to find a corresponding interface to take
1106 	 * the source address from.
1107 	 */
1108 	if (nh == NULL || nh->nh_ifp == NULL) {
1109 		struct in_ifaddr *ia;
1110 		struct ifnet *ifp;
1111 
1112 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1113 					inp->inp_socket->so_fibnum));
1114 		if (ia == NULL) {
1115 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1116 						inp->inp_socket->so_fibnum));
1117 		}
1118 		if (ia == NULL) {
1119 			error = ENETUNREACH;
1120 			goto done;
1121 		}
1122 
1123 		if (!prison_flag(cred, PR_IP4)) {
1124 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1125 			goto done;
1126 		}
1127 
1128 		ifp = ia->ia_ifp;
1129 		ia = NULL;
1130 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1131 			sa = ifa->ifa_addr;
1132 			if (sa->sa_family != AF_INET)
1133 				continue;
1134 			sin = (struct sockaddr_in *)sa;
1135 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1136 				ia = (struct in_ifaddr *)ifa;
1137 				break;
1138 			}
1139 		}
1140 		if (ia != NULL) {
1141 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1142 			goto done;
1143 		}
1144 
1145 		/* 3. As a last resort return the 'default' jail address. */
1146 		error = prison_get_ip4(cred, laddr);
1147 		goto done;
1148 	}
1149 
1150 	/*
1151 	 * If the outgoing interface on the route found is not
1152 	 * a loopback interface, use the address from that interface.
1153 	 * In case of jails do those three steps:
1154 	 * 1. check if the interface address belongs to the jail. If so use it.
1155 	 * 2. check if we have any address on the outgoing interface
1156 	 *    belonging to this jail. If so use it.
1157 	 * 3. as a last resort return the 'default' jail address.
1158 	 */
1159 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1160 		struct in_ifaddr *ia;
1161 		struct ifnet *ifp;
1162 
1163 		/* If not jailed, use the default returned. */
1164 		if (!prison_flag(cred, PR_IP4)) {
1165 			ia = (struct in_ifaddr *)nh->nh_ifa;
1166 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1167 			goto done;
1168 		}
1169 
1170 		/* Jailed. */
1171 		/* 1. Check if the iface address belongs to the jail. */
1172 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1173 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1174 			ia = (struct in_ifaddr *)nh->nh_ifa;
1175 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1176 			goto done;
1177 		}
1178 
1179 		/*
1180 		 * 2. Check if we have any address on the outgoing interface
1181 		 *    belonging to this jail.
1182 		 */
1183 		ia = NULL;
1184 		ifp = nh->nh_ifp;
1185 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1186 			sa = ifa->ifa_addr;
1187 			if (sa->sa_family != AF_INET)
1188 				continue;
1189 			sin = (struct sockaddr_in *)sa;
1190 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1191 				ia = (struct in_ifaddr *)ifa;
1192 				break;
1193 			}
1194 		}
1195 		if (ia != NULL) {
1196 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1197 			goto done;
1198 		}
1199 
1200 		/* 3. As a last resort return the 'default' jail address. */
1201 		error = prison_get_ip4(cred, laddr);
1202 		goto done;
1203 	}
1204 
1205 	/*
1206 	 * The outgoing interface is marked with 'loopback net', so a route
1207 	 * to ourselves is here.
1208 	 * Try to find the interface of the destination address and then
1209 	 * take the address from there. That interface is not necessarily
1210 	 * a loopback interface.
1211 	 * In case of jails, check that it is an address of the jail
1212 	 * and if we cannot find, fall back to the 'default' jail address.
1213 	 */
1214 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1215 		struct in_ifaddr *ia;
1216 
1217 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1218 					inp->inp_socket->so_fibnum));
1219 		if (ia == NULL)
1220 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1221 						inp->inp_socket->so_fibnum));
1222 		if (ia == NULL)
1223 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1224 
1225 		if (!prison_flag(cred, PR_IP4)) {
1226 			if (ia == NULL) {
1227 				error = ENETUNREACH;
1228 				goto done;
1229 			}
1230 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1231 			goto done;
1232 		}
1233 
1234 		/* Jailed. */
1235 		if (ia != NULL) {
1236 			struct ifnet *ifp;
1237 
1238 			ifp = ia->ia_ifp;
1239 			ia = NULL;
1240 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1241 				sa = ifa->ifa_addr;
1242 				if (sa->sa_family != AF_INET)
1243 					continue;
1244 				sin = (struct sockaddr_in *)sa;
1245 				if (prison_check_ip4(cred,
1246 				    &sin->sin_addr) == 0) {
1247 					ia = (struct in_ifaddr *)ifa;
1248 					break;
1249 				}
1250 			}
1251 			if (ia != NULL) {
1252 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1253 				goto done;
1254 			}
1255 		}
1256 
1257 		/* 3. As a last resort return the 'default' jail address. */
1258 		error = prison_get_ip4(cred, laddr);
1259 		goto done;
1260 	}
1261 
1262 done:
1263 	if (error == 0 && laddr->s_addr == INADDR_ANY)
1264 		return (EHOSTUNREACH);
1265 	return (error);
1266 }
1267 
1268 /*
1269  * Set up for a connect from a socket to the specified address.
1270  * On entry, *laddrp and *lportp should contain the current local
1271  * address and port for the PCB; these are updated to the values
1272  * that should be placed in inp_laddr and inp_lport to complete
1273  * the connect.
1274  *
1275  * On success, *faddrp and *fportp will be set to the remote address
1276  * and port. These are not updated in the error case.
1277  */
1278 int
1279 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin,
1280     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1281     struct ucred *cred)
1282 {
1283 	struct in_ifaddr *ia;
1284 	struct in_addr laddr, faddr;
1285 	u_short lport, fport;
1286 	int error;
1287 
1288 	KASSERT(sin->sin_family == AF_INET,
1289 	    ("%s: invalid address family for %p", __func__, sin));
1290 	KASSERT(sin->sin_len == sizeof(*sin),
1291 	    ("%s: invalid address length for %p", __func__, sin));
1292 
1293 	/*
1294 	 * Because a global state change doesn't actually occur here, a read
1295 	 * lock is sufficient.
1296 	 */
1297 	NET_EPOCH_ASSERT();
1298 	INP_LOCK_ASSERT(inp);
1299 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1300 
1301 	if (sin->sin_port == 0)
1302 		return (EADDRNOTAVAIL);
1303 	laddr.s_addr = *laddrp;
1304 	lport = *lportp;
1305 	faddr = sin->sin_addr;
1306 	fport = sin->sin_port;
1307 #ifdef ROUTE_MPATH
1308 	if (CALC_FLOWID_OUTBOUND) {
1309 		uint32_t hash_val, hash_type;
1310 
1311 		hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1312 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
1313 
1314 		inp->inp_flowid = hash_val;
1315 		inp->inp_flowtype = hash_type;
1316 	}
1317 #endif
1318 	if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1319 		/*
1320 		 * If the destination address is INADDR_ANY,
1321 		 * use the primary local address.
1322 		 * If the supplied address is INADDR_BROADCAST,
1323 		 * and the primary interface supports broadcast,
1324 		 * choose the broadcast address for that interface.
1325 		 */
1326 		if (faddr.s_addr == INADDR_ANY) {
1327 			faddr =
1328 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1329 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
1330 				return (error);
1331 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1332 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1333 			    IFF_BROADCAST)
1334 				faddr = satosin(&CK_STAILQ_FIRST(
1335 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1336 		}
1337 	}
1338 	if (laddr.s_addr == INADDR_ANY) {
1339 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1340 		/*
1341 		 * If the destination address is multicast and an outgoing
1342 		 * interface has been set as a multicast option, prefer the
1343 		 * address of that interface as our source address.
1344 		 */
1345 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1346 		    inp->inp_moptions != NULL) {
1347 			struct ip_moptions *imo;
1348 			struct ifnet *ifp;
1349 
1350 			imo = inp->inp_moptions;
1351 			if (imo->imo_multicast_ifp != NULL) {
1352 				ifp = imo->imo_multicast_ifp;
1353 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1354 					if (ia->ia_ifp == ifp &&
1355 					    prison_check_ip4(cred,
1356 					    &ia->ia_addr.sin_addr) == 0)
1357 						break;
1358 				}
1359 				if (ia == NULL)
1360 					error = EADDRNOTAVAIL;
1361 				else {
1362 					laddr = ia->ia_addr.sin_addr;
1363 					error = 0;
1364 				}
1365 			}
1366 		}
1367 		if (error)
1368 			return (error);
1369 	}
1370 
1371 	if (lport != 0) {
1372 		if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1373 		    fport, laddr, lport, 0, M_NODOM) != NULL)
1374 			return (EADDRINUSE);
1375 	} else {
1376 		struct sockaddr_in lsin, fsin;
1377 
1378 		bzero(&lsin, sizeof(lsin));
1379 		bzero(&fsin, sizeof(fsin));
1380 		lsin.sin_family = AF_INET;
1381 		lsin.sin_addr = laddr;
1382 		fsin.sin_family = AF_INET;
1383 		fsin.sin_addr = faddr;
1384 		error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1385 		    &lport, (struct sockaddr *)& fsin, fport, cred,
1386 		    INPLOOKUP_WILDCARD);
1387 		if (error)
1388 			return (error);
1389 	}
1390 	*laddrp = laddr.s_addr;
1391 	*lportp = lport;
1392 	*faddrp = faddr.s_addr;
1393 	*fportp = fport;
1394 	return (0);
1395 }
1396 
1397 void
1398 in_pcbdisconnect(struct inpcb *inp)
1399 {
1400 
1401 	INP_WLOCK_ASSERT(inp);
1402 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1403 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1404 	    ("%s: inp %p was already disconnected", __func__, inp));
1405 
1406 	in_pcbremhash_locked(inp);
1407 
1408 	/* See the comment in in_pcbinshash(). */
1409 	inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1410 	inp->inp_laddr.s_addr = INADDR_ANY;
1411 	inp->inp_faddr.s_addr = INADDR_ANY;
1412 	inp->inp_fport = 0;
1413 }
1414 #endif /* INET */
1415 
1416 /*
1417  * inpcb hash lookups are protected by SMR section.
1418  *
1419  * Once desired pcb has been found, switching from SMR section to a pcb
1420  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1421  * here because SMR is a critical section.
1422  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1423  */
1424 void
1425 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1426 {
1427 
1428 	lock == INPLOOKUP_RLOCKPCB ?
1429 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1430 }
1431 
1432 void
1433 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1434 {
1435 
1436 	lock == INPLOOKUP_RLOCKPCB ?
1437 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1438 }
1439 
1440 int
1441 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1442 {
1443 
1444 	return (lock == INPLOOKUP_RLOCKPCB ?
1445 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1446 }
1447 
1448 static inline bool
1449 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1450 {
1451 
1452 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1453 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1454 
1455 	if (__predict_true(inp_trylock(inp, lock))) {
1456 		if (__predict_false(inp->inp_flags & ignflags)) {
1457 			smr_exit(inp->inp_pcbinfo->ipi_smr);
1458 			inp_unlock(inp, lock);
1459 			return (false);
1460 		}
1461 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1462 		return (true);
1463 	}
1464 
1465 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1466 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1467 		inp_lock(inp, lock);
1468 		if (__predict_false(in_pcbrele(inp, lock)))
1469 			return (false);
1470 		/*
1471 		 * inp acquired through refcount & lock for sure didn't went
1472 		 * through uma_zfree().  However, it may have already went
1473 		 * through in_pcbfree() and has another reference, that
1474 		 * prevented its release by our in_pcbrele().
1475 		 */
1476 		if (__predict_false(inp->inp_flags & ignflags)) {
1477 			inp_unlock(inp, lock);
1478 			return (false);
1479 		}
1480 		return (true);
1481 	} else {
1482 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1483 		return (false);
1484 	}
1485 }
1486 
1487 bool
1488 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1489 {
1490 
1491 	/*
1492 	 * in_pcblookup() family of functions ignore not only freed entries,
1493 	 * that may be found due to lockless access to the hash, but dropped
1494 	 * entries, too.
1495 	 */
1496 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1497 }
1498 
1499 /*
1500  * inp_next() - inpcb hash/list traversal iterator
1501  *
1502  * Requires initialized struct inpcb_iterator for context.
1503  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1504  *
1505  * - Iterator can have either write-lock or read-lock semantics, that can not
1506  *   be changed later.
1507  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1508  *   a single hash slot.  Note: only rip_input() does the latter.
1509  * - Iterator may have optional bool matching function.  The matching function
1510  *   will be executed for each inpcb in the SMR context, so it can not acquire
1511  *   locks and can safely access only immutable fields of inpcb.
1512  *
1513  * A fresh initialized iterator has NULL inpcb in its context and that
1514  * means that inp_next() call would return the very first inpcb on the list
1515  * locked with desired semantic.  In all following calls the context pointer
1516  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1517  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1518  * and write NULL to its context.  After end of traversal an iterator can be
1519  * reused.
1520  *
1521  * List traversals have the following features/constraints:
1522  * - New entries won't be seen, as they are always added to the head of a list.
1523  * - Removed entries won't stop traversal as long as they are not added to
1524  *   a different list. This is violated by in_pcbrehash().
1525  */
1526 #define	II_LIST_FIRST(ipi, hash)					\
1527 		(((hash) == INP_ALL_LIST) ?				\
1528 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
1529 		    CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1530 #define	II_LIST_NEXT(inp, hash)						\
1531 		(((hash) == INP_ALL_LIST) ?				\
1532 		    CK_LIST_NEXT((inp), inp_list) :			\
1533 		    CK_LIST_NEXT((inp), inp_hash_exact))
1534 #define	II_LOCK_ASSERT(inp, lock)					\
1535 		rw_assert(&(inp)->inp_lock,				\
1536 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
1537 struct inpcb *
1538 inp_next(struct inpcb_iterator *ii)
1539 {
1540 	const struct inpcbinfo *ipi = ii->ipi;
1541 	inp_match_t *match = ii->match;
1542 	void *ctx = ii->ctx;
1543 	inp_lookup_t lock = ii->lock;
1544 	int hash = ii->hash;
1545 	struct inpcb *inp;
1546 
1547 	if (ii->inp == NULL) {		/* First call. */
1548 		smr_enter(ipi->ipi_smr);
1549 		/* This is unrolled CK_LIST_FOREACH(). */
1550 		for (inp = II_LIST_FIRST(ipi, hash);
1551 		    inp != NULL;
1552 		    inp = II_LIST_NEXT(inp, hash)) {
1553 			if (match != NULL && (match)(inp, ctx) == false)
1554 				continue;
1555 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1556 				break;
1557 			else {
1558 				smr_enter(ipi->ipi_smr);
1559 				MPASS(inp != II_LIST_FIRST(ipi, hash));
1560 				inp = II_LIST_FIRST(ipi, hash);
1561 				if (inp == NULL)
1562 					break;
1563 			}
1564 		}
1565 
1566 		if (inp == NULL)
1567 			smr_exit(ipi->ipi_smr);
1568 		else
1569 			ii->inp = inp;
1570 
1571 		return (inp);
1572 	}
1573 
1574 	/* Not a first call. */
1575 	smr_enter(ipi->ipi_smr);
1576 restart:
1577 	inp = ii->inp;
1578 	II_LOCK_ASSERT(inp, lock);
1579 next:
1580 	inp = II_LIST_NEXT(inp, hash);
1581 	if (inp == NULL) {
1582 		smr_exit(ipi->ipi_smr);
1583 		goto found;
1584 	}
1585 
1586 	if (match != NULL && (match)(inp, ctx) == false)
1587 		goto next;
1588 
1589 	if (__predict_true(inp_trylock(inp, lock))) {
1590 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1591 			/*
1592 			 * Entries are never inserted in middle of a list, thus
1593 			 * as long as we are in SMR, we can continue traversal.
1594 			 * Jump to 'restart' should yield in the same result,
1595 			 * but could produce unnecessary looping.  Could this
1596 			 * looping be unbound?
1597 			 */
1598 			inp_unlock(inp, lock);
1599 			goto next;
1600 		} else {
1601 			smr_exit(ipi->ipi_smr);
1602 			goto found;
1603 		}
1604 	}
1605 
1606 	/*
1607 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
1608 	 * SMR section we can no longer jump to 'next', and our only stable
1609 	 * anchoring point is ii->inp, which we keep locked for this case, so
1610 	 * we jump to 'restart'.
1611 	 */
1612 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1613 		smr_exit(ipi->ipi_smr);
1614 		inp_lock(inp, lock);
1615 		if (__predict_false(in_pcbrele(inp, lock))) {
1616 			smr_enter(ipi->ipi_smr);
1617 			goto restart;
1618 		}
1619 		/*
1620 		 * See comment in inp_smr_lock().
1621 		 */
1622 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1623 			inp_unlock(inp, lock);
1624 			smr_enter(ipi->ipi_smr);
1625 			goto restart;
1626 		}
1627 	} else
1628 		goto next;
1629 
1630 found:
1631 	inp_unlock(ii->inp, lock);
1632 	ii->inp = inp;
1633 
1634 	return (ii->inp);
1635 }
1636 
1637 /*
1638  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1639  * stability of an inpcb pointer despite the inpcb lock being released or
1640  * SMR section exited.
1641  *
1642  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1643  */
1644 void
1645 in_pcbref(struct inpcb *inp)
1646 {
1647 	u_int old __diagused;
1648 
1649 	old = refcount_acquire(&inp->inp_refcount);
1650 	KASSERT(old > 0, ("%s: refcount 0", __func__));
1651 }
1652 
1653 /*
1654  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1655  * freeing the pcb, if the reference was very last.
1656  */
1657 bool
1658 in_pcbrele_rlocked(struct inpcb *inp)
1659 {
1660 
1661 	INP_RLOCK_ASSERT(inp);
1662 
1663 	if (!refcount_release(&inp->inp_refcount))
1664 		return (false);
1665 
1666 	MPASS(inp->inp_flags & INP_FREED);
1667 	MPASS(inp->inp_socket == NULL);
1668 	crfree(inp->inp_cred);
1669 #ifdef INVARIANTS
1670 	inp->inp_cred = NULL;
1671 #endif
1672 	INP_RUNLOCK(inp);
1673 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1674 	return (true);
1675 }
1676 
1677 bool
1678 in_pcbrele_wlocked(struct inpcb *inp)
1679 {
1680 
1681 	INP_WLOCK_ASSERT(inp);
1682 
1683 	if (!refcount_release(&inp->inp_refcount))
1684 		return (false);
1685 
1686 	MPASS(inp->inp_flags & INP_FREED);
1687 	MPASS(inp->inp_socket == NULL);
1688 	crfree(inp->inp_cred);
1689 #ifdef INVARIANTS
1690 	inp->inp_cred = NULL;
1691 #endif
1692 	INP_WUNLOCK(inp);
1693 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1694 	return (true);
1695 }
1696 
1697 bool
1698 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1699 {
1700 
1701 	return (lock == INPLOOKUP_RLOCKPCB ?
1702 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1703 }
1704 
1705 /*
1706  * Unconditionally schedule an inpcb to be freed by decrementing its
1707  * reference count, which should occur only after the inpcb has been detached
1708  * from its socket.  If another thread holds a temporary reference (acquired
1709  * using in_pcbref()) then the free is deferred until that reference is
1710  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1711  *  Almost all work, including removal from global lists, is done in this
1712  * context, where the pcbinfo lock is held.
1713  */
1714 void
1715 in_pcbfree(struct inpcb *inp)
1716 {
1717 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1718 #ifdef INET
1719 	struct ip_moptions *imo;
1720 #endif
1721 #ifdef INET6
1722 	struct ip6_moptions *im6o;
1723 #endif
1724 
1725 	INP_WLOCK_ASSERT(inp);
1726 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1727 	KASSERT((inp->inp_flags & INP_FREED) == 0,
1728 	    ("%s: called twice for pcb %p", __func__, inp));
1729 
1730 	/*
1731 	 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1732 	 * from the hash without acquiring inpcb lock, they rely on the hash
1733 	 * lock, thus in_pcbremhash() should be the first action.
1734 	 */
1735 	if (inp->inp_flags & INP_INHASHLIST)
1736 		in_pcbremhash(inp);
1737 	INP_INFO_WLOCK(pcbinfo);
1738 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1739 	pcbinfo->ipi_count--;
1740 	CK_LIST_REMOVE(inp, inp_list);
1741 	INP_INFO_WUNLOCK(pcbinfo);
1742 
1743 #ifdef RATELIMIT
1744 	if (inp->inp_snd_tag != NULL)
1745 		in_pcbdetach_txrtlmt(inp);
1746 #endif
1747 	inp->inp_flags |= INP_FREED;
1748 	inp->inp_socket->so_pcb = NULL;
1749 	inp->inp_socket = NULL;
1750 
1751 	RO_INVALIDATE_CACHE(&inp->inp_route);
1752 #ifdef MAC
1753 	mac_inpcb_destroy(inp);
1754 #endif
1755 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1756 	if (inp->inp_sp != NULL)
1757 		ipsec_delete_pcbpolicy(inp);
1758 #endif
1759 #ifdef INET
1760 	if (inp->inp_options)
1761 		(void)m_free(inp->inp_options);
1762 	DEBUG_POISON_POINTER(inp->inp_options);
1763 	imo = inp->inp_moptions;
1764 	DEBUG_POISON_POINTER(inp->inp_moptions);
1765 #endif
1766 #ifdef INET6
1767 	if (inp->inp_vflag & INP_IPV6PROTO) {
1768 		ip6_freepcbopts(inp->in6p_outputopts);
1769 		DEBUG_POISON_POINTER(inp->in6p_outputopts);
1770 		im6o = inp->in6p_moptions;
1771 		DEBUG_POISON_POINTER(inp->in6p_moptions);
1772 	} else
1773 		im6o = NULL;
1774 #endif
1775 
1776 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1777 		INP_WUNLOCK(inp);
1778 	}
1779 #ifdef INET6
1780 	ip6_freemoptions(im6o);
1781 #endif
1782 #ifdef INET
1783 	inp_freemoptions(imo);
1784 #endif
1785 }
1786 
1787 /*
1788  * Different protocols initialize their inpcbs differently - giving
1789  * different name to the lock.  But they all are disposed the same.
1790  */
1791 static void
1792 inpcb_fini(void *mem, int size)
1793 {
1794 	struct inpcb *inp = mem;
1795 
1796 	INP_LOCK_DESTROY(inp);
1797 }
1798 
1799 /*
1800  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1801  * port reservation, and preventing it from being returned by inpcb lookups.
1802  *
1803  * It is used by TCP to mark an inpcb as unused and avoid future packet
1804  * delivery or event notification when a socket remains open but TCP has
1805  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1806  * or a RST on the wire, and allows the port binding to be reused while still
1807  * maintaining the invariant that so_pcb always points to a valid inpcb until
1808  * in_pcbdetach().
1809  *
1810  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1811  * in_pcbpurgeif0()?
1812  */
1813 void
1814 in_pcbdrop(struct inpcb *inp)
1815 {
1816 
1817 	INP_WLOCK_ASSERT(inp);
1818 
1819 	inp->inp_flags |= INP_DROPPED;
1820 	if (inp->inp_flags & INP_INHASHLIST)
1821 		in_pcbremhash(inp);
1822 }
1823 
1824 #ifdef INET
1825 /*
1826  * Common routines to return the socket addresses associated with inpcbs.
1827  */
1828 int
1829 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1830 {
1831 	struct inpcb *inp;
1832 
1833 	inp = sotoinpcb(so);
1834 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1835 
1836 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1837 		.sin_len = sizeof(struct sockaddr_in),
1838 		.sin_family = AF_INET,
1839 		.sin_port = inp->inp_lport,
1840 		.sin_addr = inp->inp_laddr,
1841 	};
1842 
1843 	return (0);
1844 }
1845 
1846 int
1847 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1848 {
1849 	struct inpcb *inp;
1850 
1851 	inp = sotoinpcb(so);
1852 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1853 
1854 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1855 		.sin_len = sizeof(struct sockaddr_in),
1856 		.sin_family = AF_INET,
1857 		.sin_port = inp->inp_fport,
1858 		.sin_addr = inp->inp_faddr,
1859 	};
1860 
1861 	return (0);
1862 }
1863 
1864 static bool
1865 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1866 {
1867 
1868 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1869 		return (true);
1870 	else
1871 		return (false);
1872 }
1873 
1874 void
1875 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1876 {
1877 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1878 	    inp_v4_multi_match, NULL);
1879 	struct inpcb *inp;
1880 	struct in_multi *inm;
1881 	struct in_mfilter *imf;
1882 	struct ip_moptions *imo;
1883 
1884 	IN_MULTI_LOCK_ASSERT();
1885 
1886 	while ((inp = inp_next(&inpi)) != NULL) {
1887 		INP_WLOCK_ASSERT(inp);
1888 
1889 		imo = inp->inp_moptions;
1890 		/*
1891 		 * Unselect the outgoing interface if it is being
1892 		 * detached.
1893 		 */
1894 		if (imo->imo_multicast_ifp == ifp)
1895 			imo->imo_multicast_ifp = NULL;
1896 
1897 		/*
1898 		 * Drop multicast group membership if we joined
1899 		 * through the interface being detached.
1900 		 *
1901 		 * XXX This can all be deferred to an epoch_call
1902 		 */
1903 restart:
1904 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1905 			if ((inm = imf->imf_inm) == NULL)
1906 				continue;
1907 			if (inm->inm_ifp != ifp)
1908 				continue;
1909 			ip_mfilter_remove(&imo->imo_head, imf);
1910 			in_leavegroup_locked(inm, NULL);
1911 			ip_mfilter_free(imf);
1912 			goto restart;
1913 		}
1914 	}
1915 }
1916 
1917 /*
1918  * Lookup a PCB based on the local address and port.  Caller must hold the
1919  * hash lock.  No inpcb locks or references are acquired.
1920  */
1921 #define INP_LOOKUP_MAPPED_PCB_COST	3
1922 struct inpcb *
1923 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1924     u_short lport, int lookupflags, struct ucred *cred)
1925 {
1926 	struct inpcb *inp;
1927 #ifdef INET6
1928 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1929 #else
1930 	int matchwild = 3;
1931 #endif
1932 	int wildcard;
1933 
1934 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1935 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1936 	INP_HASH_LOCK_ASSERT(pcbinfo);
1937 
1938 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1939 		struct inpcbhead *head;
1940 		/*
1941 		 * Look for an unconnected (wildcard foreign addr) PCB that
1942 		 * matches the local address and port we're looking for.
1943 		 */
1944 		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
1945 		    pcbinfo->ipi_hashmask)];
1946 		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
1947 #ifdef INET6
1948 			/* XXX inp locking */
1949 			if ((inp->inp_vflag & INP_IPV4) == 0)
1950 				continue;
1951 #endif
1952 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1953 			    inp->inp_laddr.s_addr == laddr.s_addr &&
1954 			    inp->inp_lport == lport) {
1955 				/*
1956 				 * Found?
1957 				 */
1958 				if (prison_equal_ip4(cred->cr_prison,
1959 				    inp->inp_cred->cr_prison))
1960 					return (inp);
1961 			}
1962 		}
1963 		/*
1964 		 * Not found.
1965 		 */
1966 		return (NULL);
1967 	} else {
1968 		struct inpcbporthead *porthash;
1969 		struct inpcbport *phd;
1970 		struct inpcb *match = NULL;
1971 		/*
1972 		 * Best fit PCB lookup.
1973 		 *
1974 		 * First see if this local port is in use by looking on the
1975 		 * port hash list.
1976 		 */
1977 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1978 		    pcbinfo->ipi_porthashmask)];
1979 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
1980 			if (phd->phd_port == lport)
1981 				break;
1982 		}
1983 		if (phd != NULL) {
1984 			/*
1985 			 * Port is in use by one or more PCBs. Look for best
1986 			 * fit.
1987 			 */
1988 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1989 				wildcard = 0;
1990 				if (!prison_equal_ip4(inp->inp_cred->cr_prison,
1991 				    cred->cr_prison))
1992 					continue;
1993 #ifdef INET6
1994 				/* XXX inp locking */
1995 				if ((inp->inp_vflag & INP_IPV4) == 0)
1996 					continue;
1997 				/*
1998 				 * We never select the PCB that has
1999 				 * INP_IPV6 flag and is bound to :: if
2000 				 * we have another PCB which is bound
2001 				 * to 0.0.0.0.  If a PCB has the
2002 				 * INP_IPV6 flag, then we set its cost
2003 				 * higher than IPv4 only PCBs.
2004 				 *
2005 				 * Note that the case only happens
2006 				 * when a socket is bound to ::, under
2007 				 * the condition that the use of the
2008 				 * mapped address is allowed.
2009 				 */
2010 				if ((inp->inp_vflag & INP_IPV6) != 0)
2011 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2012 #endif
2013 				if (inp->inp_faddr.s_addr != INADDR_ANY)
2014 					wildcard++;
2015 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
2016 					if (laddr.s_addr == INADDR_ANY)
2017 						wildcard++;
2018 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
2019 						continue;
2020 				} else {
2021 					if (laddr.s_addr != INADDR_ANY)
2022 						wildcard++;
2023 				}
2024 				if (wildcard < matchwild) {
2025 					match = inp;
2026 					matchwild = wildcard;
2027 					if (matchwild == 0)
2028 						break;
2029 				}
2030 			}
2031 		}
2032 		return (match);
2033 	}
2034 }
2035 #undef INP_LOOKUP_MAPPED_PCB_COST
2036 
2037 static bool
2038 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain)
2039 {
2040 	return (domain == M_NODOM || domain == grp->il_numa_domain);
2041 }
2042 
2043 static struct inpcb *
2044 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2045     const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2046     uint16_t lport, int domain)
2047 {
2048 	const struct inpcblbgrouphead *hdr;
2049 	struct inpcblbgroup *grp;
2050 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2051 
2052 	INP_HASH_LOCK_ASSERT(pcbinfo);
2053 
2054 	hdr = &pcbinfo->ipi_lbgrouphashbase[
2055 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2056 
2057 	/*
2058 	 * Search for an LB group match based on the following criteria:
2059 	 * - prefer jailed groups to non-jailed groups
2060 	 * - prefer exact source address matches to wildcard matches
2061 	 * - prefer groups bound to the specified NUMA domain
2062 	 */
2063 	jail_exact = jail_wild = local_exact = local_wild = NULL;
2064 	CK_LIST_FOREACH(grp, hdr, il_list) {
2065 		bool injail;
2066 
2067 #ifdef INET6
2068 		if (!(grp->il_vflag & INP_IPV4))
2069 			continue;
2070 #endif
2071 		if (grp->il_lport != lport)
2072 			continue;
2073 
2074 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2075 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2076 		    laddr) != 0)
2077 			continue;
2078 
2079 		if (grp->il_laddr.s_addr == laddr->s_addr) {
2080 			if (injail) {
2081 				jail_exact = grp;
2082 				if (in_pcblookup_lb_numa_match(grp, domain))
2083 					/* This is a perfect match. */
2084 					goto out;
2085 			} else if (local_exact == NULL ||
2086 			    in_pcblookup_lb_numa_match(grp, domain)) {
2087 				local_exact = grp;
2088 			}
2089 		} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2090 			if (injail) {
2091 				if (jail_wild == NULL ||
2092 				    in_pcblookup_lb_numa_match(grp, domain))
2093 					jail_wild = grp;
2094 			} else if (local_wild == NULL ||
2095 			    in_pcblookup_lb_numa_match(grp, domain)) {
2096 				local_wild = grp;
2097 			}
2098 		}
2099 	}
2100 
2101 	if (jail_exact != NULL)
2102 		grp = jail_exact;
2103 	else if (jail_wild != NULL)
2104 		grp = jail_wild;
2105 	else if (local_exact != NULL)
2106 		grp = local_exact;
2107 	else
2108 		grp = local_wild;
2109 	if (grp == NULL)
2110 		return (NULL);
2111 out:
2112 	return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
2113 	    grp->il_inpcnt]);
2114 }
2115 
2116 static bool
2117 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2118     u_short fport, struct in_addr laddr, u_short lport)
2119 {
2120 #ifdef INET6
2121 	/* XXX inp locking */
2122 	if ((inp->inp_vflag & INP_IPV4) == 0)
2123 		return (false);
2124 #endif
2125 	if (inp->inp_faddr.s_addr == faddr.s_addr &&
2126 	    inp->inp_laddr.s_addr == laddr.s_addr &&
2127 	    inp->inp_fport == fport &&
2128 	    inp->inp_lport == lport)
2129 		return (true);
2130 	return (false);
2131 }
2132 
2133 static struct inpcb *
2134 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2135     u_short fport, struct in_addr laddr, u_short lport)
2136 {
2137 	struct inpcbhead *head;
2138 	struct inpcb *inp;
2139 
2140 	INP_HASH_LOCK_ASSERT(pcbinfo);
2141 
2142 	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2143 	    pcbinfo->ipi_hashmask)];
2144 	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2145 		if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2146 			return (inp);
2147 	}
2148 	return (NULL);
2149 }
2150 
2151 typedef enum {
2152 	INPLOOKUP_MATCH_NONE = 0,
2153 	INPLOOKUP_MATCH_WILD = 1,
2154 	INPLOOKUP_MATCH_LADDR = 2,
2155 } inp_lookup_match_t;
2156 
2157 static inp_lookup_match_t
2158 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2159     u_short lport)
2160 {
2161 #ifdef INET6
2162 	/* XXX inp locking */
2163 	if ((inp->inp_vflag & INP_IPV4) == 0)
2164 		return (INPLOOKUP_MATCH_NONE);
2165 #endif
2166 	if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2167 		return (INPLOOKUP_MATCH_NONE);
2168 	if (inp->inp_laddr.s_addr == INADDR_ANY)
2169 		return (INPLOOKUP_MATCH_WILD);
2170 	if (inp->inp_laddr.s_addr == laddr.s_addr)
2171 		return (INPLOOKUP_MATCH_LADDR);
2172 	return (INPLOOKUP_MATCH_NONE);
2173 }
2174 
2175 #define	INP_LOOKUP_AGAIN	((struct inpcb *)(uintptr_t)-1)
2176 
2177 static struct inpcb *
2178 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2179     u_short fport, struct in_addr laddr, u_short lport,
2180     const inp_lookup_t lockflags)
2181 {
2182 	struct inpcbhead *head;
2183 	struct inpcb *inp;
2184 
2185 	KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2186 	    ("%s: not in SMR read section", __func__));
2187 
2188 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2189 	    pcbinfo->ipi_hashmask)];
2190 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2191 		inp_lookup_match_t match;
2192 
2193 		match = in_pcblookup_wild_match(inp, laddr, lport);
2194 		if (match == INPLOOKUP_MATCH_NONE)
2195 			continue;
2196 
2197 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2198 			match = in_pcblookup_wild_match(inp, laddr, lport);
2199 			if (match != INPLOOKUP_MATCH_NONE &&
2200 			    prison_check_ip4_locked(inp->inp_cred->cr_prison,
2201 			    &laddr) == 0)
2202 				return (inp);
2203 			inp_unlock(inp, lockflags);
2204 		}
2205 
2206 		/*
2207 		 * The matching socket disappeared out from under us.  Fall back
2208 		 * to a serialized lookup.
2209 		 */
2210 		return (INP_LOOKUP_AGAIN);
2211 	}
2212 	return (NULL);
2213 }
2214 
2215 static struct inpcb *
2216 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2217     u_short fport, struct in_addr laddr, u_short lport)
2218 {
2219 	struct inpcbhead *head;
2220 	struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2221 #ifdef INET6
2222 	struct inpcb *local_wild_mapped;
2223 #endif
2224 
2225 	INP_HASH_LOCK_ASSERT(pcbinfo);
2226 
2227 	/*
2228 	 * Order of socket selection - we always prefer jails.
2229 	 *      1. jailed, non-wild.
2230 	 *      2. jailed, wild.
2231 	 *      3. non-jailed, non-wild.
2232 	 *      4. non-jailed, wild.
2233 	 */
2234 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2235 	    pcbinfo->ipi_hashmask)];
2236 	local_wild = local_exact = jail_wild = NULL;
2237 #ifdef INET6
2238 	local_wild_mapped = NULL;
2239 #endif
2240 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2241 		inp_lookup_match_t match;
2242 		bool injail;
2243 
2244 		match = in_pcblookup_wild_match(inp, laddr, lport);
2245 		if (match == INPLOOKUP_MATCH_NONE)
2246 			continue;
2247 
2248 		injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2249 		if (injail) {
2250 			if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2251 			    &laddr) != 0)
2252 				continue;
2253 		} else {
2254 			if (local_exact != NULL)
2255 				continue;
2256 		}
2257 
2258 		if (match == INPLOOKUP_MATCH_LADDR) {
2259 			if (injail)
2260 				return (inp);
2261 			local_exact = inp;
2262 		} else {
2263 #ifdef INET6
2264 			/* XXX inp locking, NULL check */
2265 			if (inp->inp_vflag & INP_IPV6PROTO)
2266 				local_wild_mapped = inp;
2267 			else
2268 #endif
2269 				if (injail)
2270 					jail_wild = inp;
2271 				else
2272 					local_wild = inp;
2273 		}
2274 	}
2275 	if (jail_wild != NULL)
2276 		return (jail_wild);
2277 	if (local_exact != NULL)
2278 		return (local_exact);
2279 	if (local_wild != NULL)
2280 		return (local_wild);
2281 #ifdef INET6
2282 	if (local_wild_mapped != NULL)
2283 		return (local_wild_mapped);
2284 #endif
2285 	return (NULL);
2286 }
2287 
2288 /*
2289  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2290  * that the caller has either locked the hash list, which usually happens
2291  * for bind(2) operations, or is in SMR section, which happens when sorting
2292  * out incoming packets.
2293  */
2294 static struct inpcb *
2295 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2296     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2297     uint8_t numa_domain)
2298 {
2299 	struct inpcb *inp;
2300 	const u_short fport = fport_arg, lport = lport_arg;
2301 
2302 	KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0,
2303 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2304 	KASSERT(faddr.s_addr != INADDR_ANY,
2305 	    ("%s: invalid foreign address", __func__));
2306 	KASSERT(laddr.s_addr != INADDR_ANY,
2307 	    ("%s: invalid local address", __func__));
2308 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2309 
2310 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2311 	if (inp != NULL)
2312 		return (inp);
2313 
2314 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2315 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2316 		    &laddr, lport, numa_domain);
2317 		if (inp == NULL) {
2318 			inp = in_pcblookup_hash_wild_locked(pcbinfo, faddr,
2319 			    fport, laddr, lport);
2320 		}
2321 	}
2322 
2323 	return (inp);
2324 }
2325 
2326 static struct inpcb *
2327 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2328     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2329     uint8_t numa_domain)
2330 {
2331 	struct inpcb *inp;
2332 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2333 
2334 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2335 	    ("%s: LOCKPCB not set", __func__));
2336 
2337 	INP_HASH_WLOCK(pcbinfo);
2338 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2339 	    lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain);
2340 	if (inp != NULL && !inp_trylock(inp, lockflags)) {
2341 		in_pcbref(inp);
2342 		INP_HASH_WUNLOCK(pcbinfo);
2343 		inp_lock(inp, lockflags);
2344 		if (in_pcbrele(inp, lockflags))
2345 			/* XXX-MJ or retry until we get a negative match? */
2346 			inp = NULL;
2347 	} else {
2348 		INP_HASH_WUNLOCK(pcbinfo);
2349 	}
2350 	return (inp);
2351 }
2352 
2353 static struct inpcb *
2354 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2355     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2356     uint8_t numa_domain)
2357 {
2358 	struct inpcb *inp;
2359 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2360 	const u_short fport = fport_arg, lport = lport_arg;
2361 
2362 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2363 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2364 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2365 	    ("%s: LOCKPCB not set", __func__));
2366 
2367 	smr_enter(pcbinfo->ipi_smr);
2368 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2369 	if (inp != NULL) {
2370 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2371 			/*
2372 			 * Revalidate the 4-tuple, the socket could have been
2373 			 * disconnected.
2374 			 */
2375 			if (__predict_true(in_pcblookup_exact_match(inp,
2376 			    faddr, fport, laddr, lport)))
2377 				return (inp);
2378 			inp_unlock(inp, lockflags);
2379 		}
2380 
2381 		/*
2382 		 * We failed to lock the inpcb, or its connection state changed
2383 		 * out from under us.  Fall back to a precise search.
2384 		 */
2385 		return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2386 		    lookupflags, numa_domain));
2387 	}
2388 
2389 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2390 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2391 		    &laddr, lport, numa_domain);
2392 		if (inp != NULL) {
2393 			if (__predict_true(inp_smr_lock(inp, lockflags))) {
2394 				if (__predict_true(in_pcblookup_wild_match(inp,
2395 				    laddr, lport) != INPLOOKUP_MATCH_NONE))
2396 					return (inp);
2397 				inp_unlock(inp, lockflags);
2398 			}
2399 			inp = INP_LOOKUP_AGAIN;
2400 		} else {
2401 			inp = in_pcblookup_hash_wild_smr(pcbinfo, faddr, fport,
2402 			    laddr, lport, lockflags);
2403 		}
2404 		if (inp == INP_LOOKUP_AGAIN) {
2405 			return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2406 			    lport, lookupflags, numa_domain));
2407 		}
2408 	}
2409 
2410 	if (inp == NULL)
2411 		smr_exit(pcbinfo->ipi_smr);
2412 
2413 	return (inp);
2414 }
2415 
2416 /*
2417  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2418  * from which a pre-calculated hash value may be extracted.
2419  */
2420 struct inpcb *
2421 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2422     struct in_addr laddr, u_int lport, int lookupflags,
2423     struct ifnet *ifp __unused)
2424 {
2425 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2426 	    lookupflags, M_NODOM));
2427 }
2428 
2429 struct inpcb *
2430 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2431     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2432     struct ifnet *ifp __unused, struct mbuf *m)
2433 {
2434 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2435 	    lookupflags, m->m_pkthdr.numa_domain));
2436 }
2437 #endif /* INET */
2438 
2439 static bool
2440 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2441 {
2442 	return (prison_flag(inp->inp_cred, flag) != 0);
2443 }
2444 
2445 /*
2446  * Insert the PCB into a hash chain using ordering rules which ensure that
2447  * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2448  *
2449  * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2450  * with exact local addresses ahead of wildcard PCBs.  Unbound v4-mapped v6 PCBs
2451  * always appear last no matter whether they are jailed.
2452  */
2453 static void
2454 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2455 {
2456 	struct inpcb *last;
2457 	bool bound, injail;
2458 
2459 	INP_LOCK_ASSERT(inp);
2460 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2461 
2462 	last = NULL;
2463 	bound = inp->inp_laddr.s_addr != INADDR_ANY;
2464 	if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2465 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2466 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2467 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2468 				return;
2469 			}
2470 		}
2471 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2472 		return;
2473 	}
2474 
2475 	injail = in_pcbjailed(inp, PR_IP4);
2476 	if (!injail) {
2477 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2478 			if (!in_pcbjailed(last, PR_IP4))
2479 				break;
2480 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2481 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2482 				return;
2483 			}
2484 		}
2485 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2486 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2487 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2488 		return;
2489 	}
2490 	if (!bound) {
2491 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2492 			if (last->inp_laddr.s_addr == INADDR_ANY)
2493 				break;
2494 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2495 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2496 				return;
2497 			}
2498 		}
2499 	}
2500 	if (last == NULL)
2501 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2502 	else
2503 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2504 }
2505 
2506 #ifdef INET6
2507 /*
2508  * See the comment above _in_pcbinshash_wild().
2509  */
2510 static void
2511 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2512 {
2513 	struct inpcb *last;
2514 	bool bound, injail;
2515 
2516 	INP_LOCK_ASSERT(inp);
2517 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2518 
2519 	last = NULL;
2520 	bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2521 	injail = in_pcbjailed(inp, PR_IP6);
2522 	if (!injail) {
2523 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2524 			if (!in_pcbjailed(last, PR_IP6))
2525 				break;
2526 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2527 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2528 				return;
2529 			}
2530 		}
2531 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2532 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2533 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2534 		return;
2535 	}
2536 	if (!bound) {
2537 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2538 			if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2539 				break;
2540 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2541 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2542 				return;
2543 			}
2544 		}
2545 	}
2546 	if (last == NULL)
2547 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2548 	else
2549 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2550 }
2551 #endif
2552 
2553 /*
2554  * Insert PCB onto various hash lists.
2555  */
2556 int
2557 in_pcbinshash(struct inpcb *inp)
2558 {
2559 	struct inpcbhead *pcbhash;
2560 	struct inpcbporthead *pcbporthash;
2561 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2562 	struct inpcbport *phd;
2563 	uint32_t hash;
2564 	bool connected;
2565 
2566 	INP_WLOCK_ASSERT(inp);
2567 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2568 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2569 	    ("in_pcbinshash: INP_INHASHLIST"));
2570 
2571 #ifdef INET6
2572 	if (inp->inp_vflag & INP_IPV6) {
2573 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2574 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2575 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2576 	} else
2577 #endif
2578 	{
2579 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2580 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2581 		connected = !in_nullhost(inp->inp_faddr);
2582 	}
2583 
2584 	if (connected)
2585 		pcbhash = &pcbinfo->ipi_hash_exact[hash];
2586 	else
2587 		pcbhash = &pcbinfo->ipi_hash_wild[hash];
2588 
2589 	pcbporthash = &pcbinfo->ipi_porthashbase[
2590 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2591 
2592 	/*
2593 	 * Add entry to load balance group.
2594 	 * Only do this if SO_REUSEPORT_LB is set.
2595 	 */
2596 	if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2597 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
2598 		if (error != 0)
2599 			return (error);
2600 	}
2601 
2602 	/*
2603 	 * Go through port list and look for a head for this lport.
2604 	 */
2605 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2606 		if (phd->phd_port == inp->inp_lport)
2607 			break;
2608 	}
2609 
2610 	/*
2611 	 * If none exists, malloc one and tack it on.
2612 	 */
2613 	if (phd == NULL) {
2614 		phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
2615 		if (phd == NULL) {
2616 			if ((inp->inp_flags & INP_INLBGROUP) != 0)
2617 				in_pcbremlbgrouphash(inp);
2618 			return (ENOMEM);
2619 		}
2620 		phd->phd_port = inp->inp_lport;
2621 		CK_LIST_INIT(&phd->phd_pcblist);
2622 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2623 	}
2624 	inp->inp_phd = phd;
2625 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2626 
2627 	/*
2628 	 * The PCB may have been disconnected in the past.  Before we can safely
2629 	 * make it visible in the hash table, we must wait for all readers which
2630 	 * may be traversing this PCB to finish.
2631 	 */
2632 	if (inp->inp_smr != SMR_SEQ_INVALID) {
2633 		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2634 		inp->inp_smr = SMR_SEQ_INVALID;
2635 	}
2636 
2637 	if (connected)
2638 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2639 	else {
2640 #ifdef INET6
2641 		if ((inp->inp_vflag & INP_IPV6) != 0)
2642 			_in6_pcbinshash_wild(pcbhash, inp);
2643 		else
2644 #endif
2645 			_in_pcbinshash_wild(pcbhash, inp);
2646 	}
2647 	inp->inp_flags |= INP_INHASHLIST;
2648 
2649 	return (0);
2650 }
2651 
2652 void
2653 in_pcbremhash_locked(struct inpcb *inp)
2654 {
2655 	struct inpcbport *phd = inp->inp_phd;
2656 
2657 	INP_WLOCK_ASSERT(inp);
2658 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2659 	MPASS(inp->inp_flags & INP_INHASHLIST);
2660 
2661 	if ((inp->inp_flags & INP_INLBGROUP) != 0)
2662 		in_pcbremlbgrouphash(inp);
2663 #ifdef INET6
2664 	if (inp->inp_vflag & INP_IPV6) {
2665 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2666 			CK_LIST_REMOVE(inp, inp_hash_wild);
2667 		else
2668 			CK_LIST_REMOVE(inp, inp_hash_exact);
2669 	} else
2670 #endif
2671 	{
2672 		if (in_nullhost(inp->inp_faddr))
2673 			CK_LIST_REMOVE(inp, inp_hash_wild);
2674 		else
2675 			CK_LIST_REMOVE(inp, inp_hash_exact);
2676 	}
2677 	CK_LIST_REMOVE(inp, inp_portlist);
2678 	if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2679 		CK_LIST_REMOVE(phd, phd_hash);
2680 		uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
2681 	}
2682 	inp->inp_flags &= ~INP_INHASHLIST;
2683 }
2684 
2685 static void
2686 in_pcbremhash(struct inpcb *inp)
2687 {
2688 	INP_HASH_WLOCK(inp->inp_pcbinfo);
2689 	in_pcbremhash_locked(inp);
2690 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2691 }
2692 
2693 /*
2694  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2695  * changed. NOTE: This does not handle the case of the lport changing (the
2696  * hashed port list would have to be updated as well), so the lport must
2697  * not change after in_pcbinshash() has been called.
2698  */
2699 void
2700 in_pcbrehash(struct inpcb *inp)
2701 {
2702 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2703 	struct inpcbhead *head;
2704 	uint32_t hash;
2705 	bool connected;
2706 
2707 	INP_WLOCK_ASSERT(inp);
2708 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2709 	KASSERT(inp->inp_flags & INP_INHASHLIST,
2710 	    ("%s: !INP_INHASHLIST", __func__));
2711 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2712 	    ("%s: inp was disconnected", __func__));
2713 
2714 #ifdef INET6
2715 	if (inp->inp_vflag & INP_IPV6) {
2716 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2717 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2718 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2719 	} else
2720 #endif
2721 	{
2722 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2723 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2724 		connected = !in_nullhost(inp->inp_faddr);
2725 	}
2726 
2727 	/*
2728 	 * When rehashing, the caller must ensure that either the new or the old
2729 	 * foreign address was unspecified.
2730 	 */
2731 	if (connected)
2732 		CK_LIST_REMOVE(inp, inp_hash_wild);
2733 	else
2734 		CK_LIST_REMOVE(inp, inp_hash_exact);
2735 
2736 	if (connected) {
2737 		head = &pcbinfo->ipi_hash_exact[hash];
2738 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2739 	} else {
2740 		head = &pcbinfo->ipi_hash_wild[hash];
2741 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2742 	}
2743 }
2744 
2745 /*
2746  * Check for alternatives when higher level complains
2747  * about service problems.  For now, invalidate cached
2748  * routing information.  If the route was created dynamically
2749  * (by a redirect), time to try a default gateway again.
2750  */
2751 void
2752 in_losing(struct inpcb *inp)
2753 {
2754 
2755 	RO_INVALIDATE_CACHE(&inp->inp_route);
2756 	return;
2757 }
2758 
2759 /*
2760  * A set label operation has occurred at the socket layer, propagate the
2761  * label change into the in_pcb for the socket.
2762  */
2763 void
2764 in_pcbsosetlabel(struct socket *so)
2765 {
2766 #ifdef MAC
2767 	struct inpcb *inp;
2768 
2769 	inp = sotoinpcb(so);
2770 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2771 
2772 	INP_WLOCK(inp);
2773 	SOCK_LOCK(so);
2774 	mac_inpcb_sosetlabel(so, inp);
2775 	SOCK_UNLOCK(so);
2776 	INP_WUNLOCK(inp);
2777 #endif
2778 }
2779 
2780 void
2781 inp_wlock(struct inpcb *inp)
2782 {
2783 
2784 	INP_WLOCK(inp);
2785 }
2786 
2787 void
2788 inp_wunlock(struct inpcb *inp)
2789 {
2790 
2791 	INP_WUNLOCK(inp);
2792 }
2793 
2794 void
2795 inp_rlock(struct inpcb *inp)
2796 {
2797 
2798 	INP_RLOCK(inp);
2799 }
2800 
2801 void
2802 inp_runlock(struct inpcb *inp)
2803 {
2804 
2805 	INP_RUNLOCK(inp);
2806 }
2807 
2808 #ifdef INVARIANT_SUPPORT
2809 void
2810 inp_lock_assert(struct inpcb *inp)
2811 {
2812 
2813 	INP_WLOCK_ASSERT(inp);
2814 }
2815 
2816 void
2817 inp_unlock_assert(struct inpcb *inp)
2818 {
2819 
2820 	INP_UNLOCK_ASSERT(inp);
2821 }
2822 #endif
2823 
2824 void
2825 inp_apply_all(struct inpcbinfo *pcbinfo,
2826     void (*func)(struct inpcb *, void *), void *arg)
2827 {
2828 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2829 	    INPLOOKUP_WLOCKPCB);
2830 	struct inpcb *inp;
2831 
2832 	while ((inp = inp_next(&inpi)) != NULL)
2833 		func(inp, arg);
2834 }
2835 
2836 struct socket *
2837 inp_inpcbtosocket(struct inpcb *inp)
2838 {
2839 
2840 	INP_WLOCK_ASSERT(inp);
2841 	return (inp->inp_socket);
2842 }
2843 
2844 void
2845 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2846     uint32_t *faddr, uint16_t *fp)
2847 {
2848 
2849 	INP_LOCK_ASSERT(inp);
2850 	*laddr = inp->inp_laddr.s_addr;
2851 	*faddr = inp->inp_faddr.s_addr;
2852 	*lp = inp->inp_lport;
2853 	*fp = inp->inp_fport;
2854 }
2855 
2856 /*
2857  * Create an external-format (``xinpcb'') structure using the information in
2858  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2859  * reduce the spew of irrelevant information over this interface, to isolate
2860  * user code from changes in the kernel structure, and potentially to provide
2861  * information-hiding if we decide that some of this information should be
2862  * hidden from users.
2863  */
2864 void
2865 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2866 {
2867 
2868 	bzero(xi, sizeof(*xi));
2869 	xi->xi_len = sizeof(struct xinpcb);
2870 	if (inp->inp_socket)
2871 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
2872 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2873 	xi->inp_gencnt = inp->inp_gencnt;
2874 	xi->inp_flow = inp->inp_flow;
2875 	xi->inp_flowid = inp->inp_flowid;
2876 	xi->inp_flowtype = inp->inp_flowtype;
2877 	xi->inp_flags = inp->inp_flags;
2878 	xi->inp_flags2 = inp->inp_flags2;
2879 	xi->in6p_cksum = inp->in6p_cksum;
2880 	xi->in6p_hops = inp->in6p_hops;
2881 	xi->inp_ip_tos = inp->inp_ip_tos;
2882 	xi->inp_vflag = inp->inp_vflag;
2883 	xi->inp_ip_ttl = inp->inp_ip_ttl;
2884 	xi->inp_ip_p = inp->inp_ip_p;
2885 	xi->inp_ip_minttl = inp->inp_ip_minttl;
2886 }
2887 
2888 int
2889 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2890     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2891 {
2892 	struct sockopt sopt;
2893 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2894 	    INPLOOKUP_WLOCKPCB);
2895 	struct inpcb *inp;
2896 	struct sockopt_parameters *params;
2897 	struct socket *so;
2898 	int error;
2899 	char buf[1024];
2900 
2901 	if (req->oldptr != NULL || req->oldlen != 0)
2902 		return (EINVAL);
2903 	if (req->newptr == NULL)
2904 		return (EPERM);
2905 	if (req->newlen > sizeof(buf))
2906 		return (ENOMEM);
2907 	error = SYSCTL_IN(req, buf, req->newlen);
2908 	if (error != 0)
2909 		return (error);
2910 	if (req->newlen < sizeof(struct sockopt_parameters))
2911 		return (EINVAL);
2912 	params = (struct sockopt_parameters *)buf;
2913 	sopt.sopt_level = params->sop_level;
2914 	sopt.sopt_name = params->sop_optname;
2915 	sopt.sopt_dir = SOPT_SET;
2916 	sopt.sopt_val = params->sop_optval;
2917 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2918 	sopt.sopt_td = NULL;
2919 #ifdef INET6
2920 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
2921 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2922 			params->sop_inc.inc6_laddr.s6_addr16[1] =
2923 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
2924 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2925 			params->sop_inc.inc6_faddr.s6_addr16[1] =
2926 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
2927 	}
2928 #endif
2929 	if (params->sop_inc.inc_lport != htons(0) &&
2930 	    params->sop_inc.inc_fport != htons(0)) {
2931 #ifdef INET6
2932 		if (params->sop_inc.inc_flags & INC_ISIPV6)
2933 			inpi.hash = INP6_PCBHASH(
2934 			    &params->sop_inc.inc6_faddr,
2935 			    params->sop_inc.inc_lport,
2936 			    params->sop_inc.inc_fport,
2937 			    pcbinfo->ipi_hashmask);
2938 		else
2939 #endif
2940 			inpi.hash = INP_PCBHASH(
2941 			    &params->sop_inc.inc_faddr,
2942 			    params->sop_inc.inc_lport,
2943 			    params->sop_inc.inc_fport,
2944 			    pcbinfo->ipi_hashmask);
2945 	}
2946 	while ((inp = inp_next(&inpi)) != NULL)
2947 		if (inp->inp_gencnt == params->sop_id) {
2948 			if (inp->inp_flags & INP_DROPPED) {
2949 				INP_WUNLOCK(inp);
2950 				return (ECONNRESET);
2951 			}
2952 			so = inp->inp_socket;
2953 			KASSERT(so != NULL, ("inp_socket == NULL"));
2954 			soref(so);
2955 			if (params->sop_level == SOL_SOCKET) {
2956 				INP_WUNLOCK(inp);
2957 				error = sosetopt(so, &sopt);
2958 			} else
2959 				error = (*ctloutput_set)(inp, &sopt);
2960 			sorele(so);
2961 			break;
2962 		}
2963 	if (inp == NULL)
2964 		error = ESRCH;
2965 	return (error);
2966 }
2967 
2968 #ifdef DDB
2969 static void
2970 db_print_indent(int indent)
2971 {
2972 	int i;
2973 
2974 	for (i = 0; i < indent; i++)
2975 		db_printf(" ");
2976 }
2977 
2978 static void
2979 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2980 {
2981 	char faddr_str[48], laddr_str[48];
2982 
2983 	db_print_indent(indent);
2984 	db_printf("%s at %p\n", name, inc);
2985 
2986 	indent += 2;
2987 
2988 #ifdef INET6
2989 	if (inc->inc_flags & INC_ISIPV6) {
2990 		/* IPv6. */
2991 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
2992 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
2993 	} else
2994 #endif
2995 	{
2996 		/* IPv4. */
2997 		inet_ntoa_r(inc->inc_laddr, laddr_str);
2998 		inet_ntoa_r(inc->inc_faddr, faddr_str);
2999 	}
3000 	db_print_indent(indent);
3001 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3002 	    ntohs(inc->inc_lport));
3003 	db_print_indent(indent);
3004 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3005 	    ntohs(inc->inc_fport));
3006 }
3007 
3008 static void
3009 db_print_inpflags(int inp_flags)
3010 {
3011 	int comma;
3012 
3013 	comma = 0;
3014 	if (inp_flags & INP_RECVOPTS) {
3015 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3016 		comma = 1;
3017 	}
3018 	if (inp_flags & INP_RECVRETOPTS) {
3019 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3020 		comma = 1;
3021 	}
3022 	if (inp_flags & INP_RECVDSTADDR) {
3023 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3024 		comma = 1;
3025 	}
3026 	if (inp_flags & INP_ORIGDSTADDR) {
3027 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3028 		comma = 1;
3029 	}
3030 	if (inp_flags & INP_HDRINCL) {
3031 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
3032 		comma = 1;
3033 	}
3034 	if (inp_flags & INP_HIGHPORT) {
3035 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3036 		comma = 1;
3037 	}
3038 	if (inp_flags & INP_LOWPORT) {
3039 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
3040 		comma = 1;
3041 	}
3042 	if (inp_flags & INP_ANONPORT) {
3043 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
3044 		comma = 1;
3045 	}
3046 	if (inp_flags & INP_RECVIF) {
3047 		db_printf("%sINP_RECVIF", comma ? ", " : "");
3048 		comma = 1;
3049 	}
3050 	if (inp_flags & INP_MTUDISC) {
3051 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
3052 		comma = 1;
3053 	}
3054 	if (inp_flags & INP_RECVTTL) {
3055 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
3056 		comma = 1;
3057 	}
3058 	if (inp_flags & INP_DONTFRAG) {
3059 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3060 		comma = 1;
3061 	}
3062 	if (inp_flags & INP_RECVTOS) {
3063 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
3064 		comma = 1;
3065 	}
3066 	if (inp_flags & IN6P_IPV6_V6ONLY) {
3067 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3068 		comma = 1;
3069 	}
3070 	if (inp_flags & IN6P_PKTINFO) {
3071 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3072 		comma = 1;
3073 	}
3074 	if (inp_flags & IN6P_HOPLIMIT) {
3075 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3076 		comma = 1;
3077 	}
3078 	if (inp_flags & IN6P_HOPOPTS) {
3079 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3080 		comma = 1;
3081 	}
3082 	if (inp_flags & IN6P_DSTOPTS) {
3083 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3084 		comma = 1;
3085 	}
3086 	if (inp_flags & IN6P_RTHDR) {
3087 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3088 		comma = 1;
3089 	}
3090 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
3091 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3092 		comma = 1;
3093 	}
3094 	if (inp_flags & IN6P_TCLASS) {
3095 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3096 		comma = 1;
3097 	}
3098 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
3099 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3100 		comma = 1;
3101 	}
3102 	if (inp_flags & INP_ONESBCAST) {
3103 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3104 		comma  = 1;
3105 	}
3106 	if (inp_flags & INP_DROPPED) {
3107 		db_printf("%sINP_DROPPED", comma ? ", " : "");
3108 		comma  = 1;
3109 	}
3110 	if (inp_flags & INP_SOCKREF) {
3111 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
3112 		comma  = 1;
3113 	}
3114 	if (inp_flags & IN6P_RFC2292) {
3115 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3116 		comma = 1;
3117 	}
3118 	if (inp_flags & IN6P_MTU) {
3119 		db_printf("IN6P_MTU%s", comma ? ", " : "");
3120 		comma = 1;
3121 	}
3122 }
3123 
3124 static void
3125 db_print_inpvflag(u_char inp_vflag)
3126 {
3127 	int comma;
3128 
3129 	comma = 0;
3130 	if (inp_vflag & INP_IPV4) {
3131 		db_printf("%sINP_IPV4", comma ? ", " : "");
3132 		comma  = 1;
3133 	}
3134 	if (inp_vflag & INP_IPV6) {
3135 		db_printf("%sINP_IPV6", comma ? ", " : "");
3136 		comma  = 1;
3137 	}
3138 	if (inp_vflag & INP_IPV6PROTO) {
3139 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3140 		comma  = 1;
3141 	}
3142 }
3143 
3144 static void
3145 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3146 {
3147 
3148 	db_print_indent(indent);
3149 	db_printf("%s at %p\n", name, inp);
3150 
3151 	indent += 2;
3152 
3153 	db_print_indent(indent);
3154 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3155 
3156 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3157 
3158 	db_print_indent(indent);
3159 	db_printf("inp_label: %p   inp_flags: 0x%x (",
3160 	   inp->inp_label, inp->inp_flags);
3161 	db_print_inpflags(inp->inp_flags);
3162 	db_printf(")\n");
3163 
3164 	db_print_indent(indent);
3165 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3166 	    inp->inp_vflag);
3167 	db_print_inpvflag(inp->inp_vflag);
3168 	db_printf(")\n");
3169 
3170 	db_print_indent(indent);
3171 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3172 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3173 
3174 	db_print_indent(indent);
3175 #ifdef INET6
3176 	if (inp->inp_vflag & INP_IPV6) {
3177 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
3178 		    "in6p_moptions: %p\n", inp->in6p_options,
3179 		    inp->in6p_outputopts, inp->in6p_moptions);
3180 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3181 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3182 		    inp->in6p_hops);
3183 	} else
3184 #endif
3185 	{
3186 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3187 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3188 		    inp->inp_options, inp->inp_moptions);
3189 	}
3190 
3191 	db_print_indent(indent);
3192 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3193 	    (uintmax_t)inp->inp_gencnt);
3194 }
3195 
3196 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3197 {
3198 	struct inpcb *inp;
3199 
3200 	if (!have_addr) {
3201 		db_printf("usage: show inpcb <addr>\n");
3202 		return;
3203 	}
3204 	inp = (struct inpcb *)addr;
3205 
3206 	db_print_inpcb(inp, "inpcb", 0);
3207 }
3208 #endif /* DDB */
3209 
3210 #ifdef RATELIMIT
3211 /*
3212  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3213  * if any.
3214  */
3215 int
3216 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3217 {
3218 	union if_snd_tag_modify_params params = {
3219 		.rate_limit.max_rate = max_pacing_rate,
3220 		.rate_limit.flags = M_NOWAIT,
3221 	};
3222 	struct m_snd_tag *mst;
3223 	int error;
3224 
3225 	mst = inp->inp_snd_tag;
3226 	if (mst == NULL)
3227 		return (EINVAL);
3228 
3229 	if (mst->sw->snd_tag_modify == NULL) {
3230 		error = EOPNOTSUPP;
3231 	} else {
3232 		error = mst->sw->snd_tag_modify(mst, &params);
3233 	}
3234 	return (error);
3235 }
3236 
3237 /*
3238  * Query existing TX rate limit based on the existing
3239  * "inp->inp_snd_tag", if any.
3240  */
3241 int
3242 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3243 {
3244 	union if_snd_tag_query_params params = { };
3245 	struct m_snd_tag *mst;
3246 	int error;
3247 
3248 	mst = inp->inp_snd_tag;
3249 	if (mst == NULL)
3250 		return (EINVAL);
3251 
3252 	if (mst->sw->snd_tag_query == NULL) {
3253 		error = EOPNOTSUPP;
3254 	} else {
3255 		error = mst->sw->snd_tag_query(mst, &params);
3256 		if (error == 0 && p_max_pacing_rate != NULL)
3257 			*p_max_pacing_rate = params.rate_limit.max_rate;
3258 	}
3259 	return (error);
3260 }
3261 
3262 /*
3263  * Query existing TX queue level based on the existing
3264  * "inp->inp_snd_tag", if any.
3265  */
3266 int
3267 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3268 {
3269 	union if_snd_tag_query_params params = { };
3270 	struct m_snd_tag *mst;
3271 	int error;
3272 
3273 	mst = inp->inp_snd_tag;
3274 	if (mst == NULL)
3275 		return (EINVAL);
3276 
3277 	if (mst->sw->snd_tag_query == NULL)
3278 		return (EOPNOTSUPP);
3279 
3280 	error = mst->sw->snd_tag_query(mst, &params);
3281 	if (error == 0 && p_txqueue_level != NULL)
3282 		*p_txqueue_level = params.rate_limit.queue_level;
3283 	return (error);
3284 }
3285 
3286 /*
3287  * Allocate a new TX rate limit send tag from the network interface
3288  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3289  */
3290 int
3291 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3292     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3293 
3294 {
3295 	union if_snd_tag_alloc_params params = {
3296 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3297 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3298 		.rate_limit.hdr.flowid = flowid,
3299 		.rate_limit.hdr.flowtype = flowtype,
3300 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3301 		.rate_limit.max_rate = max_pacing_rate,
3302 		.rate_limit.flags = M_NOWAIT,
3303 	};
3304 	int error;
3305 
3306 	INP_WLOCK_ASSERT(inp);
3307 
3308 	/*
3309 	 * If there is already a send tag, or the INP is being torn
3310 	 * down, allocating a new send tag is not allowed. Else send
3311 	 * tags may leak.
3312 	 */
3313 	if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3314 		return (EINVAL);
3315 
3316 	error = m_snd_tag_alloc(ifp, &params, st);
3317 #ifdef INET
3318 	if (error == 0) {
3319 		counter_u64_add(rate_limit_set_ok, 1);
3320 		counter_u64_add(rate_limit_active, 1);
3321 	} else if (error != EOPNOTSUPP)
3322 		  counter_u64_add(rate_limit_alloc_fail, 1);
3323 #endif
3324 	return (error);
3325 }
3326 
3327 void
3328 in_pcbdetach_tag(struct m_snd_tag *mst)
3329 {
3330 
3331 	m_snd_tag_rele(mst);
3332 #ifdef INET
3333 	counter_u64_add(rate_limit_active, -1);
3334 #endif
3335 }
3336 
3337 /*
3338  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3339  * if any:
3340  */
3341 void
3342 in_pcbdetach_txrtlmt(struct inpcb *inp)
3343 {
3344 	struct m_snd_tag *mst;
3345 
3346 	INP_WLOCK_ASSERT(inp);
3347 
3348 	mst = inp->inp_snd_tag;
3349 	inp->inp_snd_tag = NULL;
3350 
3351 	if (mst == NULL)
3352 		return;
3353 
3354 	m_snd_tag_rele(mst);
3355 #ifdef INET
3356 	counter_u64_add(rate_limit_active, -1);
3357 #endif
3358 }
3359 
3360 int
3361 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3362 {
3363 	int error;
3364 
3365 	/*
3366 	 * If the existing send tag is for the wrong interface due to
3367 	 * a route change, first drop the existing tag.  Set the
3368 	 * CHANGED flag so that we will keep trying to allocate a new
3369 	 * tag if we fail to allocate one this time.
3370 	 */
3371 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3372 		in_pcbdetach_txrtlmt(inp);
3373 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3374 	}
3375 
3376 	/*
3377 	 * NOTE: When attaching to a network interface a reference is
3378 	 * made to ensure the network interface doesn't go away until
3379 	 * all ratelimit connections are gone. The network interface
3380 	 * pointers compared below represent valid network interfaces,
3381 	 * except when comparing towards NULL.
3382 	 */
3383 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3384 		error = 0;
3385 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3386 		if (inp->inp_snd_tag != NULL)
3387 			in_pcbdetach_txrtlmt(inp);
3388 		error = 0;
3389 	} else if (inp->inp_snd_tag == NULL) {
3390 		/*
3391 		 * In order to utilize packet pacing with RSS, we need
3392 		 * to wait until there is a valid RSS hash before we
3393 		 * can proceed:
3394 		 */
3395 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3396 			error = EAGAIN;
3397 		} else {
3398 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3399 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3400 		}
3401 	} else {
3402 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3403 	}
3404 	if (error == 0 || error == EOPNOTSUPP)
3405 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3406 
3407 	return (error);
3408 }
3409 
3410 /*
3411  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3412  * is set in the fast path and will attach/detach/modify the TX rate
3413  * limit send tag based on the socket's so_max_pacing_rate value.
3414  */
3415 void
3416 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3417 {
3418 	struct socket *socket;
3419 	uint32_t max_pacing_rate;
3420 	bool did_upgrade;
3421 
3422 	if (inp == NULL)
3423 		return;
3424 
3425 	socket = inp->inp_socket;
3426 	if (socket == NULL)
3427 		return;
3428 
3429 	if (!INP_WLOCKED(inp)) {
3430 		/*
3431 		 * NOTE: If the write locking fails, we need to bail
3432 		 * out and use the non-ratelimited ring for the
3433 		 * transmit until there is a new chance to get the
3434 		 * write lock.
3435 		 */
3436 		if (!INP_TRY_UPGRADE(inp))
3437 			return;
3438 		did_upgrade = 1;
3439 	} else {
3440 		did_upgrade = 0;
3441 	}
3442 
3443 	/*
3444 	 * NOTE: The so_max_pacing_rate value is read unlocked,
3445 	 * because atomic updates are not required since the variable
3446 	 * is checked at every mbuf we send. It is assumed that the
3447 	 * variable read itself will be atomic.
3448 	 */
3449 	max_pacing_rate = socket->so_max_pacing_rate;
3450 
3451 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3452 
3453 	if (did_upgrade)
3454 		INP_DOWNGRADE(inp);
3455 }
3456 
3457 /*
3458  * Track route changes for TX rate limiting.
3459  */
3460 void
3461 in_pcboutput_eagain(struct inpcb *inp)
3462 {
3463 	bool did_upgrade;
3464 
3465 	if (inp == NULL)
3466 		return;
3467 
3468 	if (inp->inp_snd_tag == NULL)
3469 		return;
3470 
3471 	if (!INP_WLOCKED(inp)) {
3472 		/*
3473 		 * NOTE: If the write locking fails, we need to bail
3474 		 * out and use the non-ratelimited ring for the
3475 		 * transmit until there is a new chance to get the
3476 		 * write lock.
3477 		 */
3478 		if (!INP_TRY_UPGRADE(inp))
3479 			return;
3480 		did_upgrade = 1;
3481 	} else {
3482 		did_upgrade = 0;
3483 	}
3484 
3485 	/* detach rate limiting */
3486 	in_pcbdetach_txrtlmt(inp);
3487 
3488 	/* make sure new mbuf send tag allocation is made */
3489 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3490 
3491 	if (did_upgrade)
3492 		INP_DOWNGRADE(inp);
3493 }
3494 
3495 #ifdef INET
3496 static void
3497 rl_init(void *st)
3498 {
3499 	rate_limit_new = counter_u64_alloc(M_WAITOK);
3500 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
3501 	rate_limit_active = counter_u64_alloc(M_WAITOK);
3502 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3503 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3504 }
3505 
3506 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3507 #endif
3508 #endif /* RATELIMIT */
3509