xref: /freebsd/sys/netinet/in_pcb.c (revision cfec995c87f39e59c80554b85625b4aaa8ddf8db)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include "opt_ddb.h"
40 #include "opt_ipsec.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #include "opt_ratelimit.h"
44 #include "opt_rss.h"
45 
46 #include <sys/param.h>
47 #include <sys/hash.h>
48 #include <sys/systm.h>
49 #include <sys/libkern.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/eventhandler.h>
54 #include <sys/domain.h>
55 #include <sys/proc.h>
56 #include <sys/protosw.h>
57 #include <sys/smp.h>
58 #include <sys/smr.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <sys/priv.h>
63 #include <sys/proc.h>
64 #include <sys/refcount.h>
65 #include <sys/jail.h>
66 #include <sys/kernel.h>
67 #include <sys/sysctl.h>
68 
69 #ifdef DDB
70 #include <ddb/ddb.h>
71 #endif
72 
73 #include <vm/uma.h>
74 #include <vm/vm.h>
75 
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/if_private.h>
79 #include <net/if_types.h>
80 #include <net/if_llatbl.h>
81 #include <net/route.h>
82 #include <net/rss_config.h>
83 #include <net/vnet.h>
84 
85 #if defined(INET) || defined(INET6)
86 #include <netinet/in.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/in_pcb_var.h>
89 #include <netinet/tcp.h>
90 #ifdef INET
91 #include <netinet/in_var.h>
92 #include <netinet/in_fib.h>
93 #endif
94 #include <netinet/ip_var.h>
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/in6_pcb.h>
98 #include <netinet6/in6_var.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101 #include <net/route/nhop.h>
102 #endif
103 
104 #include <netipsec/ipsec_support.h>
105 
106 #include <security/mac/mac_framework.h>
107 
108 #define	INPCBLBGROUP_SIZMIN	8
109 #define	INPCBLBGROUP_SIZMAX	256
110 
111 #define	INP_FREED	0x00000200	/* Went through in_pcbfree(). */
112 #define	INP_INLBGROUP	0x01000000	/* Inserted into inpcblbgroup. */
113 
114 /*
115  * These configure the range of local port addresses assigned to
116  * "unspecified" outgoing connections/packets/whatever.
117  */
118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
124 
125 /*
126  * Reserved ports accessible only to root. There are significant
127  * security considerations that must be accounted for when changing these,
128  * but the security benefits can be great. Please be careful.
129  */
130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
131 VNET_DEFINE(int, ipport_reservedlow);
132 
133 /* Enable random ephemeral port allocation by default. */
134 VNET_DEFINE(int, ipport_randomized) = 1;
135 
136 #ifdef INET
137 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
138 			    struct in_addr faddr, u_int fport_arg,
139 			    struct in_addr laddr, u_int lport_arg,
140 			    int lookupflags, uint8_t numa_domain, int fib);
141 
142 #define RANGECHK(var, min, max) \
143 	if ((var) < (min)) { (var) = (min); } \
144 	else if ((var) > (max)) { (var) = (max); }
145 
146 static int
147 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
148 {
149 	int error;
150 
151 	error = sysctl_handle_int(oidp, arg1, arg2, req);
152 	if (error == 0) {
153 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
154 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
155 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
156 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
157 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
158 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
159 	}
160 	return (error);
161 }
162 
163 #undef RANGECHK
164 
165 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
166     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
167     "IP Ports");
168 
169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
170     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
171     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
172     "");
173 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
174     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
175     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
176     "");
177 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
178     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
179     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
180     "");
181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
182     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
183     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
184     "");
185 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
186     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
187     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
188     "");
189 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
190     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
191     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
192     "");
193 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
194 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
195 	&VNET_NAME(ipport_reservedhigh), 0, "");
196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
197 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
199 	CTLFLAG_VNET | CTLFLAG_RW,
200 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
201 
202 #ifdef RATELIMIT
203 counter_u64_t rate_limit_new;
204 counter_u64_t rate_limit_chg;
205 counter_u64_t rate_limit_active;
206 counter_u64_t rate_limit_alloc_fail;
207 counter_u64_t rate_limit_set_ok;
208 
209 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
210     "IP Rate Limiting");
211 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
212     &rate_limit_active, "Active rate limited connections");
213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
214    &rate_limit_alloc_fail, "Rate limited connection failures");
215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
216    &rate_limit_set_ok, "Rate limited setting succeeded");
217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
218    &rate_limit_new, "Total Rate limit new attempts");
219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
220    &rate_limit_chg, "Total Rate limited change attempts");
221 #endif /* RATELIMIT */
222 
223 #endif /* INET */
224 
225 VNET_DEFINE(uint32_t, in_pcbhashseed);
226 static void
227 in_pcbhashseed_init(void)
228 {
229 
230 	V_in_pcbhashseed = arc4random();
231 }
232 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
233     in_pcbhashseed_init, NULL);
234 
235 #ifdef INET
236 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
237 #define	V_connect_inaddr_wild	VNET(connect_inaddr_wild)
238 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
239     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
240     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
241 #endif
242 
243 /*
244  * in_pcb.c: manage the Protocol Control Blocks.
245  *
246  * NOTE: It is assumed that most of these functions will be called with
247  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
248  * functions often modify hash chains or addresses in pcbs.
249  */
250 
251 static struct inpcblbgroup *
252 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
253     const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
254 {
255 	struct inpcblbgroup *grp;
256 	size_t bytes;
257 
258 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
259 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
260 	if (grp == NULL)
261 		return (NULL);
262 	LIST_INIT(&grp->il_pending);
263 	grp->il_cred = crhold(cred);
264 	grp->il_vflag = vflag;
265 	grp->il_lport = port;
266 	grp->il_numa_domain = numa_domain;
267 	grp->il_fibnum = fib;
268 	grp->il_dependladdr = *addr;
269 	grp->il_inpsiz = size;
270 	return (grp);
271 }
272 
273 static void
274 in_pcblbgroup_free_deferred(epoch_context_t ctx)
275 {
276 	struct inpcblbgroup *grp;
277 
278 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
279 	crfree(grp->il_cred);
280 	free(grp, M_PCB);
281 }
282 
283 static void
284 in_pcblbgroup_free(struct inpcblbgroup *grp)
285 {
286 	KASSERT(LIST_EMPTY(&grp->il_pending),
287 	    ("local group %p still has pending inps", grp));
288 
289 	CK_LIST_REMOVE(grp, il_list);
290 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
291 }
292 
293 static struct inpcblbgroup *
294 in_pcblbgroup_find(struct inpcb *inp)
295 {
296 	struct inpcbinfo *pcbinfo;
297 	struct inpcblbgroup *grp;
298 	struct inpcblbgrouphead *hdr;
299 
300 	INP_LOCK_ASSERT(inp);
301 
302 	pcbinfo = inp->inp_pcbinfo;
303 	INP_HASH_LOCK_ASSERT(pcbinfo);
304 
305 	hdr = &pcbinfo->ipi_lbgrouphashbase[
306 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
307 	CK_LIST_FOREACH(grp, hdr, il_list) {
308 		struct inpcb *inp1;
309 
310 		for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
311 			if (inp == grp->il_inp[i])
312 				goto found;
313 		}
314 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
315 			if (inp == inp1)
316 				goto found;
317 		}
318 	}
319 found:
320 	return (grp);
321 }
322 
323 static void
324 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
325 {
326 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
327 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
328 	    grp->il_inpcnt));
329 	INP_WLOCK_ASSERT(inp);
330 
331 	if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
332 	    !SOLISTENING(inp->inp_socket)) {
333 		/*
334 		 * If this is a TCP socket, it should not be visible to lbgroup
335 		 * lookups until listen() has been called.
336 		 */
337 		LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
338 		grp->il_pendcnt++;
339 	} else {
340 		grp->il_inp[grp->il_inpcnt] = inp;
341 
342 		/*
343 		 * Synchronize with in_pcblookup_lbgroup(): make sure that we
344 		 * don't expose a null slot to the lookup path.
345 		 */
346 		atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
347 	}
348 
349 	inp->inp_flags |= INP_INLBGROUP;
350 }
351 
352 static struct inpcblbgroup *
353 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
354     struct inpcblbgroup *old_grp, int size)
355 {
356 	struct inpcblbgroup *grp;
357 	int i;
358 
359 	grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
360 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
361 	    old_grp->il_numa_domain, old_grp->il_fibnum);
362 	if (grp == NULL)
363 		return (NULL);
364 
365 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
366 	    ("invalid new local group size %d and old local group count %d",
367 	     grp->il_inpsiz, old_grp->il_inpcnt));
368 
369 	for (i = 0; i < old_grp->il_inpcnt; ++i)
370 		grp->il_inp[i] = old_grp->il_inp[i];
371 	grp->il_inpcnt = old_grp->il_inpcnt;
372 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
373 	LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
374 	    inp_lbgroup_list);
375 	grp->il_pendcnt = old_grp->il_pendcnt;
376 	old_grp->il_pendcnt = 0;
377 	in_pcblbgroup_free(old_grp);
378 	return (grp);
379 }
380 
381 /*
382  * Add PCB to load balance group for SO_REUSEPORT_LB option.
383  */
384 static int
385 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
386 {
387 	const static struct timeval interval = { 60, 0 };
388 	static struct timeval lastprint;
389 	struct inpcbinfo *pcbinfo;
390 	struct inpcblbgrouphead *hdr;
391 	struct inpcblbgroup *grp;
392 	uint32_t idx;
393 	int fib;
394 
395 	pcbinfo = inp->inp_pcbinfo;
396 
397 	INP_WLOCK_ASSERT(inp);
398 	INP_HASH_WLOCK_ASSERT(pcbinfo);
399 
400 	fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
401 	    inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
402 
403 #ifdef INET6
404 	/*
405 	 * Don't allow IPv4 mapped INET6 wild socket.
406 	 */
407 	if ((inp->inp_vflag & INP_IPV4) &&
408 	    inp->inp_laddr.s_addr == INADDR_ANY &&
409 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
410 		return (0);
411 	}
412 #endif
413 
414 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask);
415 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
416 	CK_LIST_FOREACH(grp, hdr, il_list) {
417 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
418 		    grp->il_vflag == inp->inp_vflag &&
419 		    grp->il_lport == inp->inp_lport &&
420 		    grp->il_numa_domain == numa_domain &&
421 		    grp->il_fibnum == fib &&
422 		    memcmp(&grp->il_dependladdr,
423 		    &inp->inp_inc.inc_ie.ie_dependladdr,
424 		    sizeof(grp->il_dependladdr)) == 0) {
425 			break;
426 		}
427 	}
428 	if (grp == NULL) {
429 		/* Create new load balance group. */
430 		grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
431 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
432 		    INPCBLBGROUP_SIZMIN, numa_domain, fib);
433 		if (grp == NULL)
434 			return (ENOMEM);
435 		in_pcblbgroup_insert(grp, inp);
436 		CK_LIST_INSERT_HEAD(hdr, grp, il_list);
437 	} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
438 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
439 			if (ratecheck(&lastprint, &interval))
440 				printf("lb group port %d, limit reached\n",
441 				    ntohs(grp->il_lport));
442 			return (0);
443 		}
444 
445 		/* Expand this local group. */
446 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
447 		if (grp == NULL)
448 			return (ENOMEM);
449 		in_pcblbgroup_insert(grp, inp);
450 	} else {
451 		in_pcblbgroup_insert(grp, inp);
452 	}
453 	return (0);
454 }
455 
456 /*
457  * Remove PCB from load balance group.
458  */
459 static void
460 in_pcbremlbgrouphash(struct inpcb *inp)
461 {
462 	struct inpcbinfo *pcbinfo;
463 	struct inpcblbgrouphead *hdr;
464 	struct inpcblbgroup *grp;
465 	struct inpcb *inp1;
466 	int i;
467 
468 	pcbinfo = inp->inp_pcbinfo;
469 
470 	INP_WLOCK_ASSERT(inp);
471 	MPASS(inp->inp_flags & INP_INLBGROUP);
472 	INP_HASH_WLOCK_ASSERT(pcbinfo);
473 
474 	hdr = &pcbinfo->ipi_lbgrouphashbase[
475 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
476 	CK_LIST_FOREACH(grp, hdr, il_list) {
477 		for (i = 0; i < grp->il_inpcnt; ++i) {
478 			if (grp->il_inp[i] != inp)
479 				continue;
480 
481 			if (grp->il_inpcnt == 1 &&
482 			    LIST_EMPTY(&grp->il_pending)) {
483 				/* We are the last, free this local group. */
484 				in_pcblbgroup_free(grp);
485 			} else {
486 				grp->il_inp[i] =
487 				    grp->il_inp[grp->il_inpcnt - 1];
488 
489 				/*
490 				 * Synchronize with in_pcblookup_lbgroup().
491 				 */
492 				atomic_store_rel_int(&grp->il_inpcnt,
493 				    grp->il_inpcnt - 1);
494 			}
495 			inp->inp_flags &= ~INP_INLBGROUP;
496 			return;
497 		}
498 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
499 			if (inp == inp1) {
500 				LIST_REMOVE(inp, inp_lbgroup_list);
501 				grp->il_pendcnt--;
502 				inp->inp_flags &= ~INP_INLBGROUP;
503 				return;
504 			}
505 		}
506 	}
507 	__assert_unreachable();
508 }
509 
510 int
511 in_pcblbgroup_numa(struct inpcb *inp, int arg)
512 {
513 	struct inpcbinfo *pcbinfo;
514 	int error;
515 	uint8_t numa_domain;
516 
517 	switch (arg) {
518 	case TCP_REUSPORT_LB_NUMA_NODOM:
519 		numa_domain = M_NODOM;
520 		break;
521 	case TCP_REUSPORT_LB_NUMA_CURDOM:
522 		numa_domain = PCPU_GET(domain);
523 		break;
524 	default:
525 		if (arg < 0 || arg >= vm_ndomains)
526 			return (EINVAL);
527 		numa_domain = arg;
528 	}
529 
530 	pcbinfo = inp->inp_pcbinfo;
531 	INP_WLOCK_ASSERT(inp);
532 	INP_HASH_WLOCK(pcbinfo);
533 	if (in_pcblbgroup_find(inp) != NULL) {
534 		/* Remove it from the old group. */
535 		in_pcbremlbgrouphash(inp);
536 		/* Add it to the new group based on numa domain. */
537 		in_pcbinslbgrouphash(inp, numa_domain);
538 		error = 0;
539 	} else {
540 		error = ENOENT;
541 	}
542 	INP_HASH_WUNLOCK(pcbinfo);
543 	return (error);
544 }
545 
546 /*
547  * Initialize an inpcbinfo - a per-VNET instance of connections db.
548  */
549 void
550 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
551     u_int hash_nelements, u_int porthash_nelements)
552 {
553 	struct hashalloc_args ha = {
554 		.mtype = M_PCB,
555 		.mflags = M_WAITOK,
556 		.head = HASH_HEAD_CK_LIST,
557 	};
558 
559 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
560 	    NULL, MTX_DEF);
561 	CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
562 	pcbinfo->ipi_count = 0;
563 
564 	ha.size = hash_nelements;
565 	pcbinfo->ipi_hash_exact = hashalloc(&ha);
566 	pcbinfo->ipi_hash_wild = hashalloc(&ha);
567 	pcbinfo->ipi_hashmask = ha.size - 1;
568 
569 	ha.size = imin(porthash_nelements, IPPORT_MAX + 1);
570 	pcbinfo->ipi_porthashbase = hashalloc(&ha);
571 	pcbinfo->ipi_lbgrouphashbase = hashalloc(&ha);
572 	pcbinfo->ipi_porthashmask = ha.size - 1;
573 
574 	pcbinfo->ipi_zone = pcbstor->ips_zone;
575 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
576 }
577 
578 /*
579  * Destroy an inpcbinfo.
580  */
581 void
582 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
583 {
584 	struct hashalloc_args ha = {
585 		.mtype = M_PCB,
586 		.head = HASH_HEAD_CK_LIST,
587 	};
588 
589 	KASSERT(pcbinfo->ipi_count == 0,
590 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
591 
592 	ha.size = pcbinfo->ipi_hashmask + 1;
593 	hashfree(pcbinfo->ipi_hash_exact, &ha);
594 	hashfree(pcbinfo->ipi_hash_wild, &ha);
595 	ha.size = pcbinfo->ipi_porthashmask + 1;
596 	hashfree(pcbinfo->ipi_porthashbase, &ha);
597 	hashfree(pcbinfo->ipi_lbgrouphashbase, &ha);
598 	mtx_destroy(&pcbinfo->ipi_hash_lock);
599 }
600 
601 /*
602  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
603  */
604 static void inpcb_fini(void *, int);
605 void
606 in_pcbstorage_init(void *arg)
607 {
608 	struct inpcbstorage *pcbstor = arg;
609 
610 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
611 	    pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
612 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
613 }
614 
615 /*
616  * Destroy a pcbstorage - used by unloadable protocols.
617  */
618 void
619 in_pcbstorage_destroy(void *arg)
620 {
621 	struct inpcbstorage *pcbstor = arg;
622 
623 	uma_zdestroy(pcbstor->ips_zone);
624 }
625 
626 /*
627  * Allocate a PCB and associate it with the socket.
628  * On success return with the PCB locked.
629  */
630 int
631 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
632 {
633 	struct inpcb *inp;
634 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
635 	int error;
636 #endif
637 
638 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
639 	if (inp == NULL)
640 		return (ENOBUFS);
641 	bzero(&inp->inp_start_zero, inp_zero_size);
642 #ifdef NUMA
643 	inp->inp_numa_domain = M_NODOM;
644 #endif
645 	inp->inp_pcbinfo = pcbinfo;
646 	inp->inp_socket = so;
647 	inp->inp_cred = crhold(so->so_cred);
648 	inp->inp_inc.inc_fibnum = so->so_fibnum;
649 #ifdef MAC
650 	error = mac_inpcb_init(inp, M_NOWAIT);
651 	if (error != 0)
652 		goto out;
653 	mac_inpcb_create(so, inp);
654 #endif
655 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
656 	error = ipsec_init_pcbpolicy(inp);
657 	if (error != 0) {
658 #ifdef MAC
659 		mac_inpcb_destroy(inp);
660 #endif
661 		goto out;
662 	}
663 #endif /*IPSEC*/
664 #ifdef INET6
665 	if (INP_SOCKAF(so) == AF_INET6) {
666 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
667 		if (V_ip6_v6only)
668 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
669 #ifdef INET
670 		else
671 			inp->inp_vflag |= INP_IPV4;
672 #endif
673 		if (V_ip6_auto_flowlabel)
674 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
675 		inp->in6p_hops = -1;	/* use kernel default */
676 	}
677 #endif
678 #if defined(INET) && defined(INET6)
679 	else
680 #endif
681 #ifdef INET
682 		inp->inp_vflag |= INP_IPV4;
683 #endif
684 	inp->inp_smr = SMR_SEQ_INVALID;
685 
686 	/*
687 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
688 	 * to be cleaned up.
689 	 */
690 	inp->inp_route.ro_flags = RT_LLE_CACHE;
691 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
692 	inp->inp_flags |= INP_UNCONNECTED;
693 	INP_WLOCK(inp);
694 	INP_HASH_WLOCK(pcbinfo);
695 	pcbinfo->ipi_count++;
696 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
697 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
698 	INP_HASH_WUNLOCK(pcbinfo);
699 	so->so_pcb = inp;
700 
701 	return (0);
702 
703 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
704 out:
705 	crfree(inp->inp_cred);
706 #ifdef INVARIANTS
707 	inp->inp_cred = NULL;
708 #endif
709 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
710 	return (error);
711 #endif
712 }
713 
714 #if defined(INET) || defined(INET6)
715 /*
716  * Assign a local port like in_pcb_lport(), but also used with connect()
717  * and a foreign address and port.  If fsa is non-NULL, choose a local port
718  * that is unused with those, otherwise one that is completely unused.
719  * lsa can be NULL for IPv6.
720  */
721 int
722 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
723     u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
724     int lookupflags)
725 {
726 	struct inpcbinfo *pcbinfo;
727 	struct inpcb *tmpinp;
728 	unsigned short *lastport;
729 	int count, error;
730 	u_short aux, first, last, lport;
731 #ifdef INET
732 	struct in_addr laddr, faddr;
733 #endif
734 #ifdef INET6
735 	struct in6_addr *laddr6, *faddr6;
736 #endif
737 
738 	pcbinfo = inp->inp_pcbinfo;
739 
740 	/*
741 	 * Because no actual state changes occur here, a global write lock on
742 	 * the pcbinfo isn't required.
743 	 */
744 	INP_LOCK_ASSERT(inp);
745 	INP_HASH_LOCK_ASSERT(pcbinfo);
746 
747 	if (inp->inp_flags & INP_HIGHPORT) {
748 		first = V_ipport_hifirstauto;	/* sysctl */
749 		last  = V_ipport_hilastauto;
750 		lastport = &pcbinfo->ipi_lasthi;
751 	} else if (inp->inp_flags & INP_LOWPORT) {
752 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
753 		if (error)
754 			return (error);
755 		first = V_ipport_lowfirstauto;	/* 1023 */
756 		last  = V_ipport_lowlastauto;	/* 600 */
757 		lastport = &pcbinfo->ipi_lastlow;
758 	} else {
759 		first = V_ipport_firstauto;	/* sysctl */
760 		last  = V_ipport_lastauto;
761 		lastport = &pcbinfo->ipi_lastport;
762 	}
763 
764 	/*
765 	 * Instead of having two loops further down counting up or down
766 	 * make sure that first is always <= last and go with only one
767 	 * code path implementing all logic.
768 	 */
769 	if (first > last) {
770 		aux = first;
771 		first = last;
772 		last = aux;
773 	}
774 
775 #ifdef INET
776 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
777 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
778 		if (lsa != NULL)
779 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
780 		if (fsa != NULL)
781 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
782 	}
783 #endif
784 #ifdef INET6
785 	laddr6 = NULL;
786 	if ((inp->inp_vflag & INP_IPV6) != 0) {
787 		if (lsa != NULL)
788 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
789 		if (fsa != NULL)
790 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
791 	}
792 #endif
793 
794 	tmpinp = NULL;
795 
796 	if (V_ipport_randomized)
797 		*lastport = first + (arc4random() % (last - first));
798 
799 	count = last - first;
800 
801 	do {
802 		if (count-- < 0)	/* completely used? */
803 			return (EADDRNOTAVAIL);
804 		++*lastport;
805 		if (*lastport < first || *lastport > last)
806 			*lastport = first;
807 		lport = htons(*lastport);
808 
809 		if (fsa != NULL) {
810 #ifdef INET
811 			if (lsa->sa_family == AF_INET) {
812 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
813 				    faddr, fport, laddr, lport, lookupflags,
814 				    M_NODOM, RT_ALL_FIBS);
815 			}
816 #endif
817 #ifdef INET6
818 			if (lsa->sa_family == AF_INET6) {
819 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
820 				    faddr6, fport, laddr6, lport, lookupflags,
821 				    M_NODOM, RT_ALL_FIBS);
822 			}
823 #endif
824 		} else {
825 #ifdef INET6
826 			if ((inp->inp_vflag & INP_IPV6) != 0) {
827 				tmpinp = in6_pcblookup_local(pcbinfo,
828 				    &inp->in6p_laddr, lport, RT_ALL_FIBS,
829 				    lookupflags, cred);
830 #ifdef INET
831 				if (tmpinp == NULL &&
832 				    (inp->inp_vflag & INP_IPV4))
833 					tmpinp = in_pcblookup_local(pcbinfo,
834 					    laddr, lport, RT_ALL_FIBS,
835 					    lookupflags, cred);
836 #endif
837 			}
838 #endif
839 #if defined(INET) && defined(INET6)
840 			else
841 #endif
842 #ifdef INET
843 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
844 				    lport, RT_ALL_FIBS, lookupflags, cred);
845 #endif
846 		}
847 	} while (tmpinp != NULL);
848 
849 	*lportp = lport;
850 
851 	return (0);
852 }
853 
854 /*
855  * Select a local port (number) to use.
856  */
857 int
858 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
859     struct ucred *cred, int lookupflags)
860 {
861 	struct sockaddr_in laddr;
862 
863 	if (laddrp) {
864 		bzero(&laddr, sizeof(laddr));
865 		laddr.sin_family = AF_INET;
866 		laddr.sin_addr = *laddrp;
867 	}
868 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
869 	    NULL, lportp, NULL, 0, cred, lookupflags));
870 }
871 #endif /* INET || INET6 */
872 
873 #ifdef INET
874 /*
875  * Determine whether the inpcb can be bound to the specified address/port tuple.
876  */
877 static int
878 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
879     const u_short lport, const int fib, int sooptions, int lookupflags,
880     struct ucred *cred)
881 {
882 	int reuseport, reuseport_lb;
883 
884 	INP_LOCK_ASSERT(inp);
885 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
886 
887 	reuseport = (sooptions & SO_REUSEPORT);
888 	reuseport_lb = (sooptions & SO_REUSEPORT_LB);
889 
890 	if (IN_MULTICAST(ntohl(laddr.s_addr))) {
891 		/*
892 		 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
893 		 * allow complete duplication of binding if
894 		 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
895 		 * and a multicast address is bound on both
896 		 * new and duplicated sockets.
897 		 */
898 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
899 			reuseport = SO_REUSEADDR | SO_REUSEPORT;
900 		/*
901 		 * XXX: How to deal with SO_REUSEPORT_LB here?
902 		 * Treat same as SO_REUSEPORT for now.
903 		 */
904 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
905 			reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
906 	} else if (!in_nullhost(laddr)) {
907 		struct sockaddr_in sin;
908 
909 		memset(&sin, 0, sizeof(sin));
910 		sin.sin_family = AF_INET;
911 		sin.sin_len = sizeof(sin);
912 		sin.sin_addr = laddr;
913 
914 		/*
915 		 * Is the address a local IP address?
916 		 * If INP_BINDANY is set, then the socket may be bound
917 		 * to any endpoint address, local or not.
918 		 */
919 		if ((inp->inp_flags & INP_BINDANY) == 0 &&
920 		    ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
921 			return (EADDRNOTAVAIL);
922 	}
923 
924 	if (lport != 0) {
925 		struct inpcb *t;
926 
927 		if (ntohs(lport) <= V_ipport_reservedhigh &&
928 		    ntohs(lport) >= V_ipport_reservedlow &&
929 		    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
930 			return (EACCES);
931 
932 		if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
933 		    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
934 			/*
935 			 * If a socket owned by a different user is already
936 			 * bound to this port, fail.  In particular, SO_REUSE*
937 			 * can only be used to share a port among sockets owned
938 			 * by the same user.
939 			 *
940 			 * However, we can share a port with a connected socket
941 			 * which has a unique 4-tuple.
942 			 */
943 			t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
944 			    RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
945 			if (t != NULL &&
946 			    (inp->inp_socket->so_type != SOCK_STREAM ||
947 			     in_nullhost(t->inp_faddr)) &&
948 			    (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
949 				return (EADDRINUSE);
950 		}
951 		t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
952 		    lookupflags, cred);
953 		if (t != NULL && ((reuseport | reuseport_lb) &
954 		    t->inp_socket->so_options) == 0) {
955 #ifdef INET6
956 			if (!in_nullhost(laddr) ||
957 			    !in_nullhost(t->inp_laddr) ||
958 			    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
959 			    (t->inp_vflag & INP_IPV6PROTO) == 0)
960 #endif
961 				return (EADDRINUSE);
962 		}
963 	}
964 	return (0);
965 }
966 
967 /*
968  * Set up a bind operation on a PCB, performing port allocation
969  * as required, but do not actually modify the PCB. Callers can
970  * either complete the bind by setting inp_laddr/inp_lport and
971  * calling in_pcbinshash(), or they can just use the resulting
972  * port and address to authorise the sending of a once-off packet.
973  *
974  * On error, the values of *laddrp and *lportp are not changed.
975  */
976 static int
977 in_pcbbind_setup_locked(struct inpcb *inp, struct sockaddr_in *sin,
978     in_addr_t *laddrp, u_short *lportp, int flags, struct ucred *cred)
979 {
980 	struct socket *so = inp->inp_socket;
981 	struct in_addr laddr;
982 	u_short lport = 0;
983 	int error, fib, lookupflags, sooptions;
984 
985 	/*
986 	 * No state changes, so read locks are sufficient here.
987 	 */
988 	INP_LOCK_ASSERT(inp);
989 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
990 
991 	laddr.s_addr = *laddrp;
992 	if (sin != NULL && laddr.s_addr != INADDR_ANY)
993 		return (EINVAL);
994 
995 	lookupflags = 0;
996 	sooptions = atomic_load_int(&so->so_options);
997 	if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
998 		lookupflags = INPLOOKUP_WILDCARD;
999 	if (sin == NULL) {
1000 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
1001 			return (error);
1002 	} else {
1003 		KASSERT(sin->sin_family == AF_INET,
1004 		    ("%s: invalid family for address %p", __func__, sin));
1005 		KASSERT(sin->sin_len == sizeof(*sin),
1006 		    ("%s: invalid length for address %p", __func__, sin));
1007 
1008 		error = prison_local_ip4(cred, &sin->sin_addr);
1009 		if (error)
1010 			return (error);
1011 		if (sin->sin_port != *lportp) {
1012 			/* Don't allow the port to change. */
1013 			if (*lportp != 0)
1014 				return (EINVAL);
1015 			lport = sin->sin_port;
1016 		}
1017 		laddr = sin->sin_addr;
1018 
1019 		fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1020 		    RT_ALL_FIBS;
1021 
1022 		/* See if this address/port combo is available. */
1023 		error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1024 		    lookupflags, cred);
1025 		if (error != 0)
1026 			return (error);
1027 	}
1028 	if (*lportp != 0)
1029 		lport = *lportp;
1030 	if (lport == 0) {
1031 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1032 		if (error != 0)
1033 			return (error);
1034 	}
1035 	*laddrp = laddr.s_addr;
1036 	*lportp = lport;
1037 	if ((flags & INPBIND_FIB) != 0)
1038 		inp->inp_flags |= INP_BOUNDFIB;
1039 	return (0);
1040 }
1041 
1042 int
1043 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1044     u_short *lportp, int flags, struct ucred *cred)
1045 {
1046 	int error;
1047 
1048 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1049 	error = in_pcbbind_setup_locked(inp, sin, laddrp, lportp, flags, cred);
1050 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1051 
1052 	return (error);
1053 }
1054 
1055 #ifdef INET
1056 int
1057 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
1058     struct ucred *cred)
1059 {
1060 	int error;
1061 	bool anonport;
1062 
1063 	KASSERT(sin == NULL || sin->sin_family == AF_INET,
1064 	    ("%s: invalid address family for %p", __func__, sin));
1065 	KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
1066 	    ("%s: invalid address length for %p", __func__, sin));
1067 	INP_WLOCK_ASSERT(inp);
1068 
1069 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
1070 		return (EINVAL);
1071 	anonport = sin == NULL || sin->sin_port == 0;
1072 
1073 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1074 	error = in_pcbbind_setup_locked(inp, sin, &inp->inp_laddr.s_addr,
1075 	    &inp->inp_lport, flags, cred);
1076 	if (error) {
1077 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1078 		return (error);
1079 	}
1080 	if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
1081 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1082 		MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
1083 		inp->inp_laddr.s_addr = INADDR_ANY;
1084 		inp->inp_lport = 0;
1085 		inp->inp_flags &= ~INP_BOUNDFIB;
1086 		return (error);
1087 	}
1088 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1089 	if (anonport)
1090 		inp->inp_flags |= INP_ANONPORT;
1091 	return (0);
1092 }
1093 #endif
1094 
1095 /*
1096  * Connect from a socket to a specified address.
1097  * Both address and port must be specified in argument sin.
1098  * If don't have a local address for this socket yet,
1099  * then pick one.
1100  */
1101 int
1102 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1103 {
1104 	struct in_addr laddr, faddr;
1105 	u_short lport;
1106 	int error;
1107 	bool anonport;
1108 
1109 	NET_EPOCH_ASSERT();
1110 	INP_WLOCK_ASSERT(inp);
1111 	KASSERT(in_nullhost(inp->inp_faddr),
1112 	    ("%s: inp is already connected", __func__));
1113 	KASSERT(sin->sin_family == AF_INET,
1114 	    ("%s: invalid address family for %p", __func__, sin));
1115 	KASSERT(sin->sin_len == sizeof(*sin),
1116 	    ("%s: invalid address length for %p", __func__, sin));
1117 
1118 	if (sin->sin_port == 0)
1119 		return (EADDRNOTAVAIL);
1120 
1121 	anonport = (inp->inp_lport == 0);
1122 
1123 	if (__predict_false(in_broadcast(sin->sin_addr))) {
1124 		if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1125 			return (ENETUNREACH);
1126 		/*
1127 		 * If the destination address is INADDR_ANY, use the primary
1128 		 * local address.  If the supplied address is INADDR_BROADCAST,
1129 		 * and the primary interface supports broadcast, choose the
1130 		 * broadcast address for that interface.
1131 		 */
1132 		if (in_nullhost(sin->sin_addr)) {
1133 			faddr =
1134 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1135 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
1136 				return (error);
1137 		} else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1138 		    CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1139 		    & IFF_BROADCAST) {
1140 			faddr = satosin(&CK_STAILQ_FIRST(
1141 			    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1142 		} else
1143 			faddr = sin->sin_addr;
1144 	} else
1145 		faddr = sin->sin_addr;
1146 
1147 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1148 	if (in_nullhost(inp->inp_laddr)) {
1149 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1150 		if (__predict_false(error)) {
1151 			INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1152 			return (error);
1153 		}
1154 	} else
1155 		laddr = inp->inp_laddr;
1156 
1157 	if (anonport) {
1158 		struct sockaddr_in lsin = {
1159 			.sin_family = AF_INET,
1160 			.sin_addr = laddr,
1161 		};
1162 		struct sockaddr_in fsin = {
1163 			.sin_family = AF_INET,
1164 			.sin_addr = faddr,
1165 		};
1166 
1167 		error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1168 		    &lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1169 		    INPLOOKUP_WILDCARD);
1170 		if (__predict_false(error)) {
1171 			INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1172 			return (error);
1173 		}
1174 	} else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1175 	    sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1176 	    NULL) {
1177 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1178 		return (EADDRINUSE);
1179 	} else
1180 		lport = inp->inp_lport;
1181 
1182 	MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1183 	    (inp->inp_flags & INP_UNCONNECTED));
1184 
1185 	inp->inp_faddr = faddr;
1186 	inp->inp_fport = sin->sin_port;
1187 	inp->inp_laddr = laddr;
1188 	inp->inp_lport = lport;
1189 
1190 	if (inp->inp_flags & INP_UNCONNECTED) {
1191 		error = in_pcbinshash(inp);
1192 		MPASS(error == 0);
1193 	} else
1194 		in_pcbrehash(inp);
1195 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1196 
1197 	if (V_fib_hash_outbound) {
1198 		uint32_t hash_val, hash_type;
1199 
1200 		hash_val = fib4_calc_software_hash(inp->inp_laddr,
1201 		    inp->inp_faddr, 0, sin->sin_port,
1202 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
1203 
1204 		inp->inp_flowid = hash_val;
1205 		inp->inp_flowtype = hash_type;
1206 	}
1207 	if (anonport)
1208 		inp->inp_flags |= INP_ANONPORT;
1209 	return (0);
1210 }
1211 
1212 /*
1213  * Do proper source address selection on an unbound socket in case
1214  * of connect. Take jails into account as well.
1215  */
1216 int
1217 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1218     struct in_addr *laddr, struct ucred *cred)
1219 {
1220 	struct ifaddr *ifa;
1221 	struct sockaddr *sa;
1222 	struct sockaddr_in *sin, dst;
1223 	struct nhop_object *nh;
1224 	int error;
1225 
1226 	NET_EPOCH_ASSERT();
1227 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1228 
1229 	/*
1230 	 * Bypass source address selection and use the primary jail IP
1231 	 * if requested.
1232 	 */
1233 	if (!prison_saddrsel_ip4(cred, laddr))
1234 		return (0);
1235 
1236 	/*
1237 	 * If the destination address is multicast and an outgoing
1238 	 * interface has been set as a multicast option, prefer the
1239 	 * address of that interface as our source address.
1240 	 */
1241 	if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1242 	    inp->inp_moptions->imo_multicast_ifp != NULL) {
1243 		struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1244 		struct in_ifaddr *ia;
1245 
1246 		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1247 			if (ia->ia_ifp == ifp &&
1248 			    prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1249 				break;
1250 		}
1251 		if (ia == NULL)
1252 			return (EADDRNOTAVAIL);
1253 		*laddr = ia->ia_addr.sin_addr;
1254 		return (0);
1255 	}
1256 
1257 	error = 0;
1258 
1259 	nh = NULL;
1260 	bzero(&dst, sizeof(dst));
1261 	sin = &dst;
1262 	sin->sin_family = AF_INET;
1263 	sin->sin_len = sizeof(struct sockaddr_in);
1264 	sin->sin_addr.s_addr = faddr->s_addr;
1265 
1266 	/*
1267 	 * If route is known our src addr is taken from the i/f,
1268 	 * else punt.
1269 	 *
1270 	 * Find out route to destination.
1271 	 */
1272 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1273 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1274 		    0, NHR_NONE, 0);
1275 
1276 	/*
1277 	 * If we found a route, use the address corresponding to
1278 	 * the outgoing interface.
1279 	 *
1280 	 * Otherwise assume faddr is reachable on a directly connected
1281 	 * network and try to find a corresponding interface to take
1282 	 * the source address from.
1283 	 */
1284 	if (nh == NULL || nh->nh_ifp == NULL) {
1285 		struct in_ifaddr *ia;
1286 		struct ifnet *ifp;
1287 
1288 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1289 					inp->inp_socket->so_fibnum));
1290 		if (ia == NULL) {
1291 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1292 						inp->inp_socket->so_fibnum));
1293 		}
1294 		if (ia == NULL) {
1295 			error = ENETUNREACH;
1296 			goto done;
1297 		}
1298 
1299 		if (!prison_flag(cred, PR_IP4)) {
1300 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1301 			goto done;
1302 		}
1303 
1304 		ifp = ia->ia_ifp;
1305 		ia = NULL;
1306 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1307 			sa = ifa->ifa_addr;
1308 			if (sa->sa_family != AF_INET)
1309 				continue;
1310 			sin = (struct sockaddr_in *)sa;
1311 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1312 				ia = (struct in_ifaddr *)ifa;
1313 				break;
1314 			}
1315 		}
1316 		if (ia != NULL) {
1317 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1318 			goto done;
1319 		}
1320 
1321 		/* 3. As a last resort return the 'default' jail address. */
1322 		error = prison_get_ip4(cred, laddr);
1323 		goto done;
1324 	}
1325 
1326 	/*
1327 	 * If the outgoing interface on the route found is not
1328 	 * a loopback interface, use the address from that interface.
1329 	 * In case of jails do those three steps:
1330 	 * 1. check if the interface address belongs to the jail. If so use it.
1331 	 * 2. check if we have any address on the outgoing interface
1332 	 *    belonging to this jail. If so use it.
1333 	 * 3. as a last resort return the 'default' jail address.
1334 	 */
1335 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1336 		struct in_ifaddr *ia;
1337 		struct ifnet *ifp;
1338 
1339 		/* If not jailed, use the default returned. */
1340 		if (!prison_flag(cred, PR_IP4)) {
1341 			ia = (struct in_ifaddr *)nh->nh_ifa;
1342 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1343 			goto done;
1344 		}
1345 
1346 		/* Jailed. */
1347 		/* 1. Check if the iface address belongs to the jail. */
1348 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1349 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1350 			ia = (struct in_ifaddr *)nh->nh_ifa;
1351 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1352 			goto done;
1353 		}
1354 
1355 		/*
1356 		 * 2. Check if we have any address on the outgoing interface
1357 		 *    belonging to this jail.
1358 		 */
1359 		ia = NULL;
1360 		ifp = nh->nh_ifp;
1361 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1362 			sa = ifa->ifa_addr;
1363 			if (sa->sa_family != AF_INET)
1364 				continue;
1365 			sin = (struct sockaddr_in *)sa;
1366 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1367 				ia = (struct in_ifaddr *)ifa;
1368 				break;
1369 			}
1370 		}
1371 		if (ia != NULL) {
1372 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1373 			goto done;
1374 		}
1375 
1376 		/* 3. As a last resort return the 'default' jail address. */
1377 		error = prison_get_ip4(cred, laddr);
1378 		goto done;
1379 	}
1380 
1381 	/*
1382 	 * The outgoing interface is marked with 'loopback net', so a route
1383 	 * to ourselves is here.
1384 	 * Try to find the interface of the destination address and then
1385 	 * take the address from there. That interface is not necessarily
1386 	 * a loopback interface.
1387 	 * In case of jails, check that it is an address of the jail
1388 	 * and if we cannot find, fall back to the 'default' jail address.
1389 	 */
1390 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1391 		struct in_ifaddr *ia;
1392 
1393 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1394 					inp->inp_socket->so_fibnum));
1395 		if (ia == NULL)
1396 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1397 						inp->inp_socket->so_fibnum));
1398 		if (ia == NULL)
1399 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1400 
1401 		if (!prison_flag(cred, PR_IP4)) {
1402 			if (ia == NULL) {
1403 				error = ENETUNREACH;
1404 				goto done;
1405 			}
1406 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1407 			goto done;
1408 		}
1409 
1410 		/* Jailed. */
1411 		if (ia != NULL) {
1412 			struct ifnet *ifp;
1413 
1414 			ifp = ia->ia_ifp;
1415 			ia = NULL;
1416 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1417 				sa = ifa->ifa_addr;
1418 				if (sa->sa_family != AF_INET)
1419 					continue;
1420 				sin = (struct sockaddr_in *)sa;
1421 				if (prison_check_ip4(cred,
1422 				    &sin->sin_addr) == 0) {
1423 					ia = (struct in_ifaddr *)ifa;
1424 					break;
1425 				}
1426 			}
1427 			if (ia != NULL) {
1428 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1429 				goto done;
1430 			}
1431 		}
1432 
1433 		/* 3. As a last resort return the 'default' jail address. */
1434 		error = prison_get_ip4(cred, laddr);
1435 		goto done;
1436 	}
1437 
1438 done:
1439 	if (error == 0 && laddr->s_addr == INADDR_ANY)
1440 		return (EHOSTUNREACH);
1441 	return (error);
1442 }
1443 
1444 void
1445 in_pcbdisconnect(struct inpcb *inp)
1446 {
1447 
1448 	INP_WLOCK_ASSERT(inp);
1449 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1450 	    ("%s: inp %p was already disconnected", __func__, inp));
1451 
1452 	if (inp->inp_flags & INP_UNCONNECTED)
1453 		return;
1454 
1455 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1456 	in_pcbremhash(inp);
1457 	CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
1458 	    inp_unconn_list);
1459 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1460 	inp->inp_flags |= INP_UNCONNECTED;
1461 
1462 	if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1463 		/* See the comment in in_pcbinshash(). */
1464 		inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1465 		inp->inp_faddr.s_addr = INADDR_ANY;
1466 		inp->inp_fport = 0;
1467 	}
1468 }
1469 #endif /* INET */
1470 
1471 void
1472 in_pcblisten(struct inpcb *inp)
1473 {
1474 	struct inpcblbgroup *grp;
1475 
1476 	INP_WLOCK_ASSERT(inp);
1477 
1478 	if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1479 		struct inpcbinfo *pcbinfo;
1480 
1481 		pcbinfo = inp->inp_pcbinfo;
1482 		INP_HASH_WLOCK(pcbinfo);
1483 		grp = in_pcblbgroup_find(inp);
1484 		LIST_REMOVE(inp, inp_lbgroup_list);
1485 		grp->il_pendcnt--;
1486 		in_pcblbgroup_insert(grp, inp);
1487 		INP_HASH_WUNLOCK(pcbinfo);
1488 	}
1489 }
1490 
1491 /*
1492  * inpcb hash lookups are protected by SMR section.
1493  *
1494  * Once desired pcb has been found, switching from SMR section to a pcb
1495  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1496  * here because SMR is a critical section.
1497  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1498  */
1499 void
1500 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1501 {
1502 
1503 	lock == INPLOOKUP_RLOCKPCB ?
1504 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1505 }
1506 
1507 void
1508 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1509 {
1510 
1511 	lock == INPLOOKUP_RLOCKPCB ?
1512 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1513 }
1514 
1515 int
1516 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1517 {
1518 
1519 	return (lock == INPLOOKUP_RLOCKPCB ?
1520 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1521 }
1522 
1523 static inline bool
1524 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1525 {
1526 
1527 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1528 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1529 
1530 	if (__predict_true(inp_trylock(inp, lock))) {
1531 		if (__predict_false(inp->inp_flags & ignflags)) {
1532 			smr_exit(inp->inp_pcbinfo->ipi_smr);
1533 			inp_unlock(inp, lock);
1534 			return (false);
1535 		}
1536 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1537 		return (true);
1538 	}
1539 
1540 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1541 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1542 		inp_lock(inp, lock);
1543 		if (__predict_false(in_pcbrele(inp, lock)))
1544 			return (false);
1545 		/*
1546 		 * inp acquired through refcount & lock for sure didn't went
1547 		 * through uma_zfree().  However, it may have already went
1548 		 * through in_pcbfree() and has another reference, that
1549 		 * prevented its release by our in_pcbrele().
1550 		 */
1551 		if (__predict_false(inp->inp_flags & ignflags)) {
1552 			inp_unlock(inp, lock);
1553 			return (false);
1554 		}
1555 		return (true);
1556 	} else {
1557 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1558 		return (false);
1559 	}
1560 }
1561 
1562 bool
1563 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1564 {
1565 
1566 	/*
1567 	 * in_pcblookup() family of functions shall ignore not onlu pcbs that
1568 	 * had been freed that may be found due to lockless access to the hash,
1569 	 * but also pcbs that were removed from the hash, but are still around.
1570 	 */
1571 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_UNCONNECTED));
1572 }
1573 
1574 /*
1575  * inp_next() - inpcb hash/list traversal iterator
1576  *
1577  * Requires initialized struct inpcb_iterator for context.
1578  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1579  *
1580  * - Iterator can have either write-lock or read-lock semantics, that can not
1581  *   be changed later.
1582  * - Iterator has three modes of operation, defined by value of .hash member
1583  *   on the first call:
1584  *   - .hash = INP_ALL_LIST: the iterator will go through the unconnected
1585  *     list, then all wildcard hash slots and then all exact hash slots.
1586  *   - .hash = INP_UNCONN_LIST: the iterator will go through the list of
1587  *     unconnected pcbs only.
1588  *   - .hash initialized with an arbitrary positive value: iterator will go
1589  *     through this exact hash slot only.
1590  *   Note: only rip_input() and sysctl_setsockopt() use the latter.
1591  *   The interface may be extended for iteration over single wildcard hash
1592  *   slot, but there is no use case for that today.
1593  * - Iterator may have optional bool matching function.  The matching function
1594  *   will be executed for each inpcb in the SMR context, so it can not acquire
1595  *   locks and can safely access only immutable fields of inpcb.
1596  *
1597  * A fresh initialized iterator has NULL inpcb in its context and that
1598  * means that inp_next() call would return the very first inpcb on the list
1599  * locked with desired semantic.  In all following calls the context pointer
1600  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1601  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1602  * and write NULL to its context.  After end of traversal an iterator can be
1603  * reused.
1604  *
1605  * List traversals have the following features/constraints:
1606  * - New entries won't be seen, as they are always added to the head of a list.
1607  * - Removed entries won't stop traversal as long as they are not added to
1608  *   a different list. This is violated by in_pcbrehash().
1609  */
1610 static inline struct inpcb *
1611 ii_list_first(const struct inpcb_iterator *ii)
1612 {
1613 	const struct inpcbinfo *ipi = ii->ipi;
1614 	const int hash = ii->hash;
1615 
1616 	if (hash < 0)
1617 		return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
1618 	else if (hash <= ipi->ipi_hashmask)
1619 		return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
1620 	else
1621 		return (CK_LIST_FIRST(
1622 		    &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
1623 }
1624 
1625 static inline struct inpcb *
1626 ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
1627 {
1628 	if (ii->hash < 0)
1629 		return (CK_LIST_NEXT(inp, inp_unconn_list));
1630 	else if (ii->hash <= ii->ipi->ipi_hashmask)
1631 		return (CK_LIST_NEXT(inp, inp_hash_wild));
1632 	else
1633 		return (CK_LIST_NEXT(inp, inp_hash_exact));
1634 }
1635 
1636 struct inpcb *
1637 inp_next(struct inpcb_iterator *ii)
1638 {
1639 	const struct inpcbinfo *ipi = ii->ipi;
1640 	const int hashmax = (ipi->ipi_hashmask + 1) * 2;
1641 	inp_match_t *match = ii->match;
1642 	void *ctx = ii->ctx;
1643 	inp_lookup_t lock = ii->lock;
1644 	struct inpcb *inp;
1645 
1646 	if (ii->inp == NULL) {		/* First call. */
1647 		if ((ii->hash = ii->mode) >= 0) {
1648 			/* Targeted iterators support only the exact hash. */
1649 			MPASS(ii->hash <= ipi->ipi_hashmask);
1650 			ii->hash += ipi->ipi_hashmask + 1;
1651 		}
1652 		smr_enter(ipi->ipi_smr);
1653 next_first:
1654 		/* This is unrolled CK_LIST_FOREACH() over different headers. */
1655 		for (inp = ii_list_first(ii);
1656 		    inp != NULL;
1657 		    inp = ii_list_next(ii, inp)) {
1658 			if (match != NULL && (match)(inp, ctx) == false)
1659 				continue;
1660 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1661 				break;
1662 			else {
1663 				smr_enter(ipi->ipi_smr);
1664 				MPASS(inp != ii_list_first(ii));
1665 				inp = ii_list_first(ii);
1666 				if (inp == NULL)
1667 					break;
1668 			}
1669 		}
1670 
1671 		if (inp == NULL) {
1672 			if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
1673 				goto next_first;
1674 			smr_exit(ipi->ipi_smr);
1675 		} else
1676 			ii->inp = inp;
1677 
1678 		return (inp);
1679 	}
1680 
1681 	/* Not a first call. */
1682 	smr_enter(ipi->ipi_smr);
1683 restart:
1684 	inp = ii->inp;
1685 	rw_assert(&inp->inp_lock,
1686 	    lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
1687 next:
1688 	inp = ii_list_next(ii, inp);
1689 	if (inp == NULL) {
1690 		if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
1691 			inp_unlock(ii->inp, lock);
1692 			ii->inp = NULL;
1693 			goto next_first;
1694 		}
1695 		smr_exit(ipi->ipi_smr);
1696 		goto found;
1697 	}
1698 
1699 	if (match != NULL && (match)(inp, ctx) == false)
1700 		goto next;
1701 
1702 	if (__predict_true(inp_trylock(inp, lock))) {
1703 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1704 			/*
1705 			 * Entries are never inserted in middle of a list, thus
1706 			 * as long as we are in SMR, we can continue traversal.
1707 			 * Jump to 'next' should yield in the same result, but
1708 			 * could produce unnecessary looping.  Could this
1709 			 * looping be unbound?
1710 			 */
1711 			inp_unlock(inp, lock);
1712 			goto next;
1713 		} else {
1714 			smr_exit(ipi->ipi_smr);
1715 			goto found;
1716 		}
1717 	}
1718 
1719 	/*
1720 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
1721 	 * SMR section we can no longer jump to 'next', and our only stable
1722 	 * anchoring point is ii->inp, which we keep locked for this case, so
1723 	 * we jump to 'restart'.
1724 	 */
1725 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1726 		smr_exit(ipi->ipi_smr);
1727 		inp_lock(inp, lock);
1728 		if (__predict_false(in_pcbrele(inp, lock))) {
1729 			smr_enter(ipi->ipi_smr);
1730 			goto restart;
1731 		}
1732 		/*
1733 		 * See comment in inp_smr_lock().
1734 		 */
1735 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1736 			inp_unlock(inp, lock);
1737 			smr_enter(ipi->ipi_smr);
1738 			goto restart;
1739 		}
1740 	} else
1741 		goto next;
1742 
1743 found:
1744 	inp_unlock(ii->inp, lock);
1745 	ii->inp = inp;
1746 
1747 	return (ii->inp);
1748 }
1749 
1750 /*
1751  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1752  * stability of an inpcb pointer despite the inpcb lock being released or
1753  * SMR section exited.
1754  *
1755  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1756  */
1757 void
1758 in_pcbref(struct inpcb *inp)
1759 {
1760 	u_int old __diagused;
1761 
1762 	old = refcount_acquire(&inp->inp_refcount);
1763 	KASSERT(old > 0, ("%s: refcount 0", __func__));
1764 }
1765 
1766 /*
1767  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1768  * freeing the pcb, if the reference was very last.
1769  */
1770 bool
1771 in_pcbrele_rlocked(struct inpcb *inp)
1772 {
1773 
1774 	INP_RLOCK_ASSERT(inp);
1775 
1776 	if (!refcount_release(&inp->inp_refcount))
1777 		return (false);
1778 
1779 	MPASS(inp->inp_flags & INP_FREED);
1780 	MPASS(inp->inp_socket == NULL);
1781 	crfree(inp->inp_cred);
1782 #ifdef INVARIANTS
1783 	inp->inp_cred = NULL;
1784 #endif
1785 	INP_RUNLOCK(inp);
1786 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1787 	return (true);
1788 }
1789 
1790 bool
1791 in_pcbrele_wlocked(struct inpcb *inp)
1792 {
1793 
1794 	INP_WLOCK_ASSERT(inp);
1795 
1796 	if (!refcount_release(&inp->inp_refcount))
1797 		return (false);
1798 
1799 	MPASS(inp->inp_flags & INP_FREED);
1800 	MPASS(inp->inp_socket == NULL);
1801 	crfree(inp->inp_cred);
1802 #ifdef INVARIANTS
1803 	inp->inp_cred = NULL;
1804 #endif
1805 	INP_WUNLOCK(inp);
1806 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1807 	return (true);
1808 }
1809 
1810 bool
1811 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1812 {
1813 
1814 	return (lock == INPLOOKUP_RLOCKPCB ?
1815 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1816 }
1817 
1818 /*
1819  * Dereference and rlock inp, for which the caller must own the
1820  * reference.  Returns true if inp no longer usable, false otherwise.
1821  */
1822 bool
1823 in_pcbrele_rlock(struct inpcb *inp)
1824 {
1825 	INP_RLOCK(inp);
1826 	if (in_pcbrele_rlocked(inp))
1827 		return (true);
1828 	if ((inp->inp_flags & INP_FREED) != 0) {
1829 		INP_RUNLOCK(inp);
1830 		return (true);
1831 	}
1832 	return (false);
1833 }
1834 
1835 /*
1836  * Unconditionally schedule an inpcb to be freed by decrementing its
1837  * reference count, which should occur only after the inpcb has been detached
1838  * from its socket.  If another thread holds a temporary reference (acquired
1839  * using in_pcbref()) then the free is deferred until that reference is
1840  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1841  *  Almost all work, including removal from global lists, is done in this
1842  * context, where the pcbinfo lock is held.
1843  */
1844 void
1845 in_pcbfree(struct inpcb *inp)
1846 {
1847 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1848 #ifdef INET
1849 	struct ip_moptions *imo;
1850 #endif
1851 #ifdef INET6
1852 	struct ip6_moptions *im6o;
1853 #endif
1854 
1855 	INP_WLOCK_ASSERT(inp);
1856 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1857 	KASSERT((inp->inp_flags & INP_FREED) == 0,
1858 	    ("%s: called twice for pcb %p", __func__, inp));
1859 
1860 	/*
1861 	 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1862 	 * from the hash without acquiring inpcb lock, they rely on the hash
1863 	 * lock, thus in_pcbremhash() should be the first action.
1864 	 */
1865 	INP_HASH_WLOCK(pcbinfo);
1866 	if (inp->inp_flags & INP_UNCONNECTED)
1867 		CK_LIST_REMOVE(inp, inp_unconn_list);
1868 	else
1869 		in_pcbremhash(inp);
1870 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1871 	pcbinfo->ipi_count--;
1872 	INP_HASH_WUNLOCK(pcbinfo);
1873 
1874 #ifdef RATELIMIT
1875 	if (inp->inp_snd_tag != NULL)
1876 		in_pcbdetach_txrtlmt(inp);
1877 #endif
1878 	inp->inp_flags |= INP_FREED;
1879 	inp->inp_socket->so_pcb = NULL;
1880 	inp->inp_socket = NULL;
1881 
1882 	RO_INVALIDATE_CACHE(&inp->inp_route);
1883 #ifdef MAC
1884 	mac_inpcb_destroy(inp);
1885 #endif
1886 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1887 	if (inp->inp_sp != NULL)
1888 		ipsec_delete_pcbpolicy(inp);
1889 #endif
1890 #ifdef INET
1891 	if (inp->inp_options)
1892 		(void)m_free(inp->inp_options);
1893 	DEBUG_POISON_POINTER(inp->inp_options);
1894 	imo = inp->inp_moptions;
1895 	DEBUG_POISON_POINTER(inp->inp_moptions);
1896 #endif
1897 #ifdef INET6
1898 	if (inp->inp_vflag & INP_IPV6PROTO) {
1899 		ip6_freepcbopts(inp->in6p_outputopts);
1900 		DEBUG_POISON_POINTER(inp->in6p_outputopts);
1901 		im6o = inp->in6p_moptions;
1902 		DEBUG_POISON_POINTER(inp->in6p_moptions);
1903 	} else
1904 		im6o = NULL;
1905 #endif
1906 
1907 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1908 		INP_WUNLOCK(inp);
1909 	}
1910 #ifdef INET6
1911 	ip6_freemoptions(im6o);
1912 #endif
1913 #ifdef INET
1914 	inp_freemoptions(imo);
1915 #endif
1916 }
1917 
1918 /*
1919  * Different protocols initialize their inpcbs differently - giving
1920  * different name to the lock.  But they all are disposed the same.
1921  */
1922 static void
1923 inpcb_fini(void *mem, int size)
1924 {
1925 	struct inpcb *inp = mem;
1926 
1927 	INP_LOCK_DESTROY(inp);
1928 }
1929 
1930 #ifdef INET
1931 /*
1932  * Common routines to return the socket addresses associated with inpcbs.
1933  */
1934 int
1935 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1936 {
1937 	struct inpcb *inp;
1938 
1939 	inp = sotoinpcb(so);
1940 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1941 
1942 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1943 		.sin_len = sizeof(struct sockaddr_in),
1944 		.sin_family = AF_INET,
1945 		.sin_port = inp->inp_lport,
1946 		.sin_addr = inp->inp_laddr,
1947 	};
1948 
1949 	return (0);
1950 }
1951 
1952 int
1953 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1954 {
1955 	struct inpcb *inp;
1956 
1957 	inp = sotoinpcb(so);
1958 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1959 
1960 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1961 		.sin_len = sizeof(struct sockaddr_in),
1962 		.sin_family = AF_INET,
1963 		.sin_port = inp->inp_fport,
1964 		.sin_addr = inp->inp_faddr,
1965 	};
1966 
1967 	return (0);
1968 }
1969 
1970 static bool
1971 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1972 {
1973 
1974 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1975 		return (true);
1976 	else
1977 		return (false);
1978 }
1979 
1980 void
1981 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1982 {
1983 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1984 	    inp_v4_multi_match, NULL);
1985 	struct inpcb *inp;
1986 	struct in_multi *inm;
1987 	struct in_mfilter *imf;
1988 	struct ip_moptions *imo;
1989 
1990 	IN_MULTI_LOCK_ASSERT();
1991 
1992 	while ((inp = inp_next(&inpi)) != NULL) {
1993 		INP_WLOCK_ASSERT(inp);
1994 
1995 		imo = inp->inp_moptions;
1996 		/*
1997 		 * Unselect the outgoing interface if it is being
1998 		 * detached.
1999 		 */
2000 		if (imo->imo_multicast_ifp == ifp)
2001 			imo->imo_multicast_ifp = NULL;
2002 
2003 		/*
2004 		 * Drop multicast group membership if we joined
2005 		 * through the interface being detached.
2006 		 *
2007 		 * XXX This can all be deferred to an epoch_call
2008 		 */
2009 restart:
2010 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
2011 			if ((inm = imf->imf_inm) == NULL)
2012 				continue;
2013 			if (inm->inm_ifp != ifp)
2014 				continue;
2015 			ip_mfilter_remove(&imo->imo_head, imf);
2016 			in_leavegroup_locked(inm, NULL);
2017 			ip_mfilter_free(imf);
2018 			goto restart;
2019 		}
2020 	}
2021 }
2022 
2023 /*
2024  * Lookup a PCB based on the local address and port.  Caller must hold the
2025  * hash lock.  No inpcb locks or references are acquired.
2026  */
2027 #define INP_LOOKUP_MAPPED_PCB_COST	3
2028 struct inpcb *
2029 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2030     u_short lport, int fib, int lookupflags, struct ucred *cred)
2031 {
2032 	struct inpcb *inp;
2033 #ifdef INET6
2034 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2035 #else
2036 	int matchwild = 3;
2037 #endif
2038 	int wildcard;
2039 
2040 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2041 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2042 	KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
2043 	    ("%s: invalid fib %d", __func__, fib));
2044 
2045 	INP_HASH_LOCK_ASSERT(pcbinfo);
2046 
2047 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2048 		struct inpcbhead *head;
2049 		/*
2050 		 * Look for an unconnected (wildcard foreign addr) PCB that
2051 		 * matches the local address and port we're looking for.
2052 		 */
2053 		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2054 		    pcbinfo->ipi_hashmask)];
2055 		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2056 #ifdef INET6
2057 			/* XXX inp locking */
2058 			if ((inp->inp_vflag & INP_IPV4) == 0)
2059 				continue;
2060 #endif
2061 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2062 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2063 			    inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2064 			    inp->inp_inc.inc_fibnum == fib)) {
2065 				/*
2066 				 * Found?
2067 				 */
2068 				if (prison_equal_ip4(cred->cr_prison,
2069 				    inp->inp_cred->cr_prison))
2070 					return (inp);
2071 			}
2072 		}
2073 		/*
2074 		 * Not found.
2075 		 */
2076 		return (NULL);
2077 	} else {
2078 		struct inpcbhead *porthash;
2079 		struct inpcb *match = NULL;
2080 
2081 		/*
2082 		 * Port is in use by one or more PCBs. Look for best
2083 		 * fit.
2084 		 */
2085 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2086 		    pcbinfo->ipi_porthashmask)];
2087 		CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2088 			if (inp->inp_lport != lport)
2089 				continue;
2090 			if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2091 			    cred->cr_prison))
2092 				continue;
2093 			if (fib != RT_ALL_FIBS &&
2094 			    inp->inp_inc.inc_fibnum != fib)
2095 				continue;
2096 			wildcard = 0;
2097 #ifdef INET6
2098 			/* XXX inp locking */
2099 			if ((inp->inp_vflag & INP_IPV4) == 0)
2100 				continue;
2101 			/*
2102 			 * We never select the PCB that has INP_IPV6 flag and
2103 			 * is bound to :: if we have another PCB which is bound
2104 			 * to 0.0.0.0.  If a PCB has the INP_IPV6 flag, then we
2105 			 * set its cost higher than IPv4 only PCBs.
2106 			 *
2107 			 * Note that the case only happens when a socket is
2108 			 * bound to ::, under the condition that the use of the
2109 			 * mapped address is allowed.
2110 			 */
2111 			if ((inp->inp_vflag & INP_IPV6) != 0)
2112 				wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2113 #endif
2114 			if (inp->inp_faddr.s_addr != INADDR_ANY)
2115 				wildcard++;
2116 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
2117 				if (laddr.s_addr == INADDR_ANY)
2118 					wildcard++;
2119 				else if (inp->inp_laddr.s_addr != laddr.s_addr)
2120 					continue;
2121 			} else {
2122 				if (laddr.s_addr != INADDR_ANY)
2123 					wildcard++;
2124 			}
2125 			if (wildcard < matchwild) {
2126 				match = inp;
2127 				matchwild = wildcard;
2128 				if (matchwild == 0)
2129 					break;
2130 			}
2131 		}
2132 		return (match);
2133 	}
2134 }
2135 #undef INP_LOOKUP_MAPPED_PCB_COST
2136 
2137 static bool
2138 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2139 {
2140 	return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2141 	    (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2142 }
2143 
2144 static struct inpcb *
2145 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2146     const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2147     uint16_t lport, int domain, int fib)
2148 {
2149 	const struct inpcblbgrouphead *hdr;
2150 	struct inpcblbgroup *grp;
2151 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2152 	struct inpcb *inp;
2153 	u_int count;
2154 
2155 	INP_HASH_LOCK_ASSERT(pcbinfo);
2156 	NET_EPOCH_ASSERT();
2157 
2158 	hdr = &pcbinfo->ipi_lbgrouphashbase[
2159 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)];
2160 
2161 	/*
2162 	 * Search for an LB group match based on the following criteria:
2163 	 * - prefer jailed groups to non-jailed groups
2164 	 * - prefer exact source address matches to wildcard matches
2165 	 * - prefer groups bound to the specified NUMA domain
2166 	 */
2167 	jail_exact = jail_wild = local_exact = local_wild = NULL;
2168 	CK_LIST_FOREACH(grp, hdr, il_list) {
2169 		bool injail;
2170 
2171 #ifdef INET6
2172 		if (!(grp->il_vflag & INP_IPV4))
2173 			continue;
2174 #endif
2175 		if (grp->il_lport != lport)
2176 			continue;
2177 
2178 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2179 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2180 		    laddr) != 0)
2181 			continue;
2182 
2183 		if (grp->il_laddr.s_addr == laddr->s_addr) {
2184 			if (injail) {
2185 				jail_exact = grp;
2186 				if (in_pcblookup_lb_match(grp, domain, fib))
2187 					/* This is a perfect match. */
2188 					goto out;
2189 			} else if (local_exact == NULL ||
2190 			    in_pcblookup_lb_match(grp, domain, fib)) {
2191 				local_exact = grp;
2192 			}
2193 		} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2194 			if (injail) {
2195 				if (jail_wild == NULL ||
2196 				    in_pcblookup_lb_match(grp, domain, fib))
2197 					jail_wild = grp;
2198 			} else if (local_wild == NULL ||
2199 			    in_pcblookup_lb_match(grp, domain, fib)) {
2200 				local_wild = grp;
2201 			}
2202 		}
2203 	}
2204 
2205 	if (jail_exact != NULL)
2206 		grp = jail_exact;
2207 	else if (jail_wild != NULL)
2208 		grp = jail_wild;
2209 	else if (local_exact != NULL)
2210 		grp = local_exact;
2211 	else
2212 		grp = local_wild;
2213 	if (grp == NULL)
2214 		return (NULL);
2215 
2216 out:
2217 	/*
2218 	 * Synchronize with in_pcblbgroup_insert().
2219 	 */
2220 	count = atomic_load_acq_int(&grp->il_inpcnt);
2221 	if (count == 0)
2222 		return (NULL);
2223 	inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2224 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2225 	return (inp);
2226 }
2227 
2228 static bool
2229 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2230     u_short fport, struct in_addr laddr, u_short lport)
2231 {
2232 #ifdef INET6
2233 	/* XXX inp locking */
2234 	if ((inp->inp_vflag & INP_IPV4) == 0)
2235 		return (false);
2236 #endif
2237 	if (inp->inp_faddr.s_addr == faddr.s_addr &&
2238 	    inp->inp_laddr.s_addr == laddr.s_addr &&
2239 	    inp->inp_fport == fport &&
2240 	    inp->inp_lport == lport)
2241 		return (true);
2242 	return (false);
2243 }
2244 
2245 static struct inpcb *
2246 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2247     u_short fport, struct in_addr laddr, u_short lport)
2248 {
2249 	struct inpcbhead *head;
2250 	struct inpcb *inp;
2251 
2252 	INP_HASH_LOCK_ASSERT(pcbinfo);
2253 
2254 	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2255 	    pcbinfo->ipi_hashmask)];
2256 	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2257 		if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2258 			return (inp);
2259 	}
2260 	return (NULL);
2261 }
2262 
2263 typedef enum {
2264 	INPLOOKUP_MATCH_NONE = 0,
2265 	INPLOOKUP_MATCH_WILD = 1,
2266 	INPLOOKUP_MATCH_LADDR = 2,
2267 } inp_lookup_match_t;
2268 
2269 static inp_lookup_match_t
2270 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2271     u_short lport, int fib)
2272 {
2273 #ifdef INET6
2274 	/* XXX inp locking */
2275 	if ((inp->inp_vflag & INP_IPV4) == 0)
2276 		return (INPLOOKUP_MATCH_NONE);
2277 #endif
2278 	if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2279 		return (INPLOOKUP_MATCH_NONE);
2280 	if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2281 		return (INPLOOKUP_MATCH_NONE);
2282 	if (inp->inp_laddr.s_addr == INADDR_ANY)
2283 		return (INPLOOKUP_MATCH_WILD);
2284 	if (inp->inp_laddr.s_addr == laddr.s_addr)
2285 		return (INPLOOKUP_MATCH_LADDR);
2286 	return (INPLOOKUP_MATCH_NONE);
2287 }
2288 
2289 #define	INP_LOOKUP_AGAIN	((struct inpcb *)(uintptr_t)-1)
2290 
2291 static struct inpcb *
2292 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2293     u_short lport, int fib, const inp_lookup_t lockflags)
2294 {
2295 	struct inpcbhead *head;
2296 	struct inpcb *inp;
2297 
2298 	KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2299 	    ("%s: not in SMR read section", __func__));
2300 
2301 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2302 	    pcbinfo->ipi_hashmask)];
2303 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2304 		inp_lookup_match_t match;
2305 
2306 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2307 		if (match == INPLOOKUP_MATCH_NONE)
2308 			continue;
2309 
2310 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2311 			match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2312 			if (match != INPLOOKUP_MATCH_NONE &&
2313 			    prison_check_ip4_locked(inp->inp_cred->cr_prison,
2314 			    &laddr) == 0)
2315 				return (inp);
2316 			inp_unlock(inp, lockflags);
2317 		}
2318 
2319 		/*
2320 		 * The matching socket disappeared out from under us.  Fall back
2321 		 * to a serialized lookup.
2322 		 */
2323 		return (INP_LOOKUP_AGAIN);
2324 	}
2325 	return (NULL);
2326 }
2327 
2328 static struct inpcb *
2329 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2330     u_short lport, int fib)
2331 {
2332 	struct inpcbhead *head;
2333 	struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2334 #ifdef INET6
2335 	struct inpcb *local_wild_mapped;
2336 #endif
2337 
2338 	INP_HASH_LOCK_ASSERT(pcbinfo);
2339 
2340 	/*
2341 	 * Order of socket selection - we always prefer jails.
2342 	 *      1. jailed, non-wild.
2343 	 *      2. jailed, wild.
2344 	 *      3. non-jailed, non-wild.
2345 	 *      4. non-jailed, wild.
2346 	 */
2347 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2348 	    pcbinfo->ipi_hashmask)];
2349 	local_wild = local_exact = jail_wild = NULL;
2350 #ifdef INET6
2351 	local_wild_mapped = NULL;
2352 #endif
2353 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2354 		inp_lookup_match_t match;
2355 		bool injail;
2356 
2357 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2358 		if (match == INPLOOKUP_MATCH_NONE)
2359 			continue;
2360 
2361 		injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2362 		if (injail) {
2363 			if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2364 			    &laddr) != 0)
2365 				continue;
2366 		} else {
2367 			if (local_exact != NULL)
2368 				continue;
2369 		}
2370 
2371 		if (match == INPLOOKUP_MATCH_LADDR) {
2372 			if (injail)
2373 				return (inp);
2374 			local_exact = inp;
2375 		} else {
2376 #ifdef INET6
2377 			/* XXX inp locking, NULL check */
2378 			if (inp->inp_vflag & INP_IPV6PROTO)
2379 				local_wild_mapped = inp;
2380 			else
2381 #endif
2382 				if (injail)
2383 					jail_wild = inp;
2384 				else
2385 					local_wild = inp;
2386 		}
2387 	}
2388 	if (jail_wild != NULL)
2389 		return (jail_wild);
2390 	if (local_exact != NULL)
2391 		return (local_exact);
2392 	if (local_wild != NULL)
2393 		return (local_wild);
2394 #ifdef INET6
2395 	if (local_wild_mapped != NULL)
2396 		return (local_wild_mapped);
2397 #endif
2398 	return (NULL);
2399 }
2400 
2401 /*
2402  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2403  * that the caller has either locked the hash list, which usually happens
2404  * for bind(2) operations, or is in SMR section, which happens when sorting
2405  * out incoming packets.
2406  */
2407 static struct inpcb *
2408 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2409     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2410     uint8_t numa_domain, int fib)
2411 {
2412 	struct inpcb *inp;
2413 	const u_short fport = fport_arg, lport = lport_arg;
2414 
2415 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2416 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2417 	KASSERT(faddr.s_addr != INADDR_ANY,
2418 	    ("%s: invalid foreign address", __func__));
2419 	KASSERT(laddr.s_addr != INADDR_ANY,
2420 	    ("%s: invalid local address", __func__));
2421 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2422 
2423 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2424 	if (inp != NULL)
2425 		return (inp);
2426 
2427 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2428 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2429 		    &laddr, lport, numa_domain, fib);
2430 		if (inp == NULL) {
2431 			inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2432 			    lport, fib);
2433 		}
2434 	}
2435 
2436 	return (inp);
2437 }
2438 
2439 static struct inpcb *
2440 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2441     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2442     uint8_t numa_domain, int fib)
2443 {
2444 	struct inpcb *inp;
2445 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2446 
2447 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2448 	    ("%s: LOCKPCB not set", __func__));
2449 
2450 	INP_HASH_WLOCK(pcbinfo);
2451 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2452 	    lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2453 	if (inp != NULL && !inp_trylock(inp, lockflags)) {
2454 		in_pcbref(inp);
2455 		INP_HASH_WUNLOCK(pcbinfo);
2456 		inp_lock(inp, lockflags);
2457 		if (in_pcbrele(inp, lockflags))
2458 			/* XXX-MJ or retry until we get a negative match? */
2459 			inp = NULL;
2460 	} else {
2461 		INP_HASH_WUNLOCK(pcbinfo);
2462 	}
2463 	return (inp);
2464 }
2465 
2466 static struct inpcb *
2467 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2468     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2469     uint8_t numa_domain, int fib)
2470 {
2471 	struct inpcb *inp;
2472 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2473 	const u_short fport = fport_arg, lport = lport_arg;
2474 
2475 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2476 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2477 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2478 	    ("%s: LOCKPCB not set", __func__));
2479 
2480 	smr_enter(pcbinfo->ipi_smr);
2481 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2482 	if (inp != NULL) {
2483 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2484 			/*
2485 			 * Revalidate the 4-tuple, the socket could have been
2486 			 * disconnected.
2487 			 */
2488 			if (__predict_true(in_pcblookup_exact_match(inp,
2489 			    faddr, fport, laddr, lport)))
2490 				return (inp);
2491 			inp_unlock(inp, lockflags);
2492 		}
2493 
2494 		/*
2495 		 * We failed to lock the inpcb, or its connection state changed
2496 		 * out from under us.  Fall back to a precise search.
2497 		 */
2498 		return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2499 		    lookupflags, numa_domain, fib));
2500 	}
2501 
2502 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2503 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2504 		    &laddr, lport, numa_domain, fib);
2505 		if (inp != NULL) {
2506 			if (__predict_true(inp_smr_lock(inp, lockflags))) {
2507 				if (__predict_true(in_pcblookup_wild_match(inp,
2508 				    laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2509 					return (inp);
2510 				inp_unlock(inp, lockflags);
2511 			}
2512 			inp = INP_LOOKUP_AGAIN;
2513 		} else {
2514 			inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2515 			    fib, lockflags);
2516 		}
2517 		if (inp == INP_LOOKUP_AGAIN) {
2518 			return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2519 			    lport, lookupflags, numa_domain, fib));
2520 		}
2521 	}
2522 
2523 	if (inp == NULL)
2524 		smr_exit(pcbinfo->ipi_smr);
2525 
2526 	return (inp);
2527 }
2528 
2529 /*
2530  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2531  * from which a pre-calculated hash value may be extracted.
2532  */
2533 struct inpcb *
2534 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2535     struct in_addr laddr, u_int lport, int lookupflags,
2536     struct ifnet *ifp)
2537 {
2538 	int fib;
2539 
2540 	fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2541 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2542 	    lookupflags, M_NODOM, fib));
2543 }
2544 
2545 struct inpcb *
2546 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2547     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2548     struct ifnet *ifp __unused, struct mbuf *m)
2549 {
2550 	int fib;
2551 
2552 	M_ASSERTPKTHDR(m);
2553 	fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2554 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2555 	    lookupflags, m->m_pkthdr.numa_domain, fib));
2556 }
2557 #endif /* INET */
2558 
2559 static bool
2560 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2561 {
2562 	return (prison_flag(inp->inp_cred, flag) != 0);
2563 }
2564 
2565 /*
2566  * Insert the PCB into a hash chain using ordering rules which ensure that
2567  * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2568  *
2569  * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2570  * with exact local addresses ahead of wildcard PCBs.  Unbound v4-mapped v6 PCBs
2571  * always appear last no matter whether they are jailed.
2572  */
2573 static void
2574 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2575 {
2576 	struct inpcb *last;
2577 	bool bound, injail;
2578 
2579 	INP_LOCK_ASSERT(inp);
2580 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2581 
2582 	last = NULL;
2583 	bound = inp->inp_laddr.s_addr != INADDR_ANY;
2584 	if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2585 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2586 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2587 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2588 				return;
2589 			}
2590 		}
2591 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2592 		return;
2593 	}
2594 
2595 	injail = in_pcbjailed(inp, PR_IP4);
2596 	if (!injail) {
2597 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2598 			if (!in_pcbjailed(last, PR_IP4))
2599 				break;
2600 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2601 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2602 				return;
2603 			}
2604 		}
2605 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2606 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2607 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2608 		return;
2609 	}
2610 	if (!bound) {
2611 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2612 			if (last->inp_laddr.s_addr == INADDR_ANY)
2613 				break;
2614 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2615 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2616 				return;
2617 			}
2618 		}
2619 	}
2620 	if (last == NULL)
2621 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2622 	else
2623 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2624 }
2625 
2626 #ifdef INET6
2627 /*
2628  * See the comment above _in_pcbinshash_wild().
2629  */
2630 static void
2631 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2632 {
2633 	struct inpcb *last;
2634 	bool bound, injail;
2635 
2636 	INP_LOCK_ASSERT(inp);
2637 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2638 
2639 	last = NULL;
2640 	bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2641 	injail = in_pcbjailed(inp, PR_IP6);
2642 	if (!injail) {
2643 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2644 			if (!in_pcbjailed(last, PR_IP6))
2645 				break;
2646 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2647 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2648 				return;
2649 			}
2650 		}
2651 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2652 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2653 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2654 		return;
2655 	}
2656 	if (!bound) {
2657 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2658 			if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2659 				break;
2660 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2661 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2662 				return;
2663 			}
2664 		}
2665 	}
2666 	if (last == NULL)
2667 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2668 	else
2669 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2670 }
2671 #endif
2672 
2673 /*
2674  * Insert PCB onto various hash lists.
2675  *
2676  * With normal sockets this function shall not fail, so it could return void.
2677  * But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2678  * that's the only condition when it can fail.
2679  */
2680 int
2681 in_pcbinshash(struct inpcb *inp)
2682 {
2683 	struct inpcbhead *pcbhash, *pcbporthash;
2684 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2685 	uint32_t hash;
2686 	bool connected;
2687 
2688 	INP_WLOCK_ASSERT(inp);
2689 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2690 	MPASS(inp->inp_flags & INP_UNCONNECTED);
2691 
2692 #ifdef INET6
2693 	if (inp->inp_vflag & INP_IPV6) {
2694 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2695 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2696 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2697 	} else
2698 #endif
2699 	{
2700 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2701 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2702 		connected = !in_nullhost(inp->inp_faddr);
2703 	}
2704 
2705 	if (connected)
2706 		pcbhash = &pcbinfo->ipi_hash_exact[hash];
2707 	else
2708 		pcbhash = &pcbinfo->ipi_hash_wild[hash];
2709 
2710 	pcbporthash = &pcbinfo->ipi_porthashbase[
2711 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2712 
2713 	/*
2714 	 * Ignore SO_REUSEPORT_LB if the socket is connected.  Really this case
2715 	 * should be an error, but for UDP sockets it is not, and some
2716 	 * applications erroneously set it on connected UDP sockets, so we can't
2717 	 * change this without breaking compatibility.
2718 	 */
2719 	if (!connected &&
2720 	    (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2721 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
2722 		if (error != 0)
2723 			return (error);
2724 	}
2725 
2726 	/*
2727 	 * The PCB may have been disconnected in the past.  Before we can safely
2728 	 * make it visible in the hash table, we must wait for all readers which
2729 	 * may be traversing this PCB to finish.
2730 	 */
2731 	if (inp->inp_smr != SMR_SEQ_INVALID) {
2732 		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2733 		inp->inp_smr = SMR_SEQ_INVALID;
2734 	}
2735 
2736 	CK_LIST_REMOVE(inp, inp_unconn_list);
2737 
2738 	if (connected)
2739 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2740 	else {
2741 #ifdef INET6
2742 		if ((inp->inp_vflag & INP_IPV6) != 0)
2743 			_in6_pcbinshash_wild(pcbhash, inp);
2744 		else
2745 #endif
2746 			_in_pcbinshash_wild(pcbhash, inp);
2747 	}
2748 	CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2749 	inp->inp_flags &= ~INP_UNCONNECTED;
2750 
2751 	return (0);
2752 }
2753 
2754 void
2755 in_pcbremhash(struct inpcb *inp)
2756 {
2757 
2758 	INP_WLOCK_ASSERT(inp);
2759 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2760 	MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2761 
2762 	if ((inp->inp_flags & INP_INLBGROUP) != 0)
2763 		in_pcbremlbgrouphash(inp);
2764 #ifdef INET6
2765 	if (inp->inp_vflag & INP_IPV6) {
2766 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2767 			CK_LIST_REMOVE(inp, inp_hash_wild);
2768 		else
2769 			CK_LIST_REMOVE(inp, inp_hash_exact);
2770 	} else
2771 #endif
2772 	{
2773 		if (in_nullhost(inp->inp_faddr))
2774 			CK_LIST_REMOVE(inp, inp_hash_wild);
2775 		else
2776 			CK_LIST_REMOVE(inp, inp_hash_exact);
2777 	}
2778 	CK_LIST_REMOVE(inp, inp_portlist);
2779 }
2780 
2781 /*
2782  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2783  * changed. NOTE: This does not handle the case of the lport changing (the
2784  * hashed port list would have to be updated as well), so the lport must
2785  * not change after in_pcbinshash() has been called.
2786  */
2787 void
2788 in_pcbrehash(struct inpcb *inp)
2789 {
2790 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2791 	struct inpcbhead *head;
2792 	uint32_t hash;
2793 	bool connected;
2794 
2795 	INP_WLOCK_ASSERT(inp);
2796 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2797 	MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2798 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2799 	    ("%s: inp was disconnected", __func__));
2800 
2801 #ifdef INET6
2802 	if (inp->inp_vflag & INP_IPV6) {
2803 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2804 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2805 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2806 	} else
2807 #endif
2808 	{
2809 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2810 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2811 		connected = !in_nullhost(inp->inp_faddr);
2812 	}
2813 
2814 	/* See the comment in in_pcbinshash(). */
2815 	if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2816 		in_pcbremlbgrouphash(inp);
2817 
2818 	/*
2819 	 * When rehashing, the caller must ensure that either the new or the old
2820 	 * foreign address was unspecified.
2821 	 */
2822 	if (connected) {
2823 		CK_LIST_REMOVE(inp, inp_hash_wild);
2824 		head = &pcbinfo->ipi_hash_exact[hash];
2825 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2826 	} else {
2827 		CK_LIST_REMOVE(inp, inp_hash_exact);
2828 		head = &pcbinfo->ipi_hash_wild[hash];
2829 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2830 	}
2831 }
2832 
2833 void
2834 ripcb_connect(struct inpcb *inp)
2835 {
2836 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2837 	uint32_t hash;
2838 
2839 	INP_WLOCK_ASSERT(inp);
2840 	MPASS(inp->inp_flags & INP_UNCONNECTED);
2841 
2842 	hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask;
2843 
2844 	INP_HASH_WLOCK(pcbinfo);
2845 	CK_LIST_REMOVE(inp, inp_unconn_list);
2846 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp,
2847 	    inp_hash_exact);
2848 	INP_HASH_WUNLOCK(pcbinfo);
2849 	inp->inp_flags &= ~INP_UNCONNECTED;
2850 }
2851 
2852 void
2853 ripcb_disconnect(struct inpcb *inp)
2854 {
2855 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2856 
2857 	INP_WLOCK_ASSERT(inp);
2858 
2859 	if (inp->inp_flags & INP_UNCONNECTED)
2860 		return;
2861 
2862 	INP_HASH_WLOCK(pcbinfo);
2863 	CK_LIST_REMOVE(inp, inp_hash_exact);
2864 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
2865 	INP_HASH_WUNLOCK(pcbinfo);
2866 	inp->inp_flags |= INP_UNCONNECTED;
2867 }
2868 
2869 /*
2870  * Check for alternatives when higher level complains
2871  * about service problems.  For now, invalidate cached
2872  * routing information.  If the route was created dynamically
2873  * (by a redirect), time to try a default gateway again.
2874  */
2875 void
2876 in_losing(struct inpcb *inp)
2877 {
2878 
2879 	RO_INVALIDATE_CACHE(&inp->inp_route);
2880 	return;
2881 }
2882 
2883 /*
2884  * A set label operation has occurred at the socket layer, propagate the
2885  * label change into the in_pcb for the socket.
2886  */
2887 void
2888 in_pcbsosetlabel(struct socket *so)
2889 {
2890 #ifdef MAC
2891 	struct inpcb *inp;
2892 
2893 	inp = sotoinpcb(so);
2894 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2895 
2896 	INP_WLOCK(inp);
2897 	SOCK_LOCK(so);
2898 	mac_inpcb_sosetlabel(so, inp);
2899 	SOCK_UNLOCK(so);
2900 	INP_WUNLOCK(inp);
2901 #endif
2902 }
2903 
2904 void
2905 inp_wlock(struct inpcb *inp)
2906 {
2907 
2908 	INP_WLOCK(inp);
2909 }
2910 
2911 void
2912 inp_wunlock(struct inpcb *inp)
2913 {
2914 
2915 	INP_WUNLOCK(inp);
2916 }
2917 
2918 void
2919 inp_rlock(struct inpcb *inp)
2920 {
2921 
2922 	INP_RLOCK(inp);
2923 }
2924 
2925 void
2926 inp_runlock(struct inpcb *inp)
2927 {
2928 
2929 	INP_RUNLOCK(inp);
2930 }
2931 
2932 #ifdef INVARIANT_SUPPORT
2933 void
2934 inp_lock_assert(struct inpcb *inp)
2935 {
2936 
2937 	INP_WLOCK_ASSERT(inp);
2938 }
2939 
2940 void
2941 inp_unlock_assert(struct inpcb *inp)
2942 {
2943 
2944 	INP_UNLOCK_ASSERT(inp);
2945 }
2946 #endif
2947 
2948 void
2949 inp_apply_all(struct inpcbinfo *pcbinfo,
2950     void (*func)(struct inpcb *, void *), void *arg)
2951 {
2952 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2953 	    INPLOOKUP_WLOCKPCB);
2954 	struct inpcb *inp;
2955 
2956 	while ((inp = inp_next(&inpi)) != NULL)
2957 		func(inp, arg);
2958 }
2959 
2960 struct socket *
2961 inp_inpcbtosocket(struct inpcb *inp)
2962 {
2963 
2964 	INP_WLOCK_ASSERT(inp);
2965 	return (inp->inp_socket);
2966 }
2967 
2968 void
2969 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2970     uint32_t *faddr, uint16_t *fp)
2971 {
2972 
2973 	INP_LOCK_ASSERT(inp);
2974 	*laddr = inp->inp_laddr.s_addr;
2975 	*faddr = inp->inp_faddr.s_addr;
2976 	*lp = inp->inp_lport;
2977 	*fp = inp->inp_fport;
2978 }
2979 
2980 /*
2981  * Create an external-format (``xinpcb'') structure using the information in
2982  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2983  * reduce the spew of irrelevant information over this interface, to isolate
2984  * user code from changes in the kernel structure, and potentially to provide
2985  * information-hiding if we decide that some of this information should be
2986  * hidden from users.
2987  */
2988 void
2989 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2990 {
2991 
2992 	bzero(xi, sizeof(*xi));
2993 	xi->xi_len = sizeof(struct xinpcb);
2994 	if (inp->inp_socket)
2995 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
2996 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2997 	xi->inp_gencnt = inp->inp_gencnt;
2998 	xi->inp_flow = inp->inp_flow;
2999 	xi->inp_flowid = inp->inp_flowid;
3000 	xi->inp_flowtype = inp->inp_flowtype;
3001 	xi->inp_flags = inp->inp_flags;
3002 	xi->inp_flags2 = inp->inp_flags2;
3003 	xi->in6p_cksum = inp->in6p_cksum;
3004 	xi->in6p_hops = inp->in6p_hops;
3005 	xi->inp_ip_tos = inp->inp_ip_tos;
3006 	xi->inp_vflag = inp->inp_vflag;
3007 	xi->inp_ip_ttl = inp->inp_ip_ttl;
3008 	xi->inp_ip_p = inp->inp_ip_p;
3009 	xi->inp_ip_minttl = inp->inp_ip_minttl;
3010 }
3011 
3012 int
3013 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
3014     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
3015 {
3016 	struct sockopt sopt;
3017 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
3018 	    INPLOOKUP_WLOCKPCB);
3019 	struct inpcb *inp;
3020 	struct sockopt_parameters *params;
3021 	struct socket *so;
3022 	int error;
3023 	char buf[1024];
3024 
3025 	if (req->oldptr != NULL || req->oldlen != 0)
3026 		return (EINVAL);
3027 	if (req->newptr == NULL)
3028 		return (EPERM);
3029 	if (req->newlen > sizeof(buf))
3030 		return (ENOMEM);
3031 	error = SYSCTL_IN(req, buf, req->newlen);
3032 	if (error != 0)
3033 		return (error);
3034 	if (req->newlen < sizeof(struct sockopt_parameters))
3035 		return (EINVAL);
3036 	params = (struct sockopt_parameters *)buf;
3037 	sopt.sopt_level = params->sop_level;
3038 	sopt.sopt_name = params->sop_optname;
3039 	sopt.sopt_dir = SOPT_SET;
3040 	sopt.sopt_val = params->sop_optval;
3041 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
3042 	sopt.sopt_td = NULL;
3043 #ifdef INET6
3044 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
3045 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
3046 			params->sop_inc.inc6_laddr.s6_addr16[1] =
3047 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3048 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
3049 			params->sop_inc.inc6_faddr.s6_addr16[1] =
3050 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3051 	}
3052 #endif
3053 	if (params->sop_inc.inc_lport != htons(0) &&
3054 	    params->sop_inc.inc_fport != htons(0)) {
3055 #ifdef INET6
3056 		if (params->sop_inc.inc_flags & INC_ISIPV6)
3057 			inpi.hash = INP6_PCBHASH(
3058 			    &params->sop_inc.inc6_faddr,
3059 			    params->sop_inc.inc_lport,
3060 			    params->sop_inc.inc_fport,
3061 			    pcbinfo->ipi_hashmask);
3062 		else
3063 #endif
3064 			inpi.hash = INP_PCBHASH(
3065 			    &params->sop_inc.inc_faddr,
3066 			    params->sop_inc.inc_lport,
3067 			    params->sop_inc.inc_fport,
3068 			    pcbinfo->ipi_hashmask);
3069 	}
3070 	while ((inp = inp_next(&inpi)) != NULL)
3071 		if (inp->inp_gencnt == params->sop_id) {
3072 			/*
3073 			 * XXXGL
3074 			 * 1) the inp_next() that ignores INP_UNCONNECTED needs
3075 			 * to be generally supported.
3076 			 * 2) Why do we ECONNRESET instead of continueing?
3077 			 */
3078 			if (inp->inp_flags & INP_UNCONNECTED) {
3079 				INP_WUNLOCK(inp);
3080 				return (ECONNRESET);
3081 			}
3082 			so = inp->inp_socket;
3083 			KASSERT(so != NULL, ("inp_socket == NULL"));
3084 			soref(so);
3085 			if (params->sop_level == SOL_SOCKET) {
3086 				INP_WUNLOCK(inp);
3087 				error = sosetopt(so, &sopt);
3088 			} else
3089 				error = (*ctloutput_set)(inp, &sopt);
3090 			sorele(so);
3091 			break;
3092 		}
3093 	if (inp == NULL)
3094 		error = ESRCH;
3095 	return (error);
3096 }
3097 
3098 #ifdef DDB
3099 static void
3100 db_print_indent(int indent)
3101 {
3102 	int i;
3103 
3104 	for (i = 0; i < indent; i++)
3105 		db_printf(" ");
3106 }
3107 
3108 static void
3109 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3110 {
3111 	char faddr_str[48], laddr_str[48];
3112 
3113 	db_print_indent(indent);
3114 	db_printf("%s at %p\n", name, inc);
3115 
3116 	indent += 2;
3117 
3118 #ifdef INET6
3119 	if (inc->inc_flags & INC_ISIPV6) {
3120 		/* IPv6. */
3121 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
3122 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
3123 	} else
3124 #endif
3125 	{
3126 		/* IPv4. */
3127 		inet_ntoa_r(inc->inc_laddr, laddr_str);
3128 		inet_ntoa_r(inc->inc_faddr, faddr_str);
3129 	}
3130 	db_print_indent(indent);
3131 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3132 	    ntohs(inc->inc_lport));
3133 	db_print_indent(indent);
3134 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3135 	    ntohs(inc->inc_fport));
3136 }
3137 
3138 void
3139 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3140 {
3141 
3142 	db_print_indent(indent);
3143 	db_printf("%s at %p\n", name, inp);
3144 
3145 	indent += 2;
3146 
3147 	db_print_indent(indent);
3148 	db_printf("inp_flow: 0x%x   inp_label: %p\n", inp->inp_flow,
3149 	    inp->inp_label);
3150 
3151 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3152 
3153 	db_print_indent(indent);
3154 	db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3155 
3156 	db_print_indent(indent);
3157 	db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3158 
3159 	db_print_indent(indent);
3160 	db_printf("inp_sp: %p   inp_vflag: 0x%b\n", inp->inp_sp,
3161 	    inp->inp_vflag, INP_VFLAGS_BITS);
3162 
3163 	db_print_indent(indent);
3164 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3165 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3166 
3167 #ifdef INET6
3168 	if (inp->inp_vflag & INP_IPV6) {
3169 		db_print_indent(indent);
3170 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
3171 		    "in6p_moptions: %p\n", inp->in6p_options,
3172 		    inp->in6p_outputopts, inp->in6p_moptions);
3173 		db_print_indent(indent);
3174 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3175 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3176 		    inp->in6p_hops);
3177 	} else
3178 #endif
3179 	{
3180 		db_print_indent(indent);
3181 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3182 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3183 		    inp->inp_options, inp->inp_moptions);
3184 	}
3185 
3186 	db_print_indent(indent);
3187 	db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3188 }
3189 
3190 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3191 {
3192 	struct inpcb *inp;
3193 
3194 	if (!have_addr) {
3195 		db_printf("usage: show inpcb <addr>\n");
3196 		return;
3197 	}
3198 	inp = (struct inpcb *)addr;
3199 
3200 	db_print_inpcb(inp, "inpcb", 0);
3201 }
3202 #endif /* DDB */
3203 
3204 #ifdef RATELIMIT
3205 /*
3206  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3207  * if any.
3208  */
3209 int
3210 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3211 {
3212 	union if_snd_tag_modify_params params = {
3213 		.rate_limit.max_rate = max_pacing_rate,
3214 		.rate_limit.flags = M_NOWAIT,
3215 	};
3216 	struct m_snd_tag *mst;
3217 	int error;
3218 
3219 	mst = inp->inp_snd_tag;
3220 	if (mst == NULL)
3221 		return (EINVAL);
3222 
3223 	if (mst->sw->snd_tag_modify == NULL) {
3224 		error = EOPNOTSUPP;
3225 	} else {
3226 		error = mst->sw->snd_tag_modify(mst, &params);
3227 	}
3228 	return (error);
3229 }
3230 
3231 /*
3232  * Query existing TX rate limit based on the existing
3233  * "inp->inp_snd_tag", if any.
3234  */
3235 int
3236 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3237 {
3238 	union if_snd_tag_query_params params = { };
3239 	struct m_snd_tag *mst;
3240 	int error;
3241 
3242 	mst = inp->inp_snd_tag;
3243 	if (mst == NULL)
3244 		return (EINVAL);
3245 
3246 	if (mst->sw->snd_tag_query == NULL) {
3247 		error = EOPNOTSUPP;
3248 	} else {
3249 		error = mst->sw->snd_tag_query(mst, &params);
3250 		if (error == 0 && p_max_pacing_rate != NULL)
3251 			*p_max_pacing_rate = params.rate_limit.max_rate;
3252 	}
3253 	return (error);
3254 }
3255 
3256 /*
3257  * Query existing TX queue level based on the existing
3258  * "inp->inp_snd_tag", if any.
3259  */
3260 int
3261 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3262 {
3263 	union if_snd_tag_query_params params = { };
3264 	struct m_snd_tag *mst;
3265 	int error;
3266 
3267 	mst = inp->inp_snd_tag;
3268 	if (mst == NULL)
3269 		return (EINVAL);
3270 
3271 	if (mst->sw->snd_tag_query == NULL)
3272 		return (EOPNOTSUPP);
3273 
3274 	error = mst->sw->snd_tag_query(mst, &params);
3275 	if (error == 0 && p_txqueue_level != NULL)
3276 		*p_txqueue_level = params.rate_limit.queue_level;
3277 	return (error);
3278 }
3279 
3280 /*
3281  * Allocate a new TX rate limit send tag from the network interface
3282  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3283  */
3284 int
3285 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3286     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3287 
3288 {
3289 	union if_snd_tag_alloc_params params = {
3290 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3291 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3292 		.rate_limit.hdr.flowid = flowid,
3293 		.rate_limit.hdr.flowtype = flowtype,
3294 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3295 		.rate_limit.max_rate = max_pacing_rate,
3296 		.rate_limit.flags = M_NOWAIT,
3297 	};
3298 	int error;
3299 
3300 	INP_WLOCK_ASSERT(inp);
3301 
3302 	/*
3303 	 * If there is already a send tag, or the INP is being torn
3304 	 * down, allocating a new send tag is not allowed. Else send
3305 	 * tags may leak.
3306 	 */
3307 	if (*st != NULL || (inp->inp_flags & INP_UNCONNECTED))
3308 		return (EINVAL);
3309 
3310 	error = m_snd_tag_alloc(ifp, &params, st);
3311 #ifdef INET
3312 	if (error == 0) {
3313 		counter_u64_add(rate_limit_set_ok, 1);
3314 		counter_u64_add(rate_limit_active, 1);
3315 	} else if (error != EOPNOTSUPP)
3316 		  counter_u64_add(rate_limit_alloc_fail, 1);
3317 #endif
3318 	return (error);
3319 }
3320 
3321 void
3322 in_pcbdetach_tag(struct m_snd_tag *mst)
3323 {
3324 
3325 	m_snd_tag_rele(mst);
3326 #ifdef INET
3327 	counter_u64_add(rate_limit_active, -1);
3328 #endif
3329 }
3330 
3331 /*
3332  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3333  * if any:
3334  */
3335 void
3336 in_pcbdetach_txrtlmt(struct inpcb *inp)
3337 {
3338 	struct m_snd_tag *mst;
3339 
3340 	INP_WLOCK_ASSERT(inp);
3341 
3342 	mst = inp->inp_snd_tag;
3343 	inp->inp_snd_tag = NULL;
3344 
3345 	if (mst == NULL)
3346 		return;
3347 
3348 	m_snd_tag_rele(mst);
3349 #ifdef INET
3350 	counter_u64_add(rate_limit_active, -1);
3351 #endif
3352 }
3353 
3354 int
3355 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3356 {
3357 	int error;
3358 
3359 	/*
3360 	 * If the existing send tag is for the wrong interface due to
3361 	 * a route change, first drop the existing tag.  Set the
3362 	 * CHANGED flag so that we will keep trying to allocate a new
3363 	 * tag if we fail to allocate one this time.
3364 	 */
3365 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3366 		in_pcbdetach_txrtlmt(inp);
3367 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3368 	}
3369 
3370 	/*
3371 	 * NOTE: When attaching to a network interface a reference is
3372 	 * made to ensure the network interface doesn't go away until
3373 	 * all ratelimit connections are gone. The network interface
3374 	 * pointers compared below represent valid network interfaces,
3375 	 * except when comparing towards NULL.
3376 	 */
3377 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3378 		error = 0;
3379 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3380 		if (inp->inp_snd_tag != NULL)
3381 			in_pcbdetach_txrtlmt(inp);
3382 		error = 0;
3383 	} else if (inp->inp_snd_tag == NULL) {
3384 		/*
3385 		 * In order to utilize packet pacing with RSS, we need
3386 		 * to wait until there is a valid RSS hash before we
3387 		 * can proceed:
3388 		 */
3389 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3390 			error = EAGAIN;
3391 		} else {
3392 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3393 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3394 		}
3395 	} else {
3396 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3397 	}
3398 	if (error == 0 || error == EOPNOTSUPP)
3399 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3400 
3401 	return (error);
3402 }
3403 
3404 /*
3405  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3406  * is set in the fast path and will attach/detach/modify the TX rate
3407  * limit send tag based on the socket's so_max_pacing_rate value.
3408  */
3409 void
3410 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3411 {
3412 	struct socket *socket;
3413 	uint32_t max_pacing_rate;
3414 	bool did_upgrade;
3415 
3416 	if (inp == NULL)
3417 		return;
3418 
3419 	socket = inp->inp_socket;
3420 	if (socket == NULL)
3421 		return;
3422 
3423 	if (!INP_WLOCKED(inp)) {
3424 		/*
3425 		 * NOTE: If the write locking fails, we need to bail
3426 		 * out and use the non-ratelimited ring for the
3427 		 * transmit until there is a new chance to get the
3428 		 * write lock.
3429 		 */
3430 		if (!INP_TRY_UPGRADE(inp))
3431 			return;
3432 		did_upgrade = 1;
3433 	} else {
3434 		did_upgrade = 0;
3435 	}
3436 
3437 	/*
3438 	 * NOTE: The so_max_pacing_rate value is read unlocked,
3439 	 * because atomic updates are not required since the variable
3440 	 * is checked at every mbuf we send. It is assumed that the
3441 	 * variable read itself will be atomic.
3442 	 */
3443 	max_pacing_rate = socket->so_max_pacing_rate;
3444 
3445 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3446 
3447 	if (did_upgrade)
3448 		INP_DOWNGRADE(inp);
3449 }
3450 
3451 /*
3452  * Track route changes for TX rate limiting.
3453  */
3454 void
3455 in_pcboutput_eagain(struct inpcb *inp)
3456 {
3457 	bool did_upgrade;
3458 
3459 	if (inp == NULL)
3460 		return;
3461 
3462 	if (inp->inp_snd_tag == NULL)
3463 		return;
3464 
3465 	if (!INP_WLOCKED(inp)) {
3466 		/*
3467 		 * NOTE: If the write locking fails, we need to bail
3468 		 * out and use the non-ratelimited ring for the
3469 		 * transmit until there is a new chance to get the
3470 		 * write lock.
3471 		 */
3472 		if (!INP_TRY_UPGRADE(inp))
3473 			return;
3474 		did_upgrade = 1;
3475 	} else {
3476 		did_upgrade = 0;
3477 	}
3478 
3479 	/* detach rate limiting */
3480 	in_pcbdetach_txrtlmt(inp);
3481 
3482 	/* make sure new mbuf send tag allocation is made */
3483 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3484 
3485 	if (did_upgrade)
3486 		INP_DOWNGRADE(inp);
3487 }
3488 
3489 #ifdef INET
3490 static void
3491 rl_init(void *st)
3492 {
3493 	rate_limit_new = counter_u64_alloc(M_WAITOK);
3494 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
3495 	rate_limit_active = counter_u64_alloc(M_WAITOK);
3496 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3497 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3498 }
3499 
3500 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3501 #endif
3502 #endif /* RATELIMIT */
3503