xref: /freebsd/sys/netinet/in_pcb.c (revision da806e8db685eead02bc67888b16ebac6badb6b6)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #include "opt_ddb.h"
41 #include "opt_ipsec.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ratelimit.h"
45 #include "opt_route.h"
46 #include "opt_rss.h"
47 
48 #include <sys/param.h>
49 #include <sys/hash.h>
50 #include <sys/systm.h>
51 #include <sys/libkern.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/eventhandler.h>
56 #include <sys/domain.h>
57 #include <sys/proc.h>
58 #include <sys/protosw.h>
59 #include <sys/smp.h>
60 #include <sys/smr.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <sys/priv.h>
65 #include <sys/proc.h>
66 #include <sys/refcount.h>
67 #include <sys/jail.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74 
75 #include <vm/uma.h>
76 #include <vm/vm.h>
77 
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/if_private.h>
81 #include <net/if_types.h>
82 #include <net/if_llatbl.h>
83 #include <net/route.h>
84 #include <net/rss_config.h>
85 #include <net/vnet.h>
86 
87 #if defined(INET) || defined(INET6)
88 #include <netinet/in.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/in_pcb_var.h>
91 #include <netinet/tcp.h>
92 #ifdef INET
93 #include <netinet/in_var.h>
94 #include <netinet/in_fib.h>
95 #endif
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/in6_pcb.h>
100 #include <netinet6/in6_var.h>
101 #include <netinet6/ip6_var.h>
102 #endif /* INET6 */
103 #include <net/route/nhop.h>
104 #endif
105 
106 #include <netipsec/ipsec_support.h>
107 
108 #include <security/mac/mac_framework.h>
109 
110 #define	INPCBLBGROUP_SIZMIN	8
111 #define	INPCBLBGROUP_SIZMAX	256
112 
113 #define	INP_FREED	0x00000200	/* Went through in_pcbfree(). */
114 #define	INP_INLBGROUP	0x01000000	/* Inserted into inpcblbgroup. */
115 
116 /*
117  * These configure the range of local port addresses assigned to
118  * "unspecified" outgoing connections/packets/whatever.
119  */
120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
126 
127 /*
128  * Reserved ports accessible only to root. There are significant
129  * security considerations that must be accounted for when changing these,
130  * but the security benefits can be great. Please be careful.
131  */
132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
133 VNET_DEFINE(int, ipport_reservedlow);
134 
135 /* Enable random ephemeral port allocation by default. */
136 VNET_DEFINE(int, ipport_randomized) = 1;
137 
138 #ifdef INET
139 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
140 			    struct in_addr faddr, u_int fport_arg,
141 			    struct in_addr laddr, u_int lport_arg,
142 			    int lookupflags, uint8_t numa_domain, int fib);
143 
144 #define RANGECHK(var, min, max) \
145 	if ((var) < (min)) { (var) = (min); } \
146 	else if ((var) > (max)) { (var) = (max); }
147 
148 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
150 {
151 	int error;
152 
153 	error = sysctl_handle_int(oidp, arg1, arg2, req);
154 	if (error == 0) {
155 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
156 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
157 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
158 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
159 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
160 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
161 	}
162 	return (error);
163 }
164 
165 #undef RANGECHK
166 
167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
168     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
169     "IP Ports");
170 
171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
172     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
173     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
174     "");
175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
176     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
177     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
178     "");
179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
180     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
181     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
182     "");
183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
184     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
185     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
186     "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
188     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
189     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
190     "");
191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
192     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
193     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
194     "");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
196 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
197 	&VNET_NAME(ipport_reservedhigh), 0, "");
198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
199 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
201 	CTLFLAG_VNET | CTLFLAG_RW,
202 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
203 
204 #ifdef RATELIMIT
205 counter_u64_t rate_limit_new;
206 counter_u64_t rate_limit_chg;
207 counter_u64_t rate_limit_active;
208 counter_u64_t rate_limit_alloc_fail;
209 counter_u64_t rate_limit_set_ok;
210 
211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
212     "IP Rate Limiting");
213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
214     &rate_limit_active, "Active rate limited connections");
215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
216    &rate_limit_alloc_fail, "Rate limited connection failures");
217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
218    &rate_limit_set_ok, "Rate limited setting succeeded");
219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
220    &rate_limit_new, "Total Rate limit new attempts");
221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
222    &rate_limit_chg, "Total Rate limited change attempts");
223 #endif /* RATELIMIT */
224 
225 #endif /* INET */
226 
227 VNET_DEFINE(uint32_t, in_pcbhashseed);
228 static void
in_pcbhashseed_init(void)229 in_pcbhashseed_init(void)
230 {
231 
232 	V_in_pcbhashseed = arc4random();
233 }
234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
235     in_pcbhashseed_init, NULL);
236 
237 #ifdef INET
238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1;
239 #define	V_connect_inaddr_wild	VNET(connect_inaddr_wild)
240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
241     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
242     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
243 #endif
244 
245 static void in_pcbremhash(struct inpcb *);
246 
247 /*
248  * in_pcb.c: manage the Protocol Control Blocks.
249  *
250  * NOTE: It is assumed that most of these functions will be called with
251  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
252  * functions often modify hash chains or addresses in pcbs.
253  */
254 
255 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
257     const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
258 {
259 	struct inpcblbgroup *grp;
260 	size_t bytes;
261 
262 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
263 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
264 	if (grp == NULL)
265 		return (NULL);
266 	LIST_INIT(&grp->il_pending);
267 	grp->il_cred = crhold(cred);
268 	grp->il_vflag = vflag;
269 	grp->il_lport = port;
270 	grp->il_numa_domain = numa_domain;
271 	grp->il_fibnum = fib;
272 	grp->il_dependladdr = *addr;
273 	grp->il_inpsiz = size;
274 	return (grp);
275 }
276 
277 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)278 in_pcblbgroup_free_deferred(epoch_context_t ctx)
279 {
280 	struct inpcblbgroup *grp;
281 
282 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
283 	crfree(grp->il_cred);
284 	free(grp, M_PCB);
285 }
286 
287 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)288 in_pcblbgroup_free(struct inpcblbgroup *grp)
289 {
290 	KASSERT(LIST_EMPTY(&grp->il_pending),
291 	    ("local group %p still has pending inps", grp));
292 
293 	CK_LIST_REMOVE(grp, il_list);
294 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
295 }
296 
297 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)298 in_pcblbgroup_find(struct inpcb *inp)
299 {
300 	struct inpcbinfo *pcbinfo;
301 	struct inpcblbgroup *grp;
302 	struct inpcblbgrouphead *hdr;
303 
304 	INP_LOCK_ASSERT(inp);
305 
306 	pcbinfo = inp->inp_pcbinfo;
307 	INP_HASH_LOCK_ASSERT(pcbinfo);
308 
309 	hdr = &pcbinfo->ipi_lbgrouphashbase[
310 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
311 	CK_LIST_FOREACH(grp, hdr, il_list) {
312 		struct inpcb *inp1;
313 
314 		for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
315 			if (inp == grp->il_inp[i])
316 				goto found;
317 		}
318 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
319 			if (inp == inp1)
320 				goto found;
321 		}
322 	}
323 found:
324 	return (grp);
325 }
326 
327 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)328 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
329 {
330 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
331 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
332 	    grp->il_inpcnt));
333 	INP_WLOCK_ASSERT(inp);
334 
335 	if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
336 	    !SOLISTENING(inp->inp_socket)) {
337 		/*
338 		 * If this is a TCP socket, it should not be visible to lbgroup
339 		 * lookups until listen() has been called.
340 		 */
341 		LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
342 	} else {
343 		grp->il_inp[grp->il_inpcnt] = inp;
344 
345 		/*
346 		 * Synchronize with in_pcblookup_lbgroup(): make sure that we
347 		 * don't expose a null slot to the lookup path.
348 		 */
349 		atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
350 	}
351 
352 	inp->inp_flags |= INP_INLBGROUP;
353 }
354 
355 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
357     struct inpcblbgroup *old_grp, int size)
358 {
359 	struct inpcblbgroup *grp;
360 	int i;
361 
362 	grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
363 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
364 	    old_grp->il_numa_domain, old_grp->il_fibnum);
365 	if (grp == NULL)
366 		return (NULL);
367 
368 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
369 	    ("invalid new local group size %d and old local group count %d",
370 	     grp->il_inpsiz, old_grp->il_inpcnt));
371 
372 	for (i = 0; i < old_grp->il_inpcnt; ++i)
373 		grp->il_inp[i] = old_grp->il_inp[i];
374 	grp->il_inpcnt = old_grp->il_inpcnt;
375 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
376 	LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
377 	    inp_lbgroup_list);
378 	in_pcblbgroup_free(old_grp);
379 	return (grp);
380 }
381 
382 /*
383  * Add PCB to load balance group for SO_REUSEPORT_LB option.
384  */
385 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)386 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
387 {
388 	const static struct timeval interval = { 60, 0 };
389 	static struct timeval lastprint;
390 	struct inpcbinfo *pcbinfo;
391 	struct inpcblbgrouphead *hdr;
392 	struct inpcblbgroup *grp;
393 	uint32_t idx;
394 	int fib;
395 
396 	pcbinfo = inp->inp_pcbinfo;
397 
398 	INP_WLOCK_ASSERT(inp);
399 	INP_HASH_WLOCK_ASSERT(pcbinfo);
400 
401 	fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
402 	    inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
403 
404 #ifdef INET6
405 	/*
406 	 * Don't allow IPv4 mapped INET6 wild socket.
407 	 */
408 	if ((inp->inp_vflag & INP_IPV4) &&
409 	    inp->inp_laddr.s_addr == INADDR_ANY &&
410 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
411 		return (0);
412 	}
413 #endif
414 
415 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
416 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
417 	CK_LIST_FOREACH(grp, hdr, il_list) {
418 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
419 		    grp->il_vflag == inp->inp_vflag &&
420 		    grp->il_lport == inp->inp_lport &&
421 		    grp->il_numa_domain == numa_domain &&
422 		    grp->il_fibnum == fib &&
423 		    memcmp(&grp->il_dependladdr,
424 		    &inp->inp_inc.inc_ie.ie_dependladdr,
425 		    sizeof(grp->il_dependladdr)) == 0) {
426 			break;
427 		}
428 	}
429 	if (grp == NULL) {
430 		/* Create new load balance group. */
431 		grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
432 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
433 		    INPCBLBGROUP_SIZMIN, numa_domain, fib);
434 		if (grp == NULL)
435 			return (ENOBUFS);
436 		in_pcblbgroup_insert(grp, inp);
437 		CK_LIST_INSERT_HEAD(hdr, grp, il_list);
438 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
439 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
440 			if (ratecheck(&lastprint, &interval))
441 				printf("lb group port %d, limit reached\n",
442 				    ntohs(grp->il_lport));
443 			return (0);
444 		}
445 
446 		/* Expand this local group. */
447 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
448 		if (grp == NULL)
449 			return (ENOBUFS);
450 		in_pcblbgroup_insert(grp, inp);
451 	} else {
452 		in_pcblbgroup_insert(grp, inp);
453 	}
454 	return (0);
455 }
456 
457 /*
458  * Remove PCB from load balance group.
459  */
460 static void
in_pcbremlbgrouphash(struct inpcb * inp)461 in_pcbremlbgrouphash(struct inpcb *inp)
462 {
463 	struct inpcbinfo *pcbinfo;
464 	struct inpcblbgrouphead *hdr;
465 	struct inpcblbgroup *grp;
466 	struct inpcb *inp1;
467 	int i;
468 
469 	pcbinfo = inp->inp_pcbinfo;
470 
471 	INP_WLOCK_ASSERT(inp);
472 	MPASS(inp->inp_flags & INP_INLBGROUP);
473 	INP_HASH_WLOCK_ASSERT(pcbinfo);
474 
475 	hdr = &pcbinfo->ipi_lbgrouphashbase[
476 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
477 	CK_LIST_FOREACH(grp, hdr, il_list) {
478 		for (i = 0; i < grp->il_inpcnt; ++i) {
479 			if (grp->il_inp[i] != inp)
480 				continue;
481 
482 			if (grp->il_inpcnt == 1 &&
483 			    LIST_EMPTY(&grp->il_pending)) {
484 				/* We are the last, free this local group. */
485 				in_pcblbgroup_free(grp);
486 			} else {
487 				grp->il_inp[i] =
488 				    grp->il_inp[grp->il_inpcnt - 1];
489 
490 				/*
491 				 * Synchronize with in_pcblookup_lbgroup().
492 				 */
493 				atomic_store_rel_int(&grp->il_inpcnt,
494 				    grp->il_inpcnt - 1);
495 			}
496 			inp->inp_flags &= ~INP_INLBGROUP;
497 			return;
498 		}
499 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
500 			if (inp == inp1) {
501 				LIST_REMOVE(inp, inp_lbgroup_list);
502 				inp->inp_flags &= ~INP_INLBGROUP;
503 				return;
504 			}
505 		}
506 	}
507 	__assert_unreachable();
508 }
509 
510 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)511 in_pcblbgroup_numa(struct inpcb *inp, int arg)
512 {
513 	struct inpcbinfo *pcbinfo;
514 	int error;
515 	uint8_t numa_domain;
516 
517 	switch (arg) {
518 	case TCP_REUSPORT_LB_NUMA_NODOM:
519 		numa_domain = M_NODOM;
520 		break;
521 	case TCP_REUSPORT_LB_NUMA_CURDOM:
522 		numa_domain = PCPU_GET(domain);
523 		break;
524 	default:
525 		if (arg < 0 || arg >= vm_ndomains)
526 			return (EINVAL);
527 		numa_domain = arg;
528 	}
529 
530 	pcbinfo = inp->inp_pcbinfo;
531 	INP_WLOCK_ASSERT(inp);
532 	INP_HASH_WLOCK(pcbinfo);
533 	if (in_pcblbgroup_find(inp) != NULL) {
534 		/* Remove it from the old group. */
535 		in_pcbremlbgrouphash(inp);
536 		/* Add it to the new group based on numa domain. */
537 		in_pcbinslbgrouphash(inp, numa_domain);
538 		error = 0;
539 	} else {
540 		error = ENOENT;
541 	}
542 	INP_HASH_WUNLOCK(pcbinfo);
543 	return (error);
544 }
545 
546 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
547 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
548 
549 /*
550  * Initialize an inpcbinfo - a per-VNET instance of connections db.
551  */
552 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements)553 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
554     u_int hash_nelements, u_int porthash_nelements)
555 {
556 
557 	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
558 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
559 	    NULL, MTX_DEF);
560 #ifdef VIMAGE
561 	pcbinfo->ipi_vnet = curvnet;
562 #endif
563 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
564 	pcbinfo->ipi_count = 0;
565 	pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
566 	    &pcbinfo->ipi_hashmask);
567 	pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
568 	    &pcbinfo->ipi_hashmask);
569 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
570 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
571 	    &pcbinfo->ipi_porthashmask);
572 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
573 	    &pcbinfo->ipi_lbgrouphashmask);
574 	pcbinfo->ipi_zone = pcbstor->ips_zone;
575 	pcbinfo->ipi_portzone = pcbstor->ips_portzone;
576 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
577 }
578 
579 /*
580  * Destroy an inpcbinfo.
581  */
582 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)583 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
584 {
585 
586 	KASSERT(pcbinfo->ipi_count == 0,
587 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
588 
589 	hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
590 	hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
591 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
592 	    pcbinfo->ipi_porthashmask);
593 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
594 	    pcbinfo->ipi_lbgrouphashmask);
595 	mtx_destroy(&pcbinfo->ipi_hash_lock);
596 	mtx_destroy(&pcbinfo->ipi_lock);
597 }
598 
599 /*
600  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
601  */
602 static void inpcb_fini(void *, int);
603 void
in_pcbstorage_init(void * arg)604 in_pcbstorage_init(void *arg)
605 {
606 	struct inpcbstorage *pcbstor = arg;
607 
608 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
609 	    pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
610 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
611 	pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
612 	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
613 	uma_zone_set_smr(pcbstor->ips_portzone,
614 	    uma_zone_get_smr(pcbstor->ips_zone));
615 }
616 
617 /*
618  * Destroy a pcbstorage - used by unloadable protocols.
619  */
620 void
in_pcbstorage_destroy(void * arg)621 in_pcbstorage_destroy(void *arg)
622 {
623 	struct inpcbstorage *pcbstor = arg;
624 
625 	uma_zdestroy(pcbstor->ips_zone);
626 	uma_zdestroy(pcbstor->ips_portzone);
627 }
628 
629 /*
630  * Allocate a PCB and associate it with the socket.
631  * On success return with the PCB locked.
632  */
633 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)634 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
635 {
636 	struct inpcb *inp;
637 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
638 	int error;
639 #endif
640 
641 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
642 	if (inp == NULL)
643 		return (ENOBUFS);
644 	bzero(&inp->inp_start_zero, inp_zero_size);
645 #ifdef NUMA
646 	inp->inp_numa_domain = M_NODOM;
647 #endif
648 	inp->inp_pcbinfo = pcbinfo;
649 	inp->inp_socket = so;
650 	inp->inp_cred = crhold(so->so_cred);
651 	inp->inp_inc.inc_fibnum = so->so_fibnum;
652 #ifdef MAC
653 	error = mac_inpcb_init(inp, M_NOWAIT);
654 	if (error != 0)
655 		goto out;
656 	mac_inpcb_create(so, inp);
657 #endif
658 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
659 	error = ipsec_init_pcbpolicy(inp);
660 	if (error != 0) {
661 #ifdef MAC
662 		mac_inpcb_destroy(inp);
663 #endif
664 		goto out;
665 	}
666 #endif /*IPSEC*/
667 #ifdef INET6
668 	if (INP_SOCKAF(so) == AF_INET6) {
669 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
670 		if (V_ip6_v6only)
671 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
672 #ifdef INET
673 		else
674 			inp->inp_vflag |= INP_IPV4;
675 #endif
676 		if (V_ip6_auto_flowlabel)
677 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
678 		inp->in6p_hops = -1;	/* use kernel default */
679 	}
680 #endif
681 #if defined(INET) && defined(INET6)
682 	else
683 #endif
684 #ifdef INET
685 		inp->inp_vflag |= INP_IPV4;
686 #endif
687 	inp->inp_smr = SMR_SEQ_INVALID;
688 
689 	/*
690 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
691 	 * to be cleaned up.
692 	 */
693 	inp->inp_route.ro_flags = RT_LLE_CACHE;
694 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
695 	INP_WLOCK(inp);
696 	INP_INFO_WLOCK(pcbinfo);
697 	pcbinfo->ipi_count++;
698 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
699 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
700 	INP_INFO_WUNLOCK(pcbinfo);
701 	so->so_pcb = inp;
702 
703 	return (0);
704 
705 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
706 out:
707 	crfree(inp->inp_cred);
708 #ifdef INVARIANTS
709 	inp->inp_cred = NULL;
710 #endif
711 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
712 	return (error);
713 #endif
714 }
715 
716 #ifdef INET
717 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)718 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
719     struct ucred *cred)
720 {
721 	int anonport, error;
722 
723 	KASSERT(sin == NULL || sin->sin_family == AF_INET,
724 	    ("%s: invalid address family for %p", __func__, sin));
725 	KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
726 	    ("%s: invalid address length for %p", __func__, sin));
727 	INP_WLOCK_ASSERT(inp);
728 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
729 
730 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
731 		return (EINVAL);
732 	anonport = sin == NULL || sin->sin_port == 0;
733 	error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
734 	    &inp->inp_lport, flags, cred);
735 	if (error)
736 		return (error);
737 	if (in_pcbinshash(inp) != 0) {
738 		inp->inp_laddr.s_addr = INADDR_ANY;
739 		inp->inp_lport = 0;
740 		inp->inp_flags &= ~INP_BOUNDFIB;
741 		return (EAGAIN);
742 	}
743 	if (anonport)
744 		inp->inp_flags |= INP_ANONPORT;
745 	return (0);
746 }
747 #endif
748 
749 #if defined(INET) || defined(INET6)
750 /*
751  * Assign a local port like in_pcb_lport(), but also used with connect()
752  * and a foreign address and port.  If fsa is non-NULL, choose a local port
753  * that is unused with those, otherwise one that is completely unused.
754  * lsa can be NULL for IPv6.
755  */
756 int
in_pcb_lport_dest(struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)757 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
758     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
759 {
760 	struct inpcbinfo *pcbinfo;
761 	struct inpcb *tmpinp;
762 	unsigned short *lastport;
763 	int count, error;
764 	u_short aux, first, last, lport;
765 #ifdef INET
766 	struct in_addr laddr, faddr;
767 #endif
768 #ifdef INET6
769 	struct in6_addr *laddr6, *faddr6;
770 #endif
771 
772 	pcbinfo = inp->inp_pcbinfo;
773 
774 	/*
775 	 * Because no actual state changes occur here, a global write lock on
776 	 * the pcbinfo isn't required.
777 	 */
778 	INP_LOCK_ASSERT(inp);
779 	INP_HASH_LOCK_ASSERT(pcbinfo);
780 
781 	if (inp->inp_flags & INP_HIGHPORT) {
782 		first = V_ipport_hifirstauto;	/* sysctl */
783 		last  = V_ipport_hilastauto;
784 		lastport = &pcbinfo->ipi_lasthi;
785 	} else if (inp->inp_flags & INP_LOWPORT) {
786 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
787 		if (error)
788 			return (error);
789 		first = V_ipport_lowfirstauto;	/* 1023 */
790 		last  = V_ipport_lowlastauto;	/* 600 */
791 		lastport = &pcbinfo->ipi_lastlow;
792 	} else {
793 		first = V_ipport_firstauto;	/* sysctl */
794 		last  = V_ipport_lastauto;
795 		lastport = &pcbinfo->ipi_lastport;
796 	}
797 
798 	/*
799 	 * Instead of having two loops further down counting up or down
800 	 * make sure that first is always <= last and go with only one
801 	 * code path implementing all logic.
802 	 */
803 	if (first > last) {
804 		aux = first;
805 		first = last;
806 		last = aux;
807 	}
808 
809 #ifdef INET
810 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
811 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
812 		if (lsa != NULL)
813 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
814 		if (fsa != NULL)
815 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
816 	}
817 #endif
818 #ifdef INET6
819 	laddr6 = NULL;
820 	if ((inp->inp_vflag & INP_IPV6) != 0) {
821 		if (lsa != NULL)
822 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
823 		if (fsa != NULL)
824 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
825 	}
826 #endif
827 
828 	tmpinp = NULL;
829 	lport = *lportp;
830 
831 	if (V_ipport_randomized)
832 		*lastport = first + (arc4random() % (last - first));
833 
834 	count = last - first;
835 
836 	do {
837 		if (count-- < 0)	/* completely used? */
838 			return (EADDRNOTAVAIL);
839 		++*lastport;
840 		if (*lastport < first || *lastport > last)
841 			*lastport = first;
842 		lport = htons(*lastport);
843 
844 		if (fsa != NULL) {
845 #ifdef INET
846 			if (lsa->sa_family == AF_INET) {
847 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
848 				    faddr, fport, laddr, lport, lookupflags,
849 				    M_NODOM, RT_ALL_FIBS);
850 			}
851 #endif
852 #ifdef INET6
853 			if (lsa->sa_family == AF_INET6) {
854 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
855 				    faddr6, fport, laddr6, lport, lookupflags,
856 				    M_NODOM, RT_ALL_FIBS);
857 			}
858 #endif
859 		} else {
860 #ifdef INET6
861 			if ((inp->inp_vflag & INP_IPV6) != 0) {
862 				tmpinp = in6_pcblookup_local(pcbinfo,
863 				    &inp->in6p_laddr, lport, RT_ALL_FIBS,
864 				    lookupflags, cred);
865 #ifdef INET
866 				if (tmpinp == NULL &&
867 				    (inp->inp_vflag & INP_IPV4))
868 					tmpinp = in_pcblookup_local(pcbinfo,
869 					    laddr, lport, RT_ALL_FIBS,
870 					    lookupflags, cred);
871 #endif
872 			}
873 #endif
874 #if defined(INET) && defined(INET6)
875 			else
876 #endif
877 #ifdef INET
878 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
879 				    lport, RT_ALL_FIBS, lookupflags, cred);
880 #endif
881 		}
882 	} while (tmpinp != NULL);
883 
884 	*lportp = lport;
885 
886 	return (0);
887 }
888 
889 /*
890  * Select a local port (number) to use.
891  */
892 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)893 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
894     struct ucred *cred, int lookupflags)
895 {
896 	struct sockaddr_in laddr;
897 
898 	if (laddrp) {
899 		bzero(&laddr, sizeof(laddr));
900 		laddr.sin_family = AF_INET;
901 		laddr.sin_addr = *laddrp;
902 	}
903 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
904 	    NULL, lportp, NULL, 0, cred, lookupflags));
905 }
906 #endif /* INET || INET6 */
907 
908 #ifdef INET
909 /*
910  * Determine whether the inpcb can be bound to the specified address/port tuple.
911  */
912 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)913 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
914     const u_short lport, const int fib, int sooptions, int lookupflags,
915     struct ucred *cred)
916 {
917 	int reuseport, reuseport_lb;
918 
919 	INP_LOCK_ASSERT(inp);
920 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
921 
922 	reuseport = (sooptions & SO_REUSEPORT);
923 	reuseport_lb = (sooptions & SO_REUSEPORT_LB);
924 
925 	if (IN_MULTICAST(ntohl(laddr.s_addr))) {
926 		/*
927 		 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
928 		 * allow complete duplication of binding if
929 		 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
930 		 * and a multicast address is bound on both
931 		 * new and duplicated sockets.
932 		 */
933 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
934 			reuseport = SO_REUSEADDR | SO_REUSEPORT;
935 		/*
936 		 * XXX: How to deal with SO_REUSEPORT_LB here?
937 		 * Treat same as SO_REUSEPORT for now.
938 		 */
939 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
940 			reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
941 	} else if (!in_nullhost(laddr)) {
942 		struct sockaddr_in sin;
943 
944 		memset(&sin, 0, sizeof(sin));
945 		sin.sin_family = AF_INET;
946 		sin.sin_len = sizeof(sin);
947 		sin.sin_addr = laddr;
948 
949 		/*
950 		 * Is the address a local IP address?
951 		 * If INP_BINDANY is set, then the socket may be bound
952 		 * to any endpoint address, local or not.
953 		 */
954 		if ((inp->inp_flags & INP_BINDANY) == 0 &&
955 		    ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
956 			return (EADDRNOTAVAIL);
957 	}
958 
959 	if (lport != 0) {
960 		struct inpcb *t;
961 
962 		if (ntohs(lport) <= V_ipport_reservedhigh &&
963 		    ntohs(lport) >= V_ipport_reservedlow &&
964 		    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
965 			return (EACCES);
966 
967 		if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
968 		    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
969 			/*
970 			 * If a socket owned by a different user is already
971 			 * bound to this port, fail.  In particular, SO_REUSE*
972 			 * can only be used to share a port among sockets owned
973 			 * by the same user.
974 			 *
975 			 * However, we can share a port with a connected socket
976 			 * which has a unique 4-tuple.
977 			 */
978 			t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
979 			    RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
980 			if (t != NULL &&
981 			    (inp->inp_socket->so_type != SOCK_STREAM ||
982 			     in_nullhost(t->inp_faddr)) &&
983 			    (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
984 				return (EADDRINUSE);
985 		}
986 		t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
987 		    lookupflags, cred);
988 		if (t != NULL && ((reuseport | reuseport_lb) &
989 		    t->inp_socket->so_options) == 0) {
990 #ifdef INET6
991 			if (!in_nullhost(laddr) ||
992 			    !in_nullhost(t->inp_laddr) ||
993 			    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
994 			    (t->inp_vflag & INP_IPV6PROTO) == 0)
995 #endif
996 				return (EADDRINUSE);
997 		}
998 	}
999 	return (0);
1000 }
1001 
1002 /*
1003  * Set up a bind operation on a PCB, performing port allocation
1004  * as required, but do not actually modify the PCB. Callers can
1005  * either complete the bind by setting inp_laddr/inp_lport and
1006  * calling in_pcbinshash(), or they can just use the resulting
1007  * port and address to authorise the sending of a once-off packet.
1008  *
1009  * On error, the values of *laddrp and *lportp are not changed.
1010  */
1011 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1012 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1013     u_short *lportp, int flags, struct ucred *cred)
1014 {
1015 	struct socket *so = inp->inp_socket;
1016 	struct in_addr laddr;
1017 	u_short lport = 0;
1018 	int error, fib, lookupflags, sooptions;
1019 
1020 	/*
1021 	 * No state changes, so read locks are sufficient here.
1022 	 */
1023 	INP_LOCK_ASSERT(inp);
1024 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1025 
1026 	laddr.s_addr = *laddrp;
1027 	if (sin != NULL && laddr.s_addr != INADDR_ANY)
1028 		return (EINVAL);
1029 
1030 	lookupflags = 0;
1031 	sooptions = atomic_load_int(&so->so_options);
1032 	if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1033 		lookupflags = INPLOOKUP_WILDCARD;
1034 	if (sin == NULL) {
1035 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
1036 			return (error);
1037 	} else {
1038 		KASSERT(sin->sin_family == AF_INET,
1039 		    ("%s: invalid family for address %p", __func__, sin));
1040 		KASSERT(sin->sin_len == sizeof(*sin),
1041 		    ("%s: invalid length for address %p", __func__, sin));
1042 
1043 		error = prison_local_ip4(cred, &sin->sin_addr);
1044 		if (error)
1045 			return (error);
1046 		if (sin->sin_port != *lportp) {
1047 			/* Don't allow the port to change. */
1048 			if (*lportp != 0)
1049 				return (EINVAL);
1050 			lport = sin->sin_port;
1051 		}
1052 		laddr = sin->sin_addr;
1053 
1054 		fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1055 		    RT_ALL_FIBS;
1056 
1057 		/* See if this address/port combo is available. */
1058 		error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1059 		    lookupflags, cred);
1060 		if (error != 0)
1061 			return (error);
1062 	}
1063 	if (*lportp != 0)
1064 		lport = *lportp;
1065 	if (lport == 0) {
1066 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1067 		if (error != 0)
1068 			return (error);
1069 	}
1070 	*laddrp = laddr.s_addr;
1071 	*lportp = lport;
1072 	if ((flags & INPBIND_FIB) != 0)
1073 		inp->inp_flags |= INP_BOUNDFIB;
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Connect from a socket to a specified address.
1079  * Both address and port must be specified in argument sin.
1080  * If don't have a local address for this socket yet,
1081  * then pick one.
1082  */
1083 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1084 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1085 {
1086 	u_short lport, fport;
1087 	in_addr_t laddr, faddr;
1088 	int anonport, error;
1089 
1090 	INP_WLOCK_ASSERT(inp);
1091 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1092 	KASSERT(in_nullhost(inp->inp_faddr),
1093 	    ("%s: inp is already connected", __func__));
1094 
1095 	lport = inp->inp_lport;
1096 	laddr = inp->inp_laddr.s_addr;
1097 	anonport = (lport == 0);
1098 	error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport,
1099 	    cred);
1100 	if (error)
1101 		return (error);
1102 
1103 	inp->inp_faddr.s_addr = faddr;
1104 	inp->inp_fport = fport;
1105 
1106 	/* Do the initial binding of the local address if required. */
1107 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1108 		inp->inp_lport = lport;
1109 		inp->inp_laddr.s_addr = laddr;
1110 		if (in_pcbinshash(inp) != 0) {
1111 			inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
1112 			    INADDR_ANY;
1113 			inp->inp_lport = inp->inp_fport = 0;
1114 			return (EAGAIN);
1115 		}
1116 	} else {
1117 		inp->inp_lport = lport;
1118 		inp->inp_laddr.s_addr = laddr;
1119 		if ((inp->inp_flags & INP_INHASHLIST) != 0)
1120 			in_pcbrehash(inp);
1121 		else
1122 			in_pcbinshash(inp);
1123 	}
1124 
1125 	if (anonport)
1126 		inp->inp_flags |= INP_ANONPORT;
1127 	return (0);
1128 }
1129 
1130 /*
1131  * Do proper source address selection on an unbound socket in case
1132  * of connect. Take jails into account as well.
1133  */
1134 int
in_pcbladdr(struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1135 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1136     struct ucred *cred)
1137 {
1138 	struct ifaddr *ifa;
1139 	struct sockaddr *sa;
1140 	struct sockaddr_in *sin, dst;
1141 	struct nhop_object *nh;
1142 	int error;
1143 
1144 	NET_EPOCH_ASSERT();
1145 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1146 
1147 	/*
1148 	 * Bypass source address selection and use the primary jail IP
1149 	 * if requested.
1150 	 */
1151 	if (!prison_saddrsel_ip4(cred, laddr))
1152 		return (0);
1153 
1154 	error = 0;
1155 
1156 	nh = NULL;
1157 	bzero(&dst, sizeof(dst));
1158 	sin = &dst;
1159 	sin->sin_family = AF_INET;
1160 	sin->sin_len = sizeof(struct sockaddr_in);
1161 	sin->sin_addr.s_addr = faddr->s_addr;
1162 
1163 	/*
1164 	 * If route is known our src addr is taken from the i/f,
1165 	 * else punt.
1166 	 *
1167 	 * Find out route to destination.
1168 	 */
1169 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1170 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1171 		    0, NHR_NONE, 0);
1172 
1173 	/*
1174 	 * If we found a route, use the address corresponding to
1175 	 * the outgoing interface.
1176 	 *
1177 	 * Otherwise assume faddr is reachable on a directly connected
1178 	 * network and try to find a corresponding interface to take
1179 	 * the source address from.
1180 	 */
1181 	if (nh == NULL || nh->nh_ifp == NULL) {
1182 		struct in_ifaddr *ia;
1183 		struct ifnet *ifp;
1184 
1185 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1186 					inp->inp_socket->so_fibnum));
1187 		if (ia == NULL) {
1188 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1189 						inp->inp_socket->so_fibnum));
1190 		}
1191 		if (ia == NULL) {
1192 			error = ENETUNREACH;
1193 			goto done;
1194 		}
1195 
1196 		if (!prison_flag(cred, PR_IP4)) {
1197 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1198 			goto done;
1199 		}
1200 
1201 		ifp = ia->ia_ifp;
1202 		ia = NULL;
1203 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1204 			sa = ifa->ifa_addr;
1205 			if (sa->sa_family != AF_INET)
1206 				continue;
1207 			sin = (struct sockaddr_in *)sa;
1208 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1209 				ia = (struct in_ifaddr *)ifa;
1210 				break;
1211 			}
1212 		}
1213 		if (ia != NULL) {
1214 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1215 			goto done;
1216 		}
1217 
1218 		/* 3. As a last resort return the 'default' jail address. */
1219 		error = prison_get_ip4(cred, laddr);
1220 		goto done;
1221 	}
1222 
1223 	/*
1224 	 * If the outgoing interface on the route found is not
1225 	 * a loopback interface, use the address from that interface.
1226 	 * In case of jails do those three steps:
1227 	 * 1. check if the interface address belongs to the jail. If so use it.
1228 	 * 2. check if we have any address on the outgoing interface
1229 	 *    belonging to this jail. If so use it.
1230 	 * 3. as a last resort return the 'default' jail address.
1231 	 */
1232 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1233 		struct in_ifaddr *ia;
1234 		struct ifnet *ifp;
1235 
1236 		/* If not jailed, use the default returned. */
1237 		if (!prison_flag(cred, PR_IP4)) {
1238 			ia = (struct in_ifaddr *)nh->nh_ifa;
1239 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1240 			goto done;
1241 		}
1242 
1243 		/* Jailed. */
1244 		/* 1. Check if the iface address belongs to the jail. */
1245 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1246 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1247 			ia = (struct in_ifaddr *)nh->nh_ifa;
1248 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1249 			goto done;
1250 		}
1251 
1252 		/*
1253 		 * 2. Check if we have any address on the outgoing interface
1254 		 *    belonging to this jail.
1255 		 */
1256 		ia = NULL;
1257 		ifp = nh->nh_ifp;
1258 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1259 			sa = ifa->ifa_addr;
1260 			if (sa->sa_family != AF_INET)
1261 				continue;
1262 			sin = (struct sockaddr_in *)sa;
1263 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1264 				ia = (struct in_ifaddr *)ifa;
1265 				break;
1266 			}
1267 		}
1268 		if (ia != NULL) {
1269 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1270 			goto done;
1271 		}
1272 
1273 		/* 3. As a last resort return the 'default' jail address. */
1274 		error = prison_get_ip4(cred, laddr);
1275 		goto done;
1276 	}
1277 
1278 	/*
1279 	 * The outgoing interface is marked with 'loopback net', so a route
1280 	 * to ourselves is here.
1281 	 * Try to find the interface of the destination address and then
1282 	 * take the address from there. That interface is not necessarily
1283 	 * a loopback interface.
1284 	 * In case of jails, check that it is an address of the jail
1285 	 * and if we cannot find, fall back to the 'default' jail address.
1286 	 */
1287 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1288 		struct in_ifaddr *ia;
1289 
1290 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1291 					inp->inp_socket->so_fibnum));
1292 		if (ia == NULL)
1293 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1294 						inp->inp_socket->so_fibnum));
1295 		if (ia == NULL)
1296 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1297 
1298 		if (!prison_flag(cred, PR_IP4)) {
1299 			if (ia == NULL) {
1300 				error = ENETUNREACH;
1301 				goto done;
1302 			}
1303 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1304 			goto done;
1305 		}
1306 
1307 		/* Jailed. */
1308 		if (ia != NULL) {
1309 			struct ifnet *ifp;
1310 
1311 			ifp = ia->ia_ifp;
1312 			ia = NULL;
1313 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1314 				sa = ifa->ifa_addr;
1315 				if (sa->sa_family != AF_INET)
1316 					continue;
1317 				sin = (struct sockaddr_in *)sa;
1318 				if (prison_check_ip4(cred,
1319 				    &sin->sin_addr) == 0) {
1320 					ia = (struct in_ifaddr *)ifa;
1321 					break;
1322 				}
1323 			}
1324 			if (ia != NULL) {
1325 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326 				goto done;
1327 			}
1328 		}
1329 
1330 		/* 3. As a last resort return the 'default' jail address. */
1331 		error = prison_get_ip4(cred, laddr);
1332 		goto done;
1333 	}
1334 
1335 done:
1336 	if (error == 0 && laddr->s_addr == INADDR_ANY)
1337 		return (EHOSTUNREACH);
1338 	return (error);
1339 }
1340 
1341 /*
1342  * Set up for a connect from a socket to the specified address.
1343  * On entry, *laddrp and *lportp should contain the current local
1344  * address and port for the PCB; these are updated to the values
1345  * that should be placed in inp_laddr and inp_lport to complete
1346  * the connect.
1347  *
1348  * On success, *faddrp and *fportp will be set to the remote address
1349  * and port. These are not updated in the error case.
1350  */
1351 int
in_pcbconnect_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,in_addr_t * faddrp,u_short * fportp,struct ucred * cred)1352 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin,
1353     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1354     struct ucred *cred)
1355 {
1356 	struct in_ifaddr *ia;
1357 	struct in_addr laddr, faddr;
1358 	u_short lport, fport;
1359 	int error;
1360 
1361 	KASSERT(sin->sin_family == AF_INET,
1362 	    ("%s: invalid address family for %p", __func__, sin));
1363 	KASSERT(sin->sin_len == sizeof(*sin),
1364 	    ("%s: invalid address length for %p", __func__, sin));
1365 
1366 	/*
1367 	 * Because a global state change doesn't actually occur here, a read
1368 	 * lock is sufficient.
1369 	 */
1370 	NET_EPOCH_ASSERT();
1371 	INP_LOCK_ASSERT(inp);
1372 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1373 
1374 	if (sin->sin_port == 0)
1375 		return (EADDRNOTAVAIL);
1376 	laddr.s_addr = *laddrp;
1377 	lport = *lportp;
1378 	faddr = sin->sin_addr;
1379 	fport = sin->sin_port;
1380 #ifdef ROUTE_MPATH
1381 	if (CALC_FLOWID_OUTBOUND) {
1382 		uint32_t hash_val, hash_type;
1383 
1384 		hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1385 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
1386 
1387 		inp->inp_flowid = hash_val;
1388 		inp->inp_flowtype = hash_type;
1389 	}
1390 #endif
1391 	if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1392 		/*
1393 		 * If the destination address is INADDR_ANY,
1394 		 * use the primary local address.
1395 		 * If the supplied address is INADDR_BROADCAST,
1396 		 * and the primary interface supports broadcast,
1397 		 * choose the broadcast address for that interface.
1398 		 */
1399 		if (faddr.s_addr == INADDR_ANY) {
1400 			faddr =
1401 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1402 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
1403 				return (error);
1404 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1405 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1406 			    IFF_BROADCAST)
1407 				faddr = satosin(&CK_STAILQ_FIRST(
1408 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1409 		}
1410 	} else if (faddr.s_addr == INADDR_ANY) {
1411 		return (ENETUNREACH);
1412 	}
1413 	if (laddr.s_addr == INADDR_ANY) {
1414 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1415 		/*
1416 		 * If the destination address is multicast and an outgoing
1417 		 * interface has been set as a multicast option, prefer the
1418 		 * address of that interface as our source address.
1419 		 */
1420 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1421 		    inp->inp_moptions != NULL) {
1422 			struct ip_moptions *imo;
1423 			struct ifnet *ifp;
1424 
1425 			imo = inp->inp_moptions;
1426 			if (imo->imo_multicast_ifp != NULL) {
1427 				ifp = imo->imo_multicast_ifp;
1428 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1429 					if (ia->ia_ifp == ifp &&
1430 					    prison_check_ip4(cred,
1431 					    &ia->ia_addr.sin_addr) == 0)
1432 						break;
1433 				}
1434 				if (ia == NULL)
1435 					error = EADDRNOTAVAIL;
1436 				else {
1437 					laddr = ia->ia_addr.sin_addr;
1438 					error = 0;
1439 				}
1440 			}
1441 		}
1442 		if (error)
1443 			return (error);
1444 	}
1445 
1446 	if (lport != 0) {
1447 		if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1448 		    fport, laddr, lport, 0, M_NODOM, RT_ALL_FIBS) != NULL)
1449 			return (EADDRINUSE);
1450 	} else {
1451 		struct sockaddr_in lsin, fsin;
1452 
1453 		bzero(&lsin, sizeof(lsin));
1454 		bzero(&fsin, sizeof(fsin));
1455 		lsin.sin_family = AF_INET;
1456 		lsin.sin_addr = laddr;
1457 		fsin.sin_family = AF_INET;
1458 		fsin.sin_addr = faddr;
1459 		error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1460 		    &lport, (struct sockaddr *)& fsin, fport, cred,
1461 		    INPLOOKUP_WILDCARD);
1462 		if (error)
1463 			return (error);
1464 	}
1465 	*laddrp = laddr.s_addr;
1466 	*lportp = lport;
1467 	*faddrp = faddr.s_addr;
1468 	*fportp = fport;
1469 	return (0);
1470 }
1471 
1472 void
in_pcbdisconnect(struct inpcb * inp)1473 in_pcbdisconnect(struct inpcb *inp)
1474 {
1475 
1476 	INP_WLOCK_ASSERT(inp);
1477 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1478 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1479 	    ("%s: inp %p was already disconnected", __func__, inp));
1480 
1481 	in_pcbremhash_locked(inp);
1482 
1483 	/* See the comment in in_pcbinshash(). */
1484 	inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1485 	inp->inp_laddr.s_addr = INADDR_ANY;
1486 	inp->inp_faddr.s_addr = INADDR_ANY;
1487 	inp->inp_fport = 0;
1488 }
1489 #endif /* INET */
1490 
1491 void
in_pcblisten(struct inpcb * inp)1492 in_pcblisten(struct inpcb *inp)
1493 {
1494 	struct inpcblbgroup *grp;
1495 
1496 	INP_WLOCK_ASSERT(inp);
1497 
1498 	if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1499 		struct inpcbinfo *pcbinfo;
1500 
1501 		pcbinfo = inp->inp_pcbinfo;
1502 		INP_HASH_WLOCK(pcbinfo);
1503 		grp = in_pcblbgroup_find(inp);
1504 		LIST_REMOVE(inp, inp_lbgroup_list);
1505 		in_pcblbgroup_insert(grp, inp);
1506 		INP_HASH_WUNLOCK(pcbinfo);
1507 	}
1508 }
1509 
1510 /*
1511  * inpcb hash lookups are protected by SMR section.
1512  *
1513  * Once desired pcb has been found, switching from SMR section to a pcb
1514  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1515  * here because SMR is a critical section.
1516  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1517  */
1518 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1519 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1520 {
1521 
1522 	lock == INPLOOKUP_RLOCKPCB ?
1523 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1524 }
1525 
1526 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1527 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1528 {
1529 
1530 	lock == INPLOOKUP_RLOCKPCB ?
1531 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1532 }
1533 
1534 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1535 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1536 {
1537 
1538 	return (lock == INPLOOKUP_RLOCKPCB ?
1539 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1540 }
1541 
1542 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1543 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1544 {
1545 
1546 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1547 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1548 
1549 	if (__predict_true(inp_trylock(inp, lock))) {
1550 		if (__predict_false(inp->inp_flags & ignflags)) {
1551 			smr_exit(inp->inp_pcbinfo->ipi_smr);
1552 			inp_unlock(inp, lock);
1553 			return (false);
1554 		}
1555 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1556 		return (true);
1557 	}
1558 
1559 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1560 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1561 		inp_lock(inp, lock);
1562 		if (__predict_false(in_pcbrele(inp, lock)))
1563 			return (false);
1564 		/*
1565 		 * inp acquired through refcount & lock for sure didn't went
1566 		 * through uma_zfree().  However, it may have already went
1567 		 * through in_pcbfree() and has another reference, that
1568 		 * prevented its release by our in_pcbrele().
1569 		 */
1570 		if (__predict_false(inp->inp_flags & ignflags)) {
1571 			inp_unlock(inp, lock);
1572 			return (false);
1573 		}
1574 		return (true);
1575 	} else {
1576 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1577 		return (false);
1578 	}
1579 }
1580 
1581 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1582 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1583 {
1584 
1585 	/*
1586 	 * in_pcblookup() family of functions ignore not only freed entries,
1587 	 * that may be found due to lockless access to the hash, but dropped
1588 	 * entries, too.
1589 	 */
1590 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1591 }
1592 
1593 /*
1594  * inp_next() - inpcb hash/list traversal iterator
1595  *
1596  * Requires initialized struct inpcb_iterator for context.
1597  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1598  *
1599  * - Iterator can have either write-lock or read-lock semantics, that can not
1600  *   be changed later.
1601  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1602  *   a single hash slot.  Note: only rip_input() does the latter.
1603  * - Iterator may have optional bool matching function.  The matching function
1604  *   will be executed for each inpcb in the SMR context, so it can not acquire
1605  *   locks and can safely access only immutable fields of inpcb.
1606  *
1607  * A fresh initialized iterator has NULL inpcb in its context and that
1608  * means that inp_next() call would return the very first inpcb on the list
1609  * locked with desired semantic.  In all following calls the context pointer
1610  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1611  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1612  * and write NULL to its context.  After end of traversal an iterator can be
1613  * reused.
1614  *
1615  * List traversals have the following features/constraints:
1616  * - New entries won't be seen, as they are always added to the head of a list.
1617  * - Removed entries won't stop traversal as long as they are not added to
1618  *   a different list. This is violated by in_pcbrehash().
1619  */
1620 #define	II_LIST_FIRST(ipi, hash)					\
1621 		(((hash) == INP_ALL_LIST) ?				\
1622 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
1623 		    CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1624 #define	II_LIST_NEXT(inp, hash)						\
1625 		(((hash) == INP_ALL_LIST) ?				\
1626 		    CK_LIST_NEXT((inp), inp_list) :			\
1627 		    CK_LIST_NEXT((inp), inp_hash_exact))
1628 #define	II_LOCK_ASSERT(inp, lock)					\
1629 		rw_assert(&(inp)->inp_lock,				\
1630 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
1631 struct inpcb *
inp_next(struct inpcb_iterator * ii)1632 inp_next(struct inpcb_iterator *ii)
1633 {
1634 	const struct inpcbinfo *ipi = ii->ipi;
1635 	inp_match_t *match = ii->match;
1636 	void *ctx = ii->ctx;
1637 	inp_lookup_t lock = ii->lock;
1638 	int hash = ii->hash;
1639 	struct inpcb *inp;
1640 
1641 	if (ii->inp == NULL) {		/* First call. */
1642 		smr_enter(ipi->ipi_smr);
1643 		/* This is unrolled CK_LIST_FOREACH(). */
1644 		for (inp = II_LIST_FIRST(ipi, hash);
1645 		    inp != NULL;
1646 		    inp = II_LIST_NEXT(inp, hash)) {
1647 			if (match != NULL && (match)(inp, ctx) == false)
1648 				continue;
1649 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1650 				break;
1651 			else {
1652 				smr_enter(ipi->ipi_smr);
1653 				MPASS(inp != II_LIST_FIRST(ipi, hash));
1654 				inp = II_LIST_FIRST(ipi, hash);
1655 				if (inp == NULL)
1656 					break;
1657 			}
1658 		}
1659 
1660 		if (inp == NULL)
1661 			smr_exit(ipi->ipi_smr);
1662 		else
1663 			ii->inp = inp;
1664 
1665 		return (inp);
1666 	}
1667 
1668 	/* Not a first call. */
1669 	smr_enter(ipi->ipi_smr);
1670 restart:
1671 	inp = ii->inp;
1672 	II_LOCK_ASSERT(inp, lock);
1673 next:
1674 	inp = II_LIST_NEXT(inp, hash);
1675 	if (inp == NULL) {
1676 		smr_exit(ipi->ipi_smr);
1677 		goto found;
1678 	}
1679 
1680 	if (match != NULL && (match)(inp, ctx) == false)
1681 		goto next;
1682 
1683 	if (__predict_true(inp_trylock(inp, lock))) {
1684 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1685 			/*
1686 			 * Entries are never inserted in middle of a list, thus
1687 			 * as long as we are in SMR, we can continue traversal.
1688 			 * Jump to 'restart' should yield in the same result,
1689 			 * but could produce unnecessary looping.  Could this
1690 			 * looping be unbound?
1691 			 */
1692 			inp_unlock(inp, lock);
1693 			goto next;
1694 		} else {
1695 			smr_exit(ipi->ipi_smr);
1696 			goto found;
1697 		}
1698 	}
1699 
1700 	/*
1701 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
1702 	 * SMR section we can no longer jump to 'next', and our only stable
1703 	 * anchoring point is ii->inp, which we keep locked for this case, so
1704 	 * we jump to 'restart'.
1705 	 */
1706 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1707 		smr_exit(ipi->ipi_smr);
1708 		inp_lock(inp, lock);
1709 		if (__predict_false(in_pcbrele(inp, lock))) {
1710 			smr_enter(ipi->ipi_smr);
1711 			goto restart;
1712 		}
1713 		/*
1714 		 * See comment in inp_smr_lock().
1715 		 */
1716 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1717 			inp_unlock(inp, lock);
1718 			smr_enter(ipi->ipi_smr);
1719 			goto restart;
1720 		}
1721 	} else
1722 		goto next;
1723 
1724 found:
1725 	inp_unlock(ii->inp, lock);
1726 	ii->inp = inp;
1727 
1728 	return (ii->inp);
1729 }
1730 
1731 /*
1732  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1733  * stability of an inpcb pointer despite the inpcb lock being released or
1734  * SMR section exited.
1735  *
1736  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1737  */
1738 void
in_pcbref(struct inpcb * inp)1739 in_pcbref(struct inpcb *inp)
1740 {
1741 	u_int old __diagused;
1742 
1743 	old = refcount_acquire(&inp->inp_refcount);
1744 	KASSERT(old > 0, ("%s: refcount 0", __func__));
1745 }
1746 
1747 /*
1748  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1749  * freeing the pcb, if the reference was very last.
1750  */
1751 bool
in_pcbrele_rlocked(struct inpcb * inp)1752 in_pcbrele_rlocked(struct inpcb *inp)
1753 {
1754 
1755 	INP_RLOCK_ASSERT(inp);
1756 
1757 	if (!refcount_release(&inp->inp_refcount))
1758 		return (false);
1759 
1760 	MPASS(inp->inp_flags & INP_FREED);
1761 	MPASS(inp->inp_socket == NULL);
1762 	crfree(inp->inp_cred);
1763 #ifdef INVARIANTS
1764 	inp->inp_cred = NULL;
1765 #endif
1766 	INP_RUNLOCK(inp);
1767 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1768 	return (true);
1769 }
1770 
1771 bool
in_pcbrele_wlocked(struct inpcb * inp)1772 in_pcbrele_wlocked(struct inpcb *inp)
1773 {
1774 
1775 	INP_WLOCK_ASSERT(inp);
1776 
1777 	if (!refcount_release(&inp->inp_refcount))
1778 		return (false);
1779 
1780 	MPASS(inp->inp_flags & INP_FREED);
1781 	MPASS(inp->inp_socket == NULL);
1782 	crfree(inp->inp_cred);
1783 #ifdef INVARIANTS
1784 	inp->inp_cred = NULL;
1785 #endif
1786 	INP_WUNLOCK(inp);
1787 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1788 	return (true);
1789 }
1790 
1791 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1792 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1793 {
1794 
1795 	return (lock == INPLOOKUP_RLOCKPCB ?
1796 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1797 }
1798 
1799 /*
1800  * Unconditionally schedule an inpcb to be freed by decrementing its
1801  * reference count, which should occur only after the inpcb has been detached
1802  * from its socket.  If another thread holds a temporary reference (acquired
1803  * using in_pcbref()) then the free is deferred until that reference is
1804  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1805  *  Almost all work, including removal from global lists, is done in this
1806  * context, where the pcbinfo lock is held.
1807  */
1808 void
in_pcbfree(struct inpcb * inp)1809 in_pcbfree(struct inpcb *inp)
1810 {
1811 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1812 #ifdef INET
1813 	struct ip_moptions *imo;
1814 #endif
1815 #ifdef INET6
1816 	struct ip6_moptions *im6o;
1817 #endif
1818 
1819 	INP_WLOCK_ASSERT(inp);
1820 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1821 	KASSERT((inp->inp_flags & INP_FREED) == 0,
1822 	    ("%s: called twice for pcb %p", __func__, inp));
1823 
1824 	/*
1825 	 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1826 	 * from the hash without acquiring inpcb lock, they rely on the hash
1827 	 * lock, thus in_pcbremhash() should be the first action.
1828 	 */
1829 	if (inp->inp_flags & INP_INHASHLIST)
1830 		in_pcbremhash(inp);
1831 	INP_INFO_WLOCK(pcbinfo);
1832 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1833 	pcbinfo->ipi_count--;
1834 	CK_LIST_REMOVE(inp, inp_list);
1835 	INP_INFO_WUNLOCK(pcbinfo);
1836 
1837 #ifdef RATELIMIT
1838 	if (inp->inp_snd_tag != NULL)
1839 		in_pcbdetach_txrtlmt(inp);
1840 #endif
1841 	inp->inp_flags |= INP_FREED;
1842 	inp->inp_socket->so_pcb = NULL;
1843 	inp->inp_socket = NULL;
1844 
1845 	RO_INVALIDATE_CACHE(&inp->inp_route);
1846 #ifdef MAC
1847 	mac_inpcb_destroy(inp);
1848 #endif
1849 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1850 	if (inp->inp_sp != NULL)
1851 		ipsec_delete_pcbpolicy(inp);
1852 #endif
1853 #ifdef INET
1854 	if (inp->inp_options)
1855 		(void)m_free(inp->inp_options);
1856 	DEBUG_POISON_POINTER(inp->inp_options);
1857 	imo = inp->inp_moptions;
1858 	DEBUG_POISON_POINTER(inp->inp_moptions);
1859 #endif
1860 #ifdef INET6
1861 	if (inp->inp_vflag & INP_IPV6PROTO) {
1862 		ip6_freepcbopts(inp->in6p_outputopts);
1863 		DEBUG_POISON_POINTER(inp->in6p_outputopts);
1864 		im6o = inp->in6p_moptions;
1865 		DEBUG_POISON_POINTER(inp->in6p_moptions);
1866 	} else
1867 		im6o = NULL;
1868 #endif
1869 
1870 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1871 		INP_WUNLOCK(inp);
1872 	}
1873 #ifdef INET6
1874 	ip6_freemoptions(im6o);
1875 #endif
1876 #ifdef INET
1877 	inp_freemoptions(imo);
1878 #endif
1879 }
1880 
1881 /*
1882  * Different protocols initialize their inpcbs differently - giving
1883  * different name to the lock.  But they all are disposed the same.
1884  */
1885 static void
inpcb_fini(void * mem,int size)1886 inpcb_fini(void *mem, int size)
1887 {
1888 	struct inpcb *inp = mem;
1889 
1890 	INP_LOCK_DESTROY(inp);
1891 }
1892 
1893 /*
1894  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1895  * port reservation, and preventing it from being returned by inpcb lookups.
1896  *
1897  * It is used by TCP to mark an inpcb as unused and avoid future packet
1898  * delivery or event notification when a socket remains open but TCP has
1899  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1900  * or a RST on the wire, and allows the port binding to be reused while still
1901  * maintaining the invariant that so_pcb always points to a valid inpcb until
1902  * in_pcbdetach().
1903  *
1904  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1905  * in_pcbpurgeif0()?
1906  */
1907 void
in_pcbdrop(struct inpcb * inp)1908 in_pcbdrop(struct inpcb *inp)
1909 {
1910 
1911 	INP_WLOCK_ASSERT(inp);
1912 
1913 	inp->inp_flags |= INP_DROPPED;
1914 	if (inp->inp_flags & INP_INHASHLIST)
1915 		in_pcbremhash(inp);
1916 }
1917 
1918 #ifdef INET
1919 /*
1920  * Common routines to return the socket addresses associated with inpcbs.
1921  */
1922 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1923 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1924 {
1925 	struct inpcb *inp;
1926 
1927 	inp = sotoinpcb(so);
1928 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1929 
1930 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1931 		.sin_len = sizeof(struct sockaddr_in),
1932 		.sin_family = AF_INET,
1933 		.sin_port = inp->inp_lport,
1934 		.sin_addr = inp->inp_laddr,
1935 	};
1936 
1937 	return (0);
1938 }
1939 
1940 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1941 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1942 {
1943 	struct inpcb *inp;
1944 
1945 	inp = sotoinpcb(so);
1946 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1947 
1948 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1949 		.sin_len = sizeof(struct sockaddr_in),
1950 		.sin_family = AF_INET,
1951 		.sin_port = inp->inp_fport,
1952 		.sin_addr = inp->inp_faddr,
1953 	};
1954 
1955 	return (0);
1956 }
1957 
1958 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1959 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1960 {
1961 
1962 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1963 		return (true);
1964 	else
1965 		return (false);
1966 }
1967 
1968 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1969 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1970 {
1971 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1972 	    inp_v4_multi_match, NULL);
1973 	struct inpcb *inp;
1974 	struct in_multi *inm;
1975 	struct in_mfilter *imf;
1976 	struct ip_moptions *imo;
1977 
1978 	IN_MULTI_LOCK_ASSERT();
1979 
1980 	while ((inp = inp_next(&inpi)) != NULL) {
1981 		INP_WLOCK_ASSERT(inp);
1982 
1983 		imo = inp->inp_moptions;
1984 		/*
1985 		 * Unselect the outgoing interface if it is being
1986 		 * detached.
1987 		 */
1988 		if (imo->imo_multicast_ifp == ifp)
1989 			imo->imo_multicast_ifp = NULL;
1990 
1991 		/*
1992 		 * Drop multicast group membership if we joined
1993 		 * through the interface being detached.
1994 		 *
1995 		 * XXX This can all be deferred to an epoch_call
1996 		 */
1997 restart:
1998 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1999 			if ((inm = imf->imf_inm) == NULL)
2000 				continue;
2001 			if (inm->inm_ifp != ifp)
2002 				continue;
2003 			ip_mfilter_remove(&imo->imo_head, imf);
2004 			in_leavegroup_locked(inm, NULL);
2005 			ip_mfilter_free(imf);
2006 			goto restart;
2007 		}
2008 	}
2009 }
2010 
2011 /*
2012  * Lookup a PCB based on the local address and port.  Caller must hold the
2013  * hash lock.  No inpcb locks or references are acquired.
2014  */
2015 #define INP_LOOKUP_MAPPED_PCB_COST	3
2016 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)2017 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2018     u_short lport, int fib, int lookupflags, struct ucred *cred)
2019 {
2020 	struct inpcb *inp;
2021 #ifdef INET6
2022 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2023 #else
2024 	int matchwild = 3;
2025 #endif
2026 	int wildcard;
2027 
2028 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2029 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2030 	KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
2031 	    ("%s: invalid fib %d", __func__, fib));
2032 
2033 	INP_HASH_LOCK_ASSERT(pcbinfo);
2034 
2035 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2036 		struct inpcbhead *head;
2037 		/*
2038 		 * Look for an unconnected (wildcard foreign addr) PCB that
2039 		 * matches the local address and port we're looking for.
2040 		 */
2041 		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2042 		    pcbinfo->ipi_hashmask)];
2043 		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2044 #ifdef INET6
2045 			/* XXX inp locking */
2046 			if ((inp->inp_vflag & INP_IPV4) == 0)
2047 				continue;
2048 #endif
2049 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2050 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2051 			    inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2052 			    inp->inp_inc.inc_fibnum == fib)) {
2053 				/*
2054 				 * Found?
2055 				 */
2056 				if (prison_equal_ip4(cred->cr_prison,
2057 				    inp->inp_cred->cr_prison))
2058 					return (inp);
2059 			}
2060 		}
2061 		/*
2062 		 * Not found.
2063 		 */
2064 		return (NULL);
2065 	} else {
2066 		struct inpcbporthead *porthash;
2067 		struct inpcbport *phd;
2068 		struct inpcb *match = NULL;
2069 		/*
2070 		 * Best fit PCB lookup.
2071 		 *
2072 		 * First see if this local port is in use by looking on the
2073 		 * port hash list.
2074 		 */
2075 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2076 		    pcbinfo->ipi_porthashmask)];
2077 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
2078 			if (phd->phd_port == lport)
2079 				break;
2080 		}
2081 		if (phd != NULL) {
2082 			/*
2083 			 * Port is in use by one or more PCBs. Look for best
2084 			 * fit.
2085 			 */
2086 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2087 				wildcard = 0;
2088 				if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2089 				    cred->cr_prison))
2090 					continue;
2091 				if (fib != RT_ALL_FIBS &&
2092 				    inp->inp_inc.inc_fibnum != fib)
2093 					continue;
2094 #ifdef INET6
2095 				/* XXX inp locking */
2096 				if ((inp->inp_vflag & INP_IPV4) == 0)
2097 					continue;
2098 				/*
2099 				 * We never select the PCB that has
2100 				 * INP_IPV6 flag and is bound to :: if
2101 				 * we have another PCB which is bound
2102 				 * to 0.0.0.0.  If a PCB has the
2103 				 * INP_IPV6 flag, then we set its cost
2104 				 * higher than IPv4 only PCBs.
2105 				 *
2106 				 * Note that the case only happens
2107 				 * when a socket is bound to ::, under
2108 				 * the condition that the use of the
2109 				 * mapped address is allowed.
2110 				 */
2111 				if ((inp->inp_vflag & INP_IPV6) != 0)
2112 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2113 #endif
2114 				if (inp->inp_faddr.s_addr != INADDR_ANY)
2115 					wildcard++;
2116 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
2117 					if (laddr.s_addr == INADDR_ANY)
2118 						wildcard++;
2119 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
2120 						continue;
2121 				} else {
2122 					if (laddr.s_addr != INADDR_ANY)
2123 						wildcard++;
2124 				}
2125 				if (wildcard < matchwild) {
2126 					match = inp;
2127 					matchwild = wildcard;
2128 					if (matchwild == 0)
2129 						break;
2130 				}
2131 			}
2132 		}
2133 		return (match);
2134 	}
2135 }
2136 #undef INP_LOOKUP_MAPPED_PCB_COST
2137 
2138 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2139 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2140 {
2141 	return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2142 	    (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2143 }
2144 
2145 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2146 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2147     const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2148     uint16_t lport, int domain, int fib)
2149 {
2150 	const struct inpcblbgrouphead *hdr;
2151 	struct inpcblbgroup *grp;
2152 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2153 	struct inpcb *inp;
2154 	u_int count;
2155 
2156 	INP_HASH_LOCK_ASSERT(pcbinfo);
2157 	NET_EPOCH_ASSERT();
2158 
2159 	hdr = &pcbinfo->ipi_lbgrouphashbase[
2160 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2161 
2162 	/*
2163 	 * Search for an LB group match based on the following criteria:
2164 	 * - prefer jailed groups to non-jailed groups
2165 	 * - prefer exact source address matches to wildcard matches
2166 	 * - prefer groups bound to the specified NUMA domain
2167 	 */
2168 	jail_exact = jail_wild = local_exact = local_wild = NULL;
2169 	CK_LIST_FOREACH(grp, hdr, il_list) {
2170 		bool injail;
2171 
2172 #ifdef INET6
2173 		if (!(grp->il_vflag & INP_IPV4))
2174 			continue;
2175 #endif
2176 		if (grp->il_lport != lport)
2177 			continue;
2178 
2179 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2180 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2181 		    laddr) != 0)
2182 			continue;
2183 
2184 		if (grp->il_laddr.s_addr == laddr->s_addr) {
2185 			if (injail) {
2186 				jail_exact = grp;
2187 				if (in_pcblookup_lb_match(grp, domain, fib))
2188 					/* This is a perfect match. */
2189 					goto out;
2190 			} else if (local_exact == NULL ||
2191 			    in_pcblookup_lb_match(grp, domain, fib)) {
2192 				local_exact = grp;
2193 			}
2194 		} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2195 			if (injail) {
2196 				if (jail_wild == NULL ||
2197 				    in_pcblookup_lb_match(grp, domain, fib))
2198 					jail_wild = grp;
2199 			} else if (local_wild == NULL ||
2200 			    in_pcblookup_lb_match(grp, domain, fib)) {
2201 				local_wild = grp;
2202 			}
2203 		}
2204 	}
2205 
2206 	if (jail_exact != NULL)
2207 		grp = jail_exact;
2208 	else if (jail_wild != NULL)
2209 		grp = jail_wild;
2210 	else if (local_exact != NULL)
2211 		grp = local_exact;
2212 	else
2213 		grp = local_wild;
2214 	if (grp == NULL)
2215 		return (NULL);
2216 
2217 out:
2218 	/*
2219 	 * Synchronize with in_pcblbgroup_insert().
2220 	 */
2221 	count = atomic_load_acq_int(&grp->il_inpcnt);
2222 	if (count == 0)
2223 		return (NULL);
2224 	inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2225 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2226 	return (inp);
2227 }
2228 
2229 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2230 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2231     u_short fport, struct in_addr laddr, u_short lport)
2232 {
2233 #ifdef INET6
2234 	/* XXX inp locking */
2235 	if ((inp->inp_vflag & INP_IPV4) == 0)
2236 		return (false);
2237 #endif
2238 	if (inp->inp_faddr.s_addr == faddr.s_addr &&
2239 	    inp->inp_laddr.s_addr == laddr.s_addr &&
2240 	    inp->inp_fport == fport &&
2241 	    inp->inp_lport == lport)
2242 		return (true);
2243 	return (false);
2244 }
2245 
2246 static struct inpcb *
in_pcblookup_hash_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2247 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2248     u_short fport, struct in_addr laddr, u_short lport)
2249 {
2250 	struct inpcbhead *head;
2251 	struct inpcb *inp;
2252 
2253 	INP_HASH_LOCK_ASSERT(pcbinfo);
2254 
2255 	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2256 	    pcbinfo->ipi_hashmask)];
2257 	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2258 		if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2259 			return (inp);
2260 	}
2261 	return (NULL);
2262 }
2263 
2264 typedef enum {
2265 	INPLOOKUP_MATCH_NONE = 0,
2266 	INPLOOKUP_MATCH_WILD = 1,
2267 	INPLOOKUP_MATCH_LADDR = 2,
2268 } inp_lookup_match_t;
2269 
2270 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2271 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2272     u_short lport, int fib)
2273 {
2274 #ifdef INET6
2275 	/* XXX inp locking */
2276 	if ((inp->inp_vflag & INP_IPV4) == 0)
2277 		return (INPLOOKUP_MATCH_NONE);
2278 #endif
2279 	if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2280 		return (INPLOOKUP_MATCH_NONE);
2281 	if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2282 		return (INPLOOKUP_MATCH_NONE);
2283 	if (inp->inp_laddr.s_addr == INADDR_ANY)
2284 		return (INPLOOKUP_MATCH_WILD);
2285 	if (inp->inp_laddr.s_addr == laddr.s_addr)
2286 		return (INPLOOKUP_MATCH_LADDR);
2287 	return (INPLOOKUP_MATCH_NONE);
2288 }
2289 
2290 #define	INP_LOOKUP_AGAIN	((struct inpcb *)(uintptr_t)-1)
2291 
2292 static struct inpcb *
in_pcblookup_hash_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2293 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2294     u_short lport, int fib, const inp_lookup_t lockflags)
2295 {
2296 	struct inpcbhead *head;
2297 	struct inpcb *inp;
2298 
2299 	KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2300 	    ("%s: not in SMR read section", __func__));
2301 
2302 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2303 	    pcbinfo->ipi_hashmask)];
2304 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2305 		inp_lookup_match_t match;
2306 
2307 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2308 		if (match == INPLOOKUP_MATCH_NONE)
2309 			continue;
2310 
2311 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2312 			match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2313 			if (match != INPLOOKUP_MATCH_NONE &&
2314 			    prison_check_ip4_locked(inp->inp_cred->cr_prison,
2315 			    &laddr) == 0)
2316 				return (inp);
2317 			inp_unlock(inp, lockflags);
2318 		}
2319 
2320 		/*
2321 		 * The matching socket disappeared out from under us.  Fall back
2322 		 * to a serialized lookup.
2323 		 */
2324 		return (INP_LOOKUP_AGAIN);
2325 	}
2326 	return (NULL);
2327 }
2328 
2329 static struct inpcb *
in_pcblookup_hash_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2330 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2331     u_short lport, int fib)
2332 {
2333 	struct inpcbhead *head;
2334 	struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2335 #ifdef INET6
2336 	struct inpcb *local_wild_mapped;
2337 #endif
2338 
2339 	INP_HASH_LOCK_ASSERT(pcbinfo);
2340 
2341 	/*
2342 	 * Order of socket selection - we always prefer jails.
2343 	 *      1. jailed, non-wild.
2344 	 *      2. jailed, wild.
2345 	 *      3. non-jailed, non-wild.
2346 	 *      4. non-jailed, wild.
2347 	 */
2348 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2349 	    pcbinfo->ipi_hashmask)];
2350 	local_wild = local_exact = jail_wild = NULL;
2351 #ifdef INET6
2352 	local_wild_mapped = NULL;
2353 #endif
2354 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2355 		inp_lookup_match_t match;
2356 		bool injail;
2357 
2358 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2359 		if (match == INPLOOKUP_MATCH_NONE)
2360 			continue;
2361 
2362 		injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2363 		if (injail) {
2364 			if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2365 			    &laddr) != 0)
2366 				continue;
2367 		} else {
2368 			if (local_exact != NULL)
2369 				continue;
2370 		}
2371 
2372 		if (match == INPLOOKUP_MATCH_LADDR) {
2373 			if (injail)
2374 				return (inp);
2375 			local_exact = inp;
2376 		} else {
2377 #ifdef INET6
2378 			/* XXX inp locking, NULL check */
2379 			if (inp->inp_vflag & INP_IPV6PROTO)
2380 				local_wild_mapped = inp;
2381 			else
2382 #endif
2383 				if (injail)
2384 					jail_wild = inp;
2385 				else
2386 					local_wild = inp;
2387 		}
2388 	}
2389 	if (jail_wild != NULL)
2390 		return (jail_wild);
2391 	if (local_exact != NULL)
2392 		return (local_exact);
2393 	if (local_wild != NULL)
2394 		return (local_wild);
2395 #ifdef INET6
2396 	if (local_wild_mapped != NULL)
2397 		return (local_wild_mapped);
2398 #endif
2399 	return (NULL);
2400 }
2401 
2402 /*
2403  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2404  * that the caller has either locked the hash list, which usually happens
2405  * for bind(2) operations, or is in SMR section, which happens when sorting
2406  * out incoming packets.
2407  */
2408 static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2409 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2410     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2411     uint8_t numa_domain, int fib)
2412 {
2413 	struct inpcb *inp;
2414 	const u_short fport = fport_arg, lport = lport_arg;
2415 
2416 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2417 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2418 	KASSERT(faddr.s_addr != INADDR_ANY,
2419 	    ("%s: invalid foreign address", __func__));
2420 	KASSERT(laddr.s_addr != INADDR_ANY,
2421 	    ("%s: invalid local address", __func__));
2422 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2423 
2424 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2425 	if (inp != NULL)
2426 		return (inp);
2427 
2428 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2429 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2430 		    &laddr, lport, numa_domain, fib);
2431 		if (inp == NULL) {
2432 			inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2433 			    lport, fib);
2434 		}
2435 	}
2436 
2437 	return (inp);
2438 }
2439 
2440 static struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2441 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2442     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2443     uint8_t numa_domain, int fib)
2444 {
2445 	struct inpcb *inp;
2446 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2447 
2448 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2449 	    ("%s: LOCKPCB not set", __func__));
2450 
2451 	INP_HASH_WLOCK(pcbinfo);
2452 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2453 	    lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2454 	if (inp != NULL && !inp_trylock(inp, lockflags)) {
2455 		in_pcbref(inp);
2456 		INP_HASH_WUNLOCK(pcbinfo);
2457 		inp_lock(inp, lockflags);
2458 		if (in_pcbrele(inp, lockflags))
2459 			/* XXX-MJ or retry until we get a negative match? */
2460 			inp = NULL;
2461 	} else {
2462 		INP_HASH_WUNLOCK(pcbinfo);
2463 	}
2464 	return (inp);
2465 }
2466 
2467 static struct inpcb *
in_pcblookup_hash_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2468 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2469     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2470     uint8_t numa_domain, int fib)
2471 {
2472 	struct inpcb *inp;
2473 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2474 	const u_short fport = fport_arg, lport = lport_arg;
2475 
2476 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2477 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2478 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2479 	    ("%s: LOCKPCB not set", __func__));
2480 
2481 	smr_enter(pcbinfo->ipi_smr);
2482 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2483 	if (inp != NULL) {
2484 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2485 			/*
2486 			 * Revalidate the 4-tuple, the socket could have been
2487 			 * disconnected.
2488 			 */
2489 			if (__predict_true(in_pcblookup_exact_match(inp,
2490 			    faddr, fport, laddr, lport)))
2491 				return (inp);
2492 			inp_unlock(inp, lockflags);
2493 		}
2494 
2495 		/*
2496 		 * We failed to lock the inpcb, or its connection state changed
2497 		 * out from under us.  Fall back to a precise search.
2498 		 */
2499 		return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2500 		    lookupflags, numa_domain, fib));
2501 	}
2502 
2503 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2504 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2505 		    &laddr, lport, numa_domain, fib);
2506 		if (inp != NULL) {
2507 			if (__predict_true(inp_smr_lock(inp, lockflags))) {
2508 				if (__predict_true(in_pcblookup_wild_match(inp,
2509 				    laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2510 					return (inp);
2511 				inp_unlock(inp, lockflags);
2512 			}
2513 			inp = INP_LOOKUP_AGAIN;
2514 		} else {
2515 			inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2516 			    fib, lockflags);
2517 		}
2518 		if (inp == INP_LOOKUP_AGAIN) {
2519 			return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2520 			    lport, lookupflags, numa_domain, fib));
2521 		}
2522 	}
2523 
2524 	if (inp == NULL)
2525 		smr_exit(pcbinfo->ipi_smr);
2526 
2527 	return (inp);
2528 }
2529 
2530 /*
2531  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2532  * from which a pre-calculated hash value may be extracted.
2533  */
2534 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2535 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2536     struct in_addr laddr, u_int lport, int lookupflags,
2537     struct ifnet *ifp)
2538 {
2539 	int fib;
2540 
2541 	fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2542 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2543 	    lookupflags, M_NODOM, fib));
2544 }
2545 
2546 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2547 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2548     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2549     struct ifnet *ifp __unused, struct mbuf *m)
2550 {
2551 	int fib;
2552 
2553 	M_ASSERTPKTHDR(m);
2554 	fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2555 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2556 	    lookupflags, m->m_pkthdr.numa_domain, fib));
2557 }
2558 #endif /* INET */
2559 
2560 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2561 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2562 {
2563 	return (prison_flag(inp->inp_cred, flag) != 0);
2564 }
2565 
2566 /*
2567  * Insert the PCB into a hash chain using ordering rules which ensure that
2568  * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2569  *
2570  * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2571  * with exact local addresses ahead of wildcard PCBs.  Unbound v4-mapped v6 PCBs
2572  * always appear last no matter whether they are jailed.
2573  */
2574 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2575 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2576 {
2577 	struct inpcb *last;
2578 	bool bound, injail;
2579 
2580 	INP_LOCK_ASSERT(inp);
2581 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2582 
2583 	last = NULL;
2584 	bound = inp->inp_laddr.s_addr != INADDR_ANY;
2585 	if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2586 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2587 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2588 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2589 				return;
2590 			}
2591 		}
2592 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2593 		return;
2594 	}
2595 
2596 	injail = in_pcbjailed(inp, PR_IP4);
2597 	if (!injail) {
2598 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2599 			if (!in_pcbjailed(last, PR_IP4))
2600 				break;
2601 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2602 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2603 				return;
2604 			}
2605 		}
2606 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2607 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2608 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2609 		return;
2610 	}
2611 	if (!bound) {
2612 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2613 			if (last->inp_laddr.s_addr == INADDR_ANY)
2614 				break;
2615 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2616 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2617 				return;
2618 			}
2619 		}
2620 	}
2621 	if (last == NULL)
2622 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2623 	else
2624 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2625 }
2626 
2627 #ifdef INET6
2628 /*
2629  * See the comment above _in_pcbinshash_wild().
2630  */
2631 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2632 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2633 {
2634 	struct inpcb *last;
2635 	bool bound, injail;
2636 
2637 	INP_LOCK_ASSERT(inp);
2638 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2639 
2640 	last = NULL;
2641 	bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2642 	injail = in_pcbjailed(inp, PR_IP6);
2643 	if (!injail) {
2644 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2645 			if (!in_pcbjailed(last, PR_IP6))
2646 				break;
2647 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2648 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2649 				return;
2650 			}
2651 		}
2652 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2653 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2654 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2655 		return;
2656 	}
2657 	if (!bound) {
2658 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2659 			if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2660 				break;
2661 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2662 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2663 				return;
2664 			}
2665 		}
2666 	}
2667 	if (last == NULL)
2668 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2669 	else
2670 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2671 }
2672 #endif
2673 
2674 /*
2675  * Insert PCB onto various hash lists.
2676  */
2677 int
in_pcbinshash(struct inpcb * inp)2678 in_pcbinshash(struct inpcb *inp)
2679 {
2680 	struct inpcbhead *pcbhash;
2681 	struct inpcbporthead *pcbporthash;
2682 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2683 	struct inpcbport *phd;
2684 	uint32_t hash;
2685 	bool connected;
2686 
2687 	INP_WLOCK_ASSERT(inp);
2688 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2689 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2690 	    ("in_pcbinshash: INP_INHASHLIST"));
2691 
2692 #ifdef INET6
2693 	if (inp->inp_vflag & INP_IPV6) {
2694 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2695 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2696 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2697 	} else
2698 #endif
2699 	{
2700 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2701 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2702 		connected = !in_nullhost(inp->inp_faddr);
2703 	}
2704 
2705 	if (connected)
2706 		pcbhash = &pcbinfo->ipi_hash_exact[hash];
2707 	else
2708 		pcbhash = &pcbinfo->ipi_hash_wild[hash];
2709 
2710 	pcbporthash = &pcbinfo->ipi_porthashbase[
2711 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2712 
2713 	/*
2714 	 * Add entry to load balance group.
2715 	 * Only do this if SO_REUSEPORT_LB is set.
2716 	 */
2717 	if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2718 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
2719 		if (error != 0)
2720 			return (error);
2721 	}
2722 
2723 	/*
2724 	 * Go through port list and look for a head for this lport.
2725 	 */
2726 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2727 		if (phd->phd_port == inp->inp_lport)
2728 			break;
2729 	}
2730 
2731 	/*
2732 	 * If none exists, malloc one and tack it on.
2733 	 */
2734 	if (phd == NULL) {
2735 		phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
2736 		if (phd == NULL) {
2737 			if ((inp->inp_flags & INP_INLBGROUP) != 0)
2738 				in_pcbremlbgrouphash(inp);
2739 			return (ENOMEM);
2740 		}
2741 		phd->phd_port = inp->inp_lport;
2742 		CK_LIST_INIT(&phd->phd_pcblist);
2743 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2744 	}
2745 	inp->inp_phd = phd;
2746 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2747 
2748 	/*
2749 	 * The PCB may have been disconnected in the past.  Before we can safely
2750 	 * make it visible in the hash table, we must wait for all readers which
2751 	 * may be traversing this PCB to finish.
2752 	 */
2753 	if (inp->inp_smr != SMR_SEQ_INVALID) {
2754 		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2755 		inp->inp_smr = SMR_SEQ_INVALID;
2756 	}
2757 
2758 	if (connected)
2759 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2760 	else {
2761 #ifdef INET6
2762 		if ((inp->inp_vflag & INP_IPV6) != 0)
2763 			_in6_pcbinshash_wild(pcbhash, inp);
2764 		else
2765 #endif
2766 			_in_pcbinshash_wild(pcbhash, inp);
2767 	}
2768 	inp->inp_flags |= INP_INHASHLIST;
2769 
2770 	return (0);
2771 }
2772 
2773 void
in_pcbremhash_locked(struct inpcb * inp)2774 in_pcbremhash_locked(struct inpcb *inp)
2775 {
2776 	struct inpcbport *phd = inp->inp_phd;
2777 
2778 	INP_WLOCK_ASSERT(inp);
2779 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2780 	MPASS(inp->inp_flags & INP_INHASHLIST);
2781 
2782 	if ((inp->inp_flags & INP_INLBGROUP) != 0)
2783 		in_pcbremlbgrouphash(inp);
2784 #ifdef INET6
2785 	if (inp->inp_vflag & INP_IPV6) {
2786 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2787 			CK_LIST_REMOVE(inp, inp_hash_wild);
2788 		else
2789 			CK_LIST_REMOVE(inp, inp_hash_exact);
2790 	} else
2791 #endif
2792 	{
2793 		if (in_nullhost(inp->inp_faddr))
2794 			CK_LIST_REMOVE(inp, inp_hash_wild);
2795 		else
2796 			CK_LIST_REMOVE(inp, inp_hash_exact);
2797 	}
2798 	CK_LIST_REMOVE(inp, inp_portlist);
2799 	if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2800 		CK_LIST_REMOVE(phd, phd_hash);
2801 		uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
2802 	}
2803 	inp->inp_flags &= ~INP_INHASHLIST;
2804 }
2805 
2806 static void
in_pcbremhash(struct inpcb * inp)2807 in_pcbremhash(struct inpcb *inp)
2808 {
2809 	INP_HASH_WLOCK(inp->inp_pcbinfo);
2810 	in_pcbremhash_locked(inp);
2811 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2812 }
2813 
2814 /*
2815  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2816  * changed. NOTE: This does not handle the case of the lport changing (the
2817  * hashed port list would have to be updated as well), so the lport must
2818  * not change after in_pcbinshash() has been called.
2819  */
2820 void
in_pcbrehash(struct inpcb * inp)2821 in_pcbrehash(struct inpcb *inp)
2822 {
2823 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2824 	struct inpcbhead *head;
2825 	uint32_t hash;
2826 	bool connected;
2827 
2828 	INP_WLOCK_ASSERT(inp);
2829 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2830 	KASSERT(inp->inp_flags & INP_INHASHLIST,
2831 	    ("%s: !INP_INHASHLIST", __func__));
2832 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2833 	    ("%s: inp was disconnected", __func__));
2834 
2835 #ifdef INET6
2836 	if (inp->inp_vflag & INP_IPV6) {
2837 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2838 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2839 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2840 	} else
2841 #endif
2842 	{
2843 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2844 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2845 		connected = !in_nullhost(inp->inp_faddr);
2846 	}
2847 
2848 	/*
2849 	 * When rehashing, the caller must ensure that either the new or the old
2850 	 * foreign address was unspecified.
2851 	 */
2852 	if (connected)
2853 		CK_LIST_REMOVE(inp, inp_hash_wild);
2854 	else
2855 		CK_LIST_REMOVE(inp, inp_hash_exact);
2856 
2857 	if (connected) {
2858 		head = &pcbinfo->ipi_hash_exact[hash];
2859 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2860 	} else {
2861 		head = &pcbinfo->ipi_hash_wild[hash];
2862 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2863 	}
2864 }
2865 
2866 /*
2867  * Check for alternatives when higher level complains
2868  * about service problems.  For now, invalidate cached
2869  * routing information.  If the route was created dynamically
2870  * (by a redirect), time to try a default gateway again.
2871  */
2872 void
in_losing(struct inpcb * inp)2873 in_losing(struct inpcb *inp)
2874 {
2875 
2876 	RO_INVALIDATE_CACHE(&inp->inp_route);
2877 	return;
2878 }
2879 
2880 /*
2881  * A set label operation has occurred at the socket layer, propagate the
2882  * label change into the in_pcb for the socket.
2883  */
2884 void
in_pcbsosetlabel(struct socket * so)2885 in_pcbsosetlabel(struct socket *so)
2886 {
2887 #ifdef MAC
2888 	struct inpcb *inp;
2889 
2890 	inp = sotoinpcb(so);
2891 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2892 
2893 	INP_WLOCK(inp);
2894 	SOCK_LOCK(so);
2895 	mac_inpcb_sosetlabel(so, inp);
2896 	SOCK_UNLOCK(so);
2897 	INP_WUNLOCK(inp);
2898 #endif
2899 }
2900 
2901 void
inp_wlock(struct inpcb * inp)2902 inp_wlock(struct inpcb *inp)
2903 {
2904 
2905 	INP_WLOCK(inp);
2906 }
2907 
2908 void
inp_wunlock(struct inpcb * inp)2909 inp_wunlock(struct inpcb *inp)
2910 {
2911 
2912 	INP_WUNLOCK(inp);
2913 }
2914 
2915 void
inp_rlock(struct inpcb * inp)2916 inp_rlock(struct inpcb *inp)
2917 {
2918 
2919 	INP_RLOCK(inp);
2920 }
2921 
2922 void
inp_runlock(struct inpcb * inp)2923 inp_runlock(struct inpcb *inp)
2924 {
2925 
2926 	INP_RUNLOCK(inp);
2927 }
2928 
2929 #ifdef INVARIANT_SUPPORT
2930 void
inp_lock_assert(struct inpcb * inp)2931 inp_lock_assert(struct inpcb *inp)
2932 {
2933 
2934 	INP_WLOCK_ASSERT(inp);
2935 }
2936 
2937 void
inp_unlock_assert(struct inpcb * inp)2938 inp_unlock_assert(struct inpcb *inp)
2939 {
2940 
2941 	INP_UNLOCK_ASSERT(inp);
2942 }
2943 #endif
2944 
2945 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2946 inp_apply_all(struct inpcbinfo *pcbinfo,
2947     void (*func)(struct inpcb *, void *), void *arg)
2948 {
2949 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2950 	    INPLOOKUP_WLOCKPCB);
2951 	struct inpcb *inp;
2952 
2953 	while ((inp = inp_next(&inpi)) != NULL)
2954 		func(inp, arg);
2955 }
2956 
2957 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2958 inp_inpcbtosocket(struct inpcb *inp)
2959 {
2960 
2961 	INP_WLOCK_ASSERT(inp);
2962 	return (inp->inp_socket);
2963 }
2964 
2965 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2966 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2967     uint32_t *faddr, uint16_t *fp)
2968 {
2969 
2970 	INP_LOCK_ASSERT(inp);
2971 	*laddr = inp->inp_laddr.s_addr;
2972 	*faddr = inp->inp_faddr.s_addr;
2973 	*lp = inp->inp_lport;
2974 	*fp = inp->inp_fport;
2975 }
2976 
2977 /*
2978  * Create an external-format (``xinpcb'') structure using the information in
2979  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2980  * reduce the spew of irrelevant information over this interface, to isolate
2981  * user code from changes in the kernel structure, and potentially to provide
2982  * information-hiding if we decide that some of this information should be
2983  * hidden from users.
2984  */
2985 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)2986 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2987 {
2988 
2989 	bzero(xi, sizeof(*xi));
2990 	xi->xi_len = sizeof(struct xinpcb);
2991 	if (inp->inp_socket)
2992 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
2993 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2994 	xi->inp_gencnt = inp->inp_gencnt;
2995 	xi->inp_flow = inp->inp_flow;
2996 	xi->inp_flowid = inp->inp_flowid;
2997 	xi->inp_flowtype = inp->inp_flowtype;
2998 	xi->inp_flags = inp->inp_flags;
2999 	xi->inp_flags2 = inp->inp_flags2;
3000 	xi->in6p_cksum = inp->in6p_cksum;
3001 	xi->in6p_hops = inp->in6p_hops;
3002 	xi->inp_ip_tos = inp->inp_ip_tos;
3003 	xi->inp_vflag = inp->inp_vflag;
3004 	xi->inp_ip_ttl = inp->inp_ip_ttl;
3005 	xi->inp_ip_p = inp->inp_ip_p;
3006 	xi->inp_ip_minttl = inp->inp_ip_minttl;
3007 }
3008 
3009 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))3010 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
3011     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
3012 {
3013 	struct sockopt sopt;
3014 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
3015 	    INPLOOKUP_WLOCKPCB);
3016 	struct inpcb *inp;
3017 	struct sockopt_parameters *params;
3018 	struct socket *so;
3019 	int error;
3020 	char buf[1024];
3021 
3022 	if (req->oldptr != NULL || req->oldlen != 0)
3023 		return (EINVAL);
3024 	if (req->newptr == NULL)
3025 		return (EPERM);
3026 	if (req->newlen > sizeof(buf))
3027 		return (ENOMEM);
3028 	error = SYSCTL_IN(req, buf, req->newlen);
3029 	if (error != 0)
3030 		return (error);
3031 	if (req->newlen < sizeof(struct sockopt_parameters))
3032 		return (EINVAL);
3033 	params = (struct sockopt_parameters *)buf;
3034 	sopt.sopt_level = params->sop_level;
3035 	sopt.sopt_name = params->sop_optname;
3036 	sopt.sopt_dir = SOPT_SET;
3037 	sopt.sopt_val = params->sop_optval;
3038 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
3039 	sopt.sopt_td = NULL;
3040 #ifdef INET6
3041 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
3042 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
3043 			params->sop_inc.inc6_laddr.s6_addr16[1] =
3044 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3045 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
3046 			params->sop_inc.inc6_faddr.s6_addr16[1] =
3047 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3048 	}
3049 #endif
3050 	if (params->sop_inc.inc_lport != htons(0) &&
3051 	    params->sop_inc.inc_fport != htons(0)) {
3052 #ifdef INET6
3053 		if (params->sop_inc.inc_flags & INC_ISIPV6)
3054 			inpi.hash = INP6_PCBHASH(
3055 			    &params->sop_inc.inc6_faddr,
3056 			    params->sop_inc.inc_lport,
3057 			    params->sop_inc.inc_fport,
3058 			    pcbinfo->ipi_hashmask);
3059 		else
3060 #endif
3061 			inpi.hash = INP_PCBHASH(
3062 			    &params->sop_inc.inc_faddr,
3063 			    params->sop_inc.inc_lport,
3064 			    params->sop_inc.inc_fport,
3065 			    pcbinfo->ipi_hashmask);
3066 	}
3067 	while ((inp = inp_next(&inpi)) != NULL)
3068 		if (inp->inp_gencnt == params->sop_id) {
3069 			if (inp->inp_flags & INP_DROPPED) {
3070 				INP_WUNLOCK(inp);
3071 				return (ECONNRESET);
3072 			}
3073 			so = inp->inp_socket;
3074 			KASSERT(so != NULL, ("inp_socket == NULL"));
3075 			soref(so);
3076 			if (params->sop_level == SOL_SOCKET) {
3077 				INP_WUNLOCK(inp);
3078 				error = sosetopt(so, &sopt);
3079 			} else
3080 				error = (*ctloutput_set)(inp, &sopt);
3081 			sorele(so);
3082 			break;
3083 		}
3084 	if (inp == NULL)
3085 		error = ESRCH;
3086 	return (error);
3087 }
3088 
3089 #ifdef DDB
3090 static void
db_print_indent(int indent)3091 db_print_indent(int indent)
3092 {
3093 	int i;
3094 
3095 	for (i = 0; i < indent; i++)
3096 		db_printf(" ");
3097 }
3098 
3099 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3100 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3101 {
3102 	char faddr_str[48], laddr_str[48];
3103 
3104 	db_print_indent(indent);
3105 	db_printf("%s at %p\n", name, inc);
3106 
3107 	indent += 2;
3108 
3109 #ifdef INET6
3110 	if (inc->inc_flags & INC_ISIPV6) {
3111 		/* IPv6. */
3112 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
3113 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
3114 	} else
3115 #endif
3116 	{
3117 		/* IPv4. */
3118 		inet_ntoa_r(inc->inc_laddr, laddr_str);
3119 		inet_ntoa_r(inc->inc_faddr, faddr_str);
3120 	}
3121 	db_print_indent(indent);
3122 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3123 	    ntohs(inc->inc_lport));
3124 	db_print_indent(indent);
3125 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3126 	    ntohs(inc->inc_fport));
3127 }
3128 
3129 static void
db_print_inpflags(int inp_flags)3130 db_print_inpflags(int inp_flags)
3131 {
3132 	int comma;
3133 
3134 	comma = 0;
3135 	if (inp_flags & INP_RECVOPTS) {
3136 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3137 		comma = 1;
3138 	}
3139 	if (inp_flags & INP_RECVRETOPTS) {
3140 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3141 		comma = 1;
3142 	}
3143 	if (inp_flags & INP_RECVDSTADDR) {
3144 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3145 		comma = 1;
3146 	}
3147 	if (inp_flags & INP_ORIGDSTADDR) {
3148 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3149 		comma = 1;
3150 	}
3151 	if (inp_flags & INP_HDRINCL) {
3152 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
3153 		comma = 1;
3154 	}
3155 	if (inp_flags & INP_HIGHPORT) {
3156 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3157 		comma = 1;
3158 	}
3159 	if (inp_flags & INP_LOWPORT) {
3160 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
3161 		comma = 1;
3162 	}
3163 	if (inp_flags & INP_ANONPORT) {
3164 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
3165 		comma = 1;
3166 	}
3167 	if (inp_flags & INP_RECVIF) {
3168 		db_printf("%sINP_RECVIF", comma ? ", " : "");
3169 		comma = 1;
3170 	}
3171 	if (inp_flags & INP_MTUDISC) {
3172 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
3173 		comma = 1;
3174 	}
3175 	if (inp_flags & INP_RECVTTL) {
3176 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
3177 		comma = 1;
3178 	}
3179 	if (inp_flags & INP_DONTFRAG) {
3180 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3181 		comma = 1;
3182 	}
3183 	if (inp_flags & INP_RECVTOS) {
3184 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
3185 		comma = 1;
3186 	}
3187 	if (inp_flags & IN6P_IPV6_V6ONLY) {
3188 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3189 		comma = 1;
3190 	}
3191 	if (inp_flags & IN6P_PKTINFO) {
3192 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3193 		comma = 1;
3194 	}
3195 	if (inp_flags & IN6P_HOPLIMIT) {
3196 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3197 		comma = 1;
3198 	}
3199 	if (inp_flags & IN6P_HOPOPTS) {
3200 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3201 		comma = 1;
3202 	}
3203 	if (inp_flags & IN6P_DSTOPTS) {
3204 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3205 		comma = 1;
3206 	}
3207 	if (inp_flags & IN6P_RTHDR) {
3208 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3209 		comma = 1;
3210 	}
3211 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
3212 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3213 		comma = 1;
3214 	}
3215 	if (inp_flags & IN6P_TCLASS) {
3216 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3217 		comma = 1;
3218 	}
3219 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
3220 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3221 		comma = 1;
3222 	}
3223 	if (inp_flags & INP_ONESBCAST) {
3224 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3225 		comma  = 1;
3226 	}
3227 	if (inp_flags & INP_DROPPED) {
3228 		db_printf("%sINP_DROPPED", comma ? ", " : "");
3229 		comma  = 1;
3230 	}
3231 	if (inp_flags & INP_SOCKREF) {
3232 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
3233 		comma  = 1;
3234 	}
3235 	if (inp_flags & IN6P_RFC2292) {
3236 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3237 		comma = 1;
3238 	}
3239 	if (inp_flags & IN6P_MTU) {
3240 		db_printf("IN6P_MTU%s", comma ? ", " : "");
3241 		comma = 1;
3242 	}
3243 }
3244 
3245 static void
db_print_inpvflag(u_char inp_vflag)3246 db_print_inpvflag(u_char inp_vflag)
3247 {
3248 	int comma;
3249 
3250 	comma = 0;
3251 	if (inp_vflag & INP_IPV4) {
3252 		db_printf("%sINP_IPV4", comma ? ", " : "");
3253 		comma  = 1;
3254 	}
3255 	if (inp_vflag & INP_IPV6) {
3256 		db_printf("%sINP_IPV6", comma ? ", " : "");
3257 		comma  = 1;
3258 	}
3259 	if (inp_vflag & INP_IPV6PROTO) {
3260 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3261 		comma  = 1;
3262 	}
3263 }
3264 
3265 static void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3266 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3267 {
3268 
3269 	db_print_indent(indent);
3270 	db_printf("%s at %p\n", name, inp);
3271 
3272 	indent += 2;
3273 
3274 	db_print_indent(indent);
3275 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3276 
3277 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3278 
3279 	db_print_indent(indent);
3280 	db_printf("inp_label: %p   inp_flags: 0x%x (",
3281 	   inp->inp_label, inp->inp_flags);
3282 	db_print_inpflags(inp->inp_flags);
3283 	db_printf(")\n");
3284 
3285 	db_print_indent(indent);
3286 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
3287 	    inp->inp_vflag);
3288 	db_print_inpvflag(inp->inp_vflag);
3289 	db_printf(")\n");
3290 
3291 	db_print_indent(indent);
3292 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3293 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3294 
3295 	db_print_indent(indent);
3296 #ifdef INET6
3297 	if (inp->inp_vflag & INP_IPV6) {
3298 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
3299 		    "in6p_moptions: %p\n", inp->in6p_options,
3300 		    inp->in6p_outputopts, inp->in6p_moptions);
3301 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3302 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3303 		    inp->in6p_hops);
3304 	} else
3305 #endif
3306 	{
3307 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3308 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3309 		    inp->inp_options, inp->inp_moptions);
3310 	}
3311 
3312 	db_print_indent(indent);
3313 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
3314 	    (uintmax_t)inp->inp_gencnt);
3315 }
3316 
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3317 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3318 {
3319 	struct inpcb *inp;
3320 
3321 	if (!have_addr) {
3322 		db_printf("usage: show inpcb <addr>\n");
3323 		return;
3324 	}
3325 	inp = (struct inpcb *)addr;
3326 
3327 	db_print_inpcb(inp, "inpcb", 0);
3328 }
3329 #endif /* DDB */
3330 
3331 #ifdef RATELIMIT
3332 /*
3333  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3334  * if any.
3335  */
3336 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3337 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3338 {
3339 	union if_snd_tag_modify_params params = {
3340 		.rate_limit.max_rate = max_pacing_rate,
3341 		.rate_limit.flags = M_NOWAIT,
3342 	};
3343 	struct m_snd_tag *mst;
3344 	int error;
3345 
3346 	mst = inp->inp_snd_tag;
3347 	if (mst == NULL)
3348 		return (EINVAL);
3349 
3350 	if (mst->sw->snd_tag_modify == NULL) {
3351 		error = EOPNOTSUPP;
3352 	} else {
3353 		error = mst->sw->snd_tag_modify(mst, &params);
3354 	}
3355 	return (error);
3356 }
3357 
3358 /*
3359  * Query existing TX rate limit based on the existing
3360  * "inp->inp_snd_tag", if any.
3361  */
3362 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3363 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3364 {
3365 	union if_snd_tag_query_params params = { };
3366 	struct m_snd_tag *mst;
3367 	int error;
3368 
3369 	mst = inp->inp_snd_tag;
3370 	if (mst == NULL)
3371 		return (EINVAL);
3372 
3373 	if (mst->sw->snd_tag_query == NULL) {
3374 		error = EOPNOTSUPP;
3375 	} else {
3376 		error = mst->sw->snd_tag_query(mst, &params);
3377 		if (error == 0 && p_max_pacing_rate != NULL)
3378 			*p_max_pacing_rate = params.rate_limit.max_rate;
3379 	}
3380 	return (error);
3381 }
3382 
3383 /*
3384  * Query existing TX queue level based on the existing
3385  * "inp->inp_snd_tag", if any.
3386  */
3387 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3388 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3389 {
3390 	union if_snd_tag_query_params params = { };
3391 	struct m_snd_tag *mst;
3392 	int error;
3393 
3394 	mst = inp->inp_snd_tag;
3395 	if (mst == NULL)
3396 		return (EINVAL);
3397 
3398 	if (mst->sw->snd_tag_query == NULL)
3399 		return (EOPNOTSUPP);
3400 
3401 	error = mst->sw->snd_tag_query(mst, &params);
3402 	if (error == 0 && p_txqueue_level != NULL)
3403 		*p_txqueue_level = params.rate_limit.queue_level;
3404 	return (error);
3405 }
3406 
3407 /*
3408  * Allocate a new TX rate limit send tag from the network interface
3409  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3410  */
3411 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3412 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3413     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3414 
3415 {
3416 	union if_snd_tag_alloc_params params = {
3417 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3418 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3419 		.rate_limit.hdr.flowid = flowid,
3420 		.rate_limit.hdr.flowtype = flowtype,
3421 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3422 		.rate_limit.max_rate = max_pacing_rate,
3423 		.rate_limit.flags = M_NOWAIT,
3424 	};
3425 	int error;
3426 
3427 	INP_WLOCK_ASSERT(inp);
3428 
3429 	/*
3430 	 * If there is already a send tag, or the INP is being torn
3431 	 * down, allocating a new send tag is not allowed. Else send
3432 	 * tags may leak.
3433 	 */
3434 	if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3435 		return (EINVAL);
3436 
3437 	error = m_snd_tag_alloc(ifp, &params, st);
3438 #ifdef INET
3439 	if (error == 0) {
3440 		counter_u64_add(rate_limit_set_ok, 1);
3441 		counter_u64_add(rate_limit_active, 1);
3442 	} else if (error != EOPNOTSUPP)
3443 		  counter_u64_add(rate_limit_alloc_fail, 1);
3444 #endif
3445 	return (error);
3446 }
3447 
3448 void
in_pcbdetach_tag(struct m_snd_tag * mst)3449 in_pcbdetach_tag(struct m_snd_tag *mst)
3450 {
3451 
3452 	m_snd_tag_rele(mst);
3453 #ifdef INET
3454 	counter_u64_add(rate_limit_active, -1);
3455 #endif
3456 }
3457 
3458 /*
3459  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3460  * if any:
3461  */
3462 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3463 in_pcbdetach_txrtlmt(struct inpcb *inp)
3464 {
3465 	struct m_snd_tag *mst;
3466 
3467 	INP_WLOCK_ASSERT(inp);
3468 
3469 	mst = inp->inp_snd_tag;
3470 	inp->inp_snd_tag = NULL;
3471 
3472 	if (mst == NULL)
3473 		return;
3474 
3475 	m_snd_tag_rele(mst);
3476 #ifdef INET
3477 	counter_u64_add(rate_limit_active, -1);
3478 #endif
3479 }
3480 
3481 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3482 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3483 {
3484 	int error;
3485 
3486 	/*
3487 	 * If the existing send tag is for the wrong interface due to
3488 	 * a route change, first drop the existing tag.  Set the
3489 	 * CHANGED flag so that we will keep trying to allocate a new
3490 	 * tag if we fail to allocate one this time.
3491 	 */
3492 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3493 		in_pcbdetach_txrtlmt(inp);
3494 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3495 	}
3496 
3497 	/*
3498 	 * NOTE: When attaching to a network interface a reference is
3499 	 * made to ensure the network interface doesn't go away until
3500 	 * all ratelimit connections are gone. The network interface
3501 	 * pointers compared below represent valid network interfaces,
3502 	 * except when comparing towards NULL.
3503 	 */
3504 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3505 		error = 0;
3506 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3507 		if (inp->inp_snd_tag != NULL)
3508 			in_pcbdetach_txrtlmt(inp);
3509 		error = 0;
3510 	} else if (inp->inp_snd_tag == NULL) {
3511 		/*
3512 		 * In order to utilize packet pacing with RSS, we need
3513 		 * to wait until there is a valid RSS hash before we
3514 		 * can proceed:
3515 		 */
3516 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3517 			error = EAGAIN;
3518 		} else {
3519 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3520 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3521 		}
3522 	} else {
3523 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3524 	}
3525 	if (error == 0 || error == EOPNOTSUPP)
3526 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3527 
3528 	return (error);
3529 }
3530 
3531 /*
3532  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3533  * is set in the fast path and will attach/detach/modify the TX rate
3534  * limit send tag based on the socket's so_max_pacing_rate value.
3535  */
3536 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3537 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3538 {
3539 	struct socket *socket;
3540 	uint32_t max_pacing_rate;
3541 	bool did_upgrade;
3542 
3543 	if (inp == NULL)
3544 		return;
3545 
3546 	socket = inp->inp_socket;
3547 	if (socket == NULL)
3548 		return;
3549 
3550 	if (!INP_WLOCKED(inp)) {
3551 		/*
3552 		 * NOTE: If the write locking fails, we need to bail
3553 		 * out and use the non-ratelimited ring for the
3554 		 * transmit until there is a new chance to get the
3555 		 * write lock.
3556 		 */
3557 		if (!INP_TRY_UPGRADE(inp))
3558 			return;
3559 		did_upgrade = 1;
3560 	} else {
3561 		did_upgrade = 0;
3562 	}
3563 
3564 	/*
3565 	 * NOTE: The so_max_pacing_rate value is read unlocked,
3566 	 * because atomic updates are not required since the variable
3567 	 * is checked at every mbuf we send. It is assumed that the
3568 	 * variable read itself will be atomic.
3569 	 */
3570 	max_pacing_rate = socket->so_max_pacing_rate;
3571 
3572 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3573 
3574 	if (did_upgrade)
3575 		INP_DOWNGRADE(inp);
3576 }
3577 
3578 /*
3579  * Track route changes for TX rate limiting.
3580  */
3581 void
in_pcboutput_eagain(struct inpcb * inp)3582 in_pcboutput_eagain(struct inpcb *inp)
3583 {
3584 	bool did_upgrade;
3585 
3586 	if (inp == NULL)
3587 		return;
3588 
3589 	if (inp->inp_snd_tag == NULL)
3590 		return;
3591 
3592 	if (!INP_WLOCKED(inp)) {
3593 		/*
3594 		 * NOTE: If the write locking fails, we need to bail
3595 		 * out and use the non-ratelimited ring for the
3596 		 * transmit until there is a new chance to get the
3597 		 * write lock.
3598 		 */
3599 		if (!INP_TRY_UPGRADE(inp))
3600 			return;
3601 		did_upgrade = 1;
3602 	} else {
3603 		did_upgrade = 0;
3604 	}
3605 
3606 	/* detach rate limiting */
3607 	in_pcbdetach_txrtlmt(inp);
3608 
3609 	/* make sure new mbuf send tag allocation is made */
3610 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3611 
3612 	if (did_upgrade)
3613 		INP_DOWNGRADE(inp);
3614 }
3615 
3616 #ifdef INET
3617 static void
rl_init(void * st)3618 rl_init(void *st)
3619 {
3620 	rate_limit_new = counter_u64_alloc(M_WAITOK);
3621 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
3622 	rate_limit_active = counter_u64_alloc(M_WAITOK);
3623 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3624 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3625 }
3626 
3627 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3628 #endif
3629 #endif /* RATELIMIT */
3630