xref: /freebsd/sys/netinet/in_pcb.c (revision 6883b120c53735ff1681ef96d257f376731f56b3)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include "opt_ddb.h"
40 #include "opt_ipsec.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #include "opt_ratelimit.h"
44 #include "opt_rss.h"
45 
46 #include <sys/param.h>
47 #include <sys/hash.h>
48 #include <sys/systm.h>
49 #include <sys/libkern.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/eventhandler.h>
54 #include <sys/domain.h>
55 #include <sys/proc.h>
56 #include <sys/protosw.h>
57 #include <sys/smp.h>
58 #include <sys/smr.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <sys/priv.h>
63 #include <sys/proc.h>
64 #include <sys/refcount.h>
65 #include <sys/jail.h>
66 #include <sys/kernel.h>
67 #include <sys/sysctl.h>
68 
69 #ifdef DDB
70 #include <ddb/ddb.h>
71 #endif
72 
73 #include <vm/uma.h>
74 #include <vm/vm.h>
75 
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/if_private.h>
79 #include <net/if_types.h>
80 #include <net/if_llatbl.h>
81 #include <net/route.h>
82 #include <net/rss_config.h>
83 #include <net/vnet.h>
84 
85 #if defined(INET) || defined(INET6)
86 #include <netinet/in.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/in_pcb_var.h>
89 #include <netinet/tcp.h>
90 #ifdef INET
91 #include <netinet/in_var.h>
92 #include <netinet/in_fib.h>
93 #endif
94 #include <netinet/ip_var.h>
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/in6_pcb.h>
98 #include <netinet6/in6_var.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101 #include <net/route/nhop.h>
102 #endif
103 
104 #include <netipsec/ipsec_support.h>
105 
106 #include <security/mac/mac_framework.h>
107 
108 #define	INPCBLBGROUP_SIZMIN	8
109 #define	INPCBLBGROUP_SIZMAX	256
110 
111 #define	INP_FREED	0x00000200	/* Went through in_pcbfree(). */
112 #define	INP_INLBGROUP	0x01000000	/* Inserted into inpcblbgroup. */
113 
114 /*
115  * These configure the range of local port addresses assigned to
116  * "unspecified" outgoing connections/packets/whatever.
117  */
118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
124 
125 /*
126  * Reserved ports accessible only to root. There are significant
127  * security considerations that must be accounted for when changing these,
128  * but the security benefits can be great. Please be careful.
129  */
130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
131 VNET_DEFINE(int, ipport_reservedlow);
132 
133 /* Enable random ephemeral port allocation by default. */
134 VNET_DEFINE(int, ipport_randomized) = 1;
135 
136 #ifdef INET
137 static struct inpcb	*in_pcblookup_internal(struct inpcbinfo *pcbinfo,
138     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
139     u_int lport_arg, int lookupflags, uint8_t numa_domain, int fib);
140 
141 #define RANGECHK(var, min, max) \
142 	if ((var) < (min)) { (var) = (min); } \
143 	else if ((var) > (max)) { (var) = (max); }
144 
145 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)146 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
147 {
148 	int error;
149 
150 	error = sysctl_handle_int(oidp, arg1, arg2, req);
151 	if (error == 0) {
152 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
153 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
154 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
155 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
156 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
157 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
158 	}
159 	return (error);
160 }
161 
162 #undef RANGECHK
163 
164 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
165     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
166     "IP Ports");
167 
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
169     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
170     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
171     "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
173     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
174     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
175     "");
176 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
177     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
178     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
179     "");
180 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
181     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
182     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
183     "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
185     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
186     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
187     "");
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
189     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
190     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
191     "");
192 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
193 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
194 	&VNET_NAME(ipport_reservedhigh), 0, "");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
196 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
198 	CTLFLAG_VNET | CTLFLAG_RW,
199 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
200 
201 #ifdef RATELIMIT
202 counter_u64_t rate_limit_new;
203 counter_u64_t rate_limit_chg;
204 counter_u64_t rate_limit_active;
205 counter_u64_t rate_limit_alloc_fail;
206 counter_u64_t rate_limit_set_ok;
207 
208 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
209     "IP Rate Limiting");
210 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
211     &rate_limit_active, "Active rate limited connections");
212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
213    &rate_limit_alloc_fail, "Rate limited connection failures");
214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
215    &rate_limit_set_ok, "Rate limited setting succeeded");
216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
217    &rate_limit_new, "Total Rate limit new attempts");
218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
219    &rate_limit_chg, "Total Rate limited change attempts");
220 #endif /* RATELIMIT */
221 
222 #endif /* INET */
223 
224 VNET_DEFINE(uint32_t, in_pcbhashseed);
225 static void
in_pcbhashseed_init(void)226 in_pcbhashseed_init(void)
227 {
228 
229 	V_in_pcbhashseed = arc4random();
230 }
231 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
232     in_pcbhashseed_init, NULL);
233 
234 #ifdef INET
235 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
236 #define	V_connect_inaddr_wild	VNET(connect_inaddr_wild)
237 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
238     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
239     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
240 #endif
241 
242 /*
243  * in_pcb.c: manage the Protocol Control Blocks.
244  *
245  * NOTE: It is assumed that most of these functions will be called with
246  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
247  * functions often modify hash chains or addresses in pcbs.
248  */
249 
250 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)251 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
252     const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
253 {
254 	struct inpcblbgroup *grp;
255 	size_t bytes;
256 
257 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
258 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
259 	if (grp == NULL)
260 		return (NULL);
261 	LIST_INIT(&grp->il_pending);
262 	grp->il_cred = crhold(cred);
263 	grp->il_vflag = vflag;
264 	grp->il_lport = port;
265 	grp->il_numa_domain = numa_domain;
266 	grp->il_fibnum = fib;
267 	grp->il_dependladdr = *addr;
268 	grp->il_inpsiz = size;
269 	return (grp);
270 }
271 
272 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)273 in_pcblbgroup_free_deferred(epoch_context_t ctx)
274 {
275 	struct inpcblbgroup *grp;
276 
277 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
278 	crfree(grp->il_cred);
279 	free(grp, M_PCB);
280 }
281 
282 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)283 in_pcblbgroup_free(struct inpcblbgroup *grp)
284 {
285 	KASSERT(LIST_EMPTY(&grp->il_pending),
286 	    ("local group %p still has pending inps", grp));
287 
288 	CK_LIST_REMOVE(grp, il_list);
289 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
290 }
291 
292 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)293 in_pcblbgroup_find(struct inpcb *inp)
294 {
295 	struct inpcbinfo *pcbinfo;
296 	struct inpcblbgroup *grp;
297 	struct inpcblbgrouphead *hdr;
298 
299 	INP_LOCK_ASSERT(inp);
300 
301 	pcbinfo = inp->inp_pcbinfo;
302 	INP_HASH_LOCK_ASSERT(pcbinfo);
303 
304 	hdr = &pcbinfo->ipi_lbgrouphashbase[
305 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
306 	CK_LIST_FOREACH(grp, hdr, il_list) {
307 		struct inpcb *inp1;
308 
309 		for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
310 			if (inp == grp->il_inp[i])
311 				goto found;
312 		}
313 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
314 			if (inp == inp1)
315 				goto found;
316 		}
317 	}
318 found:
319 	return (grp);
320 }
321 
322 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)323 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
324 {
325 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
326 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
327 	    grp->il_inpcnt));
328 	INP_WLOCK_ASSERT(inp);
329 
330 	if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
331 	    !SOLISTENING(inp->inp_socket)) {
332 		/*
333 		 * If this is a TCP socket, it should not be visible to lbgroup
334 		 * lookups until listen() has been called.
335 		 */
336 		LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
337 		grp->il_pendcnt++;
338 	} else {
339 		grp->il_inp[grp->il_inpcnt] = inp;
340 
341 		/*
342 		 * Synchronize with in_pcblookup_lbgroup(): make sure that we
343 		 * don't expose a null slot to the lookup path.
344 		 */
345 		atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
346 	}
347 
348 	inp->inp_flags |= INP_INLBGROUP;
349 }
350 
351 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)352 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
353     struct inpcblbgroup *old_grp, int size)
354 {
355 	struct inpcblbgroup *grp;
356 	int i;
357 
358 	grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
359 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
360 	    old_grp->il_numa_domain, old_grp->il_fibnum);
361 	if (grp == NULL)
362 		return (NULL);
363 
364 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
365 	    ("invalid new local group size %d and old local group count %d",
366 	     grp->il_inpsiz, old_grp->il_inpcnt));
367 
368 	for (i = 0; i < old_grp->il_inpcnt; ++i)
369 		grp->il_inp[i] = old_grp->il_inp[i];
370 	grp->il_inpcnt = old_grp->il_inpcnt;
371 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
372 	LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
373 	    inp_lbgroup_list);
374 	grp->il_pendcnt = old_grp->il_pendcnt;
375 	old_grp->il_pendcnt = 0;
376 	in_pcblbgroup_free(old_grp);
377 	return (grp);
378 }
379 
380 /*
381  * Add PCB to load balance group for SO_REUSEPORT_LB option.
382  */
383 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)384 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
385 {
386 	const static struct timeval interval = { 60, 0 };
387 	static struct timeval lastprint;
388 	struct inpcbinfo *pcbinfo;
389 	struct inpcblbgrouphead *hdr;
390 	struct inpcblbgroup *grp;
391 	uint32_t idx;
392 	int fib;
393 
394 	pcbinfo = inp->inp_pcbinfo;
395 
396 	INP_WLOCK_ASSERT(inp);
397 	INP_HASH_WLOCK_ASSERT(pcbinfo);
398 
399 	fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
400 	    inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
401 
402 #ifdef INET6
403 	/*
404 	 * Don't allow IPv4 mapped INET6 wild socket.
405 	 */
406 	if ((inp->inp_vflag & INP_IPV4) &&
407 	    inp->inp_laddr.s_addr == INADDR_ANY &&
408 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
409 		return (0);
410 	}
411 #endif
412 
413 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
414 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
415 	CK_LIST_FOREACH(grp, hdr, il_list) {
416 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
417 		    grp->il_vflag == inp->inp_vflag &&
418 		    grp->il_lport == inp->inp_lport &&
419 		    grp->il_numa_domain == numa_domain &&
420 		    grp->il_fibnum == fib &&
421 		    memcmp(&grp->il_dependladdr,
422 		    &inp->inp_inc.inc_ie.ie_dependladdr,
423 		    sizeof(grp->il_dependladdr)) == 0) {
424 			break;
425 		}
426 	}
427 	if (grp == NULL) {
428 		/* Create new load balance group. */
429 		grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
430 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
431 		    INPCBLBGROUP_SIZMIN, numa_domain, fib);
432 		if (grp == NULL)
433 			return (ENOMEM);
434 		in_pcblbgroup_insert(grp, inp);
435 		CK_LIST_INSERT_HEAD(hdr, grp, il_list);
436 	} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
437 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
438 			if (ratecheck(&lastprint, &interval))
439 				printf("lb group port %d, limit reached\n",
440 				    ntohs(grp->il_lport));
441 			return (0);
442 		}
443 
444 		/* Expand this local group. */
445 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
446 		if (grp == NULL)
447 			return (ENOMEM);
448 		in_pcblbgroup_insert(grp, inp);
449 	} else {
450 		in_pcblbgroup_insert(grp, inp);
451 	}
452 	return (0);
453 }
454 
455 /*
456  * Remove PCB from load balance group.
457  */
458 static void
in_pcbremlbgrouphash(struct inpcb * inp)459 in_pcbremlbgrouphash(struct inpcb *inp)
460 {
461 	struct inpcbinfo *pcbinfo;
462 	struct inpcblbgrouphead *hdr;
463 	struct inpcblbgroup *grp;
464 	struct inpcb *inp1;
465 	int i;
466 
467 	pcbinfo = inp->inp_pcbinfo;
468 
469 	INP_WLOCK_ASSERT(inp);
470 	MPASS(inp->inp_flags & INP_INLBGROUP);
471 	INP_HASH_WLOCK_ASSERT(pcbinfo);
472 
473 	hdr = &pcbinfo->ipi_lbgrouphashbase[
474 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
475 	CK_LIST_FOREACH(grp, hdr, il_list) {
476 		for (i = 0; i < grp->il_inpcnt; ++i) {
477 			if (grp->il_inp[i] != inp)
478 				continue;
479 
480 			if (grp->il_inpcnt == 1 &&
481 			    LIST_EMPTY(&grp->il_pending)) {
482 				/* We are the last, free this local group. */
483 				in_pcblbgroup_free(grp);
484 			} else {
485 				grp->il_inp[i] =
486 				    grp->il_inp[grp->il_inpcnt - 1];
487 
488 				/*
489 				 * Synchronize with in_pcblookup_lbgroup().
490 				 */
491 				atomic_store_rel_int(&grp->il_inpcnt,
492 				    grp->il_inpcnt - 1);
493 			}
494 			inp->inp_flags &= ~INP_INLBGROUP;
495 			return;
496 		}
497 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
498 			if (inp == inp1) {
499 				LIST_REMOVE(inp, inp_lbgroup_list);
500 				grp->il_pendcnt--;
501 				inp->inp_flags &= ~INP_INLBGROUP;
502 				return;
503 			}
504 		}
505 	}
506 	__assert_unreachable();
507 }
508 
509 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)510 in_pcblbgroup_numa(struct inpcb *inp, int arg)
511 {
512 	struct inpcbinfo *pcbinfo;
513 	int error;
514 	uint8_t numa_domain;
515 
516 	switch (arg) {
517 	case TCP_REUSPORT_LB_NUMA_NODOM:
518 		numa_domain = M_NODOM;
519 		break;
520 	case TCP_REUSPORT_LB_NUMA_CURDOM:
521 		numa_domain = PCPU_GET(domain);
522 		break;
523 	default:
524 		if (arg < 0 || arg >= vm_ndomains)
525 			return (EINVAL);
526 		numa_domain = arg;
527 	}
528 
529 	pcbinfo = inp->inp_pcbinfo;
530 	INP_WLOCK_ASSERT(inp);
531 	INP_HASH_WLOCK(pcbinfo);
532 	if (in_pcblbgroup_find(inp) != NULL) {
533 		/* Remove it from the old group. */
534 		in_pcbremlbgrouphash(inp);
535 		/* Add it to the new group based on numa domain. */
536 		in_pcbinslbgrouphash(inp, numa_domain);
537 		error = 0;
538 	} else {
539 		error = ENOENT;
540 	}
541 	INP_HASH_WUNLOCK(pcbinfo);
542 	return (error);
543 }
544 
545 /*
546  * Initialize an inpcbinfo - a per-VNET instance of connections db.
547  */
548 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements,u_int lbgrouphash_nelements)549 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
550     u_int hash_nelements, u_int porthash_nelements, u_int lbgrouphash_nelements)
551 {
552 	struct hashalloc_args ha = {
553 		.mtype = M_PCB,
554 		.mflags = M_WAITOK,
555 		.head = HASH_HEAD_CK_LIST,
556 	};
557 
558 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
559 	    NULL, MTX_DEF);
560 	CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
561 	pcbinfo->ipi_count = 0;
562 
563 	ha.size = hash_nelements;
564 	pcbinfo->ipi_hash_exact = hashalloc(&ha);
565 	pcbinfo->ipi_hash_wild = hashalloc(&ha);
566 	pcbinfo->ipi_hashmask = ha.size - 1;
567 
568 	if (porthash_nelements > 0) {
569 		ha.size = imin(porthash_nelements, IPPORT_MAX + 1);
570 		pcbinfo->ipi_porthashbase = hashalloc(&ha);
571 		pcbinfo->ipi_porthashmask = ha.size - 1;
572 	} else
573 		pcbinfo->ipi_porthashbase = NULL;
574 	if (lbgrouphash_nelements > 0) {
575 		ha.size = imin(lbgrouphash_nelements, IPPORT_MAX + 1);
576 		pcbinfo->ipi_lbgrouphashbase = hashalloc(&ha);
577 		pcbinfo->ipi_lbgrouphashmask = ha.size - 1;
578 	} else
579 		pcbinfo->ipi_lbgrouphashbase = NULL;
580 
581 	pcbinfo->ipi_zone = pcbstor->ips_zone;
582 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
583 }
584 
585 /*
586  * Destroy an inpcbinfo.
587  */
588 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)589 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
590 {
591 	struct hashalloc_args ha = {
592 		.mtype = M_PCB,
593 		.head = HASH_HEAD_CK_LIST,
594 	};
595 
596 	KASSERT(pcbinfo->ipi_count == 0,
597 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
598 
599 	ha.size = pcbinfo->ipi_hashmask + 1;
600 	hashfree(pcbinfo->ipi_hash_exact, &ha);
601 	hashfree(pcbinfo->ipi_hash_wild, &ha);
602 	if (pcbinfo->ipi_porthashbase != NULL) {
603 		ha.size = pcbinfo->ipi_porthashmask + 1;
604 		hashfree(pcbinfo->ipi_porthashbase, &ha);
605 	}
606 	if (pcbinfo->ipi_lbgrouphashbase != NULL) {
607 		ha.size = pcbinfo->ipi_lbgrouphashmask + 1;
608 		hashfree(pcbinfo->ipi_lbgrouphashbase, &ha);
609 	}
610 	mtx_destroy(&pcbinfo->ipi_hash_lock);
611 }
612 
613 /*
614  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
615  */
616 static void inpcb_fini(void *, int);
617 void
in_pcbstorage_init(void * arg)618 in_pcbstorage_init(void *arg)
619 {
620 	struct inpcbstorage *pcbstor = arg;
621 
622 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
623 	    pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
624 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
625 }
626 
627 /*
628  * Destroy a pcbstorage - used by unloadable protocols.
629  */
630 void
in_pcbstorage_destroy(void * arg)631 in_pcbstorage_destroy(void *arg)
632 {
633 	struct inpcbstorage *pcbstor = arg;
634 
635 	uma_zdestroy(pcbstor->ips_zone);
636 }
637 
638 /*
639  * Allocate a PCB and associate it with the socket.
640  * On success return with the PCB locked.
641  */
642 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)643 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
644 {
645 	struct inpcb *inp;
646 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
647 	int error;
648 #endif
649 
650 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
651 	if (inp == NULL)
652 		return (ENOBUFS);
653 	bzero(&inp->inp_start_zero, inp_zero_size);
654 #ifdef NUMA
655 	inp->inp_numa_domain = M_NODOM;
656 #endif
657 	inp->inp_pcbinfo = pcbinfo;
658 	inp->inp_socket = so;
659 	inp->inp_cred = crhold(so->so_cred);
660 	inp->inp_inc.inc_fibnum = so->so_fibnum;
661 #ifdef MAC
662 	error = mac_inpcb_init(inp, M_NOWAIT);
663 	if (error != 0)
664 		goto out;
665 	mac_inpcb_create(so, inp);
666 #endif
667 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
668 	error = ipsec_init_pcbpolicy(inp);
669 	if (error != 0) {
670 #ifdef MAC
671 		mac_inpcb_destroy(inp);
672 #endif
673 		goto out;
674 	}
675 #endif /*IPSEC*/
676 #ifdef INET6
677 	if (INP_SOCKAF(so) == AF_INET6) {
678 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
679 		if (V_ip6_v6only)
680 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
681 #ifdef INET
682 		else
683 			inp->inp_vflag |= INP_IPV4;
684 #endif
685 		if (V_ip6_auto_flowlabel)
686 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
687 		inp->in6p_hops = -1;	/* use kernel default */
688 	}
689 #endif
690 #if defined(INET) && defined(INET6)
691 	else
692 #endif
693 #ifdef INET
694 		inp->inp_vflag |= INP_IPV4;
695 #endif
696 	inp->inp_smr = SMR_SEQ_INVALID;
697 
698 	/*
699 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
700 	 * to be cleaned up.
701 	 */
702 	inp->inp_route.ro_flags = RT_LLE_CACHE;
703 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
704 	inp->inp_flags |= INP_UNCONNECTED;
705 	INP_WLOCK(inp);
706 	INP_HASH_WLOCK(pcbinfo);
707 	pcbinfo->ipi_count++;
708 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
709 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
710 	INP_HASH_WUNLOCK(pcbinfo);
711 	so->so_pcb = inp;
712 
713 	return (0);
714 
715 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
716 out:
717 	crfree(inp->inp_cred);
718 #ifdef INVARIANTS
719 	inp->inp_cred = NULL;
720 #endif
721 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
722 	return (error);
723 #endif
724 }
725 
726 #if defined(INET) || defined(INET6)
727 /*
728  * Assign a local port like in_pcb_lport(), but also used with connect()
729  * and a foreign address and port.  If fsa is non-NULL, choose a local port
730  * that is unused with those, otherwise one that is completely unused.
731  * lsa can be NULL for IPv6.
732  */
733 int
in_pcb_lport_dest(const struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)734 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
735     u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
736     int lookupflags)
737 {
738 	struct inpcbinfo *pcbinfo;
739 	struct inpcb *tmpinp;
740 	unsigned short *lastport;
741 	int count, error;
742 	u_short aux, first, last, lport;
743 #ifdef INET
744 	struct in_addr laddr, faddr;
745 #endif
746 #ifdef INET6
747 	struct in6_addr *laddr6, *faddr6;
748 #endif
749 
750 	pcbinfo = inp->inp_pcbinfo;
751 
752 	/*
753 	 * Because no actual state changes occur here, a global write lock on
754 	 * the pcbinfo isn't required.
755 	 */
756 	INP_LOCK_ASSERT(inp);
757 	INP_HASH_LOCK_ASSERT(pcbinfo);
758 
759 	if (inp->inp_flags & INP_HIGHPORT) {
760 		first = V_ipport_hifirstauto;	/* sysctl */
761 		last  = V_ipport_hilastauto;
762 		lastport = &pcbinfo->ipi_lasthi;
763 	} else if (inp->inp_flags & INP_LOWPORT) {
764 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
765 		if (error)
766 			return (error);
767 		first = V_ipport_lowfirstauto;	/* 1023 */
768 		last  = V_ipport_lowlastauto;	/* 600 */
769 		lastport = &pcbinfo->ipi_lastlow;
770 	} else {
771 		first = V_ipport_firstauto;	/* sysctl */
772 		last  = V_ipport_lastauto;
773 		lastport = &pcbinfo->ipi_lastport;
774 	}
775 
776 	/*
777 	 * Instead of having two loops further down counting up or down
778 	 * make sure that first is always <= last and go with only one
779 	 * code path implementing all logic.
780 	 */
781 	if (first > last) {
782 		aux = first;
783 		first = last;
784 		last = aux;
785 	}
786 
787 #ifdef INET
788 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
789 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
790 		if (lsa != NULL)
791 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
792 		if (fsa != NULL)
793 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
794 	}
795 #endif
796 #ifdef INET6
797 	laddr6 = NULL;
798 	if ((inp->inp_vflag & INP_IPV6) != 0) {
799 		if (lsa != NULL)
800 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
801 		if (fsa != NULL)
802 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
803 	}
804 #endif
805 
806 	tmpinp = NULL;
807 
808 	if (V_ipport_randomized)
809 		*lastport = first + (arc4random() % (last - first));
810 
811 	count = last - first;
812 
813 	do {
814 		if (count-- < 0)	/* completely used? */
815 			return (EADDRNOTAVAIL);
816 		++*lastport;
817 		if (*lastport < first || *lastport > last)
818 			*lastport = first;
819 		lport = htons(*lastport);
820 
821 		if (fsa != NULL) {
822 #ifdef INET
823 			if (lsa->sa_family == AF_INET) {
824 				tmpinp = in_pcblookup_internal(pcbinfo,
825 				    faddr, fport, laddr, lport, lookupflags,
826 				    M_NODOM, RT_ALL_FIBS);
827 			}
828 #endif
829 #ifdef INET6
830 			if (lsa->sa_family == AF_INET6) {
831 				tmpinp = in6_pcblookup_internal(pcbinfo,
832 				    faddr6, fport, laddr6, lport, lookupflags,
833 				    M_NODOM, RT_ALL_FIBS);
834 			}
835 #endif
836 		} else {
837 #ifdef INET6
838 			if ((inp->inp_vflag & INP_IPV6) != 0) {
839 				tmpinp = in6_pcblookup_local(pcbinfo,
840 				    &inp->in6p_laddr, lport, RT_ALL_FIBS,
841 				    lookupflags, cred);
842 #ifdef INET
843 				if (tmpinp == NULL &&
844 				    (inp->inp_vflag & INP_IPV4))
845 					tmpinp = in_pcblookup_local(pcbinfo,
846 					    laddr, lport, RT_ALL_FIBS,
847 					    lookupflags, cred);
848 #endif
849 			}
850 #endif
851 #if defined(INET) && defined(INET6)
852 			else
853 #endif
854 #ifdef INET
855 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
856 				    lport, RT_ALL_FIBS, lookupflags, cred);
857 #endif
858 		}
859 	} while (tmpinp != NULL);
860 
861 	*lportp = lport;
862 
863 	return (0);
864 }
865 
866 /*
867  * Select a local port (number) to use.
868  */
869 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)870 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
871     struct ucred *cred, int lookupflags)
872 {
873 	struct sockaddr_in laddr;
874 
875 	if (laddrp) {
876 		bzero(&laddr, sizeof(laddr));
877 		laddr.sin_family = AF_INET;
878 		laddr.sin_addr = *laddrp;
879 	}
880 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
881 	    NULL, lportp, NULL, 0, cred, lookupflags));
882 }
883 #endif /* INET || INET6 */
884 
885 #ifdef INET
886 /*
887  * Determine whether the inpcb can be bound to the specified address/port tuple.
888  */
889 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)890 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
891     const u_short lport, const int fib, int sooptions, int lookupflags,
892     struct ucred *cred)
893 {
894 	int reuseport, reuseport_lb;
895 
896 	INP_LOCK_ASSERT(inp);
897 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
898 
899 	reuseport = (sooptions & SO_REUSEPORT);
900 	reuseport_lb = (sooptions & SO_REUSEPORT_LB);
901 
902 	if (IN_MULTICAST(ntohl(laddr.s_addr))) {
903 		/*
904 		 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
905 		 * allow complete duplication of binding if
906 		 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
907 		 * and a multicast address is bound on both
908 		 * new and duplicated sockets.
909 		 */
910 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
911 			reuseport = SO_REUSEADDR | SO_REUSEPORT;
912 		/*
913 		 * XXX: How to deal with SO_REUSEPORT_LB here?
914 		 * Treat same as SO_REUSEPORT for now.
915 		 */
916 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
917 			reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
918 	} else if (!in_nullhost(laddr)) {
919 		struct sockaddr_in sin;
920 
921 		memset(&sin, 0, sizeof(sin));
922 		sin.sin_family = AF_INET;
923 		sin.sin_len = sizeof(sin);
924 		sin.sin_addr = laddr;
925 
926 		/*
927 		 * Is the address a local IP address?
928 		 * If INP_BINDANY is set, then the socket may be bound
929 		 * to any endpoint address, local or not.
930 		 */
931 		if ((inp->inp_flags & INP_BINDANY) == 0 &&
932 		    ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
933 			return (EADDRNOTAVAIL);
934 	}
935 
936 	if (lport != 0) {
937 		struct inpcb *t;
938 
939 		if (ntohs(lport) <= V_ipport_reservedhigh &&
940 		    ntohs(lport) >= V_ipport_reservedlow &&
941 		    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
942 			return (EACCES);
943 
944 		if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
945 		    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
946 			/*
947 			 * If a socket owned by a different user is already
948 			 * bound to this port, fail.  In particular, SO_REUSE*
949 			 * can only be used to share a port among sockets owned
950 			 * by the same user.
951 			 *
952 			 * However, we can share a port with a connected socket
953 			 * which has a unique 4-tuple.
954 			 */
955 			t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
956 			    RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
957 			if (t != NULL &&
958 			    (inp->inp_socket->so_type != SOCK_STREAM ||
959 			     in_nullhost(t->inp_faddr)) &&
960 			    (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
961 				return (EADDRINUSE);
962 		}
963 		t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
964 		    lookupflags, cred);
965 		if (t != NULL && ((reuseport | reuseport_lb) &
966 		    t->inp_socket->so_options) == 0) {
967 #ifdef INET6
968 			if (!in_nullhost(laddr) ||
969 			    !in_nullhost(t->inp_laddr) ||
970 			    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
971 			    (t->inp_vflag & INP_IPV6PROTO) == 0)
972 #endif
973 				return (EADDRINUSE);
974 		}
975 	}
976 	return (0);
977 }
978 
979 /*
980  * Set up a bind operation on a PCB, performing port allocation
981  * as required, but do not actually modify the PCB. Callers can
982  * either complete the bind by setting inp_laddr/inp_lport and
983  * calling in_pcbinshash(), or they can just use the resulting
984  * port and address to authorise the sending of a once-off packet.
985  *
986  * On error, the values of *laddrp and *lportp are not changed.
987  */
988 static int
in_pcbbind_setup_locked(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)989 in_pcbbind_setup_locked(struct inpcb *inp, struct sockaddr_in *sin,
990     in_addr_t *laddrp, u_short *lportp, int flags, struct ucred *cred)
991 {
992 	struct socket *so = inp->inp_socket;
993 	struct in_addr laddr;
994 	u_short lport = 0;
995 	int error, fib, lookupflags, sooptions;
996 
997 	/*
998 	 * No state changes, so read locks are sufficient here.
999 	 */
1000 	INP_LOCK_ASSERT(inp);
1001 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1002 
1003 	laddr.s_addr = *laddrp;
1004 	if (sin != NULL && laddr.s_addr != INADDR_ANY)
1005 		return (EINVAL);
1006 
1007 	lookupflags = 0;
1008 	sooptions = atomic_load_int(&so->so_options);
1009 	if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1010 		lookupflags = INPLOOKUP_WILDCARD;
1011 	if (sin == NULL) {
1012 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
1013 			return (error);
1014 	} else {
1015 		KASSERT(sin->sin_family == AF_INET,
1016 		    ("%s: invalid family for address %p", __func__, sin));
1017 		KASSERT(sin->sin_len == sizeof(*sin),
1018 		    ("%s: invalid length for address %p", __func__, sin));
1019 
1020 		error = prison_local_ip4(cred, &sin->sin_addr);
1021 		if (error)
1022 			return (error);
1023 		if (sin->sin_port != *lportp) {
1024 			/* Don't allow the port to change. */
1025 			if (*lportp != 0)
1026 				return (EINVAL);
1027 			lport = sin->sin_port;
1028 		}
1029 		laddr = sin->sin_addr;
1030 
1031 		fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1032 		    RT_ALL_FIBS;
1033 
1034 		/* See if this address/port combo is available. */
1035 		error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1036 		    lookupflags, cred);
1037 		if (error != 0)
1038 			return (error);
1039 	}
1040 	if (*lportp != 0)
1041 		lport = *lportp;
1042 	if (lport == 0) {
1043 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1044 		if (error != 0)
1045 			return (error);
1046 	}
1047 	*laddrp = laddr.s_addr;
1048 	*lportp = lport;
1049 	if ((flags & INPBIND_FIB) != 0)
1050 		inp->inp_flags |= INP_BOUNDFIB;
1051 	return (0);
1052 }
1053 
1054 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1055 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1056     u_short *lportp, int flags, struct ucred *cred)
1057 {
1058 	int error;
1059 
1060 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1061 	error = in_pcbbind_setup_locked(inp, sin, laddrp, lportp, flags, cred);
1062 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1063 
1064 	return (error);
1065 }
1066 
1067 #ifdef INET
1068 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)1069 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
1070     struct ucred *cred)
1071 {
1072 	int error;
1073 	bool anonport;
1074 
1075 	KASSERT(sin == NULL || sin->sin_family == AF_INET,
1076 	    ("%s: invalid address family for %p", __func__, sin));
1077 	KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
1078 	    ("%s: invalid address length for %p", __func__, sin));
1079 	INP_WLOCK_ASSERT(inp);
1080 
1081 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
1082 		return (EINVAL);
1083 	anonport = sin == NULL || sin->sin_port == 0;
1084 
1085 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1086 	error = in_pcbbind_setup_locked(inp, sin, &inp->inp_laddr.s_addr,
1087 	    &inp->inp_lport, flags, cred);
1088 	if (error) {
1089 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1090 		return (error);
1091 	}
1092 	if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
1093 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1094 		MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
1095 		inp->inp_laddr.s_addr = INADDR_ANY;
1096 		inp->inp_lport = 0;
1097 		inp->inp_flags &= ~INP_BOUNDFIB;
1098 		return (error);
1099 	}
1100 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1101 	if (anonport)
1102 		inp->inp_flags |= INP_ANONPORT;
1103 	return (0);
1104 }
1105 #endif
1106 
1107 /*
1108  * Connect from a socket to a specified address.
1109  * Both address and port must be specified in argument sin.
1110  * If don't have a local address for this socket yet,
1111  * then pick one.
1112  */
1113 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1114 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1115 {
1116 	struct in_addr laddr, faddr;
1117 	u_short lport;
1118 	int error;
1119 	bool anonport;
1120 
1121 	NET_EPOCH_ASSERT();
1122 	INP_WLOCK_ASSERT(inp);
1123 	KASSERT(in_nullhost(inp->inp_faddr),
1124 	    ("%s: inp is already connected", __func__));
1125 	KASSERT(sin->sin_family == AF_INET,
1126 	    ("%s: invalid address family for %p", __func__, sin));
1127 	KASSERT(sin->sin_len == sizeof(*sin),
1128 	    ("%s: invalid address length for %p", __func__, sin));
1129 
1130 	if (sin->sin_port == 0)
1131 		return (EADDRNOTAVAIL);
1132 
1133 	anonport = (inp->inp_lport == 0);
1134 
1135 	if (__predict_false(in_broadcast(sin->sin_addr))) {
1136 		if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1137 			return (ENETUNREACH);
1138 		/*
1139 		 * If the destination address is INADDR_ANY, use the primary
1140 		 * local address.  If the supplied address is INADDR_BROADCAST,
1141 		 * and the primary interface supports broadcast, choose the
1142 		 * broadcast address for that interface.
1143 		 */
1144 		if (in_nullhost(sin->sin_addr)) {
1145 			faddr =
1146 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1147 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
1148 				return (error);
1149 		} else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1150 		    CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1151 		    & IFF_BROADCAST) {
1152 			faddr = satosin(&CK_STAILQ_FIRST(
1153 			    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1154 		} else
1155 			faddr = sin->sin_addr;
1156 	} else
1157 		faddr = sin->sin_addr;
1158 
1159 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1160 	if (in_nullhost(inp->inp_laddr)) {
1161 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1162 		if (__predict_false(error)) {
1163 			INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1164 			return (error);
1165 		}
1166 	} else
1167 		laddr = inp->inp_laddr;
1168 
1169 	if (anonport) {
1170 		struct sockaddr_in lsin = {
1171 			.sin_family = AF_INET,
1172 			.sin_addr = laddr,
1173 		};
1174 		struct sockaddr_in fsin = {
1175 			.sin_family = AF_INET,
1176 			.sin_addr = faddr,
1177 		};
1178 
1179 		error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1180 		    &lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1181 		    INPLOOKUP_WILDCARD);
1182 		if (__predict_false(error)) {
1183 			INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1184 			return (error);
1185 		}
1186 	} else if (in_pcblookup_internal(inp->inp_pcbinfo, faddr,
1187 	    sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1188 	    NULL) {
1189 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1190 		return (EADDRINUSE);
1191 	} else
1192 		lport = inp->inp_lport;
1193 
1194 	MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1195 	    (inp->inp_flags & INP_UNCONNECTED));
1196 
1197 	inp->inp_faddr = faddr;
1198 	inp->inp_fport = sin->sin_port;
1199 	inp->inp_laddr = laddr;
1200 	inp->inp_lport = lport;
1201 
1202 	if (inp->inp_flags & INP_UNCONNECTED) {
1203 		error = in_pcbinshash(inp);
1204 		MPASS(error == 0);
1205 	} else
1206 		in_pcbrehash(inp);
1207 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1208 
1209 	if (V_fib_hash_outbound) {
1210 		uint32_t hash_val, hash_type;
1211 
1212 		hash_val = fib4_calc_software_hash(inp->inp_laddr,
1213 		    inp->inp_faddr, 0, sin->sin_port,
1214 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
1215 
1216 		inp->inp_flowid = hash_val;
1217 		inp->inp_flowtype = hash_type;
1218 	}
1219 	if (anonport)
1220 		inp->inp_flags |= INP_ANONPORT;
1221 	return (0);
1222 }
1223 
1224 /*
1225  * Do proper source address selection on an unbound socket in case
1226  * of connect. Take jails into account as well.
1227  */
1228 int
in_pcbladdr(const struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1229 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1230     struct in_addr *laddr, struct ucred *cred)
1231 {
1232 	struct ifaddr *ifa;
1233 	struct sockaddr *sa;
1234 	struct sockaddr_in *sin, dst;
1235 	struct nhop_object *nh;
1236 	int error;
1237 
1238 	NET_EPOCH_ASSERT();
1239 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1240 
1241 	/*
1242 	 * Bypass source address selection and use the primary jail IP
1243 	 * if requested.
1244 	 */
1245 	if (!prison_saddrsel_ip4(cred, laddr))
1246 		return (0);
1247 
1248 	/*
1249 	 * If the destination address is multicast and an outgoing
1250 	 * interface has been set as a multicast option, prefer the
1251 	 * address of that interface as our source address.
1252 	 */
1253 	if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1254 	    inp->inp_moptions->imo_multicast_ifp != NULL) {
1255 		struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1256 		struct in_ifaddr *ia;
1257 
1258 		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1259 			if (ia->ia_ifp == ifp &&
1260 			    prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1261 				break;
1262 		}
1263 		if (ia == NULL)
1264 			return (EADDRNOTAVAIL);
1265 		*laddr = ia->ia_addr.sin_addr;
1266 		return (0);
1267 	}
1268 
1269 	error = 0;
1270 
1271 	nh = NULL;
1272 	bzero(&dst, sizeof(dst));
1273 	sin = &dst;
1274 	sin->sin_family = AF_INET;
1275 	sin->sin_len = sizeof(struct sockaddr_in);
1276 	sin->sin_addr.s_addr = faddr->s_addr;
1277 
1278 	/*
1279 	 * If route is known our src addr is taken from the i/f,
1280 	 * else punt.
1281 	 *
1282 	 * Find out route to destination.
1283 	 */
1284 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1285 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1286 		    0, NHR_NONE, 0);
1287 
1288 	/*
1289 	 * If we found a route, use the address corresponding to
1290 	 * the outgoing interface.
1291 	 *
1292 	 * Otherwise assume faddr is reachable on a directly connected
1293 	 * network and try to find a corresponding interface to take
1294 	 * the source address from.
1295 	 */
1296 	if (nh == NULL || nh->nh_ifp == NULL) {
1297 		struct in_ifaddr *ia;
1298 		struct ifnet *ifp;
1299 
1300 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1301 					inp->inp_socket->so_fibnum));
1302 		if (ia == NULL) {
1303 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1304 						inp->inp_socket->so_fibnum));
1305 		}
1306 		if (ia == NULL) {
1307 			error = ENETUNREACH;
1308 			goto done;
1309 		}
1310 
1311 		if (!prison_flag(cred, PR_IP4)) {
1312 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1313 			goto done;
1314 		}
1315 
1316 		ifp = ia->ia_ifp;
1317 		ia = NULL;
1318 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1319 			sa = ifa->ifa_addr;
1320 			if (sa->sa_family != AF_INET)
1321 				continue;
1322 			sin = (struct sockaddr_in *)sa;
1323 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1324 				ia = (struct in_ifaddr *)ifa;
1325 				break;
1326 			}
1327 		}
1328 		if (ia != NULL) {
1329 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1330 			goto done;
1331 		}
1332 
1333 		/* 3. As a last resort return the 'default' jail address. */
1334 		error = prison_get_ip4(cred, laddr);
1335 		goto done;
1336 	}
1337 
1338 	/*
1339 	 * If the outgoing interface on the route found is not
1340 	 * a loopback interface, use the address from that interface.
1341 	 * In case of jails do those three steps:
1342 	 * 1. check if the interface address belongs to the jail. If so use it.
1343 	 * 2. check if we have any address on the outgoing interface
1344 	 *    belonging to this jail. If so use it.
1345 	 * 3. as a last resort return the 'default' jail address.
1346 	 */
1347 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1348 		struct in_ifaddr *ia;
1349 		struct ifnet *ifp;
1350 
1351 		/* If not jailed, use the default returned. */
1352 		if (!prison_flag(cred, PR_IP4)) {
1353 			ia = (struct in_ifaddr *)nh->nh_ifa;
1354 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1355 			goto done;
1356 		}
1357 
1358 		/* Jailed. */
1359 		/* 1. Check if the iface address belongs to the jail. */
1360 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1361 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1362 			ia = (struct in_ifaddr *)nh->nh_ifa;
1363 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1364 			goto done;
1365 		}
1366 
1367 		/*
1368 		 * 2. Check if we have any address on the outgoing interface
1369 		 *    belonging to this jail.
1370 		 */
1371 		ia = NULL;
1372 		ifp = nh->nh_ifp;
1373 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1374 			sa = ifa->ifa_addr;
1375 			if (sa->sa_family != AF_INET)
1376 				continue;
1377 			sin = (struct sockaddr_in *)sa;
1378 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1379 				ia = (struct in_ifaddr *)ifa;
1380 				break;
1381 			}
1382 		}
1383 		if (ia != NULL) {
1384 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1385 			goto done;
1386 		}
1387 
1388 		/* 3. As a last resort return the 'default' jail address. */
1389 		error = prison_get_ip4(cred, laddr);
1390 		goto done;
1391 	}
1392 
1393 	/*
1394 	 * The outgoing interface is marked with 'loopback net', so a route
1395 	 * to ourselves is here.
1396 	 * Try to find the interface of the destination address and then
1397 	 * take the address from there. That interface is not necessarily
1398 	 * a loopback interface.
1399 	 * In case of jails, check that it is an address of the jail
1400 	 * and if we cannot find, fall back to the 'default' jail address.
1401 	 */
1402 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1403 		struct in_ifaddr *ia;
1404 
1405 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1406 					inp->inp_socket->so_fibnum));
1407 		if (ia == NULL)
1408 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1409 						inp->inp_socket->so_fibnum));
1410 		if (ia == NULL)
1411 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1412 
1413 		if (!prison_flag(cred, PR_IP4)) {
1414 			if (ia == NULL) {
1415 				error = ENETUNREACH;
1416 				goto done;
1417 			}
1418 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1419 			goto done;
1420 		}
1421 
1422 		/* Jailed. */
1423 		if (ia != NULL) {
1424 			struct ifnet *ifp;
1425 
1426 			ifp = ia->ia_ifp;
1427 			ia = NULL;
1428 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1429 				sa = ifa->ifa_addr;
1430 				if (sa->sa_family != AF_INET)
1431 					continue;
1432 				sin = (struct sockaddr_in *)sa;
1433 				if (prison_check_ip4(cred,
1434 				    &sin->sin_addr) == 0) {
1435 					ia = (struct in_ifaddr *)ifa;
1436 					break;
1437 				}
1438 			}
1439 			if (ia != NULL) {
1440 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1441 				goto done;
1442 			}
1443 		}
1444 
1445 		/* 3. As a last resort return the 'default' jail address. */
1446 		error = prison_get_ip4(cred, laddr);
1447 		goto done;
1448 	}
1449 
1450 done:
1451 	if (error == 0 && laddr->s_addr == INADDR_ANY)
1452 		return (EHOSTUNREACH);
1453 	return (error);
1454 }
1455 
1456 void
in_pcbdisconnect(struct inpcb * inp)1457 in_pcbdisconnect(struct inpcb *inp)
1458 {
1459 
1460 	INP_WLOCK_ASSERT(inp);
1461 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1462 	    ("%s: inp %p was already disconnected", __func__, inp));
1463 
1464 	if (inp->inp_flags & INP_UNCONNECTED)
1465 		return;
1466 
1467 	INP_HASH_WLOCK(inp->inp_pcbinfo);
1468 	in_pcbremhash(inp);
1469 	CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
1470 	    inp_unconn_list);
1471 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1472 	inp->inp_flags |= INP_UNCONNECTED;
1473 
1474 	if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1475 		/* See the comment in in_pcbinshash(). */
1476 		inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1477 		inp->inp_faddr.s_addr = INADDR_ANY;
1478 		inp->inp_fport = 0;
1479 	}
1480 }
1481 #endif /* INET */
1482 
1483 void
in_pcblisten(struct inpcb * inp)1484 in_pcblisten(struct inpcb *inp)
1485 {
1486 	struct inpcblbgroup *grp;
1487 
1488 	INP_WLOCK_ASSERT(inp);
1489 
1490 	if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1491 		struct inpcbinfo *pcbinfo;
1492 
1493 		pcbinfo = inp->inp_pcbinfo;
1494 		INP_HASH_WLOCK(pcbinfo);
1495 		grp = in_pcblbgroup_find(inp);
1496 		LIST_REMOVE(inp, inp_lbgroup_list);
1497 		grp->il_pendcnt--;
1498 		in_pcblbgroup_insert(grp, inp);
1499 		INP_HASH_WUNLOCK(pcbinfo);
1500 	}
1501 }
1502 
1503 /*
1504  * inpcb hash lookups are protected by SMR section.
1505  *
1506  * Once desired pcb has been found, switching from SMR section to a pcb
1507  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1508  * here because SMR is a critical section.
1509  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1510  */
1511 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1512 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1513 {
1514 
1515 	lock == INPLOOKUP_RLOCKPCB ?
1516 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1517 }
1518 
1519 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1520 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1521 {
1522 
1523 	lock == INPLOOKUP_RLOCKPCB ?
1524 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1525 }
1526 
1527 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1528 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1529 {
1530 
1531 	return (lock == INPLOOKUP_RLOCKPCB ?
1532 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1533 }
1534 
1535 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1536 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1537 {
1538 
1539 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1540 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1541 
1542 	if (__predict_true(inp_trylock(inp, lock))) {
1543 		if (__predict_false(inp->inp_flags & ignflags)) {
1544 			smr_exit(inp->inp_pcbinfo->ipi_smr);
1545 			inp_unlock(inp, lock);
1546 			return (false);
1547 		}
1548 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1549 		return (true);
1550 	}
1551 
1552 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1553 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1554 		inp_lock(inp, lock);
1555 		if (__predict_false(in_pcbrele(inp, lock)))
1556 			return (false);
1557 		/*
1558 		 * inp acquired through refcount & lock for sure didn't went
1559 		 * through uma_zfree().  However, it may have already went
1560 		 * through in_pcbfree() and has another reference, that
1561 		 * prevented its release by our in_pcbrele().
1562 		 */
1563 		if (__predict_false(inp->inp_flags & ignflags)) {
1564 			inp_unlock(inp, lock);
1565 			return (false);
1566 		}
1567 		return (true);
1568 	} else {
1569 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1570 		return (false);
1571 	}
1572 }
1573 
1574 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1575 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1576 {
1577 
1578 	/*
1579 	 * in_pcblookup() family of functions shall ignore not onlu pcbs that
1580 	 * had been freed that may be found due to lockless access to the hash,
1581 	 * but also pcbs that were removed from the hash, but are still around.
1582 	 */
1583 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_UNCONNECTED));
1584 }
1585 
1586 /*
1587  * inp_next() - inpcb hash/list traversal iterator
1588  *
1589  * Requires initialized struct inpcb_iterator for context.
1590  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1591  *
1592  * - Iterator can have either write-lock or read-lock semantics, that can not
1593  *   be changed later.
1594  * - Iterator has three modes of operation, defined by value of .hash member
1595  *   on the first call:
1596  *   - .hash = INP_ALL_LIST: the iterator will go through the unconnected
1597  *     list, then all wildcard hash slots and then all exact hash slots.
1598  *   - .hash = INP_UNCONN_LIST: the iterator will go through the list of
1599  *     unconnected pcbs only.
1600  *   - .hash initialized with an arbitrary positive value: iterator will go
1601  *     through this exact hash slot only.
1602  *   Note: only rip_input() and sysctl_setsockopt() use the latter.
1603  *   The interface may be extended for iteration over single wildcard hash
1604  *   slot, but there is no use case for that today.
1605  * - Iterator may have optional bool matching function.  The matching function
1606  *   will be executed for each inpcb in the SMR context, so it can not acquire
1607  *   locks and can safely access only immutable fields of inpcb.
1608  *
1609  * A fresh initialized iterator has NULL inpcb in its context and that
1610  * means that inp_next() call would return the very first inpcb on the list
1611  * locked with desired semantic.  In all following calls the context pointer
1612  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1613  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1614  * and write NULL to its context.  After end of traversal an iterator can be
1615  * reused.
1616  *
1617  * List traversals have the following features/constraints:
1618  * - New entries won't be seen, as they are always added to the head of a list.
1619  * - Removed entries won't stop traversal as long as they are not added to
1620  *   a different list. This is violated by in_pcbrehash().
1621  */
1622 static inline struct inpcb *
ii_list_first(const struct inpcb_iterator * ii)1623 ii_list_first(const struct inpcb_iterator *ii)
1624 {
1625 	const struct inpcbinfo *ipi = ii->ipi;
1626 	const int hash = ii->hash;
1627 
1628 	if (hash < 0)
1629 		return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
1630 	else if (hash <= ipi->ipi_hashmask)
1631 		return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
1632 	else
1633 		return (CK_LIST_FIRST(
1634 		    &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
1635 }
1636 
1637 static inline struct inpcb *
ii_list_next(const struct inpcb_iterator * ii,struct inpcb * inp)1638 ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
1639 {
1640 	if (ii->hash < 0)
1641 		return (CK_LIST_NEXT(inp, inp_unconn_list));
1642 	else if (ii->hash <= ii->ipi->ipi_hashmask)
1643 		return (CK_LIST_NEXT(inp, inp_hash_wild));
1644 	else
1645 		return (CK_LIST_NEXT(inp, inp_hash_exact));
1646 }
1647 
1648 struct inpcb *
inp_next(struct inpcb_iterator * ii)1649 inp_next(struct inpcb_iterator *ii)
1650 {
1651 	const struct inpcbinfo *ipi = ii->ipi;
1652 	const int hashmax = (ipi->ipi_hashmask + 1) * 2;
1653 	inp_match_t *match = ii->match;
1654 	void *ctx = ii->ctx;
1655 	inp_lookup_t lock = ii->lock;
1656 	struct inpcb *inp;
1657 
1658 	if (ii->inp == NULL) {		/* First call. */
1659 		if ((ii->hash = ii->mode) >= 0) {
1660 			/* Targeted iterators support only the exact hash. */
1661 			MPASS(ii->hash <= ipi->ipi_hashmask);
1662 			ii->hash += ipi->ipi_hashmask + 1;
1663 		}
1664 		smr_enter(ipi->ipi_smr);
1665 next_first:
1666 		/* This is unrolled CK_LIST_FOREACH() over different headers. */
1667 		for (inp = ii_list_first(ii);
1668 		    inp != NULL;
1669 		    inp = ii_list_next(ii, inp)) {
1670 			if (match != NULL && (match)(inp, ctx) == false)
1671 				continue;
1672 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1673 				break;
1674 			else {
1675 				smr_enter(ipi->ipi_smr);
1676 				MPASS(inp != ii_list_first(ii));
1677 				inp = ii_list_first(ii);
1678 				if (inp == NULL)
1679 					break;
1680 			}
1681 		}
1682 
1683 		if (inp == NULL) {
1684 			if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
1685 				goto next_first;
1686 			smr_exit(ipi->ipi_smr);
1687 		} else
1688 			ii->inp = inp;
1689 
1690 		return (inp);
1691 	}
1692 
1693 	/* Not a first call. */
1694 	smr_enter(ipi->ipi_smr);
1695 restart:
1696 	inp = ii->inp;
1697 	rw_assert(&inp->inp_lock,
1698 	    lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
1699 next:
1700 	inp = ii_list_next(ii, inp);
1701 	if (inp == NULL) {
1702 		if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
1703 			inp_unlock(ii->inp, lock);
1704 			ii->inp = NULL;
1705 			goto next_first;
1706 		}
1707 		smr_exit(ipi->ipi_smr);
1708 		goto found;
1709 	}
1710 
1711 	if (match != NULL && (match)(inp, ctx) == false)
1712 		goto next;
1713 
1714 	if (__predict_true(inp_trylock(inp, lock))) {
1715 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1716 			/*
1717 			 * Entries are never inserted in middle of a list, thus
1718 			 * as long as we are in SMR, we can continue traversal.
1719 			 * Jump to 'next' should yield in the same result, but
1720 			 * could produce unnecessary looping.  Could this
1721 			 * looping be unbound?
1722 			 */
1723 			inp_unlock(inp, lock);
1724 			goto next;
1725 		} else {
1726 			smr_exit(ipi->ipi_smr);
1727 			goto found;
1728 		}
1729 	}
1730 
1731 	/*
1732 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
1733 	 * SMR section we can no longer jump to 'next', and our only stable
1734 	 * anchoring point is ii->inp, which we keep locked for this case, so
1735 	 * we jump to 'restart'.
1736 	 */
1737 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1738 		smr_exit(ipi->ipi_smr);
1739 		inp_lock(inp, lock);
1740 		if (__predict_false(in_pcbrele(inp, lock))) {
1741 			smr_enter(ipi->ipi_smr);
1742 			goto restart;
1743 		}
1744 		/*
1745 		 * See comment in inp_smr_lock().
1746 		 */
1747 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1748 			inp_unlock(inp, lock);
1749 			smr_enter(ipi->ipi_smr);
1750 			goto restart;
1751 		}
1752 	} else
1753 		goto next;
1754 
1755 found:
1756 	inp_unlock(ii->inp, lock);
1757 	ii->inp = inp;
1758 
1759 	return (ii->inp);
1760 }
1761 
1762 /*
1763  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1764  * stability of an inpcb pointer despite the inpcb lock being released or
1765  * SMR section exited.
1766  *
1767  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1768  */
1769 void
in_pcbref(struct inpcb * inp)1770 in_pcbref(struct inpcb *inp)
1771 {
1772 	u_int old __diagused;
1773 
1774 	old = refcount_acquire(&inp->inp_refcount);
1775 	KASSERT(old > 0, ("%s: refcount 0", __func__));
1776 }
1777 
1778 /*
1779  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1780  * freeing the pcb, if the reference was very last.
1781  */
1782 bool
in_pcbrele_rlocked(struct inpcb * inp)1783 in_pcbrele_rlocked(struct inpcb *inp)
1784 {
1785 
1786 	INP_RLOCK_ASSERT(inp);
1787 
1788 	if (!refcount_release(&inp->inp_refcount))
1789 		return (false);
1790 
1791 	MPASS(inp->inp_flags & INP_FREED);
1792 	MPASS(inp->inp_socket == NULL);
1793 	crfree(inp->inp_cred);
1794 #ifdef INVARIANTS
1795 	inp->inp_cred = NULL;
1796 #endif
1797 	INP_RUNLOCK(inp);
1798 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1799 	return (true);
1800 }
1801 
1802 bool
in_pcbrele_wlocked(struct inpcb * inp)1803 in_pcbrele_wlocked(struct inpcb *inp)
1804 {
1805 
1806 	INP_WLOCK_ASSERT(inp);
1807 
1808 	if (!refcount_release(&inp->inp_refcount))
1809 		return (false);
1810 
1811 	MPASS(inp->inp_flags & INP_FREED);
1812 	MPASS(inp->inp_socket == NULL);
1813 	crfree(inp->inp_cred);
1814 #ifdef INVARIANTS
1815 	inp->inp_cred = NULL;
1816 #endif
1817 	INP_WUNLOCK(inp);
1818 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1819 	return (true);
1820 }
1821 
1822 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1823 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1824 {
1825 
1826 	return (lock == INPLOOKUP_RLOCKPCB ?
1827 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1828 }
1829 
1830 /*
1831  * Dereference and rlock inp, for which the caller must own the
1832  * reference.  Returns true if inp no longer usable, false otherwise.
1833  */
1834 bool
in_pcbrele_rlock(struct inpcb * inp)1835 in_pcbrele_rlock(struct inpcb *inp)
1836 {
1837 	INP_RLOCK(inp);
1838 	if (in_pcbrele_rlocked(inp))
1839 		return (true);
1840 	if ((inp->inp_flags & INP_FREED) != 0) {
1841 		INP_RUNLOCK(inp);
1842 		return (true);
1843 	}
1844 	return (false);
1845 }
1846 
1847 /*
1848  * Unconditionally schedule an inpcb to be freed by decrementing its
1849  * reference count, which should occur only after the inpcb has been detached
1850  * from its socket.  If another thread holds a temporary reference (acquired
1851  * using in_pcbref()) then the free is deferred until that reference is
1852  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1853  *  Almost all work, including removal from global lists, is done in this
1854  * context, where the pcbinfo lock is held.
1855  */
1856 void
in_pcbfree(struct inpcb * inp)1857 in_pcbfree(struct inpcb *inp)
1858 {
1859 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1860 #ifdef INET
1861 	struct ip_moptions *imo;
1862 #endif
1863 #ifdef INET6
1864 	struct ip6_moptions *im6o;
1865 #endif
1866 
1867 	INP_WLOCK_ASSERT(inp);
1868 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1869 	KASSERT((inp->inp_flags & INP_FREED) == 0,
1870 	    ("%s: called twice for pcb %p", __func__, inp));
1871 
1872 	/*
1873 	 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1874 	 * from the hash without acquiring inpcb lock, they rely on the hash
1875 	 * lock, thus in_pcbremhash() should be the first action.
1876 	 */
1877 	INP_HASH_WLOCK(pcbinfo);
1878 	if (inp->inp_flags & INP_UNCONNECTED)
1879 		CK_LIST_REMOVE(inp, inp_unconn_list);
1880 	else
1881 		in_pcbremhash(inp);
1882 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1883 	pcbinfo->ipi_count--;
1884 	INP_HASH_WUNLOCK(pcbinfo);
1885 
1886 #ifdef RATELIMIT
1887 	if (inp->inp_snd_tag != NULL)
1888 		in_pcbdetach_txrtlmt(inp);
1889 #endif
1890 	inp->inp_flags |= INP_FREED;
1891 	inp->inp_socket->so_pcb = NULL;
1892 	inp->inp_socket = NULL;
1893 
1894 	RO_INVALIDATE_CACHE(&inp->inp_route);
1895 #ifdef MAC
1896 	mac_inpcb_destroy(inp);
1897 #endif
1898 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1899 	if (inp->inp_sp != NULL)
1900 		ipsec_delete_pcbpolicy(inp);
1901 #endif
1902 #ifdef INET
1903 	if (inp->inp_options)
1904 		(void)m_free(inp->inp_options);
1905 	DEBUG_POISON_POINTER(inp->inp_options);
1906 	imo = inp->inp_moptions;
1907 	DEBUG_POISON_POINTER(inp->inp_moptions);
1908 #endif
1909 #ifdef INET6
1910 	if (inp->inp_vflag & INP_IPV6PROTO) {
1911 		ip6_freepcbopts(inp->in6p_outputopts);
1912 		DEBUG_POISON_POINTER(inp->in6p_outputopts);
1913 		im6o = inp->in6p_moptions;
1914 		DEBUG_POISON_POINTER(inp->in6p_moptions);
1915 	} else
1916 		im6o = NULL;
1917 #endif
1918 
1919 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1920 		INP_WUNLOCK(inp);
1921 	}
1922 #ifdef INET6
1923 	ip6_freemoptions(im6o);
1924 #endif
1925 #ifdef INET
1926 	inp_freemoptions(imo);
1927 #endif
1928 }
1929 
1930 /*
1931  * Different protocols initialize their inpcbs differently - giving
1932  * different name to the lock.  But they all are disposed the same.
1933  */
1934 static void
inpcb_fini(void * mem,int size)1935 inpcb_fini(void *mem, int size)
1936 {
1937 	struct inpcb *inp = mem;
1938 
1939 	INP_LOCK_DESTROY(inp);
1940 }
1941 
1942 #ifdef INET
1943 /*
1944  * Common routines to return the socket addresses associated with inpcbs.
1945  */
1946 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1947 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1948 {
1949 	struct inpcb *inp;
1950 
1951 	inp = sotoinpcb(so);
1952 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1953 
1954 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1955 		.sin_len = sizeof(struct sockaddr_in),
1956 		.sin_family = AF_INET,
1957 		.sin_port = inp->inp_lport,
1958 		.sin_addr = inp->inp_laddr,
1959 	};
1960 
1961 	return (0);
1962 }
1963 
1964 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1965 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1966 {
1967 	struct inpcb *inp;
1968 
1969 	inp = sotoinpcb(so);
1970 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1971 
1972 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1973 		.sin_len = sizeof(struct sockaddr_in),
1974 		.sin_family = AF_INET,
1975 		.sin_port = inp->inp_fport,
1976 		.sin_addr = inp->inp_faddr,
1977 	};
1978 
1979 	return (0);
1980 }
1981 
1982 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1983 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1984 {
1985 
1986 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1987 		return (true);
1988 	else
1989 		return (false);
1990 }
1991 
1992 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1993 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1994 {
1995 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1996 	    inp_v4_multi_match, NULL);
1997 	struct inpcb *inp;
1998 	struct in_multi *inm;
1999 	struct in_mfilter *imf;
2000 	struct ip_moptions *imo;
2001 
2002 	IN_MULTI_LOCK_ASSERT();
2003 
2004 	while ((inp = inp_next(&inpi)) != NULL) {
2005 		INP_WLOCK_ASSERT(inp);
2006 
2007 		imo = inp->inp_moptions;
2008 		/*
2009 		 * Unselect the outgoing interface if it is being
2010 		 * detached.
2011 		 */
2012 		if (imo->imo_multicast_ifp == ifp)
2013 			imo->imo_multicast_ifp = NULL;
2014 
2015 		/*
2016 		 * Drop multicast group membership if we joined
2017 		 * through the interface being detached.
2018 		 *
2019 		 * XXX This can all be deferred to an epoch_call
2020 		 */
2021 restart:
2022 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
2023 			if ((inm = imf->imf_inm) == NULL)
2024 				continue;
2025 			if (inm->inm_ifp != ifp)
2026 				continue;
2027 			ip_mfilter_remove(&imo->imo_head, imf);
2028 			in_leavegroup_locked(inm, NULL);
2029 			ip_mfilter_free(imf);
2030 			goto restart;
2031 		}
2032 	}
2033 }
2034 
2035 /*
2036  * Lookup a PCB based on the local address and port.  Caller must hold the
2037  * hash lock.  No inpcb locks or references are acquired.
2038  */
2039 #define INP_LOOKUP_MAPPED_PCB_COST	3
2040 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)2041 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2042     u_short lport, int fib, int lookupflags, struct ucred *cred)
2043 {
2044 	struct inpcb *inp;
2045 #ifdef INET6
2046 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2047 #else
2048 	int matchwild = 3;
2049 #endif
2050 	int wildcard;
2051 
2052 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2053 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2054 	KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
2055 	    ("%s: invalid fib %d", __func__, fib));
2056 
2057 	INP_HASH_LOCK_ASSERT(pcbinfo);
2058 
2059 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2060 		struct inpcbhead *head;
2061 		/*
2062 		 * Look for an unconnected (wildcard foreign addr) PCB that
2063 		 * matches the local address and port we're looking for.
2064 		 */
2065 		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2066 		    pcbinfo->ipi_hashmask)];
2067 		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2068 #ifdef INET6
2069 			/* XXX inp locking */
2070 			if ((inp->inp_vflag & INP_IPV4) == 0)
2071 				continue;
2072 #endif
2073 			if (inp->inp_laddr.s_addr == laddr.s_addr &&
2074 			    inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2075 			    inp->inp_inc.inc_fibnum == fib)) {
2076 				/*
2077 				 * Found?
2078 				 */
2079 				if (prison_equal_ip4(cred->cr_prison,
2080 				    inp->inp_cred->cr_prison))
2081 					return (inp);
2082 			}
2083 		}
2084 		/*
2085 		 * Not found.
2086 		 */
2087 		return (NULL);
2088 	} else {
2089 		struct inpcbhead *porthash;
2090 		struct inpcb *match = NULL;
2091 
2092 		/*
2093 		 * Port is in use by one or more PCBs. Look for best
2094 		 * fit.
2095 		 */
2096 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2097 		    pcbinfo->ipi_porthashmask)];
2098 		CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2099 			if (inp->inp_lport != lport)
2100 				continue;
2101 			if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2102 			    cred->cr_prison))
2103 				continue;
2104 			if (fib != RT_ALL_FIBS &&
2105 			    inp->inp_inc.inc_fibnum != fib)
2106 				continue;
2107 			wildcard = 0;
2108 #ifdef INET6
2109 			/* XXX inp locking */
2110 			if ((inp->inp_vflag & INP_IPV4) == 0)
2111 				continue;
2112 			/*
2113 			 * We never select the PCB that has INP_IPV6 flag and
2114 			 * is bound to :: if we have another PCB which is bound
2115 			 * to 0.0.0.0.  If a PCB has the INP_IPV6 flag, then we
2116 			 * set its cost higher than IPv4 only PCBs.
2117 			 *
2118 			 * Note that the case only happens when a socket is
2119 			 * bound to ::, under the condition that the use of the
2120 			 * mapped address is allowed.
2121 			 */
2122 			if ((inp->inp_vflag & INP_IPV6) != 0)
2123 				wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2124 #endif
2125 			if (inp->inp_faddr.s_addr != INADDR_ANY)
2126 				wildcard++;
2127 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
2128 				if (laddr.s_addr == INADDR_ANY)
2129 					wildcard++;
2130 				else if (inp->inp_laddr.s_addr != laddr.s_addr)
2131 					continue;
2132 			} else {
2133 				if (laddr.s_addr != INADDR_ANY)
2134 					wildcard++;
2135 			}
2136 			if (wildcard < matchwild) {
2137 				match = inp;
2138 				matchwild = wildcard;
2139 				if (matchwild == 0)
2140 					break;
2141 			}
2142 		}
2143 		return (match);
2144 	}
2145 }
2146 #undef INP_LOOKUP_MAPPED_PCB_COST
2147 
2148 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2149 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2150 {
2151 	return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2152 	    (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2153 }
2154 
2155 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2156 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2157     const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2158     uint16_t lport, int domain, int fib)
2159 {
2160 	const struct inpcblbgrouphead *hdr;
2161 	struct inpcblbgroup *grp;
2162 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2163 	struct inpcb *inp;
2164 	u_int count;
2165 
2166 	INP_HASH_LOCK_ASSERT(pcbinfo);
2167 	NET_EPOCH_ASSERT();
2168 
2169 	hdr = &pcbinfo->ipi_lbgrouphashbase[
2170 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2171 
2172 	/*
2173 	 * Search for an LB group match based on the following criteria:
2174 	 * - prefer jailed groups to non-jailed groups
2175 	 * - prefer exact source address matches to wildcard matches
2176 	 * - prefer groups bound to the specified NUMA domain
2177 	 */
2178 	jail_exact = jail_wild = local_exact = local_wild = NULL;
2179 	CK_LIST_FOREACH(grp, hdr, il_list) {
2180 		bool injail;
2181 
2182 #ifdef INET6
2183 		if (!(grp->il_vflag & INP_IPV4))
2184 			continue;
2185 #endif
2186 		if (grp->il_lport != lport)
2187 			continue;
2188 
2189 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2190 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2191 		    laddr) != 0)
2192 			continue;
2193 
2194 		if (grp->il_laddr.s_addr == laddr->s_addr) {
2195 			if (injail) {
2196 				jail_exact = grp;
2197 				if (in_pcblookup_lb_match(grp, domain, fib))
2198 					/* This is a perfect match. */
2199 					goto out;
2200 			} else if (local_exact == NULL ||
2201 			    in_pcblookup_lb_match(grp, domain, fib)) {
2202 				local_exact = grp;
2203 			}
2204 		} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2205 			if (injail) {
2206 				if (jail_wild == NULL ||
2207 				    in_pcblookup_lb_match(grp, domain, fib))
2208 					jail_wild = grp;
2209 			} else if (local_wild == NULL ||
2210 			    in_pcblookup_lb_match(grp, domain, fib)) {
2211 				local_wild = grp;
2212 			}
2213 		}
2214 	}
2215 
2216 	if (jail_exact != NULL)
2217 		grp = jail_exact;
2218 	else if (jail_wild != NULL)
2219 		grp = jail_wild;
2220 	else if (local_exact != NULL)
2221 		grp = local_exact;
2222 	else
2223 		grp = local_wild;
2224 	if (grp == NULL)
2225 		return (NULL);
2226 
2227 out:
2228 	/*
2229 	 * Synchronize with in_pcblbgroup_insert().
2230 	 */
2231 	count = atomic_load_acq_int(&grp->il_inpcnt);
2232 	if (count == 0)
2233 		return (NULL);
2234 	inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2235 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2236 	return (inp);
2237 }
2238 
2239 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2240 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2241     u_short fport, struct in_addr laddr, u_short lport)
2242 {
2243 #ifdef INET6
2244 	/* XXX inp locking */
2245 	if ((inp->inp_vflag & INP_IPV4) == 0)
2246 		return (false);
2247 #endif
2248 	if (inp->inp_faddr.s_addr == faddr.s_addr &&
2249 	    inp->inp_laddr.s_addr == laddr.s_addr &&
2250 	    inp->inp_fport == fport &&
2251 	    inp->inp_lport == lport)
2252 		return (true);
2253 	return (false);
2254 }
2255 
2256 static struct inpcb *
in_pcblookup_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2257 in_pcblookup_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2258     u_short fport, struct in_addr laddr, u_short lport)
2259 {
2260 	struct inpcbhead *head;
2261 	struct inpcb *inp;
2262 
2263 	INP_HASH_LOCK_ASSERT(pcbinfo);
2264 
2265 	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2266 	    pcbinfo->ipi_hashmask)];
2267 	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2268 		if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2269 			return (inp);
2270 	}
2271 	return (NULL);
2272 }
2273 
2274 typedef enum {
2275 	INPLOOKUP_MATCH_NONE = 0,
2276 	INPLOOKUP_MATCH_WILD = 1,
2277 	INPLOOKUP_MATCH_LADDR = 2,
2278 } inp_lookup_match_t;
2279 
2280 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2281 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2282     u_short lport, int fib)
2283 {
2284 #ifdef INET6
2285 	/* XXX inp locking */
2286 	if ((inp->inp_vflag & INP_IPV4) == 0)
2287 		return (INPLOOKUP_MATCH_NONE);
2288 #endif
2289 	if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2290 		return (INPLOOKUP_MATCH_NONE);
2291 	if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2292 		return (INPLOOKUP_MATCH_NONE);
2293 	if (inp->inp_laddr.s_addr == INADDR_ANY)
2294 		return (INPLOOKUP_MATCH_WILD);
2295 	if (inp->inp_laddr.s_addr == laddr.s_addr)
2296 		return (INPLOOKUP_MATCH_LADDR);
2297 	return (INPLOOKUP_MATCH_NONE);
2298 }
2299 
2300 #define	INP_LOOKUP_AGAIN	((struct inpcb *)(uintptr_t)-1)
2301 
2302 static struct inpcb *
in_pcblookup_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2303 in_pcblookup_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2304     u_short lport, int fib, const inp_lookup_t lockflags)
2305 {
2306 	struct inpcbhead *head;
2307 	struct inpcb *inp;
2308 
2309 	KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2310 	    ("%s: not in SMR read section", __func__));
2311 
2312 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2313 	    pcbinfo->ipi_hashmask)];
2314 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2315 		inp_lookup_match_t match;
2316 
2317 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2318 		if (match == INPLOOKUP_MATCH_NONE)
2319 			continue;
2320 
2321 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2322 			match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2323 			if (match != INPLOOKUP_MATCH_NONE &&
2324 			    prison_check_ip4_locked(inp->inp_cred->cr_prison,
2325 			    &laddr) == 0)
2326 				return (inp);
2327 			inp_unlock(inp, lockflags);
2328 		}
2329 
2330 		/*
2331 		 * The matching socket disappeared out from under us.  Fall back
2332 		 * to a serialized lookup.
2333 		 */
2334 		return (INP_LOOKUP_AGAIN);
2335 	}
2336 	return (NULL);
2337 }
2338 
2339 static struct inpcb *
in_pcblookup_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2340 in_pcblookup_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2341     u_short lport, int fib)
2342 {
2343 	struct inpcbhead *head;
2344 	struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2345 #ifdef INET6
2346 	struct inpcb *local_wild_mapped;
2347 #endif
2348 
2349 	INP_HASH_LOCK_ASSERT(pcbinfo);
2350 
2351 	/*
2352 	 * Order of socket selection - we always prefer jails.
2353 	 *      1. jailed, non-wild.
2354 	 *      2. jailed, wild.
2355 	 *      3. non-jailed, non-wild.
2356 	 *      4. non-jailed, wild.
2357 	 */
2358 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2359 	    pcbinfo->ipi_hashmask)];
2360 	local_wild = local_exact = jail_wild = NULL;
2361 #ifdef INET6
2362 	local_wild_mapped = NULL;
2363 #endif
2364 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2365 		inp_lookup_match_t match;
2366 		bool injail;
2367 
2368 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2369 		if (match == INPLOOKUP_MATCH_NONE)
2370 			continue;
2371 
2372 		injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2373 		if (injail) {
2374 			if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2375 			    &laddr) != 0)
2376 				continue;
2377 		} else {
2378 			if (local_exact != NULL)
2379 				continue;
2380 		}
2381 
2382 		if (match == INPLOOKUP_MATCH_LADDR) {
2383 			if (injail)
2384 				return (inp);
2385 			local_exact = inp;
2386 		} else {
2387 #ifdef INET6
2388 			/* XXX inp locking, NULL check */
2389 			if (inp->inp_vflag & INP_IPV6PROTO)
2390 				local_wild_mapped = inp;
2391 			else
2392 #endif
2393 				if (injail)
2394 					jail_wild = inp;
2395 				else
2396 					local_wild = inp;
2397 		}
2398 	}
2399 	if (jail_wild != NULL)
2400 		return (jail_wild);
2401 	if (local_exact != NULL)
2402 		return (local_exact);
2403 	if (local_wild != NULL)
2404 		return (local_wild);
2405 #ifdef INET6
2406 	if (local_wild_mapped != NULL)
2407 		return (local_wild_mapped);
2408 #endif
2409 	return (NULL);
2410 }
2411 
2412 /*
2413  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2414  * that the caller has either locked the hash list, which usually happens
2415  * for bind(2) operations, or is in SMR section, which happens when sorting
2416  * out incoming packets.
2417  */
2418 static struct inpcb *
in_pcblookup_internal(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2419 in_pcblookup_internal(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2420     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2421     uint8_t numa_domain, int fib)
2422 {
2423 	struct inpcb *inp;
2424 	const u_short fport = fport_arg, lport = lport_arg;
2425 
2426 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2427 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2428 	KASSERT(faddr.s_addr != INADDR_ANY,
2429 	    ("%s: invalid foreign address", __func__));
2430 	KASSERT(laddr.s_addr != INADDR_ANY,
2431 	    ("%s: invalid local address", __func__));
2432 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2433 
2434 	inp = in_pcblookup_exact(pcbinfo, faddr, fport, laddr, lport);
2435 	if (inp != NULL)
2436 		return (inp);
2437 
2438 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2439 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2440 		    &laddr, lport, numa_domain, fib);
2441 		if (inp == NULL) {
2442 			inp = in_pcblookup_wild_locked(pcbinfo, laddr,
2443 			    lport, fib);
2444 		}
2445 	}
2446 
2447 	return (inp);
2448 }
2449 
2450 /*
2451  * Lookup inpcb using locks. Used by in_pcblookup_smr() in case inp_smr_lock()
2452  * failed.
2453  */
2454 static struct inpcb *
in_pcblookup_with_lock(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2455 in_pcblookup_with_lock(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2456     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2457     uint8_t numa_domain, int fib)
2458 {
2459 	struct inpcb *inp;
2460 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2461 
2462 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2463 	    ("%s: LOCKPCB not set", __func__));
2464 
2465 	INP_HASH_WLOCK(pcbinfo);
2466 	inp = in_pcblookup_internal(pcbinfo, faddr, fport, laddr, lport,
2467 	    lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2468 	if (inp != NULL && !inp_trylock(inp, lockflags)) {
2469 		in_pcbref(inp);
2470 		INP_HASH_WUNLOCK(pcbinfo);
2471 		inp_lock(inp, lockflags);
2472 		if (in_pcbrele(inp, lockflags))
2473 			/* XXX-MJ or retry until we get a negative match? */
2474 			inp = NULL;
2475 	} else {
2476 		INP_HASH_WUNLOCK(pcbinfo);
2477 	}
2478 	return (inp);
2479 }
2480 
2481 static struct inpcb *
in_pcblookup_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2482 in_pcblookup_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2483     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2484     uint8_t numa_domain, int fib)
2485 {
2486 	struct inpcb *inp;
2487 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2488 	const u_short fport = fport_arg, lport = lport_arg;
2489 
2490 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2491 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2492 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2493 	    ("%s: LOCKPCB not set", __func__));
2494 
2495 	smr_enter(pcbinfo->ipi_smr);
2496 	inp = in_pcblookup_exact(pcbinfo, faddr, fport, laddr, lport);
2497 	if (inp != NULL) {
2498 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2499 			/*
2500 			 * Revalidate the 4-tuple, the socket could have been
2501 			 * disconnected.
2502 			 */
2503 			if (__predict_true(in_pcblookup_exact_match(inp,
2504 			    faddr, fport, laddr, lport)))
2505 				return (inp);
2506 			inp_unlock(inp, lockflags);
2507 		}
2508 
2509 		/*
2510 		 * We failed to lock the inpcb, or its connection state changed
2511 		 * out from under us.  Fall back to a precise search.
2512 		 */
2513 		return (in_pcblookup_with_lock(pcbinfo, faddr, fport, laddr,
2514 		    lport, lookupflags, numa_domain, fib));
2515 	}
2516 
2517 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2518 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2519 		    &laddr, lport, numa_domain, fib);
2520 		if (inp != NULL) {
2521 			if (__predict_true(inp_smr_lock(inp, lockflags))) {
2522 				if (__predict_true(in_pcblookup_wild_match(inp,
2523 				    laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2524 					return (inp);
2525 				inp_unlock(inp, lockflags);
2526 			}
2527 			inp = INP_LOOKUP_AGAIN;
2528 		} else {
2529 			inp = in_pcblookup_wild_smr(pcbinfo, laddr, lport,
2530 			    fib, lockflags);
2531 		}
2532 		if (inp == INP_LOOKUP_AGAIN) {
2533 			return (in_pcblookup_with_lock(pcbinfo, faddr, fport,
2534 			    laddr, lport, lookupflags, numa_domain, fib));
2535 		}
2536 	}
2537 
2538 	if (inp == NULL)
2539 		smr_exit(pcbinfo->ipi_smr);
2540 
2541 	return (inp);
2542 }
2543 
2544 /*
2545  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2546  * from which a pre-calculated hash value may be extracted.
2547  */
2548 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2549 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2550     struct in_addr laddr, u_int lport, int lookupflags,
2551     struct ifnet *ifp)
2552 {
2553 	int fib;
2554 
2555 	fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2556 	return (in_pcblookup_smr(pcbinfo, faddr, fport, laddr, lport,
2557 	    lookupflags, M_NODOM, fib));
2558 }
2559 
2560 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2561 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2562     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2563     struct ifnet *ifp __unused, struct mbuf *m)
2564 {
2565 	int fib;
2566 
2567 	M_ASSERTPKTHDR(m);
2568 	fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2569 	return (in_pcblookup_smr(pcbinfo, faddr, fport, laddr, lport,
2570 	    lookupflags, m->m_pkthdr.numa_domain, fib));
2571 }
2572 #endif /* INET */
2573 
2574 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2575 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2576 {
2577 	return (prison_flag(inp->inp_cred, flag) != 0);
2578 }
2579 
2580 /*
2581  * Insert the PCB into a hash chain using ordering rules which ensure that
2582  * in_pcblookup_wild_*() always encounter the highest-ranking PCB first.
2583  *
2584  * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2585  * with exact local addresses ahead of wildcard PCBs.  Unbound v4-mapped v6 PCBs
2586  * always appear last no matter whether they are jailed.
2587  */
2588 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2589 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2590 {
2591 	struct inpcb *last;
2592 	bool bound, injail;
2593 
2594 	INP_LOCK_ASSERT(inp);
2595 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2596 
2597 	last = NULL;
2598 	bound = inp->inp_laddr.s_addr != INADDR_ANY;
2599 	if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2600 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2601 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2602 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2603 				return;
2604 			}
2605 		}
2606 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2607 		return;
2608 	}
2609 
2610 	injail = in_pcbjailed(inp, PR_IP4);
2611 	if (!injail) {
2612 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2613 			if (!in_pcbjailed(last, PR_IP4))
2614 				break;
2615 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2616 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2617 				return;
2618 			}
2619 		}
2620 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2621 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2622 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2623 		return;
2624 	}
2625 	if (!bound) {
2626 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2627 			if (last->inp_laddr.s_addr == INADDR_ANY)
2628 				break;
2629 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2630 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2631 				return;
2632 			}
2633 		}
2634 	}
2635 	if (last == NULL)
2636 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2637 	else
2638 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2639 }
2640 
2641 #ifdef INET6
2642 /*
2643  * See the comment above _in_pcbinshash_wild().
2644  */
2645 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2646 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2647 {
2648 	struct inpcb *last;
2649 	bool bound, injail;
2650 
2651 	INP_LOCK_ASSERT(inp);
2652 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2653 
2654 	last = NULL;
2655 	bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2656 	injail = in_pcbjailed(inp, PR_IP6);
2657 	if (!injail) {
2658 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2659 			if (!in_pcbjailed(last, PR_IP6))
2660 				break;
2661 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2662 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2663 				return;
2664 			}
2665 		}
2666 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2667 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2668 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2669 		return;
2670 	}
2671 	if (!bound) {
2672 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2673 			if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2674 				break;
2675 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2676 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2677 				return;
2678 			}
2679 		}
2680 	}
2681 	if (last == NULL)
2682 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2683 	else
2684 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2685 }
2686 #endif
2687 
2688 /*
2689  * Insert PCB onto various hash lists.
2690  *
2691  * With normal sockets this function shall not fail, so it could return void.
2692  * But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2693  * that's the only condition when it can fail.
2694  */
2695 int
in_pcbinshash(struct inpcb * inp)2696 in_pcbinshash(struct inpcb *inp)
2697 {
2698 	struct inpcbhead *pcbhash, *pcbporthash;
2699 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2700 	uint32_t hash;
2701 	bool connected;
2702 
2703 	INP_WLOCK_ASSERT(inp);
2704 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2705 	MPASS(inp->inp_flags & INP_UNCONNECTED);
2706 
2707 #ifdef INET6
2708 	if (inp->inp_vflag & INP_IPV6) {
2709 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2710 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2711 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2712 	} else
2713 #endif
2714 	{
2715 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2716 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2717 		connected = !in_nullhost(inp->inp_faddr);
2718 	}
2719 
2720 	if (connected)
2721 		pcbhash = &pcbinfo->ipi_hash_exact[hash];
2722 	else
2723 		pcbhash = &pcbinfo->ipi_hash_wild[hash];
2724 
2725 	pcbporthash = &pcbinfo->ipi_porthashbase[
2726 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2727 
2728 	/*
2729 	 * Ignore SO_REUSEPORT_LB if the socket is connected.  Really this case
2730 	 * should be an error, but for UDP sockets it is not, and some
2731 	 * applications erroneously set it on connected UDP sockets, so we can't
2732 	 * change this without breaking compatibility.
2733 	 */
2734 	if (!connected &&
2735 	    (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2736 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
2737 		if (error != 0)
2738 			return (error);
2739 	}
2740 
2741 	/*
2742 	 * The PCB may have been disconnected in the past.  Before we can safely
2743 	 * make it visible in the hash table, we must wait for all readers which
2744 	 * may be traversing this PCB to finish.
2745 	 */
2746 	if (inp->inp_smr != SMR_SEQ_INVALID) {
2747 		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2748 		inp->inp_smr = SMR_SEQ_INVALID;
2749 	}
2750 
2751 	CK_LIST_REMOVE(inp, inp_unconn_list);
2752 
2753 	if (connected)
2754 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2755 	else {
2756 #ifdef INET6
2757 		if ((inp->inp_vflag & INP_IPV6) != 0)
2758 			_in6_pcbinshash_wild(pcbhash, inp);
2759 		else
2760 #endif
2761 			_in_pcbinshash_wild(pcbhash, inp);
2762 	}
2763 	CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2764 	inp->inp_flags &= ~INP_UNCONNECTED;
2765 
2766 	return (0);
2767 }
2768 
2769 void
in_pcbremhash(struct inpcb * inp)2770 in_pcbremhash(struct inpcb *inp)
2771 {
2772 
2773 	INP_WLOCK_ASSERT(inp);
2774 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2775 	MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2776 
2777 	if ((inp->inp_flags & INP_INLBGROUP) != 0)
2778 		in_pcbremlbgrouphash(inp);
2779 #ifdef INET6
2780 	if (inp->inp_vflag & INP_IPV6) {
2781 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2782 			CK_LIST_REMOVE(inp, inp_hash_wild);
2783 		else
2784 			CK_LIST_REMOVE(inp, inp_hash_exact);
2785 	} else
2786 #endif
2787 	{
2788 		if (in_nullhost(inp->inp_faddr))
2789 			CK_LIST_REMOVE(inp, inp_hash_wild);
2790 		else
2791 			CK_LIST_REMOVE(inp, inp_hash_exact);
2792 	}
2793 	CK_LIST_REMOVE(inp, inp_portlist);
2794 }
2795 
2796 /*
2797  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2798  * changed. NOTE: This does not handle the case of the lport changing (the
2799  * hashed port list would have to be updated as well), so the lport must
2800  * not change after in_pcbinshash() has been called.
2801  */
2802 void
in_pcbrehash(struct inpcb * inp)2803 in_pcbrehash(struct inpcb *inp)
2804 {
2805 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2806 	struct inpcbhead *head;
2807 	uint32_t hash;
2808 	bool connected;
2809 
2810 	INP_WLOCK_ASSERT(inp);
2811 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2812 	MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2813 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2814 	    ("%s: inp was disconnected", __func__));
2815 
2816 #ifdef INET6
2817 	if (inp->inp_vflag & INP_IPV6) {
2818 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2819 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2820 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2821 	} else
2822 #endif
2823 	{
2824 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2825 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2826 		connected = !in_nullhost(inp->inp_faddr);
2827 	}
2828 
2829 	/* See the comment in in_pcbinshash(). */
2830 	if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2831 		in_pcbremlbgrouphash(inp);
2832 
2833 	/*
2834 	 * When rehashing, the caller must ensure that either the new or the old
2835 	 * foreign address was unspecified.
2836 	 */
2837 	if (connected) {
2838 		CK_LIST_REMOVE(inp, inp_hash_wild);
2839 		head = &pcbinfo->ipi_hash_exact[hash];
2840 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2841 	} else {
2842 		CK_LIST_REMOVE(inp, inp_hash_exact);
2843 		head = &pcbinfo->ipi_hash_wild[hash];
2844 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2845 	}
2846 }
2847 
2848 void
ripcb_connect(struct inpcb * inp)2849 ripcb_connect(struct inpcb *inp)
2850 {
2851 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2852 	uint32_t hash;
2853 
2854 	INP_WLOCK_ASSERT(inp);
2855 	MPASS(inp->inp_flags & INP_UNCONNECTED);
2856 
2857 	hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask;
2858 
2859 	INP_HASH_WLOCK(pcbinfo);
2860 	CK_LIST_REMOVE(inp, inp_unconn_list);
2861 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp,
2862 	    inp_hash_exact);
2863 	INP_HASH_WUNLOCK(pcbinfo);
2864 	inp->inp_flags &= ~INP_UNCONNECTED;
2865 }
2866 
2867 void
ripcb_disconnect(struct inpcb * inp)2868 ripcb_disconnect(struct inpcb *inp)
2869 {
2870 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2871 
2872 	INP_WLOCK_ASSERT(inp);
2873 
2874 	if (inp->inp_flags & INP_UNCONNECTED)
2875 		return;
2876 
2877 	INP_HASH_WLOCK(pcbinfo);
2878 	CK_LIST_REMOVE(inp, inp_hash_exact);
2879 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
2880 	INP_HASH_WUNLOCK(pcbinfo);
2881 	inp->inp_flags |= INP_UNCONNECTED;
2882 }
2883 
2884 /*
2885  * Check for alternatives when higher level complains
2886  * about service problems.  For now, invalidate cached
2887  * routing information.  If the route was created dynamically
2888  * (by a redirect), time to try a default gateway again.
2889  */
2890 void
in_losing(struct inpcb * inp)2891 in_losing(struct inpcb *inp)
2892 {
2893 
2894 	RO_INVALIDATE_CACHE(&inp->inp_route);
2895 	return;
2896 }
2897 
2898 /*
2899  * A set label operation has occurred at the socket layer, propagate the
2900  * label change into the in_pcb for the socket.
2901  */
2902 void
in_pcbsosetlabel(struct socket * so)2903 in_pcbsosetlabel(struct socket *so)
2904 {
2905 #ifdef MAC
2906 	struct inpcb *inp;
2907 
2908 	inp = sotoinpcb(so);
2909 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2910 
2911 	INP_WLOCK(inp);
2912 	SOCK_LOCK(so);
2913 	mac_inpcb_sosetlabel(so, inp);
2914 	SOCK_UNLOCK(so);
2915 	INP_WUNLOCK(inp);
2916 #endif
2917 }
2918 
2919 void
inp_wlock(struct inpcb * inp)2920 inp_wlock(struct inpcb *inp)
2921 {
2922 
2923 	INP_WLOCK(inp);
2924 }
2925 
2926 void
inp_wunlock(struct inpcb * inp)2927 inp_wunlock(struct inpcb *inp)
2928 {
2929 
2930 	INP_WUNLOCK(inp);
2931 }
2932 
2933 void
inp_rlock(struct inpcb * inp)2934 inp_rlock(struct inpcb *inp)
2935 {
2936 
2937 	INP_RLOCK(inp);
2938 }
2939 
2940 void
inp_runlock(struct inpcb * inp)2941 inp_runlock(struct inpcb *inp)
2942 {
2943 
2944 	INP_RUNLOCK(inp);
2945 }
2946 
2947 #ifdef INVARIANT_SUPPORT
2948 void
inp_lock_assert(struct inpcb * inp)2949 inp_lock_assert(struct inpcb *inp)
2950 {
2951 
2952 	INP_WLOCK_ASSERT(inp);
2953 }
2954 
2955 void
inp_unlock_assert(struct inpcb * inp)2956 inp_unlock_assert(struct inpcb *inp)
2957 {
2958 
2959 	INP_UNLOCK_ASSERT(inp);
2960 }
2961 #endif
2962 
2963 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2964 inp_apply_all(struct inpcbinfo *pcbinfo,
2965     void (*func)(struct inpcb *, void *), void *arg)
2966 {
2967 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2968 	    INPLOOKUP_WLOCKPCB);
2969 	struct inpcb *inp;
2970 
2971 	while ((inp = inp_next(&inpi)) != NULL)
2972 		func(inp, arg);
2973 }
2974 
2975 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2976 inp_inpcbtosocket(struct inpcb *inp)
2977 {
2978 
2979 	INP_WLOCK_ASSERT(inp);
2980 	return (inp->inp_socket);
2981 }
2982 
2983 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2984 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2985     uint32_t *faddr, uint16_t *fp)
2986 {
2987 
2988 	INP_LOCK_ASSERT(inp);
2989 	*laddr = inp->inp_laddr.s_addr;
2990 	*faddr = inp->inp_faddr.s_addr;
2991 	*lp = inp->inp_lport;
2992 	*fp = inp->inp_fport;
2993 }
2994 
2995 /*
2996  * Create an external-format (``xinpcb'') structure using the information in
2997  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2998  * reduce the spew of irrelevant information over this interface, to isolate
2999  * user code from changes in the kernel structure, and potentially to provide
3000  * information-hiding if we decide that some of this information should be
3001  * hidden from users.
3002  */
3003 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)3004 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
3005 {
3006 
3007 	bzero(xi, sizeof(*xi));
3008 	xi->xi_len = sizeof(struct xinpcb);
3009 	if (inp->inp_socket)
3010 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
3011 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
3012 	xi->inp_gencnt = inp->inp_gencnt;
3013 	xi->inp_flow = inp->inp_flow;
3014 	xi->inp_flowid = inp->inp_flowid;
3015 	xi->inp_flowtype = inp->inp_flowtype;
3016 	xi->inp_flags = inp->inp_flags;
3017 	xi->inp_flags2 = inp->inp_flags2;
3018 	xi->in6p_cksum = inp->in6p_cksum;
3019 	xi->in6p_hops = inp->in6p_hops;
3020 	xi->inp_ip_tos = inp->inp_ip_tos;
3021 	xi->inp_vflag = inp->inp_vflag;
3022 	xi->inp_ip_ttl = inp->inp_ip_ttl;
3023 	xi->inp_ip_p = inp->inp_ip_p;
3024 	xi->inp_ip_minttl = inp->inp_ip_minttl;
3025 }
3026 
3027 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))3028 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
3029     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
3030 {
3031 	struct sockopt sopt;
3032 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
3033 	    INPLOOKUP_WLOCKPCB);
3034 	struct inpcb *inp;
3035 	struct sockopt_parameters *params;
3036 	struct socket *so;
3037 	int error;
3038 	char buf[1024];
3039 
3040 	if (req->oldptr != NULL || req->oldlen != 0)
3041 		return (EINVAL);
3042 	if (req->newptr == NULL)
3043 		return (EPERM);
3044 	if (req->newlen > sizeof(buf))
3045 		return (ENOMEM);
3046 	error = SYSCTL_IN(req, buf, req->newlen);
3047 	if (error != 0)
3048 		return (error);
3049 	if (req->newlen < sizeof(struct sockopt_parameters))
3050 		return (EINVAL);
3051 	params = (struct sockopt_parameters *)buf;
3052 	sopt.sopt_level = params->sop_level;
3053 	sopt.sopt_name = params->sop_optname;
3054 	sopt.sopt_dir = SOPT_SET;
3055 	sopt.sopt_val = params->sop_optval;
3056 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
3057 	sopt.sopt_td = NULL;
3058 #ifdef INET6
3059 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
3060 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
3061 			params->sop_inc.inc6_laddr.s6_addr16[1] =
3062 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3063 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
3064 			params->sop_inc.inc6_faddr.s6_addr16[1] =
3065 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
3066 	}
3067 #endif
3068 	if (params->sop_inc.inc_lport != htons(0) &&
3069 	    params->sop_inc.inc_fport != htons(0)) {
3070 #ifdef INET6
3071 		if (params->sop_inc.inc_flags & INC_ISIPV6)
3072 			inpi.hash = INP6_PCBHASH(
3073 			    &params->sop_inc.inc6_faddr,
3074 			    params->sop_inc.inc_lport,
3075 			    params->sop_inc.inc_fport,
3076 			    pcbinfo->ipi_hashmask);
3077 		else
3078 #endif
3079 			inpi.hash = INP_PCBHASH(
3080 			    &params->sop_inc.inc_faddr,
3081 			    params->sop_inc.inc_lport,
3082 			    params->sop_inc.inc_fport,
3083 			    pcbinfo->ipi_hashmask);
3084 	}
3085 	while ((inp = inp_next(&inpi)) != NULL)
3086 		if (inp->inp_gencnt == params->sop_id) {
3087 			/*
3088 			 * XXXGL
3089 			 * 1) the inp_next() that ignores INP_UNCONNECTED needs
3090 			 * to be generally supported.
3091 			 * 2) Why do we ECONNRESET instead of continueing?
3092 			 */
3093 			if (inp->inp_flags & INP_UNCONNECTED) {
3094 				INP_WUNLOCK(inp);
3095 				return (ECONNRESET);
3096 			}
3097 			so = inp->inp_socket;
3098 			KASSERT(so != NULL, ("inp_socket == NULL"));
3099 			soref(so);
3100 			if (params->sop_level == SOL_SOCKET) {
3101 				INP_WUNLOCK(inp);
3102 				error = sosetopt(so, &sopt);
3103 			} else
3104 				error = (*ctloutput_set)(inp, &sopt);
3105 			sorele(so);
3106 			break;
3107 		}
3108 	if (inp == NULL)
3109 		error = ESRCH;
3110 	return (error);
3111 }
3112 
3113 #ifdef DDB
3114 static void
db_print_indent(int indent)3115 db_print_indent(int indent)
3116 {
3117 	int i;
3118 
3119 	for (i = 0; i < indent; i++)
3120 		db_printf(" ");
3121 }
3122 
3123 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3124 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3125 {
3126 	char faddr_str[48], laddr_str[48];
3127 
3128 	db_print_indent(indent);
3129 	db_printf("%s at %p\n", name, inc);
3130 
3131 	indent += 2;
3132 
3133 #ifdef INET6
3134 	if (inc->inc_flags & INC_ISIPV6) {
3135 		/* IPv6. */
3136 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
3137 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
3138 	} else
3139 #endif
3140 	{
3141 		/* IPv4. */
3142 		inet_ntoa_r(inc->inc_laddr, laddr_str);
3143 		inet_ntoa_r(inc->inc_faddr, faddr_str);
3144 	}
3145 	db_print_indent(indent);
3146 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3147 	    ntohs(inc->inc_lport));
3148 	db_print_indent(indent);
3149 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3150 	    ntohs(inc->inc_fport));
3151 }
3152 
3153 void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3154 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3155 {
3156 
3157 	db_print_indent(indent);
3158 	db_printf("%s at %p\n", name, inp);
3159 
3160 	indent += 2;
3161 
3162 	db_print_indent(indent);
3163 	db_printf("inp_flow: 0x%x   inp_label: %p\n", inp->inp_flow,
3164 	    inp->inp_label);
3165 
3166 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3167 
3168 	db_print_indent(indent);
3169 	db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3170 
3171 	db_print_indent(indent);
3172 	db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3173 
3174 	db_print_indent(indent);
3175 	db_printf("inp_sp: %p   inp_vflag: 0x%b\n", inp->inp_sp,
3176 	    inp->inp_vflag, INP_VFLAGS_BITS);
3177 
3178 	db_print_indent(indent);
3179 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3180 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3181 
3182 #ifdef INET6
3183 	if (inp->inp_vflag & INP_IPV6) {
3184 		db_print_indent(indent);
3185 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
3186 		    "in6p_moptions: %p\n", inp->in6p_options,
3187 		    inp->in6p_outputopts, inp->in6p_moptions);
3188 		db_print_indent(indent);
3189 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3190 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3191 		    inp->in6p_hops);
3192 	} else
3193 #endif
3194 	{
3195 		db_print_indent(indent);
3196 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3197 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3198 		    inp->inp_options, inp->inp_moptions);
3199 	}
3200 
3201 	db_print_indent(indent);
3202 	db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3203 }
3204 
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3205 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3206 {
3207 	struct inpcb *inp;
3208 
3209 	if (!have_addr) {
3210 		db_printf("usage: show inpcb <addr>\n");
3211 		return;
3212 	}
3213 	inp = (struct inpcb *)addr;
3214 
3215 	db_print_inpcb(inp, "inpcb", 0);
3216 }
3217 #endif /* DDB */
3218 
3219 #ifdef RATELIMIT
3220 /*
3221  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3222  * if any.
3223  */
3224 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3225 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3226 {
3227 	union if_snd_tag_modify_params params = {
3228 		.rate_limit.max_rate = max_pacing_rate,
3229 		.rate_limit.flags = M_NOWAIT,
3230 	};
3231 	struct m_snd_tag *mst;
3232 	int error;
3233 
3234 	mst = inp->inp_snd_tag;
3235 	if (mst == NULL)
3236 		return (EINVAL);
3237 
3238 	if (mst->sw->snd_tag_modify == NULL) {
3239 		error = EOPNOTSUPP;
3240 	} else {
3241 		error = mst->sw->snd_tag_modify(mst, &params);
3242 	}
3243 	return (error);
3244 }
3245 
3246 /*
3247  * Query existing TX rate limit based on the existing
3248  * "inp->inp_snd_tag", if any.
3249  */
3250 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3251 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3252 {
3253 	union if_snd_tag_query_params params = { };
3254 	struct m_snd_tag *mst;
3255 	int error;
3256 
3257 	mst = inp->inp_snd_tag;
3258 	if (mst == NULL)
3259 		return (EINVAL);
3260 
3261 	if (mst->sw->snd_tag_query == NULL) {
3262 		error = EOPNOTSUPP;
3263 	} else {
3264 		error = mst->sw->snd_tag_query(mst, &params);
3265 		if (error == 0 && p_max_pacing_rate != NULL)
3266 			*p_max_pacing_rate = params.rate_limit.max_rate;
3267 	}
3268 	return (error);
3269 }
3270 
3271 /*
3272  * Query existing TX queue level based on the existing
3273  * "inp->inp_snd_tag", if any.
3274  */
3275 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3276 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3277 {
3278 	union if_snd_tag_query_params params = { };
3279 	struct m_snd_tag *mst;
3280 	int error;
3281 
3282 	mst = inp->inp_snd_tag;
3283 	if (mst == NULL)
3284 		return (EINVAL);
3285 
3286 	if (mst->sw->snd_tag_query == NULL)
3287 		return (EOPNOTSUPP);
3288 
3289 	error = mst->sw->snd_tag_query(mst, &params);
3290 	if (error == 0 && p_txqueue_level != NULL)
3291 		*p_txqueue_level = params.rate_limit.queue_level;
3292 	return (error);
3293 }
3294 
3295 /*
3296  * Allocate a new TX rate limit send tag from the network interface
3297  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3298  */
3299 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3300 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3301     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3302 
3303 {
3304 	union if_snd_tag_alloc_params params = {
3305 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3306 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3307 		.rate_limit.hdr.flowid = flowid,
3308 		.rate_limit.hdr.flowtype = flowtype,
3309 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3310 		.rate_limit.max_rate = max_pacing_rate,
3311 		.rate_limit.flags = M_NOWAIT,
3312 	};
3313 	int error;
3314 
3315 	INP_WLOCK_ASSERT(inp);
3316 
3317 	/*
3318 	 * If there is already a send tag, or the INP is being torn
3319 	 * down, allocating a new send tag is not allowed. Else send
3320 	 * tags may leak.
3321 	 */
3322 	if (*st != NULL || (inp->inp_flags & INP_UNCONNECTED))
3323 		return (EINVAL);
3324 
3325 	error = m_snd_tag_alloc(ifp, &params, st);
3326 #ifdef INET
3327 	if (error == 0) {
3328 		counter_u64_add(rate_limit_set_ok, 1);
3329 		counter_u64_add(rate_limit_active, 1);
3330 	} else if (error != EOPNOTSUPP)
3331 		  counter_u64_add(rate_limit_alloc_fail, 1);
3332 #endif
3333 	return (error);
3334 }
3335 
3336 void
in_pcbdetach_tag(struct m_snd_tag * mst)3337 in_pcbdetach_tag(struct m_snd_tag *mst)
3338 {
3339 
3340 	m_snd_tag_rele(mst);
3341 #ifdef INET
3342 	counter_u64_add(rate_limit_active, -1);
3343 #endif
3344 }
3345 
3346 /*
3347  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3348  * if any:
3349  */
3350 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3351 in_pcbdetach_txrtlmt(struct inpcb *inp)
3352 {
3353 	struct m_snd_tag *mst;
3354 
3355 	INP_WLOCK_ASSERT(inp);
3356 
3357 	mst = inp->inp_snd_tag;
3358 	inp->inp_snd_tag = NULL;
3359 
3360 	if (mst == NULL)
3361 		return;
3362 
3363 	m_snd_tag_rele(mst);
3364 #ifdef INET
3365 	counter_u64_add(rate_limit_active, -1);
3366 #endif
3367 }
3368 
3369 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3370 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3371 {
3372 	int error;
3373 
3374 	/*
3375 	 * If the existing send tag is for the wrong interface due to
3376 	 * a route change, first drop the existing tag.  Set the
3377 	 * CHANGED flag so that we will keep trying to allocate a new
3378 	 * tag if we fail to allocate one this time.
3379 	 */
3380 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3381 		in_pcbdetach_txrtlmt(inp);
3382 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3383 	}
3384 
3385 	/*
3386 	 * NOTE: When attaching to a network interface a reference is
3387 	 * made to ensure the network interface doesn't go away until
3388 	 * all ratelimit connections are gone. The network interface
3389 	 * pointers compared below represent valid network interfaces,
3390 	 * except when comparing towards NULL.
3391 	 */
3392 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3393 		error = 0;
3394 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3395 		if (inp->inp_snd_tag != NULL)
3396 			in_pcbdetach_txrtlmt(inp);
3397 		error = 0;
3398 	} else if (inp->inp_snd_tag == NULL) {
3399 		/*
3400 		 * In order to utilize packet pacing with RSS, we need
3401 		 * to wait until there is a valid RSS hash before we
3402 		 * can proceed:
3403 		 */
3404 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3405 			error = EAGAIN;
3406 		} else {
3407 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3408 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3409 		}
3410 	} else {
3411 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3412 	}
3413 	if (error == 0 || error == EOPNOTSUPP)
3414 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3415 
3416 	return (error);
3417 }
3418 
3419 /*
3420  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3421  * is set in the fast path and will attach/detach/modify the TX rate
3422  * limit send tag based on the socket's so_max_pacing_rate value.
3423  */
3424 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3425 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3426 {
3427 	struct socket *socket;
3428 	uint32_t max_pacing_rate;
3429 	bool did_upgrade;
3430 
3431 	if (inp == NULL)
3432 		return;
3433 
3434 	socket = inp->inp_socket;
3435 	if (socket == NULL)
3436 		return;
3437 
3438 	if (!INP_WLOCKED(inp)) {
3439 		/*
3440 		 * NOTE: If the write locking fails, we need to bail
3441 		 * out and use the non-ratelimited ring for the
3442 		 * transmit until there is a new chance to get the
3443 		 * write lock.
3444 		 */
3445 		if (!INP_TRY_UPGRADE(inp))
3446 			return;
3447 		did_upgrade = 1;
3448 	} else {
3449 		did_upgrade = 0;
3450 	}
3451 
3452 	/*
3453 	 * NOTE: The so_max_pacing_rate value is read unlocked,
3454 	 * because atomic updates are not required since the variable
3455 	 * is checked at every mbuf we send. It is assumed that the
3456 	 * variable read itself will be atomic.
3457 	 */
3458 	max_pacing_rate = socket->so_max_pacing_rate;
3459 
3460 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3461 
3462 	if (did_upgrade)
3463 		INP_DOWNGRADE(inp);
3464 }
3465 
3466 /*
3467  * Track route changes for TX rate limiting.
3468  */
3469 void
in_pcboutput_eagain(struct inpcb * inp)3470 in_pcboutput_eagain(struct inpcb *inp)
3471 {
3472 	bool did_upgrade;
3473 
3474 	if (inp == NULL)
3475 		return;
3476 
3477 	if (inp->inp_snd_tag == NULL)
3478 		return;
3479 
3480 	if (!INP_WLOCKED(inp)) {
3481 		/*
3482 		 * NOTE: If the write locking fails, we need to bail
3483 		 * out and use the non-ratelimited ring for the
3484 		 * transmit until there is a new chance to get the
3485 		 * write lock.
3486 		 */
3487 		if (!INP_TRY_UPGRADE(inp))
3488 			return;
3489 		did_upgrade = 1;
3490 	} else {
3491 		did_upgrade = 0;
3492 	}
3493 
3494 	/* detach rate limiting */
3495 	in_pcbdetach_txrtlmt(inp);
3496 
3497 	/* make sure new mbuf send tag allocation is made */
3498 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3499 
3500 	if (did_upgrade)
3501 		INP_DOWNGRADE(inp);
3502 }
3503 
3504 #ifdef INET
3505 static void
rl_init(void * st)3506 rl_init(void *st)
3507 {
3508 	rate_limit_new = counter_u64_alloc(M_WAITOK);
3509 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
3510 	rate_limit_active = counter_u64_alloc(M_WAITOK);
3511 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3512 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3513 }
3514 
3515 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3516 #endif
3517 #endif /* RATELIMIT */
3518