xref: /freebsd/sys/netinet/in_pcb.c (revision 7b71f57f4e514a2ab7308ce4147e14d90e099ad0)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
5  *	The Regents of the University of California.
6  * Copyright (c) 2007-2009 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  * All rights reserved.
10  *
11  * Portions of this software were developed by Robert N. M. Watson under
12  * contract to Juniper Networks, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include "opt_ddb.h"
40 #include "opt_ipsec.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #include "opt_ratelimit.h"
44 #include "opt_route.h"
45 #include "opt_rss.h"
46 
47 #include <sys/param.h>
48 #include <sys/hash.h>
49 #include <sys/systm.h>
50 #include <sys/libkern.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/eventhandler.h>
55 #include <sys/domain.h>
56 #include <sys/proc.h>
57 #include <sys/protosw.h>
58 #include <sys/smp.h>
59 #include <sys/smr.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/priv.h>
64 #include <sys/proc.h>
65 #include <sys/refcount.h>
66 #include <sys/jail.h>
67 #include <sys/kernel.h>
68 #include <sys/sysctl.h>
69 
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif
73 
74 #include <vm/uma.h>
75 #include <vm/vm.h>
76 
77 #include <net/if.h>
78 #include <net/if_var.h>
79 #include <net/if_private.h>
80 #include <net/if_types.h>
81 #include <net/if_llatbl.h>
82 #include <net/route.h>
83 #include <net/rss_config.h>
84 #include <net/vnet.h>
85 
86 #if defined(INET) || defined(INET6)
87 #include <netinet/in.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/in_pcb_var.h>
90 #include <netinet/tcp.h>
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #include <netinet/in_fib.h>
94 #endif
95 #include <netinet/ip_var.h>
96 #ifdef INET6
97 #include <netinet/ip6.h>
98 #include <netinet6/in6_pcb.h>
99 #include <netinet6/in6_var.h>
100 #include <netinet6/ip6_var.h>
101 #endif /* INET6 */
102 #include <net/route/nhop.h>
103 #endif
104 
105 #include <netipsec/ipsec_support.h>
106 
107 #include <security/mac/mac_framework.h>
108 
109 #define	INPCBLBGROUP_SIZMIN	8
110 #define	INPCBLBGROUP_SIZMAX	256
111 
112 #define	INP_FREED	0x00000200	/* Went through in_pcbfree(). */
113 #define	INP_INLBGROUP	0x01000000	/* Inserted into inpcblbgroup. */
114 
115 /*
116  * These configure the range of local port addresses assigned to
117  * "unspecified" outgoing connections/packets/whatever.
118  */
119 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
120 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
121 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
122 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
123 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
124 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
125 
126 /*
127  * Reserved ports accessible only to root. There are significant
128  * security considerations that must be accounted for when changing these,
129  * but the security benefits can be great. Please be careful.
130  */
131 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
132 VNET_DEFINE(int, ipport_reservedlow);
133 
134 /* Enable random ephemeral port allocation by default. */
135 VNET_DEFINE(int, ipport_randomized) = 1;
136 
137 #ifdef INET
138 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
139 			    struct in_addr faddr, u_int fport_arg,
140 			    struct in_addr laddr, u_int lport_arg,
141 			    int lookupflags, uint8_t numa_domain, int fib);
142 
143 #define RANGECHK(var, min, max) \
144 	if ((var) < (min)) { (var) = (min); } \
145 	else if ((var) > (max)) { (var) = (max); }
146 
147 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
149 {
150 	int error;
151 
152 	error = sysctl_handle_int(oidp, arg1, arg2, req);
153 	if (error == 0) {
154 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
155 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
156 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
157 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
158 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
159 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
160 	}
161 	return (error);
162 }
163 
164 #undef RANGECHK
165 
166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
167     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
168     "IP Ports");
169 
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
171     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
172     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
173     "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
175     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
176     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
177     "");
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
179     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
180     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
181     "");
182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
183     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
184     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
185     "");
186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
187     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
188     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
189     "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
191     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
192     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
193     "");
194 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
195 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
196 	&VNET_NAME(ipport_reservedhigh), 0, "");
197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
198 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
200 	CTLFLAG_VNET | CTLFLAG_RW,
201 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
202 
203 #ifdef RATELIMIT
204 counter_u64_t rate_limit_new;
205 counter_u64_t rate_limit_chg;
206 counter_u64_t rate_limit_active;
207 counter_u64_t rate_limit_alloc_fail;
208 counter_u64_t rate_limit_set_ok;
209 
210 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
211     "IP Rate Limiting");
212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
213     &rate_limit_active, "Active rate limited connections");
214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
215    &rate_limit_alloc_fail, "Rate limited connection failures");
216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
217    &rate_limit_set_ok, "Rate limited setting succeeded");
218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
219    &rate_limit_new, "Total Rate limit new attempts");
220 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
221    &rate_limit_chg, "Total Rate limited change attempts");
222 #endif /* RATELIMIT */
223 
224 #endif /* INET */
225 
226 VNET_DEFINE(uint32_t, in_pcbhashseed);
227 static void
in_pcbhashseed_init(void)228 in_pcbhashseed_init(void)
229 {
230 
231 	V_in_pcbhashseed = arc4random();
232 }
233 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
234     in_pcbhashseed_init, NULL);
235 
236 #ifdef INET
237 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
238 #define	V_connect_inaddr_wild	VNET(connect_inaddr_wild)
239 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
240     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
241     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
242 #endif
243 
244 static void in_pcbremhash(struct inpcb *);
245 
246 /*
247  * in_pcb.c: manage the Protocol Control Blocks.
248  *
249  * NOTE: It is assumed that most of these functions will be called with
250  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
251  * functions often modify hash chains or addresses in pcbs.
252  */
253 
254 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)255 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
256     const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
257 {
258 	struct inpcblbgroup *grp;
259 	size_t bytes;
260 
261 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
262 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
263 	if (grp == NULL)
264 		return (NULL);
265 	LIST_INIT(&grp->il_pending);
266 	grp->il_cred = crhold(cred);
267 	grp->il_vflag = vflag;
268 	grp->il_lport = port;
269 	grp->il_numa_domain = numa_domain;
270 	grp->il_fibnum = fib;
271 	grp->il_dependladdr = *addr;
272 	grp->il_inpsiz = size;
273 	return (grp);
274 }
275 
276 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)277 in_pcblbgroup_free_deferred(epoch_context_t ctx)
278 {
279 	struct inpcblbgroup *grp;
280 
281 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
282 	crfree(grp->il_cred);
283 	free(grp, M_PCB);
284 }
285 
286 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)287 in_pcblbgroup_free(struct inpcblbgroup *grp)
288 {
289 	KASSERT(LIST_EMPTY(&grp->il_pending),
290 	    ("local group %p still has pending inps", grp));
291 
292 	CK_LIST_REMOVE(grp, il_list);
293 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
294 }
295 
296 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)297 in_pcblbgroup_find(struct inpcb *inp)
298 {
299 	struct inpcbinfo *pcbinfo;
300 	struct inpcblbgroup *grp;
301 	struct inpcblbgrouphead *hdr;
302 
303 	INP_LOCK_ASSERT(inp);
304 
305 	pcbinfo = inp->inp_pcbinfo;
306 	INP_HASH_LOCK_ASSERT(pcbinfo);
307 
308 	hdr = &pcbinfo->ipi_lbgrouphashbase[
309 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
310 	CK_LIST_FOREACH(grp, hdr, il_list) {
311 		struct inpcb *inp1;
312 
313 		for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
314 			if (inp == grp->il_inp[i])
315 				goto found;
316 		}
317 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
318 			if (inp == inp1)
319 				goto found;
320 		}
321 	}
322 found:
323 	return (grp);
324 }
325 
326 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)327 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
328 {
329 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
330 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
331 	    grp->il_inpcnt));
332 	INP_WLOCK_ASSERT(inp);
333 
334 	if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
335 	    !SOLISTENING(inp->inp_socket)) {
336 		/*
337 		 * If this is a TCP socket, it should not be visible to lbgroup
338 		 * lookups until listen() has been called.
339 		 */
340 		LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
341 		grp->il_pendcnt++;
342 	} else {
343 		grp->il_inp[grp->il_inpcnt] = inp;
344 
345 		/*
346 		 * Synchronize with in_pcblookup_lbgroup(): make sure that we
347 		 * don't expose a null slot to the lookup path.
348 		 */
349 		atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
350 	}
351 
352 	inp->inp_flags |= INP_INLBGROUP;
353 }
354 
355 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
357     struct inpcblbgroup *old_grp, int size)
358 {
359 	struct inpcblbgroup *grp;
360 	int i;
361 
362 	grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
363 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
364 	    old_grp->il_numa_domain, old_grp->il_fibnum);
365 	if (grp == NULL)
366 		return (NULL);
367 
368 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
369 	    ("invalid new local group size %d and old local group count %d",
370 	     grp->il_inpsiz, old_grp->il_inpcnt));
371 
372 	for (i = 0; i < old_grp->il_inpcnt; ++i)
373 		grp->il_inp[i] = old_grp->il_inp[i];
374 	grp->il_inpcnt = old_grp->il_inpcnt;
375 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
376 	LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
377 	    inp_lbgroup_list);
378 	grp->il_pendcnt = old_grp->il_pendcnt;
379 	old_grp->il_pendcnt = 0;
380 	in_pcblbgroup_free(old_grp);
381 	return (grp);
382 }
383 
384 /*
385  * Add PCB to load balance group for SO_REUSEPORT_LB option.
386  */
387 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)388 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
389 {
390 	const static struct timeval interval = { 60, 0 };
391 	static struct timeval lastprint;
392 	struct inpcbinfo *pcbinfo;
393 	struct inpcblbgrouphead *hdr;
394 	struct inpcblbgroup *grp;
395 	uint32_t idx;
396 	int fib;
397 
398 	pcbinfo = inp->inp_pcbinfo;
399 
400 	INP_WLOCK_ASSERT(inp);
401 	INP_HASH_WLOCK_ASSERT(pcbinfo);
402 
403 	fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
404 	    inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
405 
406 #ifdef INET6
407 	/*
408 	 * Don't allow IPv4 mapped INET6 wild socket.
409 	 */
410 	if ((inp->inp_vflag & INP_IPV4) &&
411 	    inp->inp_laddr.s_addr == INADDR_ANY &&
412 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
413 		return (0);
414 	}
415 #endif
416 
417 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
418 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
419 	CK_LIST_FOREACH(grp, hdr, il_list) {
420 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
421 		    grp->il_vflag == inp->inp_vflag &&
422 		    grp->il_lport == inp->inp_lport &&
423 		    grp->il_numa_domain == numa_domain &&
424 		    grp->il_fibnum == fib &&
425 		    memcmp(&grp->il_dependladdr,
426 		    &inp->inp_inc.inc_ie.ie_dependladdr,
427 		    sizeof(grp->il_dependladdr)) == 0) {
428 			break;
429 		}
430 	}
431 	if (grp == NULL) {
432 		/* Create new load balance group. */
433 		grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
434 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
435 		    INPCBLBGROUP_SIZMIN, numa_domain, fib);
436 		if (grp == NULL)
437 			return (ENOMEM);
438 		in_pcblbgroup_insert(grp, inp);
439 		CK_LIST_INSERT_HEAD(hdr, grp, il_list);
440 	} else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
441 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
442 			if (ratecheck(&lastprint, &interval))
443 				printf("lb group port %d, limit reached\n",
444 				    ntohs(grp->il_lport));
445 			return (0);
446 		}
447 
448 		/* Expand this local group. */
449 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
450 		if (grp == NULL)
451 			return (ENOMEM);
452 		in_pcblbgroup_insert(grp, inp);
453 	} else {
454 		in_pcblbgroup_insert(grp, inp);
455 	}
456 	return (0);
457 }
458 
459 /*
460  * Remove PCB from load balance group.
461  */
462 static void
in_pcbremlbgrouphash(struct inpcb * inp)463 in_pcbremlbgrouphash(struct inpcb *inp)
464 {
465 	struct inpcbinfo *pcbinfo;
466 	struct inpcblbgrouphead *hdr;
467 	struct inpcblbgroup *grp;
468 	struct inpcb *inp1;
469 	int i;
470 
471 	pcbinfo = inp->inp_pcbinfo;
472 
473 	INP_WLOCK_ASSERT(inp);
474 	MPASS(inp->inp_flags & INP_INLBGROUP);
475 	INP_HASH_WLOCK_ASSERT(pcbinfo);
476 
477 	hdr = &pcbinfo->ipi_lbgrouphashbase[
478 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
479 	CK_LIST_FOREACH(grp, hdr, il_list) {
480 		for (i = 0; i < grp->il_inpcnt; ++i) {
481 			if (grp->il_inp[i] != inp)
482 				continue;
483 
484 			if (grp->il_inpcnt == 1 &&
485 			    LIST_EMPTY(&grp->il_pending)) {
486 				/* We are the last, free this local group. */
487 				in_pcblbgroup_free(grp);
488 			} else {
489 				grp->il_inp[i] =
490 				    grp->il_inp[grp->il_inpcnt - 1];
491 
492 				/*
493 				 * Synchronize with in_pcblookup_lbgroup().
494 				 */
495 				atomic_store_rel_int(&grp->il_inpcnt,
496 				    grp->il_inpcnt - 1);
497 			}
498 			inp->inp_flags &= ~INP_INLBGROUP;
499 			return;
500 		}
501 		LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
502 			if (inp == inp1) {
503 				LIST_REMOVE(inp, inp_lbgroup_list);
504 				grp->il_pendcnt--;
505 				inp->inp_flags &= ~INP_INLBGROUP;
506 				return;
507 			}
508 		}
509 	}
510 	__assert_unreachable();
511 }
512 
513 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)514 in_pcblbgroup_numa(struct inpcb *inp, int arg)
515 {
516 	struct inpcbinfo *pcbinfo;
517 	int error;
518 	uint8_t numa_domain;
519 
520 	switch (arg) {
521 	case TCP_REUSPORT_LB_NUMA_NODOM:
522 		numa_domain = M_NODOM;
523 		break;
524 	case TCP_REUSPORT_LB_NUMA_CURDOM:
525 		numa_domain = PCPU_GET(domain);
526 		break;
527 	default:
528 		if (arg < 0 || arg >= vm_ndomains)
529 			return (EINVAL);
530 		numa_domain = arg;
531 	}
532 
533 	pcbinfo = inp->inp_pcbinfo;
534 	INP_WLOCK_ASSERT(inp);
535 	INP_HASH_WLOCK(pcbinfo);
536 	if (in_pcblbgroup_find(inp) != NULL) {
537 		/* Remove it from the old group. */
538 		in_pcbremlbgrouphash(inp);
539 		/* Add it to the new group based on numa domain. */
540 		in_pcbinslbgrouphash(inp, numa_domain);
541 		error = 0;
542 	} else {
543 		error = ENOENT;
544 	}
545 	INP_HASH_WUNLOCK(pcbinfo);
546 	return (error);
547 }
548 
549 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
550 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
551 
552 /*
553  * Initialize an inpcbinfo - a per-VNET instance of connections db.
554  */
555 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements)556 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
557     u_int hash_nelements, u_int porthash_nelements)
558 {
559 
560 	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
561 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
562 	    NULL, MTX_DEF);
563 #ifdef VIMAGE
564 	pcbinfo->ipi_vnet = curvnet;
565 #endif
566 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
567 	pcbinfo->ipi_count = 0;
568 	pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
569 	    &pcbinfo->ipi_hashmask);
570 	pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
571 	    &pcbinfo->ipi_hashmask);
572 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
573 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
574 	    &pcbinfo->ipi_porthashmask);
575 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
576 	    &pcbinfo->ipi_lbgrouphashmask);
577 	pcbinfo->ipi_zone = pcbstor->ips_zone;
578 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
579 }
580 
581 /*
582  * Destroy an inpcbinfo.
583  */
584 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)585 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
586 {
587 
588 	KASSERT(pcbinfo->ipi_count == 0,
589 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
590 
591 	hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
592 	hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
593 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
594 	    pcbinfo->ipi_porthashmask);
595 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
596 	    pcbinfo->ipi_lbgrouphashmask);
597 	mtx_destroy(&pcbinfo->ipi_hash_lock);
598 	mtx_destroy(&pcbinfo->ipi_lock);
599 }
600 
601 /*
602  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
603  */
604 static void inpcb_fini(void *, int);
605 void
in_pcbstorage_init(void * arg)606 in_pcbstorage_init(void *arg)
607 {
608 	struct inpcbstorage *pcbstor = arg;
609 
610 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
611 	    pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
612 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
613 }
614 
615 /*
616  * Destroy a pcbstorage - used by unloadable protocols.
617  */
618 void
in_pcbstorage_destroy(void * arg)619 in_pcbstorage_destroy(void *arg)
620 {
621 	struct inpcbstorage *pcbstor = arg;
622 
623 	uma_zdestroy(pcbstor->ips_zone);
624 }
625 
626 /*
627  * Allocate a PCB and associate it with the socket.
628  * On success return with the PCB locked.
629  */
630 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)631 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
632 {
633 	struct inpcb *inp;
634 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
635 	int error;
636 #endif
637 
638 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
639 	if (inp == NULL)
640 		return (ENOBUFS);
641 	bzero(&inp->inp_start_zero, inp_zero_size);
642 #ifdef NUMA
643 	inp->inp_numa_domain = M_NODOM;
644 #endif
645 	inp->inp_pcbinfo = pcbinfo;
646 	inp->inp_socket = so;
647 	inp->inp_cred = crhold(so->so_cred);
648 	inp->inp_inc.inc_fibnum = so->so_fibnum;
649 #ifdef MAC
650 	error = mac_inpcb_init(inp, M_NOWAIT);
651 	if (error != 0)
652 		goto out;
653 	mac_inpcb_create(so, inp);
654 #endif
655 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
656 	error = ipsec_init_pcbpolicy(inp);
657 	if (error != 0) {
658 #ifdef MAC
659 		mac_inpcb_destroy(inp);
660 #endif
661 		goto out;
662 	}
663 #endif /*IPSEC*/
664 #ifdef INET6
665 	if (INP_SOCKAF(so) == AF_INET6) {
666 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
667 		if (V_ip6_v6only)
668 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
669 #ifdef INET
670 		else
671 			inp->inp_vflag |= INP_IPV4;
672 #endif
673 		if (V_ip6_auto_flowlabel)
674 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
675 		inp->in6p_hops = -1;	/* use kernel default */
676 	}
677 #endif
678 #if defined(INET) && defined(INET6)
679 	else
680 #endif
681 #ifdef INET
682 		inp->inp_vflag |= INP_IPV4;
683 #endif
684 	inp->inp_smr = SMR_SEQ_INVALID;
685 
686 	/*
687 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
688 	 * to be cleaned up.
689 	 */
690 	inp->inp_route.ro_flags = RT_LLE_CACHE;
691 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
692 	INP_WLOCK(inp);
693 	INP_INFO_WLOCK(pcbinfo);
694 	pcbinfo->ipi_count++;
695 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
696 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
697 	INP_INFO_WUNLOCK(pcbinfo);
698 	so->so_pcb = inp;
699 
700 	return (0);
701 
702 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
703 out:
704 	crfree(inp->inp_cred);
705 #ifdef INVARIANTS
706 	inp->inp_cred = NULL;
707 #endif
708 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
709 	return (error);
710 #endif
711 }
712 
713 #ifdef INET
714 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)715 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
716     struct ucred *cred)
717 {
718 	int anonport, error;
719 
720 	KASSERT(sin == NULL || sin->sin_family == AF_INET,
721 	    ("%s: invalid address family for %p", __func__, sin));
722 	KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
723 	    ("%s: invalid address length for %p", __func__, sin));
724 	INP_WLOCK_ASSERT(inp);
725 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
726 
727 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
728 		return (EINVAL);
729 	anonport = sin == NULL || sin->sin_port == 0;
730 	error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
731 	    &inp->inp_lport, flags, cred);
732 	if (error)
733 		return (error);
734 	if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
735 		MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
736 		inp->inp_laddr.s_addr = INADDR_ANY;
737 		inp->inp_lport = 0;
738 		inp->inp_flags &= ~INP_BOUNDFIB;
739 		return (error);
740 	}
741 	if (anonport)
742 		inp->inp_flags |= INP_ANONPORT;
743 	return (0);
744 }
745 #endif
746 
747 #if defined(INET) || defined(INET6)
748 /*
749  * Assign a local port like in_pcb_lport(), but also used with connect()
750  * and a foreign address and port.  If fsa is non-NULL, choose a local port
751  * that is unused with those, otherwise one that is completely unused.
752  * lsa can be NULL for IPv6.
753  */
754 int
in_pcb_lport_dest(const struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)755 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
756     u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
757     int lookupflags)
758 {
759 	struct inpcbinfo *pcbinfo;
760 	struct inpcb *tmpinp;
761 	unsigned short *lastport;
762 	int count, error;
763 	u_short aux, first, last, lport;
764 #ifdef INET
765 	struct in_addr laddr, faddr;
766 #endif
767 #ifdef INET6
768 	struct in6_addr *laddr6, *faddr6;
769 #endif
770 
771 	pcbinfo = inp->inp_pcbinfo;
772 
773 	/*
774 	 * Because no actual state changes occur here, a global write lock on
775 	 * the pcbinfo isn't required.
776 	 */
777 	INP_LOCK_ASSERT(inp);
778 	INP_HASH_LOCK_ASSERT(pcbinfo);
779 
780 	if (inp->inp_flags & INP_HIGHPORT) {
781 		first = V_ipport_hifirstauto;	/* sysctl */
782 		last  = V_ipport_hilastauto;
783 		lastport = &pcbinfo->ipi_lasthi;
784 	} else if (inp->inp_flags & INP_LOWPORT) {
785 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
786 		if (error)
787 			return (error);
788 		first = V_ipport_lowfirstauto;	/* 1023 */
789 		last  = V_ipport_lowlastauto;	/* 600 */
790 		lastport = &pcbinfo->ipi_lastlow;
791 	} else {
792 		first = V_ipport_firstauto;	/* sysctl */
793 		last  = V_ipport_lastauto;
794 		lastport = &pcbinfo->ipi_lastport;
795 	}
796 
797 	/*
798 	 * Instead of having two loops further down counting up or down
799 	 * make sure that first is always <= last and go with only one
800 	 * code path implementing all logic.
801 	 */
802 	if (first > last) {
803 		aux = first;
804 		first = last;
805 		last = aux;
806 	}
807 
808 #ifdef INET
809 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
810 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
811 		if (lsa != NULL)
812 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
813 		if (fsa != NULL)
814 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
815 	}
816 #endif
817 #ifdef INET6
818 	laddr6 = NULL;
819 	if ((inp->inp_vflag & INP_IPV6) != 0) {
820 		if (lsa != NULL)
821 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
822 		if (fsa != NULL)
823 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
824 	}
825 #endif
826 
827 	tmpinp = NULL;
828 
829 	if (V_ipport_randomized)
830 		*lastport = first + (arc4random() % (last - first));
831 
832 	count = last - first;
833 
834 	do {
835 		if (count-- < 0)	/* completely used? */
836 			return (EADDRNOTAVAIL);
837 		++*lastport;
838 		if (*lastport < first || *lastport > last)
839 			*lastport = first;
840 		lport = htons(*lastport);
841 
842 		if (fsa != NULL) {
843 #ifdef INET
844 			if (lsa->sa_family == AF_INET) {
845 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
846 				    faddr, fport, laddr, lport, lookupflags,
847 				    M_NODOM, RT_ALL_FIBS);
848 			}
849 #endif
850 #ifdef INET6
851 			if (lsa->sa_family == AF_INET6) {
852 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
853 				    faddr6, fport, laddr6, lport, lookupflags,
854 				    M_NODOM, RT_ALL_FIBS);
855 			}
856 #endif
857 		} else {
858 #ifdef INET6
859 			if ((inp->inp_vflag & INP_IPV6) != 0) {
860 				tmpinp = in6_pcblookup_local(pcbinfo,
861 				    &inp->in6p_laddr, lport, RT_ALL_FIBS,
862 				    lookupflags, cred);
863 #ifdef INET
864 				if (tmpinp == NULL &&
865 				    (inp->inp_vflag & INP_IPV4))
866 					tmpinp = in_pcblookup_local(pcbinfo,
867 					    laddr, lport, RT_ALL_FIBS,
868 					    lookupflags, cred);
869 #endif
870 			}
871 #endif
872 #if defined(INET) && defined(INET6)
873 			else
874 #endif
875 #ifdef INET
876 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
877 				    lport, RT_ALL_FIBS, lookupflags, cred);
878 #endif
879 		}
880 	} while (tmpinp != NULL);
881 
882 	*lportp = lport;
883 
884 	return (0);
885 }
886 
887 /*
888  * Select a local port (number) to use.
889  */
890 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)891 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
892     struct ucred *cred, int lookupflags)
893 {
894 	struct sockaddr_in laddr;
895 
896 	if (laddrp) {
897 		bzero(&laddr, sizeof(laddr));
898 		laddr.sin_family = AF_INET;
899 		laddr.sin_addr = *laddrp;
900 	}
901 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
902 	    NULL, lportp, NULL, 0, cred, lookupflags));
903 }
904 #endif /* INET || INET6 */
905 
906 #ifdef INET
907 /*
908  * Determine whether the inpcb can be bound to the specified address/port tuple.
909  */
910 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)911 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
912     const u_short lport, const int fib, int sooptions, int lookupflags,
913     struct ucred *cred)
914 {
915 	int reuseport, reuseport_lb;
916 
917 	INP_LOCK_ASSERT(inp);
918 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
919 
920 	reuseport = (sooptions & SO_REUSEPORT);
921 	reuseport_lb = (sooptions & SO_REUSEPORT_LB);
922 
923 	if (IN_MULTICAST(ntohl(laddr.s_addr))) {
924 		/*
925 		 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
926 		 * allow complete duplication of binding if
927 		 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
928 		 * and a multicast address is bound on both
929 		 * new and duplicated sockets.
930 		 */
931 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
932 			reuseport = SO_REUSEADDR | SO_REUSEPORT;
933 		/*
934 		 * XXX: How to deal with SO_REUSEPORT_LB here?
935 		 * Treat same as SO_REUSEPORT for now.
936 		 */
937 		if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
938 			reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
939 	} else if (!in_nullhost(laddr)) {
940 		struct sockaddr_in sin;
941 
942 		memset(&sin, 0, sizeof(sin));
943 		sin.sin_family = AF_INET;
944 		sin.sin_len = sizeof(sin);
945 		sin.sin_addr = laddr;
946 
947 		/*
948 		 * Is the address a local IP address?
949 		 * If INP_BINDANY is set, then the socket may be bound
950 		 * to any endpoint address, local or not.
951 		 */
952 		if ((inp->inp_flags & INP_BINDANY) == 0 &&
953 		    ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
954 			return (EADDRNOTAVAIL);
955 	}
956 
957 	if (lport != 0) {
958 		struct inpcb *t;
959 
960 		if (ntohs(lport) <= V_ipport_reservedhigh &&
961 		    ntohs(lport) >= V_ipport_reservedlow &&
962 		    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
963 			return (EACCES);
964 
965 		if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
966 		    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
967 			/*
968 			 * If a socket owned by a different user is already
969 			 * bound to this port, fail.  In particular, SO_REUSE*
970 			 * can only be used to share a port among sockets owned
971 			 * by the same user.
972 			 *
973 			 * However, we can share a port with a connected socket
974 			 * which has a unique 4-tuple.
975 			 */
976 			t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
977 			    RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
978 			if (t != NULL &&
979 			    (inp->inp_socket->so_type != SOCK_STREAM ||
980 			     in_nullhost(t->inp_faddr)) &&
981 			    (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
982 				return (EADDRINUSE);
983 		}
984 		t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
985 		    lookupflags, cred);
986 		if (t != NULL && ((reuseport | reuseport_lb) &
987 		    t->inp_socket->so_options) == 0) {
988 #ifdef INET6
989 			if (!in_nullhost(laddr) ||
990 			    !in_nullhost(t->inp_laddr) ||
991 			    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
992 			    (t->inp_vflag & INP_IPV6PROTO) == 0)
993 #endif
994 				return (EADDRINUSE);
995 		}
996 	}
997 	return (0);
998 }
999 
1000 /*
1001  * Set up a bind operation on a PCB, performing port allocation
1002  * as required, but do not actually modify the PCB. Callers can
1003  * either complete the bind by setting inp_laddr/inp_lport and
1004  * calling in_pcbinshash(), or they can just use the resulting
1005  * port and address to authorise the sending of a once-off packet.
1006  *
1007  * On error, the values of *laddrp and *lportp are not changed.
1008  */
1009 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1010 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1011     u_short *lportp, int flags, struct ucred *cred)
1012 {
1013 	struct socket *so = inp->inp_socket;
1014 	struct in_addr laddr;
1015 	u_short lport = 0;
1016 	int error, fib, lookupflags, sooptions;
1017 
1018 	/*
1019 	 * No state changes, so read locks are sufficient here.
1020 	 */
1021 	INP_LOCK_ASSERT(inp);
1022 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1023 
1024 	laddr.s_addr = *laddrp;
1025 	if (sin != NULL && laddr.s_addr != INADDR_ANY)
1026 		return (EINVAL);
1027 
1028 	lookupflags = 0;
1029 	sooptions = atomic_load_int(&so->so_options);
1030 	if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1031 		lookupflags = INPLOOKUP_WILDCARD;
1032 	if (sin == NULL) {
1033 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
1034 			return (error);
1035 	} else {
1036 		KASSERT(sin->sin_family == AF_INET,
1037 		    ("%s: invalid family for address %p", __func__, sin));
1038 		KASSERT(sin->sin_len == sizeof(*sin),
1039 		    ("%s: invalid length for address %p", __func__, sin));
1040 
1041 		error = prison_local_ip4(cred, &sin->sin_addr);
1042 		if (error)
1043 			return (error);
1044 		if (sin->sin_port != *lportp) {
1045 			/* Don't allow the port to change. */
1046 			if (*lportp != 0)
1047 				return (EINVAL);
1048 			lport = sin->sin_port;
1049 		}
1050 		laddr = sin->sin_addr;
1051 
1052 		fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1053 		    RT_ALL_FIBS;
1054 
1055 		/* See if this address/port combo is available. */
1056 		error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1057 		    lookupflags, cred);
1058 		if (error != 0)
1059 			return (error);
1060 	}
1061 	if (*lportp != 0)
1062 		lport = *lportp;
1063 	if (lport == 0) {
1064 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1065 		if (error != 0)
1066 			return (error);
1067 	}
1068 	*laddrp = laddr.s_addr;
1069 	*lportp = lport;
1070 	if ((flags & INPBIND_FIB) != 0)
1071 		inp->inp_flags |= INP_BOUNDFIB;
1072 	return (0);
1073 }
1074 
1075 /*
1076  * Connect from a socket to a specified address.
1077  * Both address and port must be specified in argument sin.
1078  * If don't have a local address for this socket yet,
1079  * then pick one.
1080  */
1081 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1082 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1083 {
1084 	struct in_addr laddr, faddr;
1085 	u_short lport;
1086 	int error;
1087 	bool anonport;
1088 
1089 	INP_WLOCK_ASSERT(inp);
1090 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1091 	KASSERT(in_nullhost(inp->inp_faddr),
1092 	    ("%s: inp is already connected", __func__));
1093 	KASSERT(sin->sin_family == AF_INET,
1094 	    ("%s: invalid address family for %p", __func__, sin));
1095 	KASSERT(sin->sin_len == sizeof(*sin),
1096 	    ("%s: invalid address length for %p", __func__, sin));
1097 
1098 	if (sin->sin_port == 0)
1099 		return (EADDRNOTAVAIL);
1100 
1101 	anonport = (inp->inp_lport == 0);
1102 
1103 	if (__predict_false(in_broadcast(sin->sin_addr))) {
1104 		if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1105 			return (ENETUNREACH);
1106 		/*
1107 		 * If the destination address is INADDR_ANY, use the primary
1108 		 * local address.  If the supplied address is INADDR_BROADCAST,
1109 		 * and the primary interface supports broadcast, choose the
1110 		 * broadcast address for that interface.
1111 		 */
1112 		if (in_nullhost(sin->sin_addr)) {
1113 			faddr =
1114 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1115 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
1116 				return (error);
1117 		} else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1118 		    CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1119 		    & IFF_BROADCAST) {
1120 			faddr = satosin(&CK_STAILQ_FIRST(
1121 			    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1122 		} else
1123 			faddr = sin->sin_addr;
1124 	} else
1125 		faddr = sin->sin_addr;
1126 
1127 	if (in_nullhost(inp->inp_laddr)) {
1128 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1129 		if (error)
1130 			return (error);
1131 	} else
1132 		laddr = inp->inp_laddr;
1133 
1134 	if (anonport) {
1135 		struct sockaddr_in lsin = {
1136 			.sin_family = AF_INET,
1137 			.sin_addr = laddr,
1138 		};
1139 		struct sockaddr_in fsin = {
1140 			.sin_family = AF_INET,
1141 			.sin_addr = faddr,
1142 		};
1143 
1144 		error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1145 		    &lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1146 		    INPLOOKUP_WILDCARD);
1147 		if (error)
1148 			return (error);
1149 	} else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1150 	    sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1151 	    NULL)
1152 		return (EADDRINUSE);
1153 	else
1154 		lport = inp->inp_lport;
1155 
1156 	MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1157 	    !(inp->inp_flags & INP_INHASHLIST));
1158 
1159 	inp->inp_faddr = faddr;
1160 	inp->inp_fport = sin->sin_port;
1161 	inp->inp_laddr = laddr;
1162 	inp->inp_lport = lport;
1163 
1164 	if ((inp->inp_flags & INP_INHASHLIST) == 0) {
1165 		error = in_pcbinshash(inp);
1166 		MPASS(error == 0);
1167 	} else
1168 		in_pcbrehash(inp);
1169 #ifdef ROUTE_MPATH
1170 	if (CALC_FLOWID_OUTBOUND) {
1171 		uint32_t hash_val, hash_type;
1172 
1173 		hash_val = fib4_calc_software_hash(inp->inp_laddr,
1174 		    inp->inp_faddr, 0, sin->sin_port,
1175 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
1176 
1177 		inp->inp_flowid = hash_val;
1178 		inp->inp_flowtype = hash_type;
1179 	}
1180 #endif
1181 	if (anonport)
1182 		inp->inp_flags |= INP_ANONPORT;
1183 	return (0);
1184 }
1185 
1186 /*
1187  * Do proper source address selection on an unbound socket in case
1188  * of connect. Take jails into account as well.
1189  */
1190 int
in_pcbladdr(const struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1191 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1192     struct in_addr *laddr, struct ucred *cred)
1193 {
1194 	struct ifaddr *ifa;
1195 	struct sockaddr *sa;
1196 	struct sockaddr_in *sin, dst;
1197 	struct nhop_object *nh;
1198 	int error;
1199 
1200 	NET_EPOCH_ASSERT();
1201 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1202 
1203 	/*
1204 	 * Bypass source address selection and use the primary jail IP
1205 	 * if requested.
1206 	 */
1207 	if (!prison_saddrsel_ip4(cred, laddr))
1208 		return (0);
1209 
1210 	/*
1211 	 * If the destination address is multicast and an outgoing
1212 	 * interface has been set as a multicast option, prefer the
1213 	 * address of that interface as our source address.
1214 	 */
1215 	if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1216 	    inp->inp_moptions->imo_multicast_ifp != NULL) {
1217 		struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1218 		struct in_ifaddr *ia;
1219 
1220 		CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1221 			if (ia->ia_ifp == ifp &&
1222 			    prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1223 				break;
1224 		}
1225 		if (ia == NULL)
1226 			return (EADDRNOTAVAIL);
1227 		*laddr = ia->ia_addr.sin_addr;
1228 		return (0);
1229 	}
1230 
1231 	error = 0;
1232 
1233 	nh = NULL;
1234 	bzero(&dst, sizeof(dst));
1235 	sin = &dst;
1236 	sin->sin_family = AF_INET;
1237 	sin->sin_len = sizeof(struct sockaddr_in);
1238 	sin->sin_addr.s_addr = faddr->s_addr;
1239 
1240 	/*
1241 	 * If route is known our src addr is taken from the i/f,
1242 	 * else punt.
1243 	 *
1244 	 * Find out route to destination.
1245 	 */
1246 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1247 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1248 		    0, NHR_NONE, 0);
1249 
1250 	/*
1251 	 * If we found a route, use the address corresponding to
1252 	 * the outgoing interface.
1253 	 *
1254 	 * Otherwise assume faddr is reachable on a directly connected
1255 	 * network and try to find a corresponding interface to take
1256 	 * the source address from.
1257 	 */
1258 	if (nh == NULL || nh->nh_ifp == NULL) {
1259 		struct in_ifaddr *ia;
1260 		struct ifnet *ifp;
1261 
1262 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1263 					inp->inp_socket->so_fibnum));
1264 		if (ia == NULL) {
1265 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1266 						inp->inp_socket->so_fibnum));
1267 		}
1268 		if (ia == NULL) {
1269 			error = ENETUNREACH;
1270 			goto done;
1271 		}
1272 
1273 		if (!prison_flag(cred, PR_IP4)) {
1274 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1275 			goto done;
1276 		}
1277 
1278 		ifp = ia->ia_ifp;
1279 		ia = NULL;
1280 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1281 			sa = ifa->ifa_addr;
1282 			if (sa->sa_family != AF_INET)
1283 				continue;
1284 			sin = (struct sockaddr_in *)sa;
1285 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1286 				ia = (struct in_ifaddr *)ifa;
1287 				break;
1288 			}
1289 		}
1290 		if (ia != NULL) {
1291 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1292 			goto done;
1293 		}
1294 
1295 		/* 3. As a last resort return the 'default' jail address. */
1296 		error = prison_get_ip4(cred, laddr);
1297 		goto done;
1298 	}
1299 
1300 	/*
1301 	 * If the outgoing interface on the route found is not
1302 	 * a loopback interface, use the address from that interface.
1303 	 * In case of jails do those three steps:
1304 	 * 1. check if the interface address belongs to the jail. If so use it.
1305 	 * 2. check if we have any address on the outgoing interface
1306 	 *    belonging to this jail. If so use it.
1307 	 * 3. as a last resort return the 'default' jail address.
1308 	 */
1309 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1310 		struct in_ifaddr *ia;
1311 		struct ifnet *ifp;
1312 
1313 		/* If not jailed, use the default returned. */
1314 		if (!prison_flag(cred, PR_IP4)) {
1315 			ia = (struct in_ifaddr *)nh->nh_ifa;
1316 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1317 			goto done;
1318 		}
1319 
1320 		/* Jailed. */
1321 		/* 1. Check if the iface address belongs to the jail. */
1322 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1323 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1324 			ia = (struct in_ifaddr *)nh->nh_ifa;
1325 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326 			goto done;
1327 		}
1328 
1329 		/*
1330 		 * 2. Check if we have any address on the outgoing interface
1331 		 *    belonging to this jail.
1332 		 */
1333 		ia = NULL;
1334 		ifp = nh->nh_ifp;
1335 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1336 			sa = ifa->ifa_addr;
1337 			if (sa->sa_family != AF_INET)
1338 				continue;
1339 			sin = (struct sockaddr_in *)sa;
1340 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1341 				ia = (struct in_ifaddr *)ifa;
1342 				break;
1343 			}
1344 		}
1345 		if (ia != NULL) {
1346 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1347 			goto done;
1348 		}
1349 
1350 		/* 3. As a last resort return the 'default' jail address. */
1351 		error = prison_get_ip4(cred, laddr);
1352 		goto done;
1353 	}
1354 
1355 	/*
1356 	 * The outgoing interface is marked with 'loopback net', so a route
1357 	 * to ourselves is here.
1358 	 * Try to find the interface of the destination address and then
1359 	 * take the address from there. That interface is not necessarily
1360 	 * a loopback interface.
1361 	 * In case of jails, check that it is an address of the jail
1362 	 * and if we cannot find, fall back to the 'default' jail address.
1363 	 */
1364 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1365 		struct in_ifaddr *ia;
1366 
1367 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1368 					inp->inp_socket->so_fibnum));
1369 		if (ia == NULL)
1370 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1371 						inp->inp_socket->so_fibnum));
1372 		if (ia == NULL)
1373 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1374 
1375 		if (!prison_flag(cred, PR_IP4)) {
1376 			if (ia == NULL) {
1377 				error = ENETUNREACH;
1378 				goto done;
1379 			}
1380 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1381 			goto done;
1382 		}
1383 
1384 		/* Jailed. */
1385 		if (ia != NULL) {
1386 			struct ifnet *ifp;
1387 
1388 			ifp = ia->ia_ifp;
1389 			ia = NULL;
1390 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1391 				sa = ifa->ifa_addr;
1392 				if (sa->sa_family != AF_INET)
1393 					continue;
1394 				sin = (struct sockaddr_in *)sa;
1395 				if (prison_check_ip4(cred,
1396 				    &sin->sin_addr) == 0) {
1397 					ia = (struct in_ifaddr *)ifa;
1398 					break;
1399 				}
1400 			}
1401 			if (ia != NULL) {
1402 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1403 				goto done;
1404 			}
1405 		}
1406 
1407 		/* 3. As a last resort return the 'default' jail address. */
1408 		error = prison_get_ip4(cred, laddr);
1409 		goto done;
1410 	}
1411 
1412 done:
1413 	if (error == 0 && laddr->s_addr == INADDR_ANY)
1414 		return (EHOSTUNREACH);
1415 	return (error);
1416 }
1417 
1418 void
in_pcbdisconnect(struct inpcb * inp)1419 in_pcbdisconnect(struct inpcb *inp)
1420 {
1421 
1422 	INP_WLOCK_ASSERT(inp);
1423 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1424 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1425 	    ("%s: inp %p was already disconnected", __func__, inp));
1426 
1427 	in_pcbremhash_locked(inp);
1428 
1429 	/* See the comment in in_pcbinshash(). */
1430 	inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1431 	inp->inp_laddr.s_addr = INADDR_ANY;
1432 	inp->inp_faddr.s_addr = INADDR_ANY;
1433 	inp->inp_fport = 0;
1434 }
1435 #endif /* INET */
1436 
1437 void
in_pcblisten(struct inpcb * inp)1438 in_pcblisten(struct inpcb *inp)
1439 {
1440 	struct inpcblbgroup *grp;
1441 
1442 	INP_WLOCK_ASSERT(inp);
1443 
1444 	if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1445 		struct inpcbinfo *pcbinfo;
1446 
1447 		pcbinfo = inp->inp_pcbinfo;
1448 		INP_HASH_WLOCK(pcbinfo);
1449 		grp = in_pcblbgroup_find(inp);
1450 		LIST_REMOVE(inp, inp_lbgroup_list);
1451 		grp->il_pendcnt--;
1452 		in_pcblbgroup_insert(grp, inp);
1453 		INP_HASH_WUNLOCK(pcbinfo);
1454 	}
1455 }
1456 
1457 /*
1458  * inpcb hash lookups are protected by SMR section.
1459  *
1460  * Once desired pcb has been found, switching from SMR section to a pcb
1461  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1462  * here because SMR is a critical section.
1463  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1464  */
1465 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1466 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1467 {
1468 
1469 	lock == INPLOOKUP_RLOCKPCB ?
1470 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1471 }
1472 
1473 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1474 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1475 {
1476 
1477 	lock == INPLOOKUP_RLOCKPCB ?
1478 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1479 }
1480 
1481 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1482 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1483 {
1484 
1485 	return (lock == INPLOOKUP_RLOCKPCB ?
1486 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1487 }
1488 
1489 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1490 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1491 {
1492 
1493 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1494 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1495 
1496 	if (__predict_true(inp_trylock(inp, lock))) {
1497 		if (__predict_false(inp->inp_flags & ignflags)) {
1498 			smr_exit(inp->inp_pcbinfo->ipi_smr);
1499 			inp_unlock(inp, lock);
1500 			return (false);
1501 		}
1502 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1503 		return (true);
1504 	}
1505 
1506 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1507 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1508 		inp_lock(inp, lock);
1509 		if (__predict_false(in_pcbrele(inp, lock)))
1510 			return (false);
1511 		/*
1512 		 * inp acquired through refcount & lock for sure didn't went
1513 		 * through uma_zfree().  However, it may have already went
1514 		 * through in_pcbfree() and has another reference, that
1515 		 * prevented its release by our in_pcbrele().
1516 		 */
1517 		if (__predict_false(inp->inp_flags & ignflags)) {
1518 			inp_unlock(inp, lock);
1519 			return (false);
1520 		}
1521 		return (true);
1522 	} else {
1523 		smr_exit(inp->inp_pcbinfo->ipi_smr);
1524 		return (false);
1525 	}
1526 }
1527 
1528 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1529 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1530 {
1531 
1532 	/*
1533 	 * in_pcblookup() family of functions ignore not only freed entries,
1534 	 * that may be found due to lockless access to the hash, but dropped
1535 	 * entries, too.
1536 	 */
1537 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1538 }
1539 
1540 /*
1541  * inp_next() - inpcb hash/list traversal iterator
1542  *
1543  * Requires initialized struct inpcb_iterator for context.
1544  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1545  *
1546  * - Iterator can have either write-lock or read-lock semantics, that can not
1547  *   be changed later.
1548  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1549  *   a single hash slot.  Note: only rip_input() does the latter.
1550  * - Iterator may have optional bool matching function.  The matching function
1551  *   will be executed for each inpcb in the SMR context, so it can not acquire
1552  *   locks and can safely access only immutable fields of inpcb.
1553  *
1554  * A fresh initialized iterator has NULL inpcb in its context and that
1555  * means that inp_next() call would return the very first inpcb on the list
1556  * locked with desired semantic.  In all following calls the context pointer
1557  * shall hold the current inpcb pointer.  The KPI user is not supposed to
1558  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
1559  * and write NULL to its context.  After end of traversal an iterator can be
1560  * reused.
1561  *
1562  * List traversals have the following features/constraints:
1563  * - New entries won't be seen, as they are always added to the head of a list.
1564  * - Removed entries won't stop traversal as long as they are not added to
1565  *   a different list. This is violated by in_pcbrehash().
1566  */
1567 #define	II_LIST_FIRST(ipi, hash)					\
1568 		(((hash) == INP_ALL_LIST) ?				\
1569 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
1570 		    CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1571 #define	II_LIST_NEXT(inp, hash)						\
1572 		(((hash) == INP_ALL_LIST) ?				\
1573 		    CK_LIST_NEXT((inp), inp_list) :			\
1574 		    CK_LIST_NEXT((inp), inp_hash_exact))
1575 #define	II_LOCK_ASSERT(inp, lock)					\
1576 		rw_assert(&(inp)->inp_lock,				\
1577 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
1578 struct inpcb *
inp_next(struct inpcb_iterator * ii)1579 inp_next(struct inpcb_iterator *ii)
1580 {
1581 	const struct inpcbinfo *ipi = ii->ipi;
1582 	inp_match_t *match = ii->match;
1583 	void *ctx = ii->ctx;
1584 	inp_lookup_t lock = ii->lock;
1585 	int hash = ii->hash;
1586 	struct inpcb *inp;
1587 
1588 	if (ii->inp == NULL) {		/* First call. */
1589 		smr_enter(ipi->ipi_smr);
1590 		/* This is unrolled CK_LIST_FOREACH(). */
1591 		for (inp = II_LIST_FIRST(ipi, hash);
1592 		    inp != NULL;
1593 		    inp = II_LIST_NEXT(inp, hash)) {
1594 			if (match != NULL && (match)(inp, ctx) == false)
1595 				continue;
1596 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1597 				break;
1598 			else {
1599 				smr_enter(ipi->ipi_smr);
1600 				MPASS(inp != II_LIST_FIRST(ipi, hash));
1601 				inp = II_LIST_FIRST(ipi, hash);
1602 				if (inp == NULL)
1603 					break;
1604 			}
1605 		}
1606 
1607 		if (inp == NULL)
1608 			smr_exit(ipi->ipi_smr);
1609 		else
1610 			ii->inp = inp;
1611 
1612 		return (inp);
1613 	}
1614 
1615 	/* Not a first call. */
1616 	smr_enter(ipi->ipi_smr);
1617 restart:
1618 	inp = ii->inp;
1619 	II_LOCK_ASSERT(inp, lock);
1620 next:
1621 	inp = II_LIST_NEXT(inp, hash);
1622 	if (inp == NULL) {
1623 		smr_exit(ipi->ipi_smr);
1624 		goto found;
1625 	}
1626 
1627 	if (match != NULL && (match)(inp, ctx) == false)
1628 		goto next;
1629 
1630 	if (__predict_true(inp_trylock(inp, lock))) {
1631 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1632 			/*
1633 			 * Entries are never inserted in middle of a list, thus
1634 			 * as long as we are in SMR, we can continue traversal.
1635 			 * Jump to 'restart' should yield in the same result,
1636 			 * but could produce unnecessary looping.  Could this
1637 			 * looping be unbound?
1638 			 */
1639 			inp_unlock(inp, lock);
1640 			goto next;
1641 		} else {
1642 			smr_exit(ipi->ipi_smr);
1643 			goto found;
1644 		}
1645 	}
1646 
1647 	/*
1648 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
1649 	 * SMR section we can no longer jump to 'next', and our only stable
1650 	 * anchoring point is ii->inp, which we keep locked for this case, so
1651 	 * we jump to 'restart'.
1652 	 */
1653 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1654 		smr_exit(ipi->ipi_smr);
1655 		inp_lock(inp, lock);
1656 		if (__predict_false(in_pcbrele(inp, lock))) {
1657 			smr_enter(ipi->ipi_smr);
1658 			goto restart;
1659 		}
1660 		/*
1661 		 * See comment in inp_smr_lock().
1662 		 */
1663 		if (__predict_false(inp->inp_flags & INP_FREED)) {
1664 			inp_unlock(inp, lock);
1665 			smr_enter(ipi->ipi_smr);
1666 			goto restart;
1667 		}
1668 	} else
1669 		goto next;
1670 
1671 found:
1672 	inp_unlock(ii->inp, lock);
1673 	ii->inp = inp;
1674 
1675 	return (ii->inp);
1676 }
1677 
1678 /*
1679  * in_pcbref() bumps the reference count on an inpcb in order to maintain
1680  * stability of an inpcb pointer despite the inpcb lock being released or
1681  * SMR section exited.
1682  *
1683  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1684  */
1685 void
in_pcbref(struct inpcb * inp)1686 in_pcbref(struct inpcb *inp)
1687 {
1688 	u_int old __diagused;
1689 
1690 	old = refcount_acquire(&inp->inp_refcount);
1691 	KASSERT(old > 0, ("%s: refcount 0", __func__));
1692 }
1693 
1694 /*
1695  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1696  * freeing the pcb, if the reference was very last.
1697  */
1698 bool
in_pcbrele_rlocked(struct inpcb * inp)1699 in_pcbrele_rlocked(struct inpcb *inp)
1700 {
1701 
1702 	INP_RLOCK_ASSERT(inp);
1703 
1704 	if (!refcount_release(&inp->inp_refcount))
1705 		return (false);
1706 
1707 	MPASS(inp->inp_flags & INP_FREED);
1708 	MPASS(inp->inp_socket == NULL);
1709 	crfree(inp->inp_cred);
1710 #ifdef INVARIANTS
1711 	inp->inp_cred = NULL;
1712 #endif
1713 	INP_RUNLOCK(inp);
1714 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1715 	return (true);
1716 }
1717 
1718 bool
in_pcbrele_wlocked(struct inpcb * inp)1719 in_pcbrele_wlocked(struct inpcb *inp)
1720 {
1721 
1722 	INP_WLOCK_ASSERT(inp);
1723 
1724 	if (!refcount_release(&inp->inp_refcount))
1725 		return (false);
1726 
1727 	MPASS(inp->inp_flags & INP_FREED);
1728 	MPASS(inp->inp_socket == NULL);
1729 	crfree(inp->inp_cred);
1730 #ifdef INVARIANTS
1731 	inp->inp_cred = NULL;
1732 #endif
1733 	INP_WUNLOCK(inp);
1734 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1735 	return (true);
1736 }
1737 
1738 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1739 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1740 {
1741 
1742 	return (lock == INPLOOKUP_RLOCKPCB ?
1743 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1744 }
1745 
1746 /*
1747  * Dereference and rlock inp, for which the caller must own the
1748  * reference.  Returns true if inp no longer usable, false otherwise.
1749  */
1750 bool
in_pcbrele_rlock(struct inpcb * inp)1751 in_pcbrele_rlock(struct inpcb *inp)
1752 {
1753 	INP_RLOCK(inp);
1754 	if (in_pcbrele_rlocked(inp))
1755 		return (true);
1756 	if ((inp->inp_flags & INP_FREED) != 0) {
1757 		INP_RUNLOCK(inp);
1758 		return (true);
1759 	}
1760 	return (false);
1761 }
1762 
1763 /*
1764  * Unconditionally schedule an inpcb to be freed by decrementing its
1765  * reference count, which should occur only after the inpcb has been detached
1766  * from its socket.  If another thread holds a temporary reference (acquired
1767  * using in_pcbref()) then the free is deferred until that reference is
1768  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1769  *  Almost all work, including removal from global lists, is done in this
1770  * context, where the pcbinfo lock is held.
1771  */
1772 void
in_pcbfree(struct inpcb * inp)1773 in_pcbfree(struct inpcb *inp)
1774 {
1775 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1776 #ifdef INET
1777 	struct ip_moptions *imo;
1778 #endif
1779 #ifdef INET6
1780 	struct ip6_moptions *im6o;
1781 #endif
1782 
1783 	INP_WLOCK_ASSERT(inp);
1784 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1785 	KASSERT((inp->inp_flags & INP_FREED) == 0,
1786 	    ("%s: called twice for pcb %p", __func__, inp));
1787 
1788 	/*
1789 	 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1790 	 * from the hash without acquiring inpcb lock, they rely on the hash
1791 	 * lock, thus in_pcbremhash() should be the first action.
1792 	 */
1793 	if (inp->inp_flags & INP_INHASHLIST)
1794 		in_pcbremhash(inp);
1795 	INP_INFO_WLOCK(pcbinfo);
1796 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1797 	pcbinfo->ipi_count--;
1798 	CK_LIST_REMOVE(inp, inp_list);
1799 	INP_INFO_WUNLOCK(pcbinfo);
1800 
1801 #ifdef RATELIMIT
1802 	if (inp->inp_snd_tag != NULL)
1803 		in_pcbdetach_txrtlmt(inp);
1804 #endif
1805 	inp->inp_flags |= INP_FREED;
1806 	inp->inp_socket->so_pcb = NULL;
1807 	inp->inp_socket = NULL;
1808 
1809 	RO_INVALIDATE_CACHE(&inp->inp_route);
1810 #ifdef MAC
1811 	mac_inpcb_destroy(inp);
1812 #endif
1813 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1814 	if (inp->inp_sp != NULL)
1815 		ipsec_delete_pcbpolicy(inp);
1816 #endif
1817 #ifdef INET
1818 	if (inp->inp_options)
1819 		(void)m_free(inp->inp_options);
1820 	DEBUG_POISON_POINTER(inp->inp_options);
1821 	imo = inp->inp_moptions;
1822 	DEBUG_POISON_POINTER(inp->inp_moptions);
1823 #endif
1824 #ifdef INET6
1825 	if (inp->inp_vflag & INP_IPV6PROTO) {
1826 		ip6_freepcbopts(inp->in6p_outputopts);
1827 		DEBUG_POISON_POINTER(inp->in6p_outputopts);
1828 		im6o = inp->in6p_moptions;
1829 		DEBUG_POISON_POINTER(inp->in6p_moptions);
1830 	} else
1831 		im6o = NULL;
1832 #endif
1833 
1834 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1835 		INP_WUNLOCK(inp);
1836 	}
1837 #ifdef INET6
1838 	ip6_freemoptions(im6o);
1839 #endif
1840 #ifdef INET
1841 	inp_freemoptions(imo);
1842 #endif
1843 }
1844 
1845 /*
1846  * Different protocols initialize their inpcbs differently - giving
1847  * different name to the lock.  But they all are disposed the same.
1848  */
1849 static void
inpcb_fini(void * mem,int size)1850 inpcb_fini(void *mem, int size)
1851 {
1852 	struct inpcb *inp = mem;
1853 
1854 	INP_LOCK_DESTROY(inp);
1855 }
1856 
1857 /*
1858  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1859  * port reservation, and preventing it from being returned by inpcb lookups.
1860  *
1861  * It is used by TCP to mark an inpcb as unused and avoid future packet
1862  * delivery or event notification when a socket remains open but TCP has
1863  * closed.  This might occur as a result of a shutdown()-initiated TCP close
1864  * or a RST on the wire, and allows the port binding to be reused while still
1865  * maintaining the invariant that so_pcb always points to a valid inpcb until
1866  * in_pcbdetach().
1867  *
1868  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1869  * in_pcbpurgeif0()?
1870  */
1871 void
in_pcbdrop(struct inpcb * inp)1872 in_pcbdrop(struct inpcb *inp)
1873 {
1874 
1875 	INP_WLOCK_ASSERT(inp);
1876 
1877 	inp->inp_flags |= INP_DROPPED;
1878 	if (inp->inp_flags & INP_INHASHLIST)
1879 		in_pcbremhash(inp);
1880 }
1881 
1882 #ifdef INET
1883 /*
1884  * Common routines to return the socket addresses associated with inpcbs.
1885  */
1886 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1887 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1888 {
1889 	struct inpcb *inp;
1890 
1891 	inp = sotoinpcb(so);
1892 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1893 
1894 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1895 		.sin_len = sizeof(struct sockaddr_in),
1896 		.sin_family = AF_INET,
1897 		.sin_port = inp->inp_lport,
1898 		.sin_addr = inp->inp_laddr,
1899 	};
1900 
1901 	return (0);
1902 }
1903 
1904 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1905 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1906 {
1907 	struct inpcb *inp;
1908 
1909 	inp = sotoinpcb(so);
1910 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1911 
1912 	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
1913 		.sin_len = sizeof(struct sockaddr_in),
1914 		.sin_family = AF_INET,
1915 		.sin_port = inp->inp_fport,
1916 		.sin_addr = inp->inp_faddr,
1917 	};
1918 
1919 	return (0);
1920 }
1921 
1922 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1923 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1924 {
1925 
1926 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1927 		return (true);
1928 	else
1929 		return (false);
1930 }
1931 
1932 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1933 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1934 {
1935 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1936 	    inp_v4_multi_match, NULL);
1937 	struct inpcb *inp;
1938 	struct in_multi *inm;
1939 	struct in_mfilter *imf;
1940 	struct ip_moptions *imo;
1941 
1942 	IN_MULTI_LOCK_ASSERT();
1943 
1944 	while ((inp = inp_next(&inpi)) != NULL) {
1945 		INP_WLOCK_ASSERT(inp);
1946 
1947 		imo = inp->inp_moptions;
1948 		/*
1949 		 * Unselect the outgoing interface if it is being
1950 		 * detached.
1951 		 */
1952 		if (imo->imo_multicast_ifp == ifp)
1953 			imo->imo_multicast_ifp = NULL;
1954 
1955 		/*
1956 		 * Drop multicast group membership if we joined
1957 		 * through the interface being detached.
1958 		 *
1959 		 * XXX This can all be deferred to an epoch_call
1960 		 */
1961 restart:
1962 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1963 			if ((inm = imf->imf_inm) == NULL)
1964 				continue;
1965 			if (inm->inm_ifp != ifp)
1966 				continue;
1967 			ip_mfilter_remove(&imo->imo_head, imf);
1968 			in_leavegroup_locked(inm, NULL);
1969 			ip_mfilter_free(imf);
1970 			goto restart;
1971 		}
1972 	}
1973 }
1974 
1975 /*
1976  * Lookup a PCB based on the local address and port.  Caller must hold the
1977  * hash lock.  No inpcb locks or references are acquired.
1978  */
1979 #define INP_LOOKUP_MAPPED_PCB_COST	3
1980 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)1981 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1982     u_short lport, int fib, int lookupflags, struct ucred *cred)
1983 {
1984 	struct inpcb *inp;
1985 #ifdef INET6
1986 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1987 #else
1988 	int matchwild = 3;
1989 #endif
1990 	int wildcard;
1991 
1992 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1993 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1994 	KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
1995 	    ("%s: invalid fib %d", __func__, fib));
1996 
1997 	INP_HASH_LOCK_ASSERT(pcbinfo);
1998 
1999 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2000 		struct inpcbhead *head;
2001 		/*
2002 		 * Look for an unconnected (wildcard foreign addr) PCB that
2003 		 * matches the local address and port we're looking for.
2004 		 */
2005 		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2006 		    pcbinfo->ipi_hashmask)];
2007 		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2008 #ifdef INET6
2009 			/* XXX inp locking */
2010 			if ((inp->inp_vflag & INP_IPV4) == 0)
2011 				continue;
2012 #endif
2013 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
2014 			    inp->inp_laddr.s_addr == laddr.s_addr &&
2015 			    inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2016 			    inp->inp_inc.inc_fibnum == fib)) {
2017 				/*
2018 				 * Found?
2019 				 */
2020 				if (prison_equal_ip4(cred->cr_prison,
2021 				    inp->inp_cred->cr_prison))
2022 					return (inp);
2023 			}
2024 		}
2025 		/*
2026 		 * Not found.
2027 		 */
2028 		return (NULL);
2029 	} else {
2030 		struct inpcbhead *porthash;
2031 		struct inpcb *match = NULL;
2032 
2033 		/*
2034 		 * Port is in use by one or more PCBs. Look for best
2035 		 * fit.
2036 		 */
2037 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2038 		    pcbinfo->ipi_porthashmask)];
2039 		CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2040 			if (inp->inp_lport != lport)
2041 				continue;
2042 			if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2043 			    cred->cr_prison))
2044 				continue;
2045 			if (fib != RT_ALL_FIBS &&
2046 			    inp->inp_inc.inc_fibnum != fib)
2047 				continue;
2048 			wildcard = 0;
2049 #ifdef INET6
2050 			/* XXX inp locking */
2051 			if ((inp->inp_vflag & INP_IPV4) == 0)
2052 				continue;
2053 			/*
2054 			 * We never select the PCB that has INP_IPV6 flag and
2055 			 * is bound to :: if we have another PCB which is bound
2056 			 * to 0.0.0.0.  If a PCB has the INP_IPV6 flag, then we
2057 			 * set its cost higher than IPv4 only PCBs.
2058 			 *
2059 			 * Note that the case only happens when a socket is
2060 			 * bound to ::, under the condition that the use of the
2061 			 * mapped address is allowed.
2062 			 */
2063 			if ((inp->inp_vflag & INP_IPV6) != 0)
2064 				wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2065 #endif
2066 			if (inp->inp_faddr.s_addr != INADDR_ANY)
2067 				wildcard++;
2068 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
2069 				if (laddr.s_addr == INADDR_ANY)
2070 					wildcard++;
2071 				else if (inp->inp_laddr.s_addr != laddr.s_addr)
2072 					continue;
2073 			} else {
2074 				if (laddr.s_addr != INADDR_ANY)
2075 					wildcard++;
2076 			}
2077 			if (wildcard < matchwild) {
2078 				match = inp;
2079 				matchwild = wildcard;
2080 				if (matchwild == 0)
2081 					break;
2082 			}
2083 		}
2084 		return (match);
2085 	}
2086 }
2087 #undef INP_LOOKUP_MAPPED_PCB_COST
2088 
2089 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2090 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2091 {
2092 	return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2093 	    (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2094 }
2095 
2096 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2097 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2098     const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2099     uint16_t lport, int domain, int fib)
2100 {
2101 	const struct inpcblbgrouphead *hdr;
2102 	struct inpcblbgroup *grp;
2103 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2104 	struct inpcb *inp;
2105 	u_int count;
2106 
2107 	INP_HASH_LOCK_ASSERT(pcbinfo);
2108 	NET_EPOCH_ASSERT();
2109 
2110 	hdr = &pcbinfo->ipi_lbgrouphashbase[
2111 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2112 
2113 	/*
2114 	 * Search for an LB group match based on the following criteria:
2115 	 * - prefer jailed groups to non-jailed groups
2116 	 * - prefer exact source address matches to wildcard matches
2117 	 * - prefer groups bound to the specified NUMA domain
2118 	 */
2119 	jail_exact = jail_wild = local_exact = local_wild = NULL;
2120 	CK_LIST_FOREACH(grp, hdr, il_list) {
2121 		bool injail;
2122 
2123 #ifdef INET6
2124 		if (!(grp->il_vflag & INP_IPV4))
2125 			continue;
2126 #endif
2127 		if (grp->il_lport != lport)
2128 			continue;
2129 
2130 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2131 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2132 		    laddr) != 0)
2133 			continue;
2134 
2135 		if (grp->il_laddr.s_addr == laddr->s_addr) {
2136 			if (injail) {
2137 				jail_exact = grp;
2138 				if (in_pcblookup_lb_match(grp, domain, fib))
2139 					/* This is a perfect match. */
2140 					goto out;
2141 			} else if (local_exact == NULL ||
2142 			    in_pcblookup_lb_match(grp, domain, fib)) {
2143 				local_exact = grp;
2144 			}
2145 		} else if (grp->il_laddr.s_addr == INADDR_ANY) {
2146 			if (injail) {
2147 				if (jail_wild == NULL ||
2148 				    in_pcblookup_lb_match(grp, domain, fib))
2149 					jail_wild = grp;
2150 			} else if (local_wild == NULL ||
2151 			    in_pcblookup_lb_match(grp, domain, fib)) {
2152 				local_wild = grp;
2153 			}
2154 		}
2155 	}
2156 
2157 	if (jail_exact != NULL)
2158 		grp = jail_exact;
2159 	else if (jail_wild != NULL)
2160 		grp = jail_wild;
2161 	else if (local_exact != NULL)
2162 		grp = local_exact;
2163 	else
2164 		grp = local_wild;
2165 	if (grp == NULL)
2166 		return (NULL);
2167 
2168 out:
2169 	/*
2170 	 * Synchronize with in_pcblbgroup_insert().
2171 	 */
2172 	count = atomic_load_acq_int(&grp->il_inpcnt);
2173 	if (count == 0)
2174 		return (NULL);
2175 	inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2176 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2177 	return (inp);
2178 }
2179 
2180 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2181 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2182     u_short fport, struct in_addr laddr, u_short lport)
2183 {
2184 #ifdef INET6
2185 	/* XXX inp locking */
2186 	if ((inp->inp_vflag & INP_IPV4) == 0)
2187 		return (false);
2188 #endif
2189 	if (inp->inp_faddr.s_addr == faddr.s_addr &&
2190 	    inp->inp_laddr.s_addr == laddr.s_addr &&
2191 	    inp->inp_fport == fport &&
2192 	    inp->inp_lport == lport)
2193 		return (true);
2194 	return (false);
2195 }
2196 
2197 static struct inpcb *
in_pcblookup_hash_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2198 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2199     u_short fport, struct in_addr laddr, u_short lport)
2200 {
2201 	struct inpcbhead *head;
2202 	struct inpcb *inp;
2203 
2204 	INP_HASH_LOCK_ASSERT(pcbinfo);
2205 
2206 	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2207 	    pcbinfo->ipi_hashmask)];
2208 	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2209 		if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2210 			return (inp);
2211 	}
2212 	return (NULL);
2213 }
2214 
2215 typedef enum {
2216 	INPLOOKUP_MATCH_NONE = 0,
2217 	INPLOOKUP_MATCH_WILD = 1,
2218 	INPLOOKUP_MATCH_LADDR = 2,
2219 } inp_lookup_match_t;
2220 
2221 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2222 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2223     u_short lport, int fib)
2224 {
2225 #ifdef INET6
2226 	/* XXX inp locking */
2227 	if ((inp->inp_vflag & INP_IPV4) == 0)
2228 		return (INPLOOKUP_MATCH_NONE);
2229 #endif
2230 	if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2231 		return (INPLOOKUP_MATCH_NONE);
2232 	if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2233 		return (INPLOOKUP_MATCH_NONE);
2234 	if (inp->inp_laddr.s_addr == INADDR_ANY)
2235 		return (INPLOOKUP_MATCH_WILD);
2236 	if (inp->inp_laddr.s_addr == laddr.s_addr)
2237 		return (INPLOOKUP_MATCH_LADDR);
2238 	return (INPLOOKUP_MATCH_NONE);
2239 }
2240 
2241 #define	INP_LOOKUP_AGAIN	((struct inpcb *)(uintptr_t)-1)
2242 
2243 static struct inpcb *
in_pcblookup_hash_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2244 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2245     u_short lport, int fib, const inp_lookup_t lockflags)
2246 {
2247 	struct inpcbhead *head;
2248 	struct inpcb *inp;
2249 
2250 	KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2251 	    ("%s: not in SMR read section", __func__));
2252 
2253 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2254 	    pcbinfo->ipi_hashmask)];
2255 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2256 		inp_lookup_match_t match;
2257 
2258 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2259 		if (match == INPLOOKUP_MATCH_NONE)
2260 			continue;
2261 
2262 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2263 			match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2264 			if (match != INPLOOKUP_MATCH_NONE &&
2265 			    prison_check_ip4_locked(inp->inp_cred->cr_prison,
2266 			    &laddr) == 0)
2267 				return (inp);
2268 			inp_unlock(inp, lockflags);
2269 		}
2270 
2271 		/*
2272 		 * The matching socket disappeared out from under us.  Fall back
2273 		 * to a serialized lookup.
2274 		 */
2275 		return (INP_LOOKUP_AGAIN);
2276 	}
2277 	return (NULL);
2278 }
2279 
2280 static struct inpcb *
in_pcblookup_hash_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2281 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2282     u_short lport, int fib)
2283 {
2284 	struct inpcbhead *head;
2285 	struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2286 #ifdef INET6
2287 	struct inpcb *local_wild_mapped;
2288 #endif
2289 
2290 	INP_HASH_LOCK_ASSERT(pcbinfo);
2291 
2292 	/*
2293 	 * Order of socket selection - we always prefer jails.
2294 	 *      1. jailed, non-wild.
2295 	 *      2. jailed, wild.
2296 	 *      3. non-jailed, non-wild.
2297 	 *      4. non-jailed, wild.
2298 	 */
2299 	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2300 	    pcbinfo->ipi_hashmask)];
2301 	local_wild = local_exact = jail_wild = NULL;
2302 #ifdef INET6
2303 	local_wild_mapped = NULL;
2304 #endif
2305 	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2306 		inp_lookup_match_t match;
2307 		bool injail;
2308 
2309 		match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2310 		if (match == INPLOOKUP_MATCH_NONE)
2311 			continue;
2312 
2313 		injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2314 		if (injail) {
2315 			if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2316 			    &laddr) != 0)
2317 				continue;
2318 		} else {
2319 			if (local_exact != NULL)
2320 				continue;
2321 		}
2322 
2323 		if (match == INPLOOKUP_MATCH_LADDR) {
2324 			if (injail)
2325 				return (inp);
2326 			local_exact = inp;
2327 		} else {
2328 #ifdef INET6
2329 			/* XXX inp locking, NULL check */
2330 			if (inp->inp_vflag & INP_IPV6PROTO)
2331 				local_wild_mapped = inp;
2332 			else
2333 #endif
2334 				if (injail)
2335 					jail_wild = inp;
2336 				else
2337 					local_wild = inp;
2338 		}
2339 	}
2340 	if (jail_wild != NULL)
2341 		return (jail_wild);
2342 	if (local_exact != NULL)
2343 		return (local_exact);
2344 	if (local_wild != NULL)
2345 		return (local_wild);
2346 #ifdef INET6
2347 	if (local_wild_mapped != NULL)
2348 		return (local_wild_mapped);
2349 #endif
2350 	return (NULL);
2351 }
2352 
2353 /*
2354  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
2355  * that the caller has either locked the hash list, which usually happens
2356  * for bind(2) operations, or is in SMR section, which happens when sorting
2357  * out incoming packets.
2358  */
2359 static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2360 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2361     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2362     uint8_t numa_domain, int fib)
2363 {
2364 	struct inpcb *inp;
2365 	const u_short fport = fport_arg, lport = lport_arg;
2366 
2367 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2368 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2369 	KASSERT(faddr.s_addr != INADDR_ANY,
2370 	    ("%s: invalid foreign address", __func__));
2371 	KASSERT(laddr.s_addr != INADDR_ANY,
2372 	    ("%s: invalid local address", __func__));
2373 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2374 
2375 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2376 	if (inp != NULL)
2377 		return (inp);
2378 
2379 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2380 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2381 		    &laddr, lport, numa_domain, fib);
2382 		if (inp == NULL) {
2383 			inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2384 			    lport, fib);
2385 		}
2386 	}
2387 
2388 	return (inp);
2389 }
2390 
2391 static struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2392 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2393     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2394     uint8_t numa_domain, int fib)
2395 {
2396 	struct inpcb *inp;
2397 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2398 
2399 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2400 	    ("%s: LOCKPCB not set", __func__));
2401 
2402 	INP_HASH_WLOCK(pcbinfo);
2403 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2404 	    lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2405 	if (inp != NULL && !inp_trylock(inp, lockflags)) {
2406 		in_pcbref(inp);
2407 		INP_HASH_WUNLOCK(pcbinfo);
2408 		inp_lock(inp, lockflags);
2409 		if (in_pcbrele(inp, lockflags))
2410 			/* XXX-MJ or retry until we get a negative match? */
2411 			inp = NULL;
2412 	} else {
2413 		INP_HASH_WUNLOCK(pcbinfo);
2414 	}
2415 	return (inp);
2416 }
2417 
2418 static struct inpcb *
in_pcblookup_hash_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2419 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2420     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2421     uint8_t numa_domain, int fib)
2422 {
2423 	struct inpcb *inp;
2424 	const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2425 	const u_short fport = fport_arg, lport = lport_arg;
2426 
2427 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2428 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2429 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2430 	    ("%s: LOCKPCB not set", __func__));
2431 
2432 	smr_enter(pcbinfo->ipi_smr);
2433 	inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2434 	if (inp != NULL) {
2435 		if (__predict_true(inp_smr_lock(inp, lockflags))) {
2436 			/*
2437 			 * Revalidate the 4-tuple, the socket could have been
2438 			 * disconnected.
2439 			 */
2440 			if (__predict_true(in_pcblookup_exact_match(inp,
2441 			    faddr, fport, laddr, lport)))
2442 				return (inp);
2443 			inp_unlock(inp, lockflags);
2444 		}
2445 
2446 		/*
2447 		 * We failed to lock the inpcb, or its connection state changed
2448 		 * out from under us.  Fall back to a precise search.
2449 		 */
2450 		return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2451 		    lookupflags, numa_domain, fib));
2452 	}
2453 
2454 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2455 		inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2456 		    &laddr, lport, numa_domain, fib);
2457 		if (inp != NULL) {
2458 			if (__predict_true(inp_smr_lock(inp, lockflags))) {
2459 				if (__predict_true(in_pcblookup_wild_match(inp,
2460 				    laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2461 					return (inp);
2462 				inp_unlock(inp, lockflags);
2463 			}
2464 			inp = INP_LOOKUP_AGAIN;
2465 		} else {
2466 			inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2467 			    fib, lockflags);
2468 		}
2469 		if (inp == INP_LOOKUP_AGAIN) {
2470 			return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2471 			    lport, lookupflags, numa_domain, fib));
2472 		}
2473 	}
2474 
2475 	if (inp == NULL)
2476 		smr_exit(pcbinfo->ipi_smr);
2477 
2478 	return (inp);
2479 }
2480 
2481 /*
2482  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2483  * from which a pre-calculated hash value may be extracted.
2484  */
2485 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2486 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2487     struct in_addr laddr, u_int lport, int lookupflags,
2488     struct ifnet *ifp)
2489 {
2490 	int fib;
2491 
2492 	fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2493 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2494 	    lookupflags, M_NODOM, fib));
2495 }
2496 
2497 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2498 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2499     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2500     struct ifnet *ifp __unused, struct mbuf *m)
2501 {
2502 	int fib;
2503 
2504 	M_ASSERTPKTHDR(m);
2505 	fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2506 	return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2507 	    lookupflags, m->m_pkthdr.numa_domain, fib));
2508 }
2509 #endif /* INET */
2510 
2511 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2512 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2513 {
2514 	return (prison_flag(inp->inp_cred, flag) != 0);
2515 }
2516 
2517 /*
2518  * Insert the PCB into a hash chain using ordering rules which ensure that
2519  * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2520  *
2521  * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2522  * with exact local addresses ahead of wildcard PCBs.  Unbound v4-mapped v6 PCBs
2523  * always appear last no matter whether they are jailed.
2524  */
2525 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2526 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2527 {
2528 	struct inpcb *last;
2529 	bool bound, injail;
2530 
2531 	INP_LOCK_ASSERT(inp);
2532 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2533 
2534 	last = NULL;
2535 	bound = inp->inp_laddr.s_addr != INADDR_ANY;
2536 	if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2537 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2538 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2539 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2540 				return;
2541 			}
2542 		}
2543 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2544 		return;
2545 	}
2546 
2547 	injail = in_pcbjailed(inp, PR_IP4);
2548 	if (!injail) {
2549 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2550 			if (!in_pcbjailed(last, PR_IP4))
2551 				break;
2552 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2553 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2554 				return;
2555 			}
2556 		}
2557 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2558 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2559 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2560 		return;
2561 	}
2562 	if (!bound) {
2563 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2564 			if (last->inp_laddr.s_addr == INADDR_ANY)
2565 				break;
2566 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2567 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2568 				return;
2569 			}
2570 		}
2571 	}
2572 	if (last == NULL)
2573 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2574 	else
2575 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2576 }
2577 
2578 #ifdef INET6
2579 /*
2580  * See the comment above _in_pcbinshash_wild().
2581  */
2582 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2583 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2584 {
2585 	struct inpcb *last;
2586 	bool bound, injail;
2587 
2588 	INP_LOCK_ASSERT(inp);
2589 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2590 
2591 	last = NULL;
2592 	bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2593 	injail = in_pcbjailed(inp, PR_IP6);
2594 	if (!injail) {
2595 		CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2596 			if (!in_pcbjailed(last, PR_IP6))
2597 				break;
2598 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2599 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2600 				return;
2601 			}
2602 		}
2603 	} else if (!CK_LIST_EMPTY(pcbhash) &&
2604 	    !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2605 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2606 		return;
2607 	}
2608 	if (!bound) {
2609 		CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2610 			if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2611 				break;
2612 			if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2613 				CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2614 				return;
2615 			}
2616 		}
2617 	}
2618 	if (last == NULL)
2619 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2620 	else
2621 		CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2622 }
2623 #endif
2624 
2625 /*
2626  * Insert PCB onto various hash lists.
2627  *
2628  * With normal sockets this function shall not fail, so it could return void.
2629  * But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2630  * that's the only condition when it can fail.
2631  */
2632 int
in_pcbinshash(struct inpcb * inp)2633 in_pcbinshash(struct inpcb *inp)
2634 {
2635 	struct inpcbhead *pcbhash, *pcbporthash;
2636 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2637 	uint32_t hash;
2638 	bool connected;
2639 
2640 	INP_WLOCK_ASSERT(inp);
2641 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2642 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2643 	    ("in_pcbinshash: INP_INHASHLIST"));
2644 
2645 #ifdef INET6
2646 	if (inp->inp_vflag & INP_IPV6) {
2647 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2648 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2649 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2650 	} else
2651 #endif
2652 	{
2653 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2654 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2655 		connected = !in_nullhost(inp->inp_faddr);
2656 	}
2657 
2658 	if (connected)
2659 		pcbhash = &pcbinfo->ipi_hash_exact[hash];
2660 	else
2661 		pcbhash = &pcbinfo->ipi_hash_wild[hash];
2662 
2663 	pcbporthash = &pcbinfo->ipi_porthashbase[
2664 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2665 
2666 	/*
2667 	 * Ignore SO_REUSEPORT_LB if the socket is connected.  Really this case
2668 	 * should be an error, but for UDP sockets it is not, and some
2669 	 * applications erroneously set it on connected UDP sockets, so we can't
2670 	 * change this without breaking compatibility.
2671 	 */
2672 	if (!connected &&
2673 	    (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2674 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
2675 		if (error != 0)
2676 			return (error);
2677 	}
2678 
2679 	/*
2680 	 * The PCB may have been disconnected in the past.  Before we can safely
2681 	 * make it visible in the hash table, we must wait for all readers which
2682 	 * may be traversing this PCB to finish.
2683 	 */
2684 	if (inp->inp_smr != SMR_SEQ_INVALID) {
2685 		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2686 		inp->inp_smr = SMR_SEQ_INVALID;
2687 	}
2688 
2689 	if (connected)
2690 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2691 	else {
2692 #ifdef INET6
2693 		if ((inp->inp_vflag & INP_IPV6) != 0)
2694 			_in6_pcbinshash_wild(pcbhash, inp);
2695 		else
2696 #endif
2697 			_in_pcbinshash_wild(pcbhash, inp);
2698 	}
2699 	CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2700 	inp->inp_flags |= INP_INHASHLIST;
2701 
2702 	return (0);
2703 }
2704 
2705 void
in_pcbremhash_locked(struct inpcb * inp)2706 in_pcbremhash_locked(struct inpcb *inp)
2707 {
2708 
2709 	INP_WLOCK_ASSERT(inp);
2710 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2711 	MPASS(inp->inp_flags & INP_INHASHLIST);
2712 
2713 	if ((inp->inp_flags & INP_INLBGROUP) != 0)
2714 		in_pcbremlbgrouphash(inp);
2715 #ifdef INET6
2716 	if (inp->inp_vflag & INP_IPV6) {
2717 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2718 			CK_LIST_REMOVE(inp, inp_hash_wild);
2719 		else
2720 			CK_LIST_REMOVE(inp, inp_hash_exact);
2721 	} else
2722 #endif
2723 	{
2724 		if (in_nullhost(inp->inp_faddr))
2725 			CK_LIST_REMOVE(inp, inp_hash_wild);
2726 		else
2727 			CK_LIST_REMOVE(inp, inp_hash_exact);
2728 	}
2729 	CK_LIST_REMOVE(inp, inp_portlist);
2730 	inp->inp_flags &= ~INP_INHASHLIST;
2731 }
2732 
2733 static void
in_pcbremhash(struct inpcb * inp)2734 in_pcbremhash(struct inpcb *inp)
2735 {
2736 	INP_HASH_WLOCK(inp->inp_pcbinfo);
2737 	in_pcbremhash_locked(inp);
2738 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2739 }
2740 
2741 /*
2742  * Move PCB to the proper hash bucket when { faddr, fport } have  been
2743  * changed. NOTE: This does not handle the case of the lport changing (the
2744  * hashed port list would have to be updated as well), so the lport must
2745  * not change after in_pcbinshash() has been called.
2746  */
2747 void
in_pcbrehash(struct inpcb * inp)2748 in_pcbrehash(struct inpcb *inp)
2749 {
2750 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2751 	struct inpcbhead *head;
2752 	uint32_t hash;
2753 	bool connected;
2754 
2755 	INP_WLOCK_ASSERT(inp);
2756 	INP_HASH_WLOCK_ASSERT(pcbinfo);
2757 	KASSERT(inp->inp_flags & INP_INHASHLIST,
2758 	    ("%s: !INP_INHASHLIST", __func__));
2759 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2760 	    ("%s: inp was disconnected", __func__));
2761 
2762 #ifdef INET6
2763 	if (inp->inp_vflag & INP_IPV6) {
2764 		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2765 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2766 		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2767 	} else
2768 #endif
2769 	{
2770 		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2771 		    inp->inp_fport, pcbinfo->ipi_hashmask);
2772 		connected = !in_nullhost(inp->inp_faddr);
2773 	}
2774 
2775 	/* See the comment in in_pcbinshash(). */
2776 	if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2777 		in_pcbremlbgrouphash(inp);
2778 
2779 	/*
2780 	 * When rehashing, the caller must ensure that either the new or the old
2781 	 * foreign address was unspecified.
2782 	 */
2783 	if (connected)
2784 		CK_LIST_REMOVE(inp, inp_hash_wild);
2785 	else
2786 		CK_LIST_REMOVE(inp, inp_hash_exact);
2787 
2788 	if (connected) {
2789 		head = &pcbinfo->ipi_hash_exact[hash];
2790 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2791 	} else {
2792 		head = &pcbinfo->ipi_hash_wild[hash];
2793 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2794 	}
2795 }
2796 
2797 /*
2798  * Check for alternatives when higher level complains
2799  * about service problems.  For now, invalidate cached
2800  * routing information.  If the route was created dynamically
2801  * (by a redirect), time to try a default gateway again.
2802  */
2803 void
in_losing(struct inpcb * inp)2804 in_losing(struct inpcb *inp)
2805 {
2806 
2807 	RO_INVALIDATE_CACHE(&inp->inp_route);
2808 	return;
2809 }
2810 
2811 /*
2812  * A set label operation has occurred at the socket layer, propagate the
2813  * label change into the in_pcb for the socket.
2814  */
2815 void
in_pcbsosetlabel(struct socket * so)2816 in_pcbsosetlabel(struct socket *so)
2817 {
2818 #ifdef MAC
2819 	struct inpcb *inp;
2820 
2821 	inp = sotoinpcb(so);
2822 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2823 
2824 	INP_WLOCK(inp);
2825 	SOCK_LOCK(so);
2826 	mac_inpcb_sosetlabel(so, inp);
2827 	SOCK_UNLOCK(so);
2828 	INP_WUNLOCK(inp);
2829 #endif
2830 }
2831 
2832 void
inp_wlock(struct inpcb * inp)2833 inp_wlock(struct inpcb *inp)
2834 {
2835 
2836 	INP_WLOCK(inp);
2837 }
2838 
2839 void
inp_wunlock(struct inpcb * inp)2840 inp_wunlock(struct inpcb *inp)
2841 {
2842 
2843 	INP_WUNLOCK(inp);
2844 }
2845 
2846 void
inp_rlock(struct inpcb * inp)2847 inp_rlock(struct inpcb *inp)
2848 {
2849 
2850 	INP_RLOCK(inp);
2851 }
2852 
2853 void
inp_runlock(struct inpcb * inp)2854 inp_runlock(struct inpcb *inp)
2855 {
2856 
2857 	INP_RUNLOCK(inp);
2858 }
2859 
2860 #ifdef INVARIANT_SUPPORT
2861 void
inp_lock_assert(struct inpcb * inp)2862 inp_lock_assert(struct inpcb *inp)
2863 {
2864 
2865 	INP_WLOCK_ASSERT(inp);
2866 }
2867 
2868 void
inp_unlock_assert(struct inpcb * inp)2869 inp_unlock_assert(struct inpcb *inp)
2870 {
2871 
2872 	INP_UNLOCK_ASSERT(inp);
2873 }
2874 #endif
2875 
2876 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2877 inp_apply_all(struct inpcbinfo *pcbinfo,
2878     void (*func)(struct inpcb *, void *), void *arg)
2879 {
2880 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2881 	    INPLOOKUP_WLOCKPCB);
2882 	struct inpcb *inp;
2883 
2884 	while ((inp = inp_next(&inpi)) != NULL)
2885 		func(inp, arg);
2886 }
2887 
2888 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2889 inp_inpcbtosocket(struct inpcb *inp)
2890 {
2891 
2892 	INP_WLOCK_ASSERT(inp);
2893 	return (inp->inp_socket);
2894 }
2895 
2896 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2897 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2898     uint32_t *faddr, uint16_t *fp)
2899 {
2900 
2901 	INP_LOCK_ASSERT(inp);
2902 	*laddr = inp->inp_laddr.s_addr;
2903 	*faddr = inp->inp_faddr.s_addr;
2904 	*lp = inp->inp_lport;
2905 	*fp = inp->inp_fport;
2906 }
2907 
2908 /*
2909  * Create an external-format (``xinpcb'') structure using the information in
2910  * the kernel-format in_pcb structure pointed to by inp.  This is done to
2911  * reduce the spew of irrelevant information over this interface, to isolate
2912  * user code from changes in the kernel structure, and potentially to provide
2913  * information-hiding if we decide that some of this information should be
2914  * hidden from users.
2915  */
2916 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)2917 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2918 {
2919 
2920 	bzero(xi, sizeof(*xi));
2921 	xi->xi_len = sizeof(struct xinpcb);
2922 	if (inp->inp_socket)
2923 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
2924 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2925 	xi->inp_gencnt = inp->inp_gencnt;
2926 	xi->inp_flow = inp->inp_flow;
2927 	xi->inp_flowid = inp->inp_flowid;
2928 	xi->inp_flowtype = inp->inp_flowtype;
2929 	xi->inp_flags = inp->inp_flags;
2930 	xi->inp_flags2 = inp->inp_flags2;
2931 	xi->in6p_cksum = inp->in6p_cksum;
2932 	xi->in6p_hops = inp->in6p_hops;
2933 	xi->inp_ip_tos = inp->inp_ip_tos;
2934 	xi->inp_vflag = inp->inp_vflag;
2935 	xi->inp_ip_ttl = inp->inp_ip_ttl;
2936 	xi->inp_ip_p = inp->inp_ip_p;
2937 	xi->inp_ip_minttl = inp->inp_ip_minttl;
2938 }
2939 
2940 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))2941 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2942     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2943 {
2944 	struct sockopt sopt;
2945 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2946 	    INPLOOKUP_WLOCKPCB);
2947 	struct inpcb *inp;
2948 	struct sockopt_parameters *params;
2949 	struct socket *so;
2950 	int error;
2951 	char buf[1024];
2952 
2953 	if (req->oldptr != NULL || req->oldlen != 0)
2954 		return (EINVAL);
2955 	if (req->newptr == NULL)
2956 		return (EPERM);
2957 	if (req->newlen > sizeof(buf))
2958 		return (ENOMEM);
2959 	error = SYSCTL_IN(req, buf, req->newlen);
2960 	if (error != 0)
2961 		return (error);
2962 	if (req->newlen < sizeof(struct sockopt_parameters))
2963 		return (EINVAL);
2964 	params = (struct sockopt_parameters *)buf;
2965 	sopt.sopt_level = params->sop_level;
2966 	sopt.sopt_name = params->sop_optname;
2967 	sopt.sopt_dir = SOPT_SET;
2968 	sopt.sopt_val = params->sop_optval;
2969 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2970 	sopt.sopt_td = NULL;
2971 #ifdef INET6
2972 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
2973 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
2974 			params->sop_inc.inc6_laddr.s6_addr16[1] =
2975 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
2976 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
2977 			params->sop_inc.inc6_faddr.s6_addr16[1] =
2978 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
2979 	}
2980 #endif
2981 	if (params->sop_inc.inc_lport != htons(0) &&
2982 	    params->sop_inc.inc_fport != htons(0)) {
2983 #ifdef INET6
2984 		if (params->sop_inc.inc_flags & INC_ISIPV6)
2985 			inpi.hash = INP6_PCBHASH(
2986 			    &params->sop_inc.inc6_faddr,
2987 			    params->sop_inc.inc_lport,
2988 			    params->sop_inc.inc_fport,
2989 			    pcbinfo->ipi_hashmask);
2990 		else
2991 #endif
2992 			inpi.hash = INP_PCBHASH(
2993 			    &params->sop_inc.inc_faddr,
2994 			    params->sop_inc.inc_lport,
2995 			    params->sop_inc.inc_fport,
2996 			    pcbinfo->ipi_hashmask);
2997 	}
2998 	while ((inp = inp_next(&inpi)) != NULL)
2999 		if (inp->inp_gencnt == params->sop_id) {
3000 			if (inp->inp_flags & INP_DROPPED) {
3001 				INP_WUNLOCK(inp);
3002 				return (ECONNRESET);
3003 			}
3004 			so = inp->inp_socket;
3005 			KASSERT(so != NULL, ("inp_socket == NULL"));
3006 			soref(so);
3007 			if (params->sop_level == SOL_SOCKET) {
3008 				INP_WUNLOCK(inp);
3009 				error = sosetopt(so, &sopt);
3010 			} else
3011 				error = (*ctloutput_set)(inp, &sopt);
3012 			sorele(so);
3013 			break;
3014 		}
3015 	if (inp == NULL)
3016 		error = ESRCH;
3017 	return (error);
3018 }
3019 
3020 #ifdef DDB
3021 static void
db_print_indent(int indent)3022 db_print_indent(int indent)
3023 {
3024 	int i;
3025 
3026 	for (i = 0; i < indent; i++)
3027 		db_printf(" ");
3028 }
3029 
3030 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3031 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3032 {
3033 	char faddr_str[48], laddr_str[48];
3034 
3035 	db_print_indent(indent);
3036 	db_printf("%s at %p\n", name, inc);
3037 
3038 	indent += 2;
3039 
3040 #ifdef INET6
3041 	if (inc->inc_flags & INC_ISIPV6) {
3042 		/* IPv6. */
3043 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
3044 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
3045 	} else
3046 #endif
3047 	{
3048 		/* IPv4. */
3049 		inet_ntoa_r(inc->inc_laddr, laddr_str);
3050 		inet_ntoa_r(inc->inc_faddr, faddr_str);
3051 	}
3052 	db_print_indent(indent);
3053 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
3054 	    ntohs(inc->inc_lport));
3055 	db_print_indent(indent);
3056 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
3057 	    ntohs(inc->inc_fport));
3058 }
3059 
3060 void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3061 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3062 {
3063 
3064 	db_print_indent(indent);
3065 	db_printf("%s at %p\n", name, inp);
3066 
3067 	indent += 2;
3068 
3069 	db_print_indent(indent);
3070 	db_printf("inp_flow: 0x%x   inp_label: %p\n", inp->inp_flow,
3071 	    inp->inp_label);
3072 
3073 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3074 
3075 	db_print_indent(indent);
3076 	db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3077 
3078 	db_print_indent(indent);
3079 	db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3080 
3081 	db_print_indent(indent);
3082 	db_printf("inp_sp: %p   inp_vflag: 0x%b\n", inp->inp_sp,
3083 	    inp->inp_vflag, INP_VFLAGS_BITS);
3084 
3085 	db_print_indent(indent);
3086 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
3087 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3088 
3089 #ifdef INET6
3090 	if (inp->inp_vflag & INP_IPV6) {
3091 		db_print_indent(indent);
3092 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
3093 		    "in6p_moptions: %p\n", inp->in6p_options,
3094 		    inp->in6p_outputopts, inp->in6p_moptions);
3095 		db_print_indent(indent);
3096 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
3097 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3098 		    inp->in6p_hops);
3099 	} else
3100 #endif
3101 	{
3102 		db_print_indent(indent);
3103 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
3104 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3105 		    inp->inp_options, inp->inp_moptions);
3106 	}
3107 
3108 	db_print_indent(indent);
3109 	db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3110 }
3111 
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3112 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3113 {
3114 	struct inpcb *inp;
3115 
3116 	if (!have_addr) {
3117 		db_printf("usage: show inpcb <addr>\n");
3118 		return;
3119 	}
3120 	inp = (struct inpcb *)addr;
3121 
3122 	db_print_inpcb(inp, "inpcb", 0);
3123 }
3124 #endif /* DDB */
3125 
3126 #ifdef RATELIMIT
3127 /*
3128  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3129  * if any.
3130  */
3131 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3132 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3133 {
3134 	union if_snd_tag_modify_params params = {
3135 		.rate_limit.max_rate = max_pacing_rate,
3136 		.rate_limit.flags = M_NOWAIT,
3137 	};
3138 	struct m_snd_tag *mst;
3139 	int error;
3140 
3141 	mst = inp->inp_snd_tag;
3142 	if (mst == NULL)
3143 		return (EINVAL);
3144 
3145 	if (mst->sw->snd_tag_modify == NULL) {
3146 		error = EOPNOTSUPP;
3147 	} else {
3148 		error = mst->sw->snd_tag_modify(mst, &params);
3149 	}
3150 	return (error);
3151 }
3152 
3153 /*
3154  * Query existing TX rate limit based on the existing
3155  * "inp->inp_snd_tag", if any.
3156  */
3157 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3158 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3159 {
3160 	union if_snd_tag_query_params params = { };
3161 	struct m_snd_tag *mst;
3162 	int error;
3163 
3164 	mst = inp->inp_snd_tag;
3165 	if (mst == NULL)
3166 		return (EINVAL);
3167 
3168 	if (mst->sw->snd_tag_query == NULL) {
3169 		error = EOPNOTSUPP;
3170 	} else {
3171 		error = mst->sw->snd_tag_query(mst, &params);
3172 		if (error == 0 && p_max_pacing_rate != NULL)
3173 			*p_max_pacing_rate = params.rate_limit.max_rate;
3174 	}
3175 	return (error);
3176 }
3177 
3178 /*
3179  * Query existing TX queue level based on the existing
3180  * "inp->inp_snd_tag", if any.
3181  */
3182 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3183 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3184 {
3185 	union if_snd_tag_query_params params = { };
3186 	struct m_snd_tag *mst;
3187 	int error;
3188 
3189 	mst = inp->inp_snd_tag;
3190 	if (mst == NULL)
3191 		return (EINVAL);
3192 
3193 	if (mst->sw->snd_tag_query == NULL)
3194 		return (EOPNOTSUPP);
3195 
3196 	error = mst->sw->snd_tag_query(mst, &params);
3197 	if (error == 0 && p_txqueue_level != NULL)
3198 		*p_txqueue_level = params.rate_limit.queue_level;
3199 	return (error);
3200 }
3201 
3202 /*
3203  * Allocate a new TX rate limit send tag from the network interface
3204  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3205  */
3206 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3207 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3208     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3209 
3210 {
3211 	union if_snd_tag_alloc_params params = {
3212 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3213 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3214 		.rate_limit.hdr.flowid = flowid,
3215 		.rate_limit.hdr.flowtype = flowtype,
3216 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3217 		.rate_limit.max_rate = max_pacing_rate,
3218 		.rate_limit.flags = M_NOWAIT,
3219 	};
3220 	int error;
3221 
3222 	INP_WLOCK_ASSERT(inp);
3223 
3224 	/*
3225 	 * If there is already a send tag, or the INP is being torn
3226 	 * down, allocating a new send tag is not allowed. Else send
3227 	 * tags may leak.
3228 	 */
3229 	if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3230 		return (EINVAL);
3231 
3232 	error = m_snd_tag_alloc(ifp, &params, st);
3233 #ifdef INET
3234 	if (error == 0) {
3235 		counter_u64_add(rate_limit_set_ok, 1);
3236 		counter_u64_add(rate_limit_active, 1);
3237 	} else if (error != EOPNOTSUPP)
3238 		  counter_u64_add(rate_limit_alloc_fail, 1);
3239 #endif
3240 	return (error);
3241 }
3242 
3243 void
in_pcbdetach_tag(struct m_snd_tag * mst)3244 in_pcbdetach_tag(struct m_snd_tag *mst)
3245 {
3246 
3247 	m_snd_tag_rele(mst);
3248 #ifdef INET
3249 	counter_u64_add(rate_limit_active, -1);
3250 #endif
3251 }
3252 
3253 /*
3254  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3255  * if any:
3256  */
3257 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3258 in_pcbdetach_txrtlmt(struct inpcb *inp)
3259 {
3260 	struct m_snd_tag *mst;
3261 
3262 	INP_WLOCK_ASSERT(inp);
3263 
3264 	mst = inp->inp_snd_tag;
3265 	inp->inp_snd_tag = NULL;
3266 
3267 	if (mst == NULL)
3268 		return;
3269 
3270 	m_snd_tag_rele(mst);
3271 #ifdef INET
3272 	counter_u64_add(rate_limit_active, -1);
3273 #endif
3274 }
3275 
3276 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3277 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3278 {
3279 	int error;
3280 
3281 	/*
3282 	 * If the existing send tag is for the wrong interface due to
3283 	 * a route change, first drop the existing tag.  Set the
3284 	 * CHANGED flag so that we will keep trying to allocate a new
3285 	 * tag if we fail to allocate one this time.
3286 	 */
3287 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3288 		in_pcbdetach_txrtlmt(inp);
3289 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3290 	}
3291 
3292 	/*
3293 	 * NOTE: When attaching to a network interface a reference is
3294 	 * made to ensure the network interface doesn't go away until
3295 	 * all ratelimit connections are gone. The network interface
3296 	 * pointers compared below represent valid network interfaces,
3297 	 * except when comparing towards NULL.
3298 	 */
3299 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3300 		error = 0;
3301 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3302 		if (inp->inp_snd_tag != NULL)
3303 			in_pcbdetach_txrtlmt(inp);
3304 		error = 0;
3305 	} else if (inp->inp_snd_tag == NULL) {
3306 		/*
3307 		 * In order to utilize packet pacing with RSS, we need
3308 		 * to wait until there is a valid RSS hash before we
3309 		 * can proceed:
3310 		 */
3311 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3312 			error = EAGAIN;
3313 		} else {
3314 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3315 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3316 		}
3317 	} else {
3318 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3319 	}
3320 	if (error == 0 || error == EOPNOTSUPP)
3321 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3322 
3323 	return (error);
3324 }
3325 
3326 /*
3327  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3328  * is set in the fast path and will attach/detach/modify the TX rate
3329  * limit send tag based on the socket's so_max_pacing_rate value.
3330  */
3331 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3332 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3333 {
3334 	struct socket *socket;
3335 	uint32_t max_pacing_rate;
3336 	bool did_upgrade;
3337 
3338 	if (inp == NULL)
3339 		return;
3340 
3341 	socket = inp->inp_socket;
3342 	if (socket == NULL)
3343 		return;
3344 
3345 	if (!INP_WLOCKED(inp)) {
3346 		/*
3347 		 * NOTE: If the write locking fails, we need to bail
3348 		 * out and use the non-ratelimited ring for the
3349 		 * transmit until there is a new chance to get the
3350 		 * write lock.
3351 		 */
3352 		if (!INP_TRY_UPGRADE(inp))
3353 			return;
3354 		did_upgrade = 1;
3355 	} else {
3356 		did_upgrade = 0;
3357 	}
3358 
3359 	/*
3360 	 * NOTE: The so_max_pacing_rate value is read unlocked,
3361 	 * because atomic updates are not required since the variable
3362 	 * is checked at every mbuf we send. It is assumed that the
3363 	 * variable read itself will be atomic.
3364 	 */
3365 	max_pacing_rate = socket->so_max_pacing_rate;
3366 
3367 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3368 
3369 	if (did_upgrade)
3370 		INP_DOWNGRADE(inp);
3371 }
3372 
3373 /*
3374  * Track route changes for TX rate limiting.
3375  */
3376 void
in_pcboutput_eagain(struct inpcb * inp)3377 in_pcboutput_eagain(struct inpcb *inp)
3378 {
3379 	bool did_upgrade;
3380 
3381 	if (inp == NULL)
3382 		return;
3383 
3384 	if (inp->inp_snd_tag == NULL)
3385 		return;
3386 
3387 	if (!INP_WLOCKED(inp)) {
3388 		/*
3389 		 * NOTE: If the write locking fails, we need to bail
3390 		 * out and use the non-ratelimited ring for the
3391 		 * transmit until there is a new chance to get the
3392 		 * write lock.
3393 		 */
3394 		if (!INP_TRY_UPGRADE(inp))
3395 			return;
3396 		did_upgrade = 1;
3397 	} else {
3398 		did_upgrade = 0;
3399 	}
3400 
3401 	/* detach rate limiting */
3402 	in_pcbdetach_txrtlmt(inp);
3403 
3404 	/* make sure new mbuf send tag allocation is made */
3405 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3406 
3407 	if (did_upgrade)
3408 		INP_DOWNGRADE(inp);
3409 }
3410 
3411 #ifdef INET
3412 static void
rl_init(void * st)3413 rl_init(void *st)
3414 {
3415 	rate_limit_new = counter_u64_alloc(M_WAITOK);
3416 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
3417 	rate_limit_active = counter_u64_alloc(M_WAITOK);
3418 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3419 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3420 }
3421 
3422 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3423 #endif
3424 #endif /* RATELIMIT */
3425