1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9 * All rights reserved.
10 *
11 * Portions of this software were developed by Robert N. M. Watson under
12 * contract to Juniper Networks, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include "opt_ddb.h"
40 #include "opt_ipsec.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #include "opt_ratelimit.h"
44 #include "opt_route.h"
45 #include "opt_rss.h"
46
47 #include <sys/param.h>
48 #include <sys/hash.h>
49 #include <sys/systm.h>
50 #include <sys/libkern.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/eventhandler.h>
55 #include <sys/domain.h>
56 #include <sys/proc.h>
57 #include <sys/protosw.h>
58 #include <sys/smp.h>
59 #include <sys/smr.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <sys/priv.h>
64 #include <sys/proc.h>
65 #include <sys/refcount.h>
66 #include <sys/jail.h>
67 #include <sys/kernel.h>
68 #include <sys/sysctl.h>
69
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif
73
74 #include <vm/uma.h>
75 #include <vm/vm.h>
76
77 #include <net/if.h>
78 #include <net/if_var.h>
79 #include <net/if_private.h>
80 #include <net/if_types.h>
81 #include <net/if_llatbl.h>
82 #include <net/route.h>
83 #include <net/rss_config.h>
84 #include <net/vnet.h>
85
86 #if defined(INET) || defined(INET6)
87 #include <netinet/in.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/in_pcb_var.h>
90 #include <netinet/tcp.h>
91 #ifdef INET
92 #include <netinet/in_var.h>
93 #include <netinet/in_fib.h>
94 #endif
95 #include <netinet/ip_var.h>
96 #ifdef INET6
97 #include <netinet/ip6.h>
98 #include <netinet6/in6_pcb.h>
99 #include <netinet6/in6_var.h>
100 #include <netinet6/ip6_var.h>
101 #endif /* INET6 */
102 #include <net/route/nhop.h>
103 #endif
104
105 #include <netipsec/ipsec_support.h>
106
107 #include <security/mac/mac_framework.h>
108
109 #define INPCBLBGROUP_SIZMIN 8
110 #define INPCBLBGROUP_SIZMAX 256
111
112 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
113 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
114
115 /*
116 * These configure the range of local port addresses assigned to
117 * "unspecified" outgoing connections/packets/whatever.
118 */
119 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
120 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
121 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
122 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
123 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
124 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
125
126 /*
127 * Reserved ports accessible only to root. There are significant
128 * security considerations that must be accounted for when changing these,
129 * but the security benefits can be great. Please be careful.
130 */
131 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
132 VNET_DEFINE(int, ipport_reservedlow);
133
134 /* Enable random ephemeral port allocation by default. */
135 VNET_DEFINE(int, ipport_randomized) = 1;
136
137 #ifdef INET
138 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
139 struct in_addr faddr, u_int fport_arg,
140 struct in_addr laddr, u_int lport_arg,
141 int lookupflags, uint8_t numa_domain, int fib);
142
143 #define RANGECHK(var, min, max) \
144 if ((var) < (min)) { (var) = (min); } \
145 else if ((var) > (max)) { (var) = (max); }
146
147 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
149 {
150 int error;
151
152 error = sysctl_handle_int(oidp, arg1, arg2, req);
153 if (error == 0) {
154 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
155 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
156 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
157 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
158 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
159 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
160 }
161 return (error);
162 }
163
164 #undef RANGECHK
165
166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
167 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
168 "IP Ports");
169
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
171 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
172 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
173 "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
175 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
176 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
177 "");
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
179 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
180 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
181 "");
182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
183 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
184 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
185 "");
186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
187 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
188 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
189 "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
191 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
192 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
193 "");
194 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
195 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
196 &VNET_NAME(ipport_reservedhigh), 0, "");
197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
198 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
200 CTLFLAG_VNET | CTLFLAG_RW,
201 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
202
203 #ifdef RATELIMIT
204 counter_u64_t rate_limit_new;
205 counter_u64_t rate_limit_chg;
206 counter_u64_t rate_limit_active;
207 counter_u64_t rate_limit_alloc_fail;
208 counter_u64_t rate_limit_set_ok;
209
210 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
211 "IP Rate Limiting");
212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
213 &rate_limit_active, "Active rate limited connections");
214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
215 &rate_limit_alloc_fail, "Rate limited connection failures");
216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
217 &rate_limit_set_ok, "Rate limited setting succeeded");
218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
219 &rate_limit_new, "Total Rate limit new attempts");
220 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
221 &rate_limit_chg, "Total Rate limited change attempts");
222 #endif /* RATELIMIT */
223
224 #endif /* INET */
225
226 VNET_DEFINE(uint32_t, in_pcbhashseed);
227 static void
in_pcbhashseed_init(void)228 in_pcbhashseed_init(void)
229 {
230
231 V_in_pcbhashseed = arc4random();
232 }
233 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
234 in_pcbhashseed_init, NULL);
235
236 #ifdef INET
237 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
238 #define V_connect_inaddr_wild VNET(connect_inaddr_wild)
239 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
240 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
241 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
242 #endif
243
244 static void in_pcbremhash(struct inpcb *);
245
246 /*
247 * in_pcb.c: manage the Protocol Control Blocks.
248 *
249 * NOTE: It is assumed that most of these functions will be called with
250 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
251 * functions often modify hash chains or addresses in pcbs.
252 */
253
254 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)255 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
256 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
257 {
258 struct inpcblbgroup *grp;
259 size_t bytes;
260
261 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
262 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
263 if (grp == NULL)
264 return (NULL);
265 LIST_INIT(&grp->il_pending);
266 grp->il_cred = crhold(cred);
267 grp->il_vflag = vflag;
268 grp->il_lport = port;
269 grp->il_numa_domain = numa_domain;
270 grp->il_fibnum = fib;
271 grp->il_dependladdr = *addr;
272 grp->il_inpsiz = size;
273 return (grp);
274 }
275
276 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)277 in_pcblbgroup_free_deferred(epoch_context_t ctx)
278 {
279 struct inpcblbgroup *grp;
280
281 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
282 crfree(grp->il_cred);
283 free(grp, M_PCB);
284 }
285
286 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)287 in_pcblbgroup_free(struct inpcblbgroup *grp)
288 {
289 KASSERT(LIST_EMPTY(&grp->il_pending),
290 ("local group %p still has pending inps", grp));
291
292 CK_LIST_REMOVE(grp, il_list);
293 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
294 }
295
296 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)297 in_pcblbgroup_find(struct inpcb *inp)
298 {
299 struct inpcbinfo *pcbinfo;
300 struct inpcblbgroup *grp;
301 struct inpcblbgrouphead *hdr;
302
303 INP_LOCK_ASSERT(inp);
304
305 pcbinfo = inp->inp_pcbinfo;
306 INP_HASH_LOCK_ASSERT(pcbinfo);
307
308 hdr = &pcbinfo->ipi_lbgrouphashbase[
309 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
310 CK_LIST_FOREACH(grp, hdr, il_list) {
311 struct inpcb *inp1;
312
313 for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
314 if (inp == grp->il_inp[i])
315 goto found;
316 }
317 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
318 if (inp == inp1)
319 goto found;
320 }
321 }
322 found:
323 return (grp);
324 }
325
326 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)327 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
328 {
329 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
330 ("invalid local group size %d and count %d", grp->il_inpsiz,
331 grp->il_inpcnt));
332 INP_WLOCK_ASSERT(inp);
333
334 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
335 !SOLISTENING(inp->inp_socket)) {
336 /*
337 * If this is a TCP socket, it should not be visible to lbgroup
338 * lookups until listen() has been called.
339 */
340 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
341 grp->il_pendcnt++;
342 } else {
343 grp->il_inp[grp->il_inpcnt] = inp;
344
345 /*
346 * Synchronize with in_pcblookup_lbgroup(): make sure that we
347 * don't expose a null slot to the lookup path.
348 */
349 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
350 }
351
352 inp->inp_flags |= INP_INLBGROUP;
353 }
354
355 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
357 struct inpcblbgroup *old_grp, int size)
358 {
359 struct inpcblbgroup *grp;
360 int i;
361
362 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
363 old_grp->il_lport, &old_grp->il_dependladdr, size,
364 old_grp->il_numa_domain, old_grp->il_fibnum);
365 if (grp == NULL)
366 return (NULL);
367
368 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
369 ("invalid new local group size %d and old local group count %d",
370 grp->il_inpsiz, old_grp->il_inpcnt));
371
372 for (i = 0; i < old_grp->il_inpcnt; ++i)
373 grp->il_inp[i] = old_grp->il_inp[i];
374 grp->il_inpcnt = old_grp->il_inpcnt;
375 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
376 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
377 inp_lbgroup_list);
378 grp->il_pendcnt = old_grp->il_pendcnt;
379 old_grp->il_pendcnt = 0;
380 in_pcblbgroup_free(old_grp);
381 return (grp);
382 }
383
384 /*
385 * Add PCB to load balance group for SO_REUSEPORT_LB option.
386 */
387 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)388 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
389 {
390 const static struct timeval interval = { 60, 0 };
391 static struct timeval lastprint;
392 struct inpcbinfo *pcbinfo;
393 struct inpcblbgrouphead *hdr;
394 struct inpcblbgroup *grp;
395 uint32_t idx;
396 int fib;
397
398 pcbinfo = inp->inp_pcbinfo;
399
400 INP_WLOCK_ASSERT(inp);
401 INP_HASH_WLOCK_ASSERT(pcbinfo);
402
403 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
404 inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
405
406 #ifdef INET6
407 /*
408 * Don't allow IPv4 mapped INET6 wild socket.
409 */
410 if ((inp->inp_vflag & INP_IPV4) &&
411 inp->inp_laddr.s_addr == INADDR_ANY &&
412 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
413 return (0);
414 }
415 #endif
416
417 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
418 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
419 CK_LIST_FOREACH(grp, hdr, il_list) {
420 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
421 grp->il_vflag == inp->inp_vflag &&
422 grp->il_lport == inp->inp_lport &&
423 grp->il_numa_domain == numa_domain &&
424 grp->il_fibnum == fib &&
425 memcmp(&grp->il_dependladdr,
426 &inp->inp_inc.inc_ie.ie_dependladdr,
427 sizeof(grp->il_dependladdr)) == 0) {
428 break;
429 }
430 }
431 if (grp == NULL) {
432 /* Create new load balance group. */
433 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
434 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
435 INPCBLBGROUP_SIZMIN, numa_domain, fib);
436 if (grp == NULL)
437 return (ENOMEM);
438 in_pcblbgroup_insert(grp, inp);
439 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
440 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
441 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
442 if (ratecheck(&lastprint, &interval))
443 printf("lb group port %d, limit reached\n",
444 ntohs(grp->il_lport));
445 return (0);
446 }
447
448 /* Expand this local group. */
449 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
450 if (grp == NULL)
451 return (ENOMEM);
452 in_pcblbgroup_insert(grp, inp);
453 } else {
454 in_pcblbgroup_insert(grp, inp);
455 }
456 return (0);
457 }
458
459 /*
460 * Remove PCB from load balance group.
461 */
462 static void
in_pcbremlbgrouphash(struct inpcb * inp)463 in_pcbremlbgrouphash(struct inpcb *inp)
464 {
465 struct inpcbinfo *pcbinfo;
466 struct inpcblbgrouphead *hdr;
467 struct inpcblbgroup *grp;
468 struct inpcb *inp1;
469 int i;
470
471 pcbinfo = inp->inp_pcbinfo;
472
473 INP_WLOCK_ASSERT(inp);
474 MPASS(inp->inp_flags & INP_INLBGROUP);
475 INP_HASH_WLOCK_ASSERT(pcbinfo);
476
477 hdr = &pcbinfo->ipi_lbgrouphashbase[
478 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
479 CK_LIST_FOREACH(grp, hdr, il_list) {
480 for (i = 0; i < grp->il_inpcnt; ++i) {
481 if (grp->il_inp[i] != inp)
482 continue;
483
484 if (grp->il_inpcnt == 1 &&
485 LIST_EMPTY(&grp->il_pending)) {
486 /* We are the last, free this local group. */
487 in_pcblbgroup_free(grp);
488 } else {
489 grp->il_inp[i] =
490 grp->il_inp[grp->il_inpcnt - 1];
491
492 /*
493 * Synchronize with in_pcblookup_lbgroup().
494 */
495 atomic_store_rel_int(&grp->il_inpcnt,
496 grp->il_inpcnt - 1);
497 }
498 inp->inp_flags &= ~INP_INLBGROUP;
499 return;
500 }
501 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
502 if (inp == inp1) {
503 LIST_REMOVE(inp, inp_lbgroup_list);
504 grp->il_pendcnt--;
505 inp->inp_flags &= ~INP_INLBGROUP;
506 return;
507 }
508 }
509 }
510 __assert_unreachable();
511 }
512
513 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)514 in_pcblbgroup_numa(struct inpcb *inp, int arg)
515 {
516 struct inpcbinfo *pcbinfo;
517 int error;
518 uint8_t numa_domain;
519
520 switch (arg) {
521 case TCP_REUSPORT_LB_NUMA_NODOM:
522 numa_domain = M_NODOM;
523 break;
524 case TCP_REUSPORT_LB_NUMA_CURDOM:
525 numa_domain = PCPU_GET(domain);
526 break;
527 default:
528 if (arg < 0 || arg >= vm_ndomains)
529 return (EINVAL);
530 numa_domain = arg;
531 }
532
533 pcbinfo = inp->inp_pcbinfo;
534 INP_WLOCK_ASSERT(inp);
535 INP_HASH_WLOCK(pcbinfo);
536 if (in_pcblbgroup_find(inp) != NULL) {
537 /* Remove it from the old group. */
538 in_pcbremlbgrouphash(inp);
539 /* Add it to the new group based on numa domain. */
540 in_pcbinslbgrouphash(inp, numa_domain);
541 error = 0;
542 } else {
543 error = ENOENT;
544 }
545 INP_HASH_WUNLOCK(pcbinfo);
546 return (error);
547 }
548
549 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
550 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
551
552 /*
553 * Initialize an inpcbinfo - a per-VNET instance of connections db.
554 */
555 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements)556 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
557 u_int hash_nelements, u_int porthash_nelements)
558 {
559
560 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
561 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
562 NULL, MTX_DEF);
563 #ifdef VIMAGE
564 pcbinfo->ipi_vnet = curvnet;
565 #endif
566 CK_LIST_INIT(&pcbinfo->ipi_listhead);
567 pcbinfo->ipi_count = 0;
568 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
569 &pcbinfo->ipi_hashmask);
570 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
571 &pcbinfo->ipi_hashmask);
572 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
573 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
574 &pcbinfo->ipi_porthashmask);
575 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
576 &pcbinfo->ipi_lbgrouphashmask);
577 pcbinfo->ipi_zone = pcbstor->ips_zone;
578 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
579 }
580
581 /*
582 * Destroy an inpcbinfo.
583 */
584 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)585 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
586 {
587
588 KASSERT(pcbinfo->ipi_count == 0,
589 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
590
591 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
592 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
593 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
594 pcbinfo->ipi_porthashmask);
595 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
596 pcbinfo->ipi_lbgrouphashmask);
597 mtx_destroy(&pcbinfo->ipi_hash_lock);
598 mtx_destroy(&pcbinfo->ipi_lock);
599 }
600
601 /*
602 * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
603 */
604 static void inpcb_fini(void *, int);
605 void
in_pcbstorage_init(void * arg)606 in_pcbstorage_init(void *arg)
607 {
608 struct inpcbstorage *pcbstor = arg;
609
610 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
611 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
612 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
613 }
614
615 /*
616 * Destroy a pcbstorage - used by unloadable protocols.
617 */
618 void
in_pcbstorage_destroy(void * arg)619 in_pcbstorage_destroy(void *arg)
620 {
621 struct inpcbstorage *pcbstor = arg;
622
623 uma_zdestroy(pcbstor->ips_zone);
624 }
625
626 /*
627 * Allocate a PCB and associate it with the socket.
628 * On success return with the PCB locked.
629 */
630 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)631 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
632 {
633 struct inpcb *inp;
634 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
635 int error;
636 #endif
637
638 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
639 if (inp == NULL)
640 return (ENOBUFS);
641 bzero(&inp->inp_start_zero, inp_zero_size);
642 #ifdef NUMA
643 inp->inp_numa_domain = M_NODOM;
644 #endif
645 inp->inp_pcbinfo = pcbinfo;
646 inp->inp_socket = so;
647 inp->inp_cred = crhold(so->so_cred);
648 inp->inp_inc.inc_fibnum = so->so_fibnum;
649 #ifdef MAC
650 error = mac_inpcb_init(inp, M_NOWAIT);
651 if (error != 0)
652 goto out;
653 mac_inpcb_create(so, inp);
654 #endif
655 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
656 error = ipsec_init_pcbpolicy(inp);
657 if (error != 0) {
658 #ifdef MAC
659 mac_inpcb_destroy(inp);
660 #endif
661 goto out;
662 }
663 #endif /*IPSEC*/
664 #ifdef INET6
665 if (INP_SOCKAF(so) == AF_INET6) {
666 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
667 if (V_ip6_v6only)
668 inp->inp_flags |= IN6P_IPV6_V6ONLY;
669 #ifdef INET
670 else
671 inp->inp_vflag |= INP_IPV4;
672 #endif
673 if (V_ip6_auto_flowlabel)
674 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
675 inp->in6p_hops = -1; /* use kernel default */
676 }
677 #endif
678 #if defined(INET) && defined(INET6)
679 else
680 #endif
681 #ifdef INET
682 inp->inp_vflag |= INP_IPV4;
683 #endif
684 inp->inp_smr = SMR_SEQ_INVALID;
685
686 /*
687 * Routes in inpcb's can cache L2 as well; they are guaranteed
688 * to be cleaned up.
689 */
690 inp->inp_route.ro_flags = RT_LLE_CACHE;
691 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
692 INP_WLOCK(inp);
693 INP_INFO_WLOCK(pcbinfo);
694 pcbinfo->ipi_count++;
695 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
696 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
697 INP_INFO_WUNLOCK(pcbinfo);
698 so->so_pcb = inp;
699
700 return (0);
701
702 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
703 out:
704 crfree(inp->inp_cred);
705 #ifdef INVARIANTS
706 inp->inp_cred = NULL;
707 #endif
708 uma_zfree_smr(pcbinfo->ipi_zone, inp);
709 return (error);
710 #endif
711 }
712
713 #ifdef INET
714 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)715 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
716 struct ucred *cred)
717 {
718 int anonport, error;
719
720 KASSERT(sin == NULL || sin->sin_family == AF_INET,
721 ("%s: invalid address family for %p", __func__, sin));
722 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
723 ("%s: invalid address length for %p", __func__, sin));
724 INP_WLOCK_ASSERT(inp);
725 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
726
727 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
728 return (EINVAL);
729 anonport = sin == NULL || sin->sin_port == 0;
730 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
731 &inp->inp_lport, flags, cred);
732 if (error)
733 return (error);
734 if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
735 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
736 inp->inp_laddr.s_addr = INADDR_ANY;
737 inp->inp_lport = 0;
738 inp->inp_flags &= ~INP_BOUNDFIB;
739 return (error);
740 }
741 if (anonport)
742 inp->inp_flags |= INP_ANONPORT;
743 return (0);
744 }
745 #endif
746
747 #if defined(INET) || defined(INET6)
748 /*
749 * Assign a local port like in_pcb_lport(), but also used with connect()
750 * and a foreign address and port. If fsa is non-NULL, choose a local port
751 * that is unused with those, otherwise one that is completely unused.
752 * lsa can be NULL for IPv6.
753 */
754 int
in_pcb_lport_dest(const struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)755 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
756 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
757 int lookupflags)
758 {
759 struct inpcbinfo *pcbinfo;
760 struct inpcb *tmpinp;
761 unsigned short *lastport;
762 int count, error;
763 u_short aux, first, last, lport;
764 #ifdef INET
765 struct in_addr laddr, faddr;
766 #endif
767 #ifdef INET6
768 struct in6_addr *laddr6, *faddr6;
769 #endif
770
771 pcbinfo = inp->inp_pcbinfo;
772
773 /*
774 * Because no actual state changes occur here, a global write lock on
775 * the pcbinfo isn't required.
776 */
777 INP_LOCK_ASSERT(inp);
778 INP_HASH_LOCK_ASSERT(pcbinfo);
779
780 if (inp->inp_flags & INP_HIGHPORT) {
781 first = V_ipport_hifirstauto; /* sysctl */
782 last = V_ipport_hilastauto;
783 lastport = &pcbinfo->ipi_lasthi;
784 } else if (inp->inp_flags & INP_LOWPORT) {
785 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
786 if (error)
787 return (error);
788 first = V_ipport_lowfirstauto; /* 1023 */
789 last = V_ipport_lowlastauto; /* 600 */
790 lastport = &pcbinfo->ipi_lastlow;
791 } else {
792 first = V_ipport_firstauto; /* sysctl */
793 last = V_ipport_lastauto;
794 lastport = &pcbinfo->ipi_lastport;
795 }
796
797 /*
798 * Instead of having two loops further down counting up or down
799 * make sure that first is always <= last and go with only one
800 * code path implementing all logic.
801 */
802 if (first > last) {
803 aux = first;
804 first = last;
805 last = aux;
806 }
807
808 #ifdef INET
809 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
810 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
811 if (lsa != NULL)
812 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
813 if (fsa != NULL)
814 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
815 }
816 #endif
817 #ifdef INET6
818 laddr6 = NULL;
819 if ((inp->inp_vflag & INP_IPV6) != 0) {
820 if (lsa != NULL)
821 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
822 if (fsa != NULL)
823 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
824 }
825 #endif
826
827 tmpinp = NULL;
828
829 if (V_ipport_randomized)
830 *lastport = first + (arc4random() % (last - first));
831
832 count = last - first;
833
834 do {
835 if (count-- < 0) /* completely used? */
836 return (EADDRNOTAVAIL);
837 ++*lastport;
838 if (*lastport < first || *lastport > last)
839 *lastport = first;
840 lport = htons(*lastport);
841
842 if (fsa != NULL) {
843 #ifdef INET
844 if (lsa->sa_family == AF_INET) {
845 tmpinp = in_pcblookup_hash_locked(pcbinfo,
846 faddr, fport, laddr, lport, lookupflags,
847 M_NODOM, RT_ALL_FIBS);
848 }
849 #endif
850 #ifdef INET6
851 if (lsa->sa_family == AF_INET6) {
852 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
853 faddr6, fport, laddr6, lport, lookupflags,
854 M_NODOM, RT_ALL_FIBS);
855 }
856 #endif
857 } else {
858 #ifdef INET6
859 if ((inp->inp_vflag & INP_IPV6) != 0) {
860 tmpinp = in6_pcblookup_local(pcbinfo,
861 &inp->in6p_laddr, lport, RT_ALL_FIBS,
862 lookupflags, cred);
863 #ifdef INET
864 if (tmpinp == NULL &&
865 (inp->inp_vflag & INP_IPV4))
866 tmpinp = in_pcblookup_local(pcbinfo,
867 laddr, lport, RT_ALL_FIBS,
868 lookupflags, cred);
869 #endif
870 }
871 #endif
872 #if defined(INET) && defined(INET6)
873 else
874 #endif
875 #ifdef INET
876 tmpinp = in_pcblookup_local(pcbinfo, laddr,
877 lport, RT_ALL_FIBS, lookupflags, cred);
878 #endif
879 }
880 } while (tmpinp != NULL);
881
882 *lportp = lport;
883
884 return (0);
885 }
886
887 /*
888 * Select a local port (number) to use.
889 */
890 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)891 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
892 struct ucred *cred, int lookupflags)
893 {
894 struct sockaddr_in laddr;
895
896 if (laddrp) {
897 bzero(&laddr, sizeof(laddr));
898 laddr.sin_family = AF_INET;
899 laddr.sin_addr = *laddrp;
900 }
901 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
902 NULL, lportp, NULL, 0, cred, lookupflags));
903 }
904 #endif /* INET || INET6 */
905
906 #ifdef INET
907 /*
908 * Determine whether the inpcb can be bound to the specified address/port tuple.
909 */
910 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)911 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
912 const u_short lport, const int fib, int sooptions, int lookupflags,
913 struct ucred *cred)
914 {
915 int reuseport, reuseport_lb;
916
917 INP_LOCK_ASSERT(inp);
918 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
919
920 reuseport = (sooptions & SO_REUSEPORT);
921 reuseport_lb = (sooptions & SO_REUSEPORT_LB);
922
923 if (IN_MULTICAST(ntohl(laddr.s_addr))) {
924 /*
925 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
926 * allow complete duplication of binding if
927 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
928 * and a multicast address is bound on both
929 * new and duplicated sockets.
930 */
931 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
932 reuseport = SO_REUSEADDR | SO_REUSEPORT;
933 /*
934 * XXX: How to deal with SO_REUSEPORT_LB here?
935 * Treat same as SO_REUSEPORT for now.
936 */
937 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
938 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
939 } else if (!in_nullhost(laddr)) {
940 struct sockaddr_in sin;
941
942 memset(&sin, 0, sizeof(sin));
943 sin.sin_family = AF_INET;
944 sin.sin_len = sizeof(sin);
945 sin.sin_addr = laddr;
946
947 /*
948 * Is the address a local IP address?
949 * If INP_BINDANY is set, then the socket may be bound
950 * to any endpoint address, local or not.
951 */
952 if ((inp->inp_flags & INP_BINDANY) == 0 &&
953 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
954 return (EADDRNOTAVAIL);
955 }
956
957 if (lport != 0) {
958 struct inpcb *t;
959
960 if (ntohs(lport) <= V_ipport_reservedhigh &&
961 ntohs(lport) >= V_ipport_reservedlow &&
962 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
963 return (EACCES);
964
965 if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
966 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
967 /*
968 * If a socket owned by a different user is already
969 * bound to this port, fail. In particular, SO_REUSE*
970 * can only be used to share a port among sockets owned
971 * by the same user.
972 *
973 * However, we can share a port with a connected socket
974 * which has a unique 4-tuple.
975 */
976 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
977 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
978 if (t != NULL &&
979 (inp->inp_socket->so_type != SOCK_STREAM ||
980 in_nullhost(t->inp_faddr)) &&
981 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
982 return (EADDRINUSE);
983 }
984 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
985 lookupflags, cred);
986 if (t != NULL && ((reuseport | reuseport_lb) &
987 t->inp_socket->so_options) == 0) {
988 #ifdef INET6
989 if (!in_nullhost(laddr) ||
990 !in_nullhost(t->inp_laddr) ||
991 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
992 (t->inp_vflag & INP_IPV6PROTO) == 0)
993 #endif
994 return (EADDRINUSE);
995 }
996 }
997 return (0);
998 }
999
1000 /*
1001 * Set up a bind operation on a PCB, performing port allocation
1002 * as required, but do not actually modify the PCB. Callers can
1003 * either complete the bind by setting inp_laddr/inp_lport and
1004 * calling in_pcbinshash(), or they can just use the resulting
1005 * port and address to authorise the sending of a once-off packet.
1006 *
1007 * On error, the values of *laddrp and *lportp are not changed.
1008 */
1009 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1010 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1011 u_short *lportp, int flags, struct ucred *cred)
1012 {
1013 struct socket *so = inp->inp_socket;
1014 struct in_addr laddr;
1015 u_short lport = 0;
1016 int error, fib, lookupflags, sooptions;
1017
1018 /*
1019 * No state changes, so read locks are sufficient here.
1020 */
1021 INP_LOCK_ASSERT(inp);
1022 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1023
1024 laddr.s_addr = *laddrp;
1025 if (sin != NULL && laddr.s_addr != INADDR_ANY)
1026 return (EINVAL);
1027
1028 lookupflags = 0;
1029 sooptions = atomic_load_int(&so->so_options);
1030 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1031 lookupflags = INPLOOKUP_WILDCARD;
1032 if (sin == NULL) {
1033 if ((error = prison_local_ip4(cred, &laddr)) != 0)
1034 return (error);
1035 } else {
1036 KASSERT(sin->sin_family == AF_INET,
1037 ("%s: invalid family for address %p", __func__, sin));
1038 KASSERT(sin->sin_len == sizeof(*sin),
1039 ("%s: invalid length for address %p", __func__, sin));
1040
1041 error = prison_local_ip4(cred, &sin->sin_addr);
1042 if (error)
1043 return (error);
1044 if (sin->sin_port != *lportp) {
1045 /* Don't allow the port to change. */
1046 if (*lportp != 0)
1047 return (EINVAL);
1048 lport = sin->sin_port;
1049 }
1050 laddr = sin->sin_addr;
1051
1052 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1053 RT_ALL_FIBS;
1054
1055 /* See if this address/port combo is available. */
1056 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1057 lookupflags, cred);
1058 if (error != 0)
1059 return (error);
1060 }
1061 if (*lportp != 0)
1062 lport = *lportp;
1063 if (lport == 0) {
1064 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1065 if (error != 0)
1066 return (error);
1067 }
1068 *laddrp = laddr.s_addr;
1069 *lportp = lport;
1070 if ((flags & INPBIND_FIB) != 0)
1071 inp->inp_flags |= INP_BOUNDFIB;
1072 return (0);
1073 }
1074
1075 /*
1076 * Connect from a socket to a specified address.
1077 * Both address and port must be specified in argument sin.
1078 * If don't have a local address for this socket yet,
1079 * then pick one.
1080 */
1081 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1082 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1083 {
1084 struct in_addr laddr, faddr;
1085 u_short lport;
1086 int error;
1087 bool anonport;
1088
1089 INP_WLOCK_ASSERT(inp);
1090 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1091 KASSERT(in_nullhost(inp->inp_faddr),
1092 ("%s: inp is already connected", __func__));
1093 KASSERT(sin->sin_family == AF_INET,
1094 ("%s: invalid address family for %p", __func__, sin));
1095 KASSERT(sin->sin_len == sizeof(*sin),
1096 ("%s: invalid address length for %p", __func__, sin));
1097
1098 if (sin->sin_port == 0)
1099 return (EADDRNOTAVAIL);
1100
1101 anonport = (inp->inp_lport == 0);
1102
1103 if (__predict_false(in_broadcast(sin->sin_addr))) {
1104 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1105 return (ENETUNREACH);
1106 /*
1107 * If the destination address is INADDR_ANY, use the primary
1108 * local address. If the supplied address is INADDR_BROADCAST,
1109 * and the primary interface supports broadcast, choose the
1110 * broadcast address for that interface.
1111 */
1112 if (in_nullhost(sin->sin_addr)) {
1113 faddr =
1114 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1115 if ((error = prison_get_ip4(cred, &faddr)) != 0)
1116 return (error);
1117 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1118 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1119 & IFF_BROADCAST) {
1120 faddr = satosin(&CK_STAILQ_FIRST(
1121 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1122 } else
1123 faddr = sin->sin_addr;
1124 } else
1125 faddr = sin->sin_addr;
1126
1127 if (in_nullhost(inp->inp_laddr)) {
1128 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1129 if (error)
1130 return (error);
1131 } else
1132 laddr = inp->inp_laddr;
1133
1134 if (anonport) {
1135 struct sockaddr_in lsin = {
1136 .sin_family = AF_INET,
1137 .sin_addr = laddr,
1138 };
1139 struct sockaddr_in fsin = {
1140 .sin_family = AF_INET,
1141 .sin_addr = faddr,
1142 };
1143
1144 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1145 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1146 INPLOOKUP_WILDCARD);
1147 if (error)
1148 return (error);
1149 } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1150 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1151 NULL)
1152 return (EADDRINUSE);
1153 else
1154 lport = inp->inp_lport;
1155
1156 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1157 !(inp->inp_flags & INP_INHASHLIST));
1158
1159 inp->inp_faddr = faddr;
1160 inp->inp_fport = sin->sin_port;
1161 inp->inp_laddr = laddr;
1162 inp->inp_lport = lport;
1163
1164 if ((inp->inp_flags & INP_INHASHLIST) == 0) {
1165 error = in_pcbinshash(inp);
1166 MPASS(error == 0);
1167 } else
1168 in_pcbrehash(inp);
1169 #ifdef ROUTE_MPATH
1170 if (CALC_FLOWID_OUTBOUND) {
1171 uint32_t hash_val, hash_type;
1172
1173 hash_val = fib4_calc_software_hash(inp->inp_laddr,
1174 inp->inp_faddr, 0, sin->sin_port,
1175 inp->inp_socket->so_proto->pr_protocol, &hash_type);
1176
1177 inp->inp_flowid = hash_val;
1178 inp->inp_flowtype = hash_type;
1179 }
1180 #endif
1181 if (anonport)
1182 inp->inp_flags |= INP_ANONPORT;
1183 return (0);
1184 }
1185
1186 /*
1187 * Do proper source address selection on an unbound socket in case
1188 * of connect. Take jails into account as well.
1189 */
1190 int
in_pcbladdr(const struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1191 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1192 struct in_addr *laddr, struct ucred *cred)
1193 {
1194 struct ifaddr *ifa;
1195 struct sockaddr *sa;
1196 struct sockaddr_in *sin, dst;
1197 struct nhop_object *nh;
1198 int error;
1199
1200 NET_EPOCH_ASSERT();
1201 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1202
1203 /*
1204 * Bypass source address selection and use the primary jail IP
1205 * if requested.
1206 */
1207 if (!prison_saddrsel_ip4(cred, laddr))
1208 return (0);
1209
1210 /*
1211 * If the destination address is multicast and an outgoing
1212 * interface has been set as a multicast option, prefer the
1213 * address of that interface as our source address.
1214 */
1215 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1216 inp->inp_moptions->imo_multicast_ifp != NULL) {
1217 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1218 struct in_ifaddr *ia;
1219
1220 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1221 if (ia->ia_ifp == ifp &&
1222 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1223 break;
1224 }
1225 if (ia == NULL)
1226 return (EADDRNOTAVAIL);
1227 *laddr = ia->ia_addr.sin_addr;
1228 return (0);
1229 }
1230
1231 error = 0;
1232
1233 nh = NULL;
1234 bzero(&dst, sizeof(dst));
1235 sin = &dst;
1236 sin->sin_family = AF_INET;
1237 sin->sin_len = sizeof(struct sockaddr_in);
1238 sin->sin_addr.s_addr = faddr->s_addr;
1239
1240 /*
1241 * If route is known our src addr is taken from the i/f,
1242 * else punt.
1243 *
1244 * Find out route to destination.
1245 */
1246 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1247 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1248 0, NHR_NONE, 0);
1249
1250 /*
1251 * If we found a route, use the address corresponding to
1252 * the outgoing interface.
1253 *
1254 * Otherwise assume faddr is reachable on a directly connected
1255 * network and try to find a corresponding interface to take
1256 * the source address from.
1257 */
1258 if (nh == NULL || nh->nh_ifp == NULL) {
1259 struct in_ifaddr *ia;
1260 struct ifnet *ifp;
1261
1262 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1263 inp->inp_socket->so_fibnum));
1264 if (ia == NULL) {
1265 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1266 inp->inp_socket->so_fibnum));
1267 }
1268 if (ia == NULL) {
1269 error = ENETUNREACH;
1270 goto done;
1271 }
1272
1273 if (!prison_flag(cred, PR_IP4)) {
1274 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1275 goto done;
1276 }
1277
1278 ifp = ia->ia_ifp;
1279 ia = NULL;
1280 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1281 sa = ifa->ifa_addr;
1282 if (sa->sa_family != AF_INET)
1283 continue;
1284 sin = (struct sockaddr_in *)sa;
1285 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1286 ia = (struct in_ifaddr *)ifa;
1287 break;
1288 }
1289 }
1290 if (ia != NULL) {
1291 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1292 goto done;
1293 }
1294
1295 /* 3. As a last resort return the 'default' jail address. */
1296 error = prison_get_ip4(cred, laddr);
1297 goto done;
1298 }
1299
1300 /*
1301 * If the outgoing interface on the route found is not
1302 * a loopback interface, use the address from that interface.
1303 * In case of jails do those three steps:
1304 * 1. check if the interface address belongs to the jail. If so use it.
1305 * 2. check if we have any address on the outgoing interface
1306 * belonging to this jail. If so use it.
1307 * 3. as a last resort return the 'default' jail address.
1308 */
1309 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1310 struct in_ifaddr *ia;
1311 struct ifnet *ifp;
1312
1313 /* If not jailed, use the default returned. */
1314 if (!prison_flag(cred, PR_IP4)) {
1315 ia = (struct in_ifaddr *)nh->nh_ifa;
1316 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1317 goto done;
1318 }
1319
1320 /* Jailed. */
1321 /* 1. Check if the iface address belongs to the jail. */
1322 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1323 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1324 ia = (struct in_ifaddr *)nh->nh_ifa;
1325 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326 goto done;
1327 }
1328
1329 /*
1330 * 2. Check if we have any address on the outgoing interface
1331 * belonging to this jail.
1332 */
1333 ia = NULL;
1334 ifp = nh->nh_ifp;
1335 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1336 sa = ifa->ifa_addr;
1337 if (sa->sa_family != AF_INET)
1338 continue;
1339 sin = (struct sockaddr_in *)sa;
1340 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1341 ia = (struct in_ifaddr *)ifa;
1342 break;
1343 }
1344 }
1345 if (ia != NULL) {
1346 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1347 goto done;
1348 }
1349
1350 /* 3. As a last resort return the 'default' jail address. */
1351 error = prison_get_ip4(cred, laddr);
1352 goto done;
1353 }
1354
1355 /*
1356 * The outgoing interface is marked with 'loopback net', so a route
1357 * to ourselves is here.
1358 * Try to find the interface of the destination address and then
1359 * take the address from there. That interface is not necessarily
1360 * a loopback interface.
1361 * In case of jails, check that it is an address of the jail
1362 * and if we cannot find, fall back to the 'default' jail address.
1363 */
1364 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1365 struct in_ifaddr *ia;
1366
1367 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1368 inp->inp_socket->so_fibnum));
1369 if (ia == NULL)
1370 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1371 inp->inp_socket->so_fibnum));
1372 if (ia == NULL)
1373 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1374
1375 if (!prison_flag(cred, PR_IP4)) {
1376 if (ia == NULL) {
1377 error = ENETUNREACH;
1378 goto done;
1379 }
1380 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1381 goto done;
1382 }
1383
1384 /* Jailed. */
1385 if (ia != NULL) {
1386 struct ifnet *ifp;
1387
1388 ifp = ia->ia_ifp;
1389 ia = NULL;
1390 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1391 sa = ifa->ifa_addr;
1392 if (sa->sa_family != AF_INET)
1393 continue;
1394 sin = (struct sockaddr_in *)sa;
1395 if (prison_check_ip4(cred,
1396 &sin->sin_addr) == 0) {
1397 ia = (struct in_ifaddr *)ifa;
1398 break;
1399 }
1400 }
1401 if (ia != NULL) {
1402 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1403 goto done;
1404 }
1405 }
1406
1407 /* 3. As a last resort return the 'default' jail address. */
1408 error = prison_get_ip4(cred, laddr);
1409 goto done;
1410 }
1411
1412 done:
1413 if (error == 0 && laddr->s_addr == INADDR_ANY)
1414 return (EHOSTUNREACH);
1415 return (error);
1416 }
1417
1418 void
in_pcbdisconnect(struct inpcb * inp)1419 in_pcbdisconnect(struct inpcb *inp)
1420 {
1421
1422 INP_WLOCK_ASSERT(inp);
1423 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1424 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1425 ("%s: inp %p was already disconnected", __func__, inp));
1426
1427 in_pcbremhash_locked(inp);
1428
1429 /* See the comment in in_pcbinshash(). */
1430 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1431 inp->inp_laddr.s_addr = INADDR_ANY;
1432 inp->inp_faddr.s_addr = INADDR_ANY;
1433 inp->inp_fport = 0;
1434 }
1435 #endif /* INET */
1436
1437 void
in_pcblisten(struct inpcb * inp)1438 in_pcblisten(struct inpcb *inp)
1439 {
1440 struct inpcblbgroup *grp;
1441
1442 INP_WLOCK_ASSERT(inp);
1443
1444 if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1445 struct inpcbinfo *pcbinfo;
1446
1447 pcbinfo = inp->inp_pcbinfo;
1448 INP_HASH_WLOCK(pcbinfo);
1449 grp = in_pcblbgroup_find(inp);
1450 LIST_REMOVE(inp, inp_lbgroup_list);
1451 grp->il_pendcnt--;
1452 in_pcblbgroup_insert(grp, inp);
1453 INP_HASH_WUNLOCK(pcbinfo);
1454 }
1455 }
1456
1457 /*
1458 * inpcb hash lookups are protected by SMR section.
1459 *
1460 * Once desired pcb has been found, switching from SMR section to a pcb
1461 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1462 * here because SMR is a critical section.
1463 * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1464 */
1465 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1466 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1467 {
1468
1469 lock == INPLOOKUP_RLOCKPCB ?
1470 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1471 }
1472
1473 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1474 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1475 {
1476
1477 lock == INPLOOKUP_RLOCKPCB ?
1478 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1479 }
1480
1481 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1482 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1483 {
1484
1485 return (lock == INPLOOKUP_RLOCKPCB ?
1486 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1487 }
1488
1489 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1490 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1491 {
1492
1493 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1494 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1495
1496 if (__predict_true(inp_trylock(inp, lock))) {
1497 if (__predict_false(inp->inp_flags & ignflags)) {
1498 smr_exit(inp->inp_pcbinfo->ipi_smr);
1499 inp_unlock(inp, lock);
1500 return (false);
1501 }
1502 smr_exit(inp->inp_pcbinfo->ipi_smr);
1503 return (true);
1504 }
1505
1506 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1507 smr_exit(inp->inp_pcbinfo->ipi_smr);
1508 inp_lock(inp, lock);
1509 if (__predict_false(in_pcbrele(inp, lock)))
1510 return (false);
1511 /*
1512 * inp acquired through refcount & lock for sure didn't went
1513 * through uma_zfree(). However, it may have already went
1514 * through in_pcbfree() and has another reference, that
1515 * prevented its release by our in_pcbrele().
1516 */
1517 if (__predict_false(inp->inp_flags & ignflags)) {
1518 inp_unlock(inp, lock);
1519 return (false);
1520 }
1521 return (true);
1522 } else {
1523 smr_exit(inp->inp_pcbinfo->ipi_smr);
1524 return (false);
1525 }
1526 }
1527
1528 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1529 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1530 {
1531
1532 /*
1533 * in_pcblookup() family of functions ignore not only freed entries,
1534 * that may be found due to lockless access to the hash, but dropped
1535 * entries, too.
1536 */
1537 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1538 }
1539
1540 /*
1541 * inp_next() - inpcb hash/list traversal iterator
1542 *
1543 * Requires initialized struct inpcb_iterator for context.
1544 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1545 *
1546 * - Iterator can have either write-lock or read-lock semantics, that can not
1547 * be changed later.
1548 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1549 * a single hash slot. Note: only rip_input() does the latter.
1550 * - Iterator may have optional bool matching function. The matching function
1551 * will be executed for each inpcb in the SMR context, so it can not acquire
1552 * locks and can safely access only immutable fields of inpcb.
1553 *
1554 * A fresh initialized iterator has NULL inpcb in its context and that
1555 * means that inp_next() call would return the very first inpcb on the list
1556 * locked with desired semantic. In all following calls the context pointer
1557 * shall hold the current inpcb pointer. The KPI user is not supposed to
1558 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1559 * and write NULL to its context. After end of traversal an iterator can be
1560 * reused.
1561 *
1562 * List traversals have the following features/constraints:
1563 * - New entries won't be seen, as they are always added to the head of a list.
1564 * - Removed entries won't stop traversal as long as they are not added to
1565 * a different list. This is violated by in_pcbrehash().
1566 */
1567 #define II_LIST_FIRST(ipi, hash) \
1568 (((hash) == INP_ALL_LIST) ? \
1569 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
1570 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1571 #define II_LIST_NEXT(inp, hash) \
1572 (((hash) == INP_ALL_LIST) ? \
1573 CK_LIST_NEXT((inp), inp_list) : \
1574 CK_LIST_NEXT((inp), inp_hash_exact))
1575 #define II_LOCK_ASSERT(inp, lock) \
1576 rw_assert(&(inp)->inp_lock, \
1577 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
1578 struct inpcb *
inp_next(struct inpcb_iterator * ii)1579 inp_next(struct inpcb_iterator *ii)
1580 {
1581 const struct inpcbinfo *ipi = ii->ipi;
1582 inp_match_t *match = ii->match;
1583 void *ctx = ii->ctx;
1584 inp_lookup_t lock = ii->lock;
1585 int hash = ii->hash;
1586 struct inpcb *inp;
1587
1588 if (ii->inp == NULL) { /* First call. */
1589 smr_enter(ipi->ipi_smr);
1590 /* This is unrolled CK_LIST_FOREACH(). */
1591 for (inp = II_LIST_FIRST(ipi, hash);
1592 inp != NULL;
1593 inp = II_LIST_NEXT(inp, hash)) {
1594 if (match != NULL && (match)(inp, ctx) == false)
1595 continue;
1596 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1597 break;
1598 else {
1599 smr_enter(ipi->ipi_smr);
1600 MPASS(inp != II_LIST_FIRST(ipi, hash));
1601 inp = II_LIST_FIRST(ipi, hash);
1602 if (inp == NULL)
1603 break;
1604 }
1605 }
1606
1607 if (inp == NULL)
1608 smr_exit(ipi->ipi_smr);
1609 else
1610 ii->inp = inp;
1611
1612 return (inp);
1613 }
1614
1615 /* Not a first call. */
1616 smr_enter(ipi->ipi_smr);
1617 restart:
1618 inp = ii->inp;
1619 II_LOCK_ASSERT(inp, lock);
1620 next:
1621 inp = II_LIST_NEXT(inp, hash);
1622 if (inp == NULL) {
1623 smr_exit(ipi->ipi_smr);
1624 goto found;
1625 }
1626
1627 if (match != NULL && (match)(inp, ctx) == false)
1628 goto next;
1629
1630 if (__predict_true(inp_trylock(inp, lock))) {
1631 if (__predict_false(inp->inp_flags & INP_FREED)) {
1632 /*
1633 * Entries are never inserted in middle of a list, thus
1634 * as long as we are in SMR, we can continue traversal.
1635 * Jump to 'restart' should yield in the same result,
1636 * but could produce unnecessary looping. Could this
1637 * looping be unbound?
1638 */
1639 inp_unlock(inp, lock);
1640 goto next;
1641 } else {
1642 smr_exit(ipi->ipi_smr);
1643 goto found;
1644 }
1645 }
1646
1647 /*
1648 * Can't obtain lock immediately, thus going hard. Once we exit the
1649 * SMR section we can no longer jump to 'next', and our only stable
1650 * anchoring point is ii->inp, which we keep locked for this case, so
1651 * we jump to 'restart'.
1652 */
1653 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1654 smr_exit(ipi->ipi_smr);
1655 inp_lock(inp, lock);
1656 if (__predict_false(in_pcbrele(inp, lock))) {
1657 smr_enter(ipi->ipi_smr);
1658 goto restart;
1659 }
1660 /*
1661 * See comment in inp_smr_lock().
1662 */
1663 if (__predict_false(inp->inp_flags & INP_FREED)) {
1664 inp_unlock(inp, lock);
1665 smr_enter(ipi->ipi_smr);
1666 goto restart;
1667 }
1668 } else
1669 goto next;
1670
1671 found:
1672 inp_unlock(ii->inp, lock);
1673 ii->inp = inp;
1674
1675 return (ii->inp);
1676 }
1677
1678 /*
1679 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1680 * stability of an inpcb pointer despite the inpcb lock being released or
1681 * SMR section exited.
1682 *
1683 * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1684 */
1685 void
in_pcbref(struct inpcb * inp)1686 in_pcbref(struct inpcb *inp)
1687 {
1688 u_int old __diagused;
1689
1690 old = refcount_acquire(&inp->inp_refcount);
1691 KASSERT(old > 0, ("%s: refcount 0", __func__));
1692 }
1693
1694 /*
1695 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1696 * freeing the pcb, if the reference was very last.
1697 */
1698 bool
in_pcbrele_rlocked(struct inpcb * inp)1699 in_pcbrele_rlocked(struct inpcb *inp)
1700 {
1701
1702 INP_RLOCK_ASSERT(inp);
1703
1704 if (!refcount_release(&inp->inp_refcount))
1705 return (false);
1706
1707 MPASS(inp->inp_flags & INP_FREED);
1708 MPASS(inp->inp_socket == NULL);
1709 crfree(inp->inp_cred);
1710 #ifdef INVARIANTS
1711 inp->inp_cred = NULL;
1712 #endif
1713 INP_RUNLOCK(inp);
1714 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1715 return (true);
1716 }
1717
1718 bool
in_pcbrele_wlocked(struct inpcb * inp)1719 in_pcbrele_wlocked(struct inpcb *inp)
1720 {
1721
1722 INP_WLOCK_ASSERT(inp);
1723
1724 if (!refcount_release(&inp->inp_refcount))
1725 return (false);
1726
1727 MPASS(inp->inp_flags & INP_FREED);
1728 MPASS(inp->inp_socket == NULL);
1729 crfree(inp->inp_cred);
1730 #ifdef INVARIANTS
1731 inp->inp_cred = NULL;
1732 #endif
1733 INP_WUNLOCK(inp);
1734 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1735 return (true);
1736 }
1737
1738 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1739 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1740 {
1741
1742 return (lock == INPLOOKUP_RLOCKPCB ?
1743 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1744 }
1745
1746 /*
1747 * Dereference and rlock inp, for which the caller must own the
1748 * reference. Returns true if inp no longer usable, false otherwise.
1749 */
1750 bool
in_pcbrele_rlock(struct inpcb * inp)1751 in_pcbrele_rlock(struct inpcb *inp)
1752 {
1753 INP_RLOCK(inp);
1754 if (in_pcbrele_rlocked(inp))
1755 return (true);
1756 if ((inp->inp_flags & INP_FREED) != 0) {
1757 INP_RUNLOCK(inp);
1758 return (true);
1759 }
1760 return (false);
1761 }
1762
1763 /*
1764 * Unconditionally schedule an inpcb to be freed by decrementing its
1765 * reference count, which should occur only after the inpcb has been detached
1766 * from its socket. If another thread holds a temporary reference (acquired
1767 * using in_pcbref()) then the free is deferred until that reference is
1768 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1769 * Almost all work, including removal from global lists, is done in this
1770 * context, where the pcbinfo lock is held.
1771 */
1772 void
in_pcbfree(struct inpcb * inp)1773 in_pcbfree(struct inpcb *inp)
1774 {
1775 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1776 #ifdef INET
1777 struct ip_moptions *imo;
1778 #endif
1779 #ifdef INET6
1780 struct ip6_moptions *im6o;
1781 #endif
1782
1783 INP_WLOCK_ASSERT(inp);
1784 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1785 KASSERT((inp->inp_flags & INP_FREED) == 0,
1786 ("%s: called twice for pcb %p", __func__, inp));
1787
1788 /*
1789 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1790 * from the hash without acquiring inpcb lock, they rely on the hash
1791 * lock, thus in_pcbremhash() should be the first action.
1792 */
1793 if (inp->inp_flags & INP_INHASHLIST)
1794 in_pcbremhash(inp);
1795 INP_INFO_WLOCK(pcbinfo);
1796 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1797 pcbinfo->ipi_count--;
1798 CK_LIST_REMOVE(inp, inp_list);
1799 INP_INFO_WUNLOCK(pcbinfo);
1800
1801 #ifdef RATELIMIT
1802 if (inp->inp_snd_tag != NULL)
1803 in_pcbdetach_txrtlmt(inp);
1804 #endif
1805 inp->inp_flags |= INP_FREED;
1806 inp->inp_socket->so_pcb = NULL;
1807 inp->inp_socket = NULL;
1808
1809 RO_INVALIDATE_CACHE(&inp->inp_route);
1810 #ifdef MAC
1811 mac_inpcb_destroy(inp);
1812 #endif
1813 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1814 if (inp->inp_sp != NULL)
1815 ipsec_delete_pcbpolicy(inp);
1816 #endif
1817 #ifdef INET
1818 if (inp->inp_options)
1819 (void)m_free(inp->inp_options);
1820 DEBUG_POISON_POINTER(inp->inp_options);
1821 imo = inp->inp_moptions;
1822 DEBUG_POISON_POINTER(inp->inp_moptions);
1823 #endif
1824 #ifdef INET6
1825 if (inp->inp_vflag & INP_IPV6PROTO) {
1826 ip6_freepcbopts(inp->in6p_outputopts);
1827 DEBUG_POISON_POINTER(inp->in6p_outputopts);
1828 im6o = inp->in6p_moptions;
1829 DEBUG_POISON_POINTER(inp->in6p_moptions);
1830 } else
1831 im6o = NULL;
1832 #endif
1833
1834 if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1835 INP_WUNLOCK(inp);
1836 }
1837 #ifdef INET6
1838 ip6_freemoptions(im6o);
1839 #endif
1840 #ifdef INET
1841 inp_freemoptions(imo);
1842 #endif
1843 }
1844
1845 /*
1846 * Different protocols initialize their inpcbs differently - giving
1847 * different name to the lock. But they all are disposed the same.
1848 */
1849 static void
inpcb_fini(void * mem,int size)1850 inpcb_fini(void *mem, int size)
1851 {
1852 struct inpcb *inp = mem;
1853
1854 INP_LOCK_DESTROY(inp);
1855 }
1856
1857 /*
1858 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1859 * port reservation, and preventing it from being returned by inpcb lookups.
1860 *
1861 * It is used by TCP to mark an inpcb as unused and avoid future packet
1862 * delivery or event notification when a socket remains open but TCP has
1863 * closed. This might occur as a result of a shutdown()-initiated TCP close
1864 * or a RST on the wire, and allows the port binding to be reused while still
1865 * maintaining the invariant that so_pcb always points to a valid inpcb until
1866 * in_pcbdetach().
1867 *
1868 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1869 * in_pcbpurgeif0()?
1870 */
1871 void
in_pcbdrop(struct inpcb * inp)1872 in_pcbdrop(struct inpcb *inp)
1873 {
1874
1875 INP_WLOCK_ASSERT(inp);
1876
1877 inp->inp_flags |= INP_DROPPED;
1878 if (inp->inp_flags & INP_INHASHLIST)
1879 in_pcbremhash(inp);
1880 }
1881
1882 #ifdef INET
1883 /*
1884 * Common routines to return the socket addresses associated with inpcbs.
1885 */
1886 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1887 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1888 {
1889 struct inpcb *inp;
1890
1891 inp = sotoinpcb(so);
1892 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1893
1894 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1895 .sin_len = sizeof(struct sockaddr_in),
1896 .sin_family = AF_INET,
1897 .sin_port = inp->inp_lport,
1898 .sin_addr = inp->inp_laddr,
1899 };
1900
1901 return (0);
1902 }
1903
1904 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1905 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1906 {
1907 struct inpcb *inp;
1908
1909 inp = sotoinpcb(so);
1910 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1911
1912 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1913 .sin_len = sizeof(struct sockaddr_in),
1914 .sin_family = AF_INET,
1915 .sin_port = inp->inp_fport,
1916 .sin_addr = inp->inp_faddr,
1917 };
1918
1919 return (0);
1920 }
1921
1922 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1923 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1924 {
1925
1926 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1927 return (true);
1928 else
1929 return (false);
1930 }
1931
1932 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1933 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1934 {
1935 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1936 inp_v4_multi_match, NULL);
1937 struct inpcb *inp;
1938 struct in_multi *inm;
1939 struct in_mfilter *imf;
1940 struct ip_moptions *imo;
1941
1942 IN_MULTI_LOCK_ASSERT();
1943
1944 while ((inp = inp_next(&inpi)) != NULL) {
1945 INP_WLOCK_ASSERT(inp);
1946
1947 imo = inp->inp_moptions;
1948 /*
1949 * Unselect the outgoing interface if it is being
1950 * detached.
1951 */
1952 if (imo->imo_multicast_ifp == ifp)
1953 imo->imo_multicast_ifp = NULL;
1954
1955 /*
1956 * Drop multicast group membership if we joined
1957 * through the interface being detached.
1958 *
1959 * XXX This can all be deferred to an epoch_call
1960 */
1961 restart:
1962 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1963 if ((inm = imf->imf_inm) == NULL)
1964 continue;
1965 if (inm->inm_ifp != ifp)
1966 continue;
1967 ip_mfilter_remove(&imo->imo_head, imf);
1968 in_leavegroup_locked(inm, NULL);
1969 ip_mfilter_free(imf);
1970 goto restart;
1971 }
1972 }
1973 }
1974
1975 /*
1976 * Lookup a PCB based on the local address and port. Caller must hold the
1977 * hash lock. No inpcb locks or references are acquired.
1978 */
1979 #define INP_LOOKUP_MAPPED_PCB_COST 3
1980 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)1981 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1982 u_short lport, int fib, int lookupflags, struct ucred *cred)
1983 {
1984 struct inpcb *inp;
1985 #ifdef INET6
1986 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1987 #else
1988 int matchwild = 3;
1989 #endif
1990 int wildcard;
1991
1992 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1993 ("%s: invalid lookup flags %d", __func__, lookupflags));
1994 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
1995 ("%s: invalid fib %d", __func__, fib));
1996
1997 INP_HASH_LOCK_ASSERT(pcbinfo);
1998
1999 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2000 struct inpcbhead *head;
2001 /*
2002 * Look for an unconnected (wildcard foreign addr) PCB that
2003 * matches the local address and port we're looking for.
2004 */
2005 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2006 pcbinfo->ipi_hashmask)];
2007 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2008 #ifdef INET6
2009 /* XXX inp locking */
2010 if ((inp->inp_vflag & INP_IPV4) == 0)
2011 continue;
2012 #endif
2013 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2014 inp->inp_laddr.s_addr == laddr.s_addr &&
2015 inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2016 inp->inp_inc.inc_fibnum == fib)) {
2017 /*
2018 * Found?
2019 */
2020 if (prison_equal_ip4(cred->cr_prison,
2021 inp->inp_cred->cr_prison))
2022 return (inp);
2023 }
2024 }
2025 /*
2026 * Not found.
2027 */
2028 return (NULL);
2029 } else {
2030 struct inpcbhead *porthash;
2031 struct inpcb *match = NULL;
2032
2033 /*
2034 * Port is in use by one or more PCBs. Look for best
2035 * fit.
2036 */
2037 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2038 pcbinfo->ipi_porthashmask)];
2039 CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2040 if (inp->inp_lport != lport)
2041 continue;
2042 if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2043 cred->cr_prison))
2044 continue;
2045 if (fib != RT_ALL_FIBS &&
2046 inp->inp_inc.inc_fibnum != fib)
2047 continue;
2048 wildcard = 0;
2049 #ifdef INET6
2050 /* XXX inp locking */
2051 if ((inp->inp_vflag & INP_IPV4) == 0)
2052 continue;
2053 /*
2054 * We never select the PCB that has INP_IPV6 flag and
2055 * is bound to :: if we have another PCB which is bound
2056 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we
2057 * set its cost higher than IPv4 only PCBs.
2058 *
2059 * Note that the case only happens when a socket is
2060 * bound to ::, under the condition that the use of the
2061 * mapped address is allowed.
2062 */
2063 if ((inp->inp_vflag & INP_IPV6) != 0)
2064 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2065 #endif
2066 if (inp->inp_faddr.s_addr != INADDR_ANY)
2067 wildcard++;
2068 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2069 if (laddr.s_addr == INADDR_ANY)
2070 wildcard++;
2071 else if (inp->inp_laddr.s_addr != laddr.s_addr)
2072 continue;
2073 } else {
2074 if (laddr.s_addr != INADDR_ANY)
2075 wildcard++;
2076 }
2077 if (wildcard < matchwild) {
2078 match = inp;
2079 matchwild = wildcard;
2080 if (matchwild == 0)
2081 break;
2082 }
2083 }
2084 return (match);
2085 }
2086 }
2087 #undef INP_LOOKUP_MAPPED_PCB_COST
2088
2089 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2090 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2091 {
2092 return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2093 (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2094 }
2095
2096 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2097 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2098 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2099 uint16_t lport, int domain, int fib)
2100 {
2101 const struct inpcblbgrouphead *hdr;
2102 struct inpcblbgroup *grp;
2103 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2104 struct inpcb *inp;
2105 u_int count;
2106
2107 INP_HASH_LOCK_ASSERT(pcbinfo);
2108 NET_EPOCH_ASSERT();
2109
2110 hdr = &pcbinfo->ipi_lbgrouphashbase[
2111 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2112
2113 /*
2114 * Search for an LB group match based on the following criteria:
2115 * - prefer jailed groups to non-jailed groups
2116 * - prefer exact source address matches to wildcard matches
2117 * - prefer groups bound to the specified NUMA domain
2118 */
2119 jail_exact = jail_wild = local_exact = local_wild = NULL;
2120 CK_LIST_FOREACH(grp, hdr, il_list) {
2121 bool injail;
2122
2123 #ifdef INET6
2124 if (!(grp->il_vflag & INP_IPV4))
2125 continue;
2126 #endif
2127 if (grp->il_lport != lport)
2128 continue;
2129
2130 injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2131 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2132 laddr) != 0)
2133 continue;
2134
2135 if (grp->il_laddr.s_addr == laddr->s_addr) {
2136 if (injail) {
2137 jail_exact = grp;
2138 if (in_pcblookup_lb_match(grp, domain, fib))
2139 /* This is a perfect match. */
2140 goto out;
2141 } else if (local_exact == NULL ||
2142 in_pcblookup_lb_match(grp, domain, fib)) {
2143 local_exact = grp;
2144 }
2145 } else if (grp->il_laddr.s_addr == INADDR_ANY) {
2146 if (injail) {
2147 if (jail_wild == NULL ||
2148 in_pcblookup_lb_match(grp, domain, fib))
2149 jail_wild = grp;
2150 } else if (local_wild == NULL ||
2151 in_pcblookup_lb_match(grp, domain, fib)) {
2152 local_wild = grp;
2153 }
2154 }
2155 }
2156
2157 if (jail_exact != NULL)
2158 grp = jail_exact;
2159 else if (jail_wild != NULL)
2160 grp = jail_wild;
2161 else if (local_exact != NULL)
2162 grp = local_exact;
2163 else
2164 grp = local_wild;
2165 if (grp == NULL)
2166 return (NULL);
2167
2168 out:
2169 /*
2170 * Synchronize with in_pcblbgroup_insert().
2171 */
2172 count = atomic_load_acq_int(&grp->il_inpcnt);
2173 if (count == 0)
2174 return (NULL);
2175 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2176 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2177 return (inp);
2178 }
2179
2180 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2181 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2182 u_short fport, struct in_addr laddr, u_short lport)
2183 {
2184 #ifdef INET6
2185 /* XXX inp locking */
2186 if ((inp->inp_vflag & INP_IPV4) == 0)
2187 return (false);
2188 #endif
2189 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2190 inp->inp_laddr.s_addr == laddr.s_addr &&
2191 inp->inp_fport == fport &&
2192 inp->inp_lport == lport)
2193 return (true);
2194 return (false);
2195 }
2196
2197 static struct inpcb *
in_pcblookup_hash_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2198 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2199 u_short fport, struct in_addr laddr, u_short lport)
2200 {
2201 struct inpcbhead *head;
2202 struct inpcb *inp;
2203
2204 INP_HASH_LOCK_ASSERT(pcbinfo);
2205
2206 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2207 pcbinfo->ipi_hashmask)];
2208 CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2209 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2210 return (inp);
2211 }
2212 return (NULL);
2213 }
2214
2215 typedef enum {
2216 INPLOOKUP_MATCH_NONE = 0,
2217 INPLOOKUP_MATCH_WILD = 1,
2218 INPLOOKUP_MATCH_LADDR = 2,
2219 } inp_lookup_match_t;
2220
2221 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2222 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2223 u_short lport, int fib)
2224 {
2225 #ifdef INET6
2226 /* XXX inp locking */
2227 if ((inp->inp_vflag & INP_IPV4) == 0)
2228 return (INPLOOKUP_MATCH_NONE);
2229 #endif
2230 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2231 return (INPLOOKUP_MATCH_NONE);
2232 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2233 return (INPLOOKUP_MATCH_NONE);
2234 if (inp->inp_laddr.s_addr == INADDR_ANY)
2235 return (INPLOOKUP_MATCH_WILD);
2236 if (inp->inp_laddr.s_addr == laddr.s_addr)
2237 return (INPLOOKUP_MATCH_LADDR);
2238 return (INPLOOKUP_MATCH_NONE);
2239 }
2240
2241 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
2242
2243 static struct inpcb *
in_pcblookup_hash_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2244 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2245 u_short lport, int fib, const inp_lookup_t lockflags)
2246 {
2247 struct inpcbhead *head;
2248 struct inpcb *inp;
2249
2250 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2251 ("%s: not in SMR read section", __func__));
2252
2253 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2254 pcbinfo->ipi_hashmask)];
2255 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2256 inp_lookup_match_t match;
2257
2258 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2259 if (match == INPLOOKUP_MATCH_NONE)
2260 continue;
2261
2262 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2263 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2264 if (match != INPLOOKUP_MATCH_NONE &&
2265 prison_check_ip4_locked(inp->inp_cred->cr_prison,
2266 &laddr) == 0)
2267 return (inp);
2268 inp_unlock(inp, lockflags);
2269 }
2270
2271 /*
2272 * The matching socket disappeared out from under us. Fall back
2273 * to a serialized lookup.
2274 */
2275 return (INP_LOOKUP_AGAIN);
2276 }
2277 return (NULL);
2278 }
2279
2280 static struct inpcb *
in_pcblookup_hash_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2281 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2282 u_short lport, int fib)
2283 {
2284 struct inpcbhead *head;
2285 struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2286 #ifdef INET6
2287 struct inpcb *local_wild_mapped;
2288 #endif
2289
2290 INP_HASH_LOCK_ASSERT(pcbinfo);
2291
2292 /*
2293 * Order of socket selection - we always prefer jails.
2294 * 1. jailed, non-wild.
2295 * 2. jailed, wild.
2296 * 3. non-jailed, non-wild.
2297 * 4. non-jailed, wild.
2298 */
2299 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2300 pcbinfo->ipi_hashmask)];
2301 local_wild = local_exact = jail_wild = NULL;
2302 #ifdef INET6
2303 local_wild_mapped = NULL;
2304 #endif
2305 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2306 inp_lookup_match_t match;
2307 bool injail;
2308
2309 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2310 if (match == INPLOOKUP_MATCH_NONE)
2311 continue;
2312
2313 injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2314 if (injail) {
2315 if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2316 &laddr) != 0)
2317 continue;
2318 } else {
2319 if (local_exact != NULL)
2320 continue;
2321 }
2322
2323 if (match == INPLOOKUP_MATCH_LADDR) {
2324 if (injail)
2325 return (inp);
2326 local_exact = inp;
2327 } else {
2328 #ifdef INET6
2329 /* XXX inp locking, NULL check */
2330 if (inp->inp_vflag & INP_IPV6PROTO)
2331 local_wild_mapped = inp;
2332 else
2333 #endif
2334 if (injail)
2335 jail_wild = inp;
2336 else
2337 local_wild = inp;
2338 }
2339 }
2340 if (jail_wild != NULL)
2341 return (jail_wild);
2342 if (local_exact != NULL)
2343 return (local_exact);
2344 if (local_wild != NULL)
2345 return (local_wild);
2346 #ifdef INET6
2347 if (local_wild_mapped != NULL)
2348 return (local_wild_mapped);
2349 #endif
2350 return (NULL);
2351 }
2352
2353 /*
2354 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2355 * that the caller has either locked the hash list, which usually happens
2356 * for bind(2) operations, or is in SMR section, which happens when sorting
2357 * out incoming packets.
2358 */
2359 static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2360 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2361 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2362 uint8_t numa_domain, int fib)
2363 {
2364 struct inpcb *inp;
2365 const u_short fport = fport_arg, lport = lport_arg;
2366
2367 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2368 ("%s: invalid lookup flags %d", __func__, lookupflags));
2369 KASSERT(faddr.s_addr != INADDR_ANY,
2370 ("%s: invalid foreign address", __func__));
2371 KASSERT(laddr.s_addr != INADDR_ANY,
2372 ("%s: invalid local address", __func__));
2373 INP_HASH_WLOCK_ASSERT(pcbinfo);
2374
2375 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2376 if (inp != NULL)
2377 return (inp);
2378
2379 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2380 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2381 &laddr, lport, numa_domain, fib);
2382 if (inp == NULL) {
2383 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2384 lport, fib);
2385 }
2386 }
2387
2388 return (inp);
2389 }
2390
2391 static struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2392 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2393 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2394 uint8_t numa_domain, int fib)
2395 {
2396 struct inpcb *inp;
2397 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2398
2399 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2400 ("%s: LOCKPCB not set", __func__));
2401
2402 INP_HASH_WLOCK(pcbinfo);
2403 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2404 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2405 if (inp != NULL && !inp_trylock(inp, lockflags)) {
2406 in_pcbref(inp);
2407 INP_HASH_WUNLOCK(pcbinfo);
2408 inp_lock(inp, lockflags);
2409 if (in_pcbrele(inp, lockflags))
2410 /* XXX-MJ or retry until we get a negative match? */
2411 inp = NULL;
2412 } else {
2413 INP_HASH_WUNLOCK(pcbinfo);
2414 }
2415 return (inp);
2416 }
2417
2418 static struct inpcb *
in_pcblookup_hash_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2419 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2420 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2421 uint8_t numa_domain, int fib)
2422 {
2423 struct inpcb *inp;
2424 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2425 const u_short fport = fport_arg, lport = lport_arg;
2426
2427 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2428 ("%s: invalid lookup flags %d", __func__, lookupflags));
2429 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2430 ("%s: LOCKPCB not set", __func__));
2431
2432 smr_enter(pcbinfo->ipi_smr);
2433 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2434 if (inp != NULL) {
2435 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2436 /*
2437 * Revalidate the 4-tuple, the socket could have been
2438 * disconnected.
2439 */
2440 if (__predict_true(in_pcblookup_exact_match(inp,
2441 faddr, fport, laddr, lport)))
2442 return (inp);
2443 inp_unlock(inp, lockflags);
2444 }
2445
2446 /*
2447 * We failed to lock the inpcb, or its connection state changed
2448 * out from under us. Fall back to a precise search.
2449 */
2450 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2451 lookupflags, numa_domain, fib));
2452 }
2453
2454 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2455 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2456 &laddr, lport, numa_domain, fib);
2457 if (inp != NULL) {
2458 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2459 if (__predict_true(in_pcblookup_wild_match(inp,
2460 laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2461 return (inp);
2462 inp_unlock(inp, lockflags);
2463 }
2464 inp = INP_LOOKUP_AGAIN;
2465 } else {
2466 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2467 fib, lockflags);
2468 }
2469 if (inp == INP_LOOKUP_AGAIN) {
2470 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2471 lport, lookupflags, numa_domain, fib));
2472 }
2473 }
2474
2475 if (inp == NULL)
2476 smr_exit(pcbinfo->ipi_smr);
2477
2478 return (inp);
2479 }
2480
2481 /*
2482 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2483 * from which a pre-calculated hash value may be extracted.
2484 */
2485 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2486 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2487 struct in_addr laddr, u_int lport, int lookupflags,
2488 struct ifnet *ifp)
2489 {
2490 int fib;
2491
2492 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2493 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2494 lookupflags, M_NODOM, fib));
2495 }
2496
2497 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2498 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2499 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2500 struct ifnet *ifp __unused, struct mbuf *m)
2501 {
2502 int fib;
2503
2504 M_ASSERTPKTHDR(m);
2505 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2506 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2507 lookupflags, m->m_pkthdr.numa_domain, fib));
2508 }
2509 #endif /* INET */
2510
2511 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2512 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2513 {
2514 return (prison_flag(inp->inp_cred, flag) != 0);
2515 }
2516
2517 /*
2518 * Insert the PCB into a hash chain using ordering rules which ensure that
2519 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2520 *
2521 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2522 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
2523 * always appear last no matter whether they are jailed.
2524 */
2525 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2526 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2527 {
2528 struct inpcb *last;
2529 bool bound, injail;
2530
2531 INP_LOCK_ASSERT(inp);
2532 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2533
2534 last = NULL;
2535 bound = inp->inp_laddr.s_addr != INADDR_ANY;
2536 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2537 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2538 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2539 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2540 return;
2541 }
2542 }
2543 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2544 return;
2545 }
2546
2547 injail = in_pcbjailed(inp, PR_IP4);
2548 if (!injail) {
2549 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2550 if (!in_pcbjailed(last, PR_IP4))
2551 break;
2552 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2553 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2554 return;
2555 }
2556 }
2557 } else if (!CK_LIST_EMPTY(pcbhash) &&
2558 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2559 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2560 return;
2561 }
2562 if (!bound) {
2563 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2564 if (last->inp_laddr.s_addr == INADDR_ANY)
2565 break;
2566 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2567 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2568 return;
2569 }
2570 }
2571 }
2572 if (last == NULL)
2573 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2574 else
2575 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2576 }
2577
2578 #ifdef INET6
2579 /*
2580 * See the comment above _in_pcbinshash_wild().
2581 */
2582 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2583 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2584 {
2585 struct inpcb *last;
2586 bool bound, injail;
2587
2588 INP_LOCK_ASSERT(inp);
2589 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2590
2591 last = NULL;
2592 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2593 injail = in_pcbjailed(inp, PR_IP6);
2594 if (!injail) {
2595 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2596 if (!in_pcbjailed(last, PR_IP6))
2597 break;
2598 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2599 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2600 return;
2601 }
2602 }
2603 } else if (!CK_LIST_EMPTY(pcbhash) &&
2604 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2605 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2606 return;
2607 }
2608 if (!bound) {
2609 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2610 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2611 break;
2612 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2613 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2614 return;
2615 }
2616 }
2617 }
2618 if (last == NULL)
2619 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2620 else
2621 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2622 }
2623 #endif
2624
2625 /*
2626 * Insert PCB onto various hash lists.
2627 *
2628 * With normal sockets this function shall not fail, so it could return void.
2629 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2630 * that's the only condition when it can fail.
2631 */
2632 int
in_pcbinshash(struct inpcb * inp)2633 in_pcbinshash(struct inpcb *inp)
2634 {
2635 struct inpcbhead *pcbhash, *pcbporthash;
2636 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2637 uint32_t hash;
2638 bool connected;
2639
2640 INP_WLOCK_ASSERT(inp);
2641 INP_HASH_WLOCK_ASSERT(pcbinfo);
2642 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2643 ("in_pcbinshash: INP_INHASHLIST"));
2644
2645 #ifdef INET6
2646 if (inp->inp_vflag & INP_IPV6) {
2647 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2648 inp->inp_fport, pcbinfo->ipi_hashmask);
2649 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2650 } else
2651 #endif
2652 {
2653 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2654 inp->inp_fport, pcbinfo->ipi_hashmask);
2655 connected = !in_nullhost(inp->inp_faddr);
2656 }
2657
2658 if (connected)
2659 pcbhash = &pcbinfo->ipi_hash_exact[hash];
2660 else
2661 pcbhash = &pcbinfo->ipi_hash_wild[hash];
2662
2663 pcbporthash = &pcbinfo->ipi_porthashbase[
2664 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2665
2666 /*
2667 * Ignore SO_REUSEPORT_LB if the socket is connected. Really this case
2668 * should be an error, but for UDP sockets it is not, and some
2669 * applications erroneously set it on connected UDP sockets, so we can't
2670 * change this without breaking compatibility.
2671 */
2672 if (!connected &&
2673 (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2674 int error = in_pcbinslbgrouphash(inp, M_NODOM);
2675 if (error != 0)
2676 return (error);
2677 }
2678
2679 /*
2680 * The PCB may have been disconnected in the past. Before we can safely
2681 * make it visible in the hash table, we must wait for all readers which
2682 * may be traversing this PCB to finish.
2683 */
2684 if (inp->inp_smr != SMR_SEQ_INVALID) {
2685 smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2686 inp->inp_smr = SMR_SEQ_INVALID;
2687 }
2688
2689 if (connected)
2690 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2691 else {
2692 #ifdef INET6
2693 if ((inp->inp_vflag & INP_IPV6) != 0)
2694 _in6_pcbinshash_wild(pcbhash, inp);
2695 else
2696 #endif
2697 _in_pcbinshash_wild(pcbhash, inp);
2698 }
2699 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2700 inp->inp_flags |= INP_INHASHLIST;
2701
2702 return (0);
2703 }
2704
2705 void
in_pcbremhash_locked(struct inpcb * inp)2706 in_pcbremhash_locked(struct inpcb *inp)
2707 {
2708
2709 INP_WLOCK_ASSERT(inp);
2710 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2711 MPASS(inp->inp_flags & INP_INHASHLIST);
2712
2713 if ((inp->inp_flags & INP_INLBGROUP) != 0)
2714 in_pcbremlbgrouphash(inp);
2715 #ifdef INET6
2716 if (inp->inp_vflag & INP_IPV6) {
2717 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2718 CK_LIST_REMOVE(inp, inp_hash_wild);
2719 else
2720 CK_LIST_REMOVE(inp, inp_hash_exact);
2721 } else
2722 #endif
2723 {
2724 if (in_nullhost(inp->inp_faddr))
2725 CK_LIST_REMOVE(inp, inp_hash_wild);
2726 else
2727 CK_LIST_REMOVE(inp, inp_hash_exact);
2728 }
2729 CK_LIST_REMOVE(inp, inp_portlist);
2730 inp->inp_flags &= ~INP_INHASHLIST;
2731 }
2732
2733 static void
in_pcbremhash(struct inpcb * inp)2734 in_pcbremhash(struct inpcb *inp)
2735 {
2736 INP_HASH_WLOCK(inp->inp_pcbinfo);
2737 in_pcbremhash_locked(inp);
2738 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2739 }
2740
2741 /*
2742 * Move PCB to the proper hash bucket when { faddr, fport } have been
2743 * changed. NOTE: This does not handle the case of the lport changing (the
2744 * hashed port list would have to be updated as well), so the lport must
2745 * not change after in_pcbinshash() has been called.
2746 */
2747 void
in_pcbrehash(struct inpcb * inp)2748 in_pcbrehash(struct inpcb *inp)
2749 {
2750 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2751 struct inpcbhead *head;
2752 uint32_t hash;
2753 bool connected;
2754
2755 INP_WLOCK_ASSERT(inp);
2756 INP_HASH_WLOCK_ASSERT(pcbinfo);
2757 KASSERT(inp->inp_flags & INP_INHASHLIST,
2758 ("%s: !INP_INHASHLIST", __func__));
2759 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2760 ("%s: inp was disconnected", __func__));
2761
2762 #ifdef INET6
2763 if (inp->inp_vflag & INP_IPV6) {
2764 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2765 inp->inp_fport, pcbinfo->ipi_hashmask);
2766 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2767 } else
2768 #endif
2769 {
2770 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2771 inp->inp_fport, pcbinfo->ipi_hashmask);
2772 connected = !in_nullhost(inp->inp_faddr);
2773 }
2774
2775 /* See the comment in in_pcbinshash(). */
2776 if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2777 in_pcbremlbgrouphash(inp);
2778
2779 /*
2780 * When rehashing, the caller must ensure that either the new or the old
2781 * foreign address was unspecified.
2782 */
2783 if (connected)
2784 CK_LIST_REMOVE(inp, inp_hash_wild);
2785 else
2786 CK_LIST_REMOVE(inp, inp_hash_exact);
2787
2788 if (connected) {
2789 head = &pcbinfo->ipi_hash_exact[hash];
2790 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2791 } else {
2792 head = &pcbinfo->ipi_hash_wild[hash];
2793 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2794 }
2795 }
2796
2797 /*
2798 * Check for alternatives when higher level complains
2799 * about service problems. For now, invalidate cached
2800 * routing information. If the route was created dynamically
2801 * (by a redirect), time to try a default gateway again.
2802 */
2803 void
in_losing(struct inpcb * inp)2804 in_losing(struct inpcb *inp)
2805 {
2806
2807 RO_INVALIDATE_CACHE(&inp->inp_route);
2808 return;
2809 }
2810
2811 /*
2812 * A set label operation has occurred at the socket layer, propagate the
2813 * label change into the in_pcb for the socket.
2814 */
2815 void
in_pcbsosetlabel(struct socket * so)2816 in_pcbsosetlabel(struct socket *so)
2817 {
2818 #ifdef MAC
2819 struct inpcb *inp;
2820
2821 inp = sotoinpcb(so);
2822 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2823
2824 INP_WLOCK(inp);
2825 SOCK_LOCK(so);
2826 mac_inpcb_sosetlabel(so, inp);
2827 SOCK_UNLOCK(so);
2828 INP_WUNLOCK(inp);
2829 #endif
2830 }
2831
2832 void
inp_wlock(struct inpcb * inp)2833 inp_wlock(struct inpcb *inp)
2834 {
2835
2836 INP_WLOCK(inp);
2837 }
2838
2839 void
inp_wunlock(struct inpcb * inp)2840 inp_wunlock(struct inpcb *inp)
2841 {
2842
2843 INP_WUNLOCK(inp);
2844 }
2845
2846 void
inp_rlock(struct inpcb * inp)2847 inp_rlock(struct inpcb *inp)
2848 {
2849
2850 INP_RLOCK(inp);
2851 }
2852
2853 void
inp_runlock(struct inpcb * inp)2854 inp_runlock(struct inpcb *inp)
2855 {
2856
2857 INP_RUNLOCK(inp);
2858 }
2859
2860 #ifdef INVARIANT_SUPPORT
2861 void
inp_lock_assert(struct inpcb * inp)2862 inp_lock_assert(struct inpcb *inp)
2863 {
2864
2865 INP_WLOCK_ASSERT(inp);
2866 }
2867
2868 void
inp_unlock_assert(struct inpcb * inp)2869 inp_unlock_assert(struct inpcb *inp)
2870 {
2871
2872 INP_UNLOCK_ASSERT(inp);
2873 }
2874 #endif
2875
2876 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2877 inp_apply_all(struct inpcbinfo *pcbinfo,
2878 void (*func)(struct inpcb *, void *), void *arg)
2879 {
2880 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2881 INPLOOKUP_WLOCKPCB);
2882 struct inpcb *inp;
2883
2884 while ((inp = inp_next(&inpi)) != NULL)
2885 func(inp, arg);
2886 }
2887
2888 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2889 inp_inpcbtosocket(struct inpcb *inp)
2890 {
2891
2892 INP_WLOCK_ASSERT(inp);
2893 return (inp->inp_socket);
2894 }
2895
2896 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2897 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2898 uint32_t *faddr, uint16_t *fp)
2899 {
2900
2901 INP_LOCK_ASSERT(inp);
2902 *laddr = inp->inp_laddr.s_addr;
2903 *faddr = inp->inp_faddr.s_addr;
2904 *lp = inp->inp_lport;
2905 *fp = inp->inp_fport;
2906 }
2907
2908 /*
2909 * Create an external-format (``xinpcb'') structure using the information in
2910 * the kernel-format in_pcb structure pointed to by inp. This is done to
2911 * reduce the spew of irrelevant information over this interface, to isolate
2912 * user code from changes in the kernel structure, and potentially to provide
2913 * information-hiding if we decide that some of this information should be
2914 * hidden from users.
2915 */
2916 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)2917 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2918 {
2919
2920 bzero(xi, sizeof(*xi));
2921 xi->xi_len = sizeof(struct xinpcb);
2922 if (inp->inp_socket)
2923 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2924 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2925 xi->inp_gencnt = inp->inp_gencnt;
2926 xi->inp_flow = inp->inp_flow;
2927 xi->inp_flowid = inp->inp_flowid;
2928 xi->inp_flowtype = inp->inp_flowtype;
2929 xi->inp_flags = inp->inp_flags;
2930 xi->inp_flags2 = inp->inp_flags2;
2931 xi->in6p_cksum = inp->in6p_cksum;
2932 xi->in6p_hops = inp->in6p_hops;
2933 xi->inp_ip_tos = inp->inp_ip_tos;
2934 xi->inp_vflag = inp->inp_vflag;
2935 xi->inp_ip_ttl = inp->inp_ip_ttl;
2936 xi->inp_ip_p = inp->inp_ip_p;
2937 xi->inp_ip_minttl = inp->inp_ip_minttl;
2938 }
2939
2940 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))2941 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
2942 int (*ctloutput_set)(struct inpcb *, struct sockopt *))
2943 {
2944 struct sockopt sopt;
2945 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2946 INPLOOKUP_WLOCKPCB);
2947 struct inpcb *inp;
2948 struct sockopt_parameters *params;
2949 struct socket *so;
2950 int error;
2951 char buf[1024];
2952
2953 if (req->oldptr != NULL || req->oldlen != 0)
2954 return (EINVAL);
2955 if (req->newptr == NULL)
2956 return (EPERM);
2957 if (req->newlen > sizeof(buf))
2958 return (ENOMEM);
2959 error = SYSCTL_IN(req, buf, req->newlen);
2960 if (error != 0)
2961 return (error);
2962 if (req->newlen < sizeof(struct sockopt_parameters))
2963 return (EINVAL);
2964 params = (struct sockopt_parameters *)buf;
2965 sopt.sopt_level = params->sop_level;
2966 sopt.sopt_name = params->sop_optname;
2967 sopt.sopt_dir = SOPT_SET;
2968 sopt.sopt_val = params->sop_optval;
2969 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
2970 sopt.sopt_td = NULL;
2971 #ifdef INET6
2972 if (params->sop_inc.inc_flags & INC_ISIPV6) {
2973 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr))
2974 params->sop_inc.inc6_laddr.s6_addr16[1] =
2975 htons(params->sop_inc.inc6_zoneid & 0xffff);
2976 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr))
2977 params->sop_inc.inc6_faddr.s6_addr16[1] =
2978 htons(params->sop_inc.inc6_zoneid & 0xffff);
2979 }
2980 #endif
2981 if (params->sop_inc.inc_lport != htons(0) &&
2982 params->sop_inc.inc_fport != htons(0)) {
2983 #ifdef INET6
2984 if (params->sop_inc.inc_flags & INC_ISIPV6)
2985 inpi.hash = INP6_PCBHASH(
2986 ¶ms->sop_inc.inc6_faddr,
2987 params->sop_inc.inc_lport,
2988 params->sop_inc.inc_fport,
2989 pcbinfo->ipi_hashmask);
2990 else
2991 #endif
2992 inpi.hash = INP_PCBHASH(
2993 ¶ms->sop_inc.inc_faddr,
2994 params->sop_inc.inc_lport,
2995 params->sop_inc.inc_fport,
2996 pcbinfo->ipi_hashmask);
2997 }
2998 while ((inp = inp_next(&inpi)) != NULL)
2999 if (inp->inp_gencnt == params->sop_id) {
3000 if (inp->inp_flags & INP_DROPPED) {
3001 INP_WUNLOCK(inp);
3002 return (ECONNRESET);
3003 }
3004 so = inp->inp_socket;
3005 KASSERT(so != NULL, ("inp_socket == NULL"));
3006 soref(so);
3007 if (params->sop_level == SOL_SOCKET) {
3008 INP_WUNLOCK(inp);
3009 error = sosetopt(so, &sopt);
3010 } else
3011 error = (*ctloutput_set)(inp, &sopt);
3012 sorele(so);
3013 break;
3014 }
3015 if (inp == NULL)
3016 error = ESRCH;
3017 return (error);
3018 }
3019
3020 #ifdef DDB
3021 static void
db_print_indent(int indent)3022 db_print_indent(int indent)
3023 {
3024 int i;
3025
3026 for (i = 0; i < indent; i++)
3027 db_printf(" ");
3028 }
3029
3030 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3031 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3032 {
3033 char faddr_str[48], laddr_str[48];
3034
3035 db_print_indent(indent);
3036 db_printf("%s at %p\n", name, inc);
3037
3038 indent += 2;
3039
3040 #ifdef INET6
3041 if (inc->inc_flags & INC_ISIPV6) {
3042 /* IPv6. */
3043 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3044 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3045 } else
3046 #endif
3047 {
3048 /* IPv4. */
3049 inet_ntoa_r(inc->inc_laddr, laddr_str);
3050 inet_ntoa_r(inc->inc_faddr, faddr_str);
3051 }
3052 db_print_indent(indent);
3053 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3054 ntohs(inc->inc_lport));
3055 db_print_indent(indent);
3056 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3057 ntohs(inc->inc_fport));
3058 }
3059
3060 void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3061 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3062 {
3063
3064 db_print_indent(indent);
3065 db_printf("%s at %p\n", name, inp);
3066
3067 indent += 2;
3068
3069 db_print_indent(indent);
3070 db_printf("inp_flow: 0x%x inp_label: %p\n", inp->inp_flow,
3071 inp->inp_label);
3072
3073 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3074
3075 db_print_indent(indent);
3076 db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3077
3078 db_print_indent(indent);
3079 db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3080
3081 db_print_indent(indent);
3082 db_printf("inp_sp: %p inp_vflag: 0x%b\n", inp->inp_sp,
3083 inp->inp_vflag, INP_VFLAGS_BITS);
3084
3085 db_print_indent(indent);
3086 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3087 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3088
3089 #ifdef INET6
3090 if (inp->inp_vflag & INP_IPV6) {
3091 db_print_indent(indent);
3092 db_printf("in6p_options: %p in6p_outputopts: %p "
3093 "in6p_moptions: %p\n", inp->in6p_options,
3094 inp->in6p_outputopts, inp->in6p_moptions);
3095 db_print_indent(indent);
3096 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3097 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3098 inp->in6p_hops);
3099 } else
3100 #endif
3101 {
3102 db_print_indent(indent);
3103 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3104 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3105 inp->inp_options, inp->inp_moptions);
3106 }
3107
3108 db_print_indent(indent);
3109 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3110 }
3111
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3112 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3113 {
3114 struct inpcb *inp;
3115
3116 if (!have_addr) {
3117 db_printf("usage: show inpcb <addr>\n");
3118 return;
3119 }
3120 inp = (struct inpcb *)addr;
3121
3122 db_print_inpcb(inp, "inpcb", 0);
3123 }
3124 #endif /* DDB */
3125
3126 #ifdef RATELIMIT
3127 /*
3128 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3129 * if any.
3130 */
3131 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3132 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3133 {
3134 union if_snd_tag_modify_params params = {
3135 .rate_limit.max_rate = max_pacing_rate,
3136 .rate_limit.flags = M_NOWAIT,
3137 };
3138 struct m_snd_tag *mst;
3139 int error;
3140
3141 mst = inp->inp_snd_tag;
3142 if (mst == NULL)
3143 return (EINVAL);
3144
3145 if (mst->sw->snd_tag_modify == NULL) {
3146 error = EOPNOTSUPP;
3147 } else {
3148 error = mst->sw->snd_tag_modify(mst, ¶ms);
3149 }
3150 return (error);
3151 }
3152
3153 /*
3154 * Query existing TX rate limit based on the existing
3155 * "inp->inp_snd_tag", if any.
3156 */
3157 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3158 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3159 {
3160 union if_snd_tag_query_params params = { };
3161 struct m_snd_tag *mst;
3162 int error;
3163
3164 mst = inp->inp_snd_tag;
3165 if (mst == NULL)
3166 return (EINVAL);
3167
3168 if (mst->sw->snd_tag_query == NULL) {
3169 error = EOPNOTSUPP;
3170 } else {
3171 error = mst->sw->snd_tag_query(mst, ¶ms);
3172 if (error == 0 && p_max_pacing_rate != NULL)
3173 *p_max_pacing_rate = params.rate_limit.max_rate;
3174 }
3175 return (error);
3176 }
3177
3178 /*
3179 * Query existing TX queue level based on the existing
3180 * "inp->inp_snd_tag", if any.
3181 */
3182 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3183 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3184 {
3185 union if_snd_tag_query_params params = { };
3186 struct m_snd_tag *mst;
3187 int error;
3188
3189 mst = inp->inp_snd_tag;
3190 if (mst == NULL)
3191 return (EINVAL);
3192
3193 if (mst->sw->snd_tag_query == NULL)
3194 return (EOPNOTSUPP);
3195
3196 error = mst->sw->snd_tag_query(mst, ¶ms);
3197 if (error == 0 && p_txqueue_level != NULL)
3198 *p_txqueue_level = params.rate_limit.queue_level;
3199 return (error);
3200 }
3201
3202 /*
3203 * Allocate a new TX rate limit send tag from the network interface
3204 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3205 */
3206 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3207 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3208 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3209
3210 {
3211 union if_snd_tag_alloc_params params = {
3212 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3213 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3214 .rate_limit.hdr.flowid = flowid,
3215 .rate_limit.hdr.flowtype = flowtype,
3216 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3217 .rate_limit.max_rate = max_pacing_rate,
3218 .rate_limit.flags = M_NOWAIT,
3219 };
3220 int error;
3221
3222 INP_WLOCK_ASSERT(inp);
3223
3224 /*
3225 * If there is already a send tag, or the INP is being torn
3226 * down, allocating a new send tag is not allowed. Else send
3227 * tags may leak.
3228 */
3229 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3230 return (EINVAL);
3231
3232 error = m_snd_tag_alloc(ifp, ¶ms, st);
3233 #ifdef INET
3234 if (error == 0) {
3235 counter_u64_add(rate_limit_set_ok, 1);
3236 counter_u64_add(rate_limit_active, 1);
3237 } else if (error != EOPNOTSUPP)
3238 counter_u64_add(rate_limit_alloc_fail, 1);
3239 #endif
3240 return (error);
3241 }
3242
3243 void
in_pcbdetach_tag(struct m_snd_tag * mst)3244 in_pcbdetach_tag(struct m_snd_tag *mst)
3245 {
3246
3247 m_snd_tag_rele(mst);
3248 #ifdef INET
3249 counter_u64_add(rate_limit_active, -1);
3250 #endif
3251 }
3252
3253 /*
3254 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3255 * if any:
3256 */
3257 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3258 in_pcbdetach_txrtlmt(struct inpcb *inp)
3259 {
3260 struct m_snd_tag *mst;
3261
3262 INP_WLOCK_ASSERT(inp);
3263
3264 mst = inp->inp_snd_tag;
3265 inp->inp_snd_tag = NULL;
3266
3267 if (mst == NULL)
3268 return;
3269
3270 m_snd_tag_rele(mst);
3271 #ifdef INET
3272 counter_u64_add(rate_limit_active, -1);
3273 #endif
3274 }
3275
3276 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3277 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3278 {
3279 int error;
3280
3281 /*
3282 * If the existing send tag is for the wrong interface due to
3283 * a route change, first drop the existing tag. Set the
3284 * CHANGED flag so that we will keep trying to allocate a new
3285 * tag if we fail to allocate one this time.
3286 */
3287 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3288 in_pcbdetach_txrtlmt(inp);
3289 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3290 }
3291
3292 /*
3293 * NOTE: When attaching to a network interface a reference is
3294 * made to ensure the network interface doesn't go away until
3295 * all ratelimit connections are gone. The network interface
3296 * pointers compared below represent valid network interfaces,
3297 * except when comparing towards NULL.
3298 */
3299 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3300 error = 0;
3301 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3302 if (inp->inp_snd_tag != NULL)
3303 in_pcbdetach_txrtlmt(inp);
3304 error = 0;
3305 } else if (inp->inp_snd_tag == NULL) {
3306 /*
3307 * In order to utilize packet pacing with RSS, we need
3308 * to wait until there is a valid RSS hash before we
3309 * can proceed:
3310 */
3311 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3312 error = EAGAIN;
3313 } else {
3314 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3315 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3316 }
3317 } else {
3318 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3319 }
3320 if (error == 0 || error == EOPNOTSUPP)
3321 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3322
3323 return (error);
3324 }
3325
3326 /*
3327 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3328 * is set in the fast path and will attach/detach/modify the TX rate
3329 * limit send tag based on the socket's so_max_pacing_rate value.
3330 */
3331 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3332 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3333 {
3334 struct socket *socket;
3335 uint32_t max_pacing_rate;
3336 bool did_upgrade;
3337
3338 if (inp == NULL)
3339 return;
3340
3341 socket = inp->inp_socket;
3342 if (socket == NULL)
3343 return;
3344
3345 if (!INP_WLOCKED(inp)) {
3346 /*
3347 * NOTE: If the write locking fails, we need to bail
3348 * out and use the non-ratelimited ring for the
3349 * transmit until there is a new chance to get the
3350 * write lock.
3351 */
3352 if (!INP_TRY_UPGRADE(inp))
3353 return;
3354 did_upgrade = 1;
3355 } else {
3356 did_upgrade = 0;
3357 }
3358
3359 /*
3360 * NOTE: The so_max_pacing_rate value is read unlocked,
3361 * because atomic updates are not required since the variable
3362 * is checked at every mbuf we send. It is assumed that the
3363 * variable read itself will be atomic.
3364 */
3365 max_pacing_rate = socket->so_max_pacing_rate;
3366
3367 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3368
3369 if (did_upgrade)
3370 INP_DOWNGRADE(inp);
3371 }
3372
3373 /*
3374 * Track route changes for TX rate limiting.
3375 */
3376 void
in_pcboutput_eagain(struct inpcb * inp)3377 in_pcboutput_eagain(struct inpcb *inp)
3378 {
3379 bool did_upgrade;
3380
3381 if (inp == NULL)
3382 return;
3383
3384 if (inp->inp_snd_tag == NULL)
3385 return;
3386
3387 if (!INP_WLOCKED(inp)) {
3388 /*
3389 * NOTE: If the write locking fails, we need to bail
3390 * out and use the non-ratelimited ring for the
3391 * transmit until there is a new chance to get the
3392 * write lock.
3393 */
3394 if (!INP_TRY_UPGRADE(inp))
3395 return;
3396 did_upgrade = 1;
3397 } else {
3398 did_upgrade = 0;
3399 }
3400
3401 /* detach rate limiting */
3402 in_pcbdetach_txrtlmt(inp);
3403
3404 /* make sure new mbuf send tag allocation is made */
3405 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3406
3407 if (did_upgrade)
3408 INP_DOWNGRADE(inp);
3409 }
3410
3411 #ifdef INET
3412 static void
rl_init(void * st)3413 rl_init(void *st)
3414 {
3415 rate_limit_new = counter_u64_alloc(M_WAITOK);
3416 rate_limit_chg = counter_u64_alloc(M_WAITOK);
3417 rate_limit_active = counter_u64_alloc(M_WAITOK);
3418 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3419 rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3420 }
3421
3422 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3423 #endif
3424 #endif /* RATELIMIT */
3425