1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9 * All rights reserved.
10 *
11 * Portions of this software were developed by Robert N. M. Watson under
12 * contract to Juniper Networks, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include "opt_ddb.h"
40 #include "opt_ipsec.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #include "opt_ratelimit.h"
44 #include "opt_rss.h"
45
46 #include <sys/param.h>
47 #include <sys/hash.h>
48 #include <sys/systm.h>
49 #include <sys/libkern.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/eventhandler.h>
54 #include <sys/domain.h>
55 #include <sys/proc.h>
56 #include <sys/protosw.h>
57 #include <sys/smp.h>
58 #include <sys/smr.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <sys/priv.h>
63 #include <sys/proc.h>
64 #include <sys/refcount.h>
65 #include <sys/jail.h>
66 #include <sys/kernel.h>
67 #include <sys/sysctl.h>
68
69 #ifdef DDB
70 #include <ddb/ddb.h>
71 #endif
72
73 #include <vm/uma.h>
74 #include <vm/vm.h>
75
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/if_private.h>
79 #include <net/if_types.h>
80 #include <net/if_llatbl.h>
81 #include <net/route.h>
82 #include <net/rss_config.h>
83 #include <net/vnet.h>
84
85 #if defined(INET) || defined(INET6)
86 #include <netinet/in.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/in_pcb_var.h>
89 #include <netinet/tcp.h>
90 #ifdef INET
91 #include <netinet/in_var.h>
92 #include <netinet/in_fib.h>
93 #endif
94 #include <netinet/ip_var.h>
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/in6_pcb.h>
98 #include <netinet6/in6_var.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101 #include <net/route/nhop.h>
102 #endif
103
104 #include <netipsec/ipsec_support.h>
105
106 #include <security/mac/mac_framework.h>
107
108 #define INPCBLBGROUP_SIZMIN 8
109 #define INPCBLBGROUP_SIZMAX 256
110
111 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
112 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
113
114 /*
115 * These configure the range of local port addresses assigned to
116 * "unspecified" outgoing connections/packets/whatever.
117 */
118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
124
125 /*
126 * Reserved ports accessible only to root. There are significant
127 * security considerations that must be accounted for when changing these,
128 * but the security benefits can be great. Please be careful.
129 */
130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
131 VNET_DEFINE(int, ipport_reservedlow);
132
133 /* Enable random ephemeral port allocation by default. */
134 VNET_DEFINE(int, ipport_randomized) = 1;
135
136 #ifdef INET
137 static struct inpcb *in_pcblookup_internal(struct inpcbinfo *pcbinfo,
138 struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
139 u_int lport_arg, int lookupflags, uint8_t numa_domain, int fib);
140
141 #define RANGECHK(var, min, max) \
142 if ((var) < (min)) { (var) = (min); } \
143 else if ((var) > (max)) { (var) = (max); }
144
145 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)146 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
147 {
148 int error;
149
150 error = sysctl_handle_int(oidp, arg1, arg2, req);
151 if (error == 0) {
152 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
153 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
154 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
155 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
156 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
157 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
158 }
159 return (error);
160 }
161
162 #undef RANGECHK
163
164 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
165 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
166 "IP Ports");
167
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
169 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
170 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
171 "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
173 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
174 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
175 "");
176 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
177 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
178 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
179 "");
180 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
181 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
182 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
183 "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
185 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
186 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
187 "");
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
189 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
190 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
191 "");
192 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
193 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
194 &VNET_NAME(ipport_reservedhigh), 0, "");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
196 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
198 CTLFLAG_VNET | CTLFLAG_RW,
199 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
200
201 #ifdef RATELIMIT
202 counter_u64_t rate_limit_new;
203 counter_u64_t rate_limit_chg;
204 counter_u64_t rate_limit_active;
205 counter_u64_t rate_limit_alloc_fail;
206 counter_u64_t rate_limit_set_ok;
207
208 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
209 "IP Rate Limiting");
210 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
211 &rate_limit_active, "Active rate limited connections");
212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
213 &rate_limit_alloc_fail, "Rate limited connection failures");
214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
215 &rate_limit_set_ok, "Rate limited setting succeeded");
216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
217 &rate_limit_new, "Total Rate limit new attempts");
218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
219 &rate_limit_chg, "Total Rate limited change attempts");
220 #endif /* RATELIMIT */
221
222 #endif /* INET */
223
224 VNET_DEFINE(uint32_t, in_pcbhashseed);
225 static void
in_pcbhashseed_init(void)226 in_pcbhashseed_init(void)
227 {
228
229 V_in_pcbhashseed = arc4random();
230 }
231 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
232 in_pcbhashseed_init, NULL);
233
234 #ifdef INET
235 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0;
236 #define V_connect_inaddr_wild VNET(connect_inaddr_wild)
237 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
238 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
239 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
240 #endif
241
242 /*
243 * in_pcb.c: manage the Protocol Control Blocks.
244 *
245 * NOTE: It is assumed that most of these functions will be called with
246 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
247 * functions often modify hash chains or addresses in pcbs.
248 */
249
250 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)251 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
252 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
253 {
254 struct inpcblbgroup *grp;
255 size_t bytes;
256
257 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
258 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
259 if (grp == NULL)
260 return (NULL);
261 LIST_INIT(&grp->il_pending);
262 grp->il_cred = crhold(cred);
263 grp->il_vflag = vflag;
264 grp->il_lport = port;
265 grp->il_numa_domain = numa_domain;
266 grp->il_fibnum = fib;
267 grp->il_dependladdr = *addr;
268 grp->il_inpsiz = size;
269 return (grp);
270 }
271
272 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)273 in_pcblbgroup_free_deferred(epoch_context_t ctx)
274 {
275 struct inpcblbgroup *grp;
276
277 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
278 crfree(grp->il_cred);
279 free(grp, M_PCB);
280 }
281
282 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)283 in_pcblbgroup_free(struct inpcblbgroup *grp)
284 {
285 KASSERT(LIST_EMPTY(&grp->il_pending),
286 ("local group %p still has pending inps", grp));
287
288 CK_LIST_REMOVE(grp, il_list);
289 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
290 }
291
292 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)293 in_pcblbgroup_find(struct inpcb *inp)
294 {
295 struct inpcbinfo *pcbinfo;
296 struct inpcblbgroup *grp;
297 struct inpcblbgrouphead *hdr;
298
299 INP_LOCK_ASSERT(inp);
300
301 pcbinfo = inp->inp_pcbinfo;
302 INP_HASH_LOCK_ASSERT(pcbinfo);
303
304 hdr = &pcbinfo->ipi_lbgrouphashbase[
305 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
306 CK_LIST_FOREACH(grp, hdr, il_list) {
307 struct inpcb *inp1;
308
309 for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
310 if (inp == grp->il_inp[i])
311 goto found;
312 }
313 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
314 if (inp == inp1)
315 goto found;
316 }
317 }
318 found:
319 return (grp);
320 }
321
322 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)323 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
324 {
325 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
326 ("invalid local group size %d and count %d", grp->il_inpsiz,
327 grp->il_inpcnt));
328 INP_WLOCK_ASSERT(inp);
329
330 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
331 !SOLISTENING(inp->inp_socket)) {
332 /*
333 * If this is a TCP socket, it should not be visible to lbgroup
334 * lookups until listen() has been called.
335 */
336 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
337 grp->il_pendcnt++;
338 } else {
339 grp->il_inp[grp->il_inpcnt] = inp;
340
341 /*
342 * Synchronize with in_pcblookup_lbgroup(): make sure that we
343 * don't expose a null slot to the lookup path.
344 */
345 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
346 }
347
348 inp->inp_flags |= INP_INLBGROUP;
349 }
350
351 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)352 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
353 struct inpcblbgroup *old_grp, int size)
354 {
355 struct inpcblbgroup *grp;
356 int i;
357
358 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
359 old_grp->il_lport, &old_grp->il_dependladdr, size,
360 old_grp->il_numa_domain, old_grp->il_fibnum);
361 if (grp == NULL)
362 return (NULL);
363
364 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
365 ("invalid new local group size %d and old local group count %d",
366 grp->il_inpsiz, old_grp->il_inpcnt));
367
368 for (i = 0; i < old_grp->il_inpcnt; ++i)
369 grp->il_inp[i] = old_grp->il_inp[i];
370 grp->il_inpcnt = old_grp->il_inpcnt;
371 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
372 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
373 inp_lbgroup_list);
374 grp->il_pendcnt = old_grp->il_pendcnt;
375 old_grp->il_pendcnt = 0;
376 in_pcblbgroup_free(old_grp);
377 return (grp);
378 }
379
380 /*
381 * Add PCB to load balance group for SO_REUSEPORT_LB option.
382 */
383 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)384 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
385 {
386 const static struct timeval interval = { 60, 0 };
387 static struct timeval lastprint;
388 struct inpcbinfo *pcbinfo;
389 struct inpcblbgrouphead *hdr;
390 struct inpcblbgroup *grp;
391 uint32_t idx;
392 int fib;
393
394 pcbinfo = inp->inp_pcbinfo;
395
396 INP_WLOCK_ASSERT(inp);
397 INP_HASH_WLOCK_ASSERT(pcbinfo);
398
399 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
400 inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
401
402 #ifdef INET6
403 /*
404 * Don't allow IPv4 mapped INET6 wild socket.
405 */
406 if ((inp->inp_vflag & INP_IPV4) &&
407 inp->inp_laddr.s_addr == INADDR_ANY &&
408 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
409 return (0);
410 }
411 #endif
412
413 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
414 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
415 CK_LIST_FOREACH(grp, hdr, il_list) {
416 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
417 grp->il_vflag == inp->inp_vflag &&
418 grp->il_lport == inp->inp_lport &&
419 grp->il_numa_domain == numa_domain &&
420 grp->il_fibnum == fib &&
421 memcmp(&grp->il_dependladdr,
422 &inp->inp_inc.inc_ie.ie_dependladdr,
423 sizeof(grp->il_dependladdr)) == 0) {
424 break;
425 }
426 }
427 if (grp == NULL) {
428 /* Create new load balance group. */
429 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
430 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
431 INPCBLBGROUP_SIZMIN, numa_domain, fib);
432 if (grp == NULL)
433 return (ENOMEM);
434 in_pcblbgroup_insert(grp, inp);
435 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
436 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) {
437 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
438 if (ratecheck(&lastprint, &interval))
439 printf("lb group port %d, limit reached\n",
440 ntohs(grp->il_lport));
441 return (0);
442 }
443
444 /* Expand this local group. */
445 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
446 if (grp == NULL)
447 return (ENOMEM);
448 in_pcblbgroup_insert(grp, inp);
449 } else {
450 in_pcblbgroup_insert(grp, inp);
451 }
452 return (0);
453 }
454
455 /*
456 * Remove PCB from load balance group.
457 */
458 static void
in_pcbremlbgrouphash(struct inpcb * inp)459 in_pcbremlbgrouphash(struct inpcb *inp)
460 {
461 struct inpcbinfo *pcbinfo;
462 struct inpcblbgrouphead *hdr;
463 struct inpcblbgroup *grp;
464 struct inpcb *inp1;
465 int i;
466
467 pcbinfo = inp->inp_pcbinfo;
468
469 INP_WLOCK_ASSERT(inp);
470 MPASS(inp->inp_flags & INP_INLBGROUP);
471 INP_HASH_WLOCK_ASSERT(pcbinfo);
472
473 hdr = &pcbinfo->ipi_lbgrouphashbase[
474 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
475 CK_LIST_FOREACH(grp, hdr, il_list) {
476 for (i = 0; i < grp->il_inpcnt; ++i) {
477 if (grp->il_inp[i] != inp)
478 continue;
479
480 if (grp->il_inpcnt == 1 &&
481 LIST_EMPTY(&grp->il_pending)) {
482 /* We are the last, free this local group. */
483 in_pcblbgroup_free(grp);
484 } else {
485 grp->il_inp[i] =
486 grp->il_inp[grp->il_inpcnt - 1];
487
488 /*
489 * Synchronize with in_pcblookup_lbgroup().
490 */
491 atomic_store_rel_int(&grp->il_inpcnt,
492 grp->il_inpcnt - 1);
493 }
494 inp->inp_flags &= ~INP_INLBGROUP;
495 return;
496 }
497 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
498 if (inp == inp1) {
499 LIST_REMOVE(inp, inp_lbgroup_list);
500 grp->il_pendcnt--;
501 inp->inp_flags &= ~INP_INLBGROUP;
502 return;
503 }
504 }
505 }
506 __assert_unreachable();
507 }
508
509 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)510 in_pcblbgroup_numa(struct inpcb *inp, int arg)
511 {
512 struct inpcbinfo *pcbinfo;
513 int error;
514 uint8_t numa_domain;
515
516 switch (arg) {
517 case TCP_REUSPORT_LB_NUMA_NODOM:
518 numa_domain = M_NODOM;
519 break;
520 case TCP_REUSPORT_LB_NUMA_CURDOM:
521 numa_domain = PCPU_GET(domain);
522 break;
523 default:
524 if (arg < 0 || arg >= vm_ndomains)
525 return (EINVAL);
526 numa_domain = arg;
527 }
528
529 pcbinfo = inp->inp_pcbinfo;
530 INP_WLOCK_ASSERT(inp);
531 INP_HASH_WLOCK(pcbinfo);
532 if (in_pcblbgroup_find(inp) != NULL) {
533 /* Remove it from the old group. */
534 in_pcbremlbgrouphash(inp);
535 /* Add it to the new group based on numa domain. */
536 in_pcbinslbgrouphash(inp, numa_domain);
537 error = 0;
538 } else {
539 error = ENOENT;
540 }
541 INP_HASH_WUNLOCK(pcbinfo);
542 return (error);
543 }
544
545 /*
546 * Initialize an inpcbinfo - a per-VNET instance of connections db.
547 */
548 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements,u_int lbgrouphash_nelements)549 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
550 u_int hash_nelements, u_int porthash_nelements, u_int lbgrouphash_nelements)
551 {
552 struct hashalloc_args ha = {
553 .mtype = M_PCB,
554 .mflags = M_WAITOK,
555 .head = HASH_HEAD_CK_LIST,
556 };
557
558 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
559 NULL, MTX_DEF);
560 CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
561 pcbinfo->ipi_count = 0;
562
563 ha.size = hash_nelements;
564 pcbinfo->ipi_hash_exact = hashalloc(&ha);
565 pcbinfo->ipi_hash_wild = hashalloc(&ha);
566 pcbinfo->ipi_hashmask = ha.size - 1;
567
568 if (porthash_nelements > 0) {
569 ha.size = imin(porthash_nelements, IPPORT_MAX + 1);
570 pcbinfo->ipi_porthashbase = hashalloc(&ha);
571 pcbinfo->ipi_porthashmask = ha.size - 1;
572 } else
573 pcbinfo->ipi_porthashbase = NULL;
574 if (lbgrouphash_nelements > 0) {
575 ha.size = imin(lbgrouphash_nelements, IPPORT_MAX + 1);
576 pcbinfo->ipi_lbgrouphashbase = hashalloc(&ha);
577 pcbinfo->ipi_lbgrouphashmask = ha.size - 1;
578 } else
579 pcbinfo->ipi_lbgrouphashbase = NULL;
580
581 pcbinfo->ipi_zone = pcbstor->ips_zone;
582 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
583 }
584
585 /*
586 * Destroy an inpcbinfo.
587 */
588 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)589 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
590 {
591 struct hashalloc_args ha = {
592 .mtype = M_PCB,
593 .head = HASH_HEAD_CK_LIST,
594 };
595
596 KASSERT(pcbinfo->ipi_count == 0,
597 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
598
599 ha.size = pcbinfo->ipi_hashmask + 1;
600 hashfree(pcbinfo->ipi_hash_exact, &ha);
601 hashfree(pcbinfo->ipi_hash_wild, &ha);
602 if (pcbinfo->ipi_porthashbase != NULL) {
603 ha.size = pcbinfo->ipi_porthashmask + 1;
604 hashfree(pcbinfo->ipi_porthashbase, &ha);
605 }
606 if (pcbinfo->ipi_lbgrouphashbase != NULL) {
607 ha.size = pcbinfo->ipi_lbgrouphashmask + 1;
608 hashfree(pcbinfo->ipi_lbgrouphashbase, &ha);
609 }
610 mtx_destroy(&pcbinfo->ipi_hash_lock);
611 }
612
613 /*
614 * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
615 */
616 static void inpcb_fini(void *, int);
617 void
in_pcbstorage_init(void * arg)618 in_pcbstorage_init(void *arg)
619 {
620 struct inpcbstorage *pcbstor = arg;
621
622 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
623 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
624 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
625 }
626
627 /*
628 * Destroy a pcbstorage - used by unloadable protocols.
629 */
630 void
in_pcbstorage_destroy(void * arg)631 in_pcbstorage_destroy(void *arg)
632 {
633 struct inpcbstorage *pcbstor = arg;
634
635 uma_zdestroy(pcbstor->ips_zone);
636 }
637
638 /*
639 * Allocate a PCB and associate it with the socket.
640 * On success return with the PCB locked.
641 */
642 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)643 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
644 {
645 struct inpcb *inp;
646 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
647 int error;
648 #endif
649
650 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
651 if (inp == NULL)
652 return (ENOBUFS);
653 bzero(&inp->inp_start_zero, inp_zero_size);
654 #ifdef NUMA
655 inp->inp_numa_domain = M_NODOM;
656 #endif
657 inp->inp_pcbinfo = pcbinfo;
658 inp->inp_socket = so;
659 inp->inp_cred = crhold(so->so_cred);
660 inp->inp_inc.inc_fibnum = so->so_fibnum;
661 #ifdef MAC
662 error = mac_inpcb_init(inp, M_NOWAIT);
663 if (error != 0)
664 goto out;
665 mac_inpcb_create(so, inp);
666 #endif
667 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
668 error = ipsec_init_pcbpolicy(inp);
669 if (error != 0) {
670 #ifdef MAC
671 mac_inpcb_destroy(inp);
672 #endif
673 goto out;
674 }
675 #endif /*IPSEC*/
676 #ifdef INET6
677 if (INP_SOCKAF(so) == AF_INET6) {
678 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
679 if (V_ip6_v6only)
680 inp->inp_flags |= IN6P_IPV6_V6ONLY;
681 #ifdef INET
682 else
683 inp->inp_vflag |= INP_IPV4;
684 #endif
685 if (V_ip6_auto_flowlabel)
686 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
687 inp->in6p_hops = -1; /* use kernel default */
688 }
689 #endif
690 #if defined(INET) && defined(INET6)
691 else
692 #endif
693 #ifdef INET
694 inp->inp_vflag |= INP_IPV4;
695 #endif
696 inp->inp_smr = SMR_SEQ_INVALID;
697
698 /*
699 * Routes in inpcb's can cache L2 as well; they are guaranteed
700 * to be cleaned up.
701 */
702 inp->inp_route.ro_flags = RT_LLE_CACHE;
703 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
704 inp->inp_flags |= INP_UNCONNECTED;
705 INP_WLOCK(inp);
706 INP_HASH_WLOCK(pcbinfo);
707 pcbinfo->ipi_count++;
708 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
709 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
710 INP_HASH_WUNLOCK(pcbinfo);
711 so->so_pcb = inp;
712
713 return (0);
714
715 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
716 out:
717 crfree(inp->inp_cred);
718 #ifdef INVARIANTS
719 inp->inp_cred = NULL;
720 #endif
721 uma_zfree_smr(pcbinfo->ipi_zone, inp);
722 return (error);
723 #endif
724 }
725
726 #if defined(INET) || defined(INET6)
727 /*
728 * Assign a local port like in_pcb_lport(), but also used with connect()
729 * and a foreign address and port. If fsa is non-NULL, choose a local port
730 * that is unused with those, otherwise one that is completely unused.
731 * lsa can be NULL for IPv6.
732 */
733 int
in_pcb_lport_dest(const struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)734 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa,
735 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred,
736 int lookupflags)
737 {
738 struct inpcbinfo *pcbinfo;
739 struct inpcb *tmpinp;
740 unsigned short *lastport;
741 int count, error;
742 u_short aux, first, last, lport;
743 #ifdef INET
744 struct in_addr laddr, faddr;
745 #endif
746 #ifdef INET6
747 struct in6_addr *laddr6, *faddr6;
748 #endif
749
750 pcbinfo = inp->inp_pcbinfo;
751
752 /*
753 * Because no actual state changes occur here, a global write lock on
754 * the pcbinfo isn't required.
755 */
756 INP_LOCK_ASSERT(inp);
757 INP_HASH_LOCK_ASSERT(pcbinfo);
758
759 if (inp->inp_flags & INP_HIGHPORT) {
760 first = V_ipport_hifirstauto; /* sysctl */
761 last = V_ipport_hilastauto;
762 lastport = &pcbinfo->ipi_lasthi;
763 } else if (inp->inp_flags & INP_LOWPORT) {
764 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
765 if (error)
766 return (error);
767 first = V_ipport_lowfirstauto; /* 1023 */
768 last = V_ipport_lowlastauto; /* 600 */
769 lastport = &pcbinfo->ipi_lastlow;
770 } else {
771 first = V_ipport_firstauto; /* sysctl */
772 last = V_ipport_lastauto;
773 lastport = &pcbinfo->ipi_lastport;
774 }
775
776 /*
777 * Instead of having two loops further down counting up or down
778 * make sure that first is always <= last and go with only one
779 * code path implementing all logic.
780 */
781 if (first > last) {
782 aux = first;
783 first = last;
784 last = aux;
785 }
786
787 #ifdef INET
788 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
789 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
790 if (lsa != NULL)
791 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
792 if (fsa != NULL)
793 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
794 }
795 #endif
796 #ifdef INET6
797 laddr6 = NULL;
798 if ((inp->inp_vflag & INP_IPV6) != 0) {
799 if (lsa != NULL)
800 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
801 if (fsa != NULL)
802 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
803 }
804 #endif
805
806 tmpinp = NULL;
807
808 if (V_ipport_randomized)
809 *lastport = first + (arc4random() % (last - first));
810
811 count = last - first;
812
813 do {
814 if (count-- < 0) /* completely used? */
815 return (EADDRNOTAVAIL);
816 ++*lastport;
817 if (*lastport < first || *lastport > last)
818 *lastport = first;
819 lport = htons(*lastport);
820
821 if (fsa != NULL) {
822 #ifdef INET
823 if (lsa->sa_family == AF_INET) {
824 tmpinp = in_pcblookup_internal(pcbinfo,
825 faddr, fport, laddr, lport, lookupflags,
826 M_NODOM, RT_ALL_FIBS);
827 }
828 #endif
829 #ifdef INET6
830 if (lsa->sa_family == AF_INET6) {
831 tmpinp = in6_pcblookup_internal(pcbinfo,
832 faddr6, fport, laddr6, lport, lookupflags,
833 M_NODOM, RT_ALL_FIBS);
834 }
835 #endif
836 } else {
837 #ifdef INET6
838 if ((inp->inp_vflag & INP_IPV6) != 0) {
839 tmpinp = in6_pcblookup_local(pcbinfo,
840 &inp->in6p_laddr, lport, RT_ALL_FIBS,
841 lookupflags, cred);
842 #ifdef INET
843 if (tmpinp == NULL &&
844 (inp->inp_vflag & INP_IPV4))
845 tmpinp = in_pcblookup_local(pcbinfo,
846 laddr, lport, RT_ALL_FIBS,
847 lookupflags, cred);
848 #endif
849 }
850 #endif
851 #if defined(INET) && defined(INET6)
852 else
853 #endif
854 #ifdef INET
855 tmpinp = in_pcblookup_local(pcbinfo, laddr,
856 lport, RT_ALL_FIBS, lookupflags, cred);
857 #endif
858 }
859 } while (tmpinp != NULL);
860
861 *lportp = lport;
862
863 return (0);
864 }
865
866 /*
867 * Select a local port (number) to use.
868 */
869 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)870 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
871 struct ucred *cred, int lookupflags)
872 {
873 struct sockaddr_in laddr;
874
875 if (laddrp) {
876 bzero(&laddr, sizeof(laddr));
877 laddr.sin_family = AF_INET;
878 laddr.sin_addr = *laddrp;
879 }
880 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
881 NULL, lportp, NULL, 0, cred, lookupflags));
882 }
883 #endif /* INET || INET6 */
884
885 #ifdef INET
886 /*
887 * Determine whether the inpcb can be bound to the specified address/port tuple.
888 */
889 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)890 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
891 const u_short lport, const int fib, int sooptions, int lookupflags,
892 struct ucred *cred)
893 {
894 int reuseport, reuseport_lb;
895
896 INP_LOCK_ASSERT(inp);
897 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
898
899 reuseport = (sooptions & SO_REUSEPORT);
900 reuseport_lb = (sooptions & SO_REUSEPORT_LB);
901
902 if (IN_MULTICAST(ntohl(laddr.s_addr))) {
903 /*
904 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
905 * allow complete duplication of binding if
906 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
907 * and a multicast address is bound on both
908 * new and duplicated sockets.
909 */
910 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
911 reuseport = SO_REUSEADDR | SO_REUSEPORT;
912 /*
913 * XXX: How to deal with SO_REUSEPORT_LB here?
914 * Treat same as SO_REUSEPORT for now.
915 */
916 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
917 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
918 } else if (!in_nullhost(laddr)) {
919 struct sockaddr_in sin;
920
921 memset(&sin, 0, sizeof(sin));
922 sin.sin_family = AF_INET;
923 sin.sin_len = sizeof(sin);
924 sin.sin_addr = laddr;
925
926 /*
927 * Is the address a local IP address?
928 * If INP_BINDANY is set, then the socket may be bound
929 * to any endpoint address, local or not.
930 */
931 if ((inp->inp_flags & INP_BINDANY) == 0 &&
932 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
933 return (EADDRNOTAVAIL);
934 }
935
936 if (lport != 0) {
937 struct inpcb *t;
938
939 if (ntohs(lport) <= V_ipport_reservedhigh &&
940 ntohs(lport) >= V_ipport_reservedlow &&
941 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
942 return (EACCES);
943
944 if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
945 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
946 /*
947 * If a socket owned by a different user is already
948 * bound to this port, fail. In particular, SO_REUSE*
949 * can only be used to share a port among sockets owned
950 * by the same user.
951 *
952 * However, we can share a port with a connected socket
953 * which has a unique 4-tuple.
954 */
955 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
956 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
957 if (t != NULL &&
958 (inp->inp_socket->so_type != SOCK_STREAM ||
959 in_nullhost(t->inp_faddr)) &&
960 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
961 return (EADDRINUSE);
962 }
963 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
964 lookupflags, cred);
965 if (t != NULL && ((reuseport | reuseport_lb) &
966 t->inp_socket->so_options) == 0) {
967 #ifdef INET6
968 if (!in_nullhost(laddr) ||
969 !in_nullhost(t->inp_laddr) ||
970 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
971 (t->inp_vflag & INP_IPV6PROTO) == 0)
972 #endif
973 return (EADDRINUSE);
974 }
975 }
976 return (0);
977 }
978
979 /*
980 * Set up a bind operation on a PCB, performing port allocation
981 * as required, but do not actually modify the PCB. Callers can
982 * either complete the bind by setting inp_laddr/inp_lport and
983 * calling in_pcbinshash(), or they can just use the resulting
984 * port and address to authorise the sending of a once-off packet.
985 *
986 * On error, the values of *laddrp and *lportp are not changed.
987 */
988 static int
in_pcbbind_setup_locked(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)989 in_pcbbind_setup_locked(struct inpcb *inp, struct sockaddr_in *sin,
990 in_addr_t *laddrp, u_short *lportp, int flags, struct ucred *cred)
991 {
992 struct socket *so = inp->inp_socket;
993 struct in_addr laddr;
994 u_short lport = 0;
995 int error, fib, lookupflags, sooptions;
996
997 /*
998 * No state changes, so read locks are sufficient here.
999 */
1000 INP_LOCK_ASSERT(inp);
1001 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1002
1003 laddr.s_addr = *laddrp;
1004 if (sin != NULL && laddr.s_addr != INADDR_ANY)
1005 return (EINVAL);
1006
1007 lookupflags = 0;
1008 sooptions = atomic_load_int(&so->so_options);
1009 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1010 lookupflags = INPLOOKUP_WILDCARD;
1011 if (sin == NULL) {
1012 if ((error = prison_local_ip4(cred, &laddr)) != 0)
1013 return (error);
1014 } else {
1015 KASSERT(sin->sin_family == AF_INET,
1016 ("%s: invalid family for address %p", __func__, sin));
1017 KASSERT(sin->sin_len == sizeof(*sin),
1018 ("%s: invalid length for address %p", __func__, sin));
1019
1020 error = prison_local_ip4(cred, &sin->sin_addr);
1021 if (error)
1022 return (error);
1023 if (sin->sin_port != *lportp) {
1024 /* Don't allow the port to change. */
1025 if (*lportp != 0)
1026 return (EINVAL);
1027 lport = sin->sin_port;
1028 }
1029 laddr = sin->sin_addr;
1030
1031 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1032 RT_ALL_FIBS;
1033
1034 /* See if this address/port combo is available. */
1035 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1036 lookupflags, cred);
1037 if (error != 0)
1038 return (error);
1039 }
1040 if (*lportp != 0)
1041 lport = *lportp;
1042 if (lport == 0) {
1043 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1044 if (error != 0)
1045 return (error);
1046 }
1047 *laddrp = laddr.s_addr;
1048 *lportp = lport;
1049 if ((flags & INPBIND_FIB) != 0)
1050 inp->inp_flags |= INP_BOUNDFIB;
1051 return (0);
1052 }
1053
1054 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1055 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1056 u_short *lportp, int flags, struct ucred *cred)
1057 {
1058 int error;
1059
1060 INP_HASH_WLOCK(inp->inp_pcbinfo);
1061 error = in_pcbbind_setup_locked(inp, sin, laddrp, lportp, flags, cred);
1062 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1063
1064 return (error);
1065 }
1066
1067 #ifdef INET
1068 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)1069 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
1070 struct ucred *cred)
1071 {
1072 int error;
1073 bool anonport;
1074
1075 KASSERT(sin == NULL || sin->sin_family == AF_INET,
1076 ("%s: invalid address family for %p", __func__, sin));
1077 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
1078 ("%s: invalid address length for %p", __func__, sin));
1079 INP_WLOCK_ASSERT(inp);
1080
1081 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
1082 return (EINVAL);
1083 anonport = sin == NULL || sin->sin_port == 0;
1084
1085 INP_HASH_WLOCK(inp->inp_pcbinfo);
1086 error = in_pcbbind_setup_locked(inp, sin, &inp->inp_laddr.s_addr,
1087 &inp->inp_lport, flags, cred);
1088 if (error) {
1089 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1090 return (error);
1091 }
1092 if (__predict_false((error = in_pcbinshash(inp)) != 0)) {
1093 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1094 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB);
1095 inp->inp_laddr.s_addr = INADDR_ANY;
1096 inp->inp_lport = 0;
1097 inp->inp_flags &= ~INP_BOUNDFIB;
1098 return (error);
1099 }
1100 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1101 if (anonport)
1102 inp->inp_flags |= INP_ANONPORT;
1103 return (0);
1104 }
1105 #endif
1106
1107 /*
1108 * Connect from a socket to a specified address.
1109 * Both address and port must be specified in argument sin.
1110 * If don't have a local address for this socket yet,
1111 * then pick one.
1112 */
1113 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1114 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1115 {
1116 struct in_addr laddr, faddr;
1117 u_short lport;
1118 int error;
1119 bool anonport;
1120
1121 NET_EPOCH_ASSERT();
1122 INP_WLOCK_ASSERT(inp);
1123 KASSERT(in_nullhost(inp->inp_faddr),
1124 ("%s: inp is already connected", __func__));
1125 KASSERT(sin->sin_family == AF_INET,
1126 ("%s: invalid address family for %p", __func__, sin));
1127 KASSERT(sin->sin_len == sizeof(*sin),
1128 ("%s: invalid address length for %p", __func__, sin));
1129
1130 if (sin->sin_port == 0)
1131 return (EADDRNOTAVAIL);
1132
1133 anonport = (inp->inp_lport == 0);
1134
1135 if (__predict_false(in_broadcast(sin->sin_addr))) {
1136 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead))
1137 return (ENETUNREACH);
1138 /*
1139 * If the destination address is INADDR_ANY, use the primary
1140 * local address. If the supplied address is INADDR_BROADCAST,
1141 * and the primary interface supports broadcast, choose the
1142 * broadcast address for that interface.
1143 */
1144 if (in_nullhost(sin->sin_addr)) {
1145 faddr =
1146 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1147 if ((error = prison_get_ip4(cred, &faddr)) != 0)
1148 return (error);
1149 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
1150 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags
1151 & IFF_BROADCAST) {
1152 faddr = satosin(&CK_STAILQ_FIRST(
1153 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1154 } else
1155 faddr = sin->sin_addr;
1156 } else
1157 faddr = sin->sin_addr;
1158
1159 INP_HASH_WLOCK(inp->inp_pcbinfo);
1160 if (in_nullhost(inp->inp_laddr)) {
1161 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1162 if (__predict_false(error)) {
1163 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1164 return (error);
1165 }
1166 } else
1167 laddr = inp->inp_laddr;
1168
1169 if (anonport) {
1170 struct sockaddr_in lsin = {
1171 .sin_family = AF_INET,
1172 .sin_addr = laddr,
1173 };
1174 struct sockaddr_in fsin = {
1175 .sin_family = AF_INET,
1176 .sin_addr = faddr,
1177 };
1178
1179 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin,
1180 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred,
1181 INPLOOKUP_WILDCARD);
1182 if (__predict_false(error)) {
1183 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1184 return (error);
1185 }
1186 } else if (in_pcblookup_internal(inp->inp_pcbinfo, faddr,
1187 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) !=
1188 NULL) {
1189 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1190 return (EADDRINUSE);
1191 } else
1192 lport = inp->inp_lport;
1193
1194 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 ||
1195 (inp->inp_flags & INP_UNCONNECTED));
1196
1197 inp->inp_faddr = faddr;
1198 inp->inp_fport = sin->sin_port;
1199 inp->inp_laddr = laddr;
1200 inp->inp_lport = lport;
1201
1202 if (inp->inp_flags & INP_UNCONNECTED) {
1203 error = in_pcbinshash(inp);
1204 MPASS(error == 0);
1205 } else
1206 in_pcbrehash(inp);
1207 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1208
1209 if (V_fib_hash_outbound) {
1210 uint32_t hash_val, hash_type;
1211
1212 hash_val = fib4_calc_software_hash(inp->inp_laddr,
1213 inp->inp_faddr, 0, sin->sin_port,
1214 inp->inp_socket->so_proto->pr_protocol, &hash_type);
1215
1216 inp->inp_flowid = hash_val;
1217 inp->inp_flowtype = hash_type;
1218 }
1219 if (anonport)
1220 inp->inp_flags |= INP_ANONPORT;
1221 return (0);
1222 }
1223
1224 /*
1225 * Do proper source address selection on an unbound socket in case
1226 * of connect. Take jails into account as well.
1227 */
1228 int
in_pcbladdr(const struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1229 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr,
1230 struct in_addr *laddr, struct ucred *cred)
1231 {
1232 struct ifaddr *ifa;
1233 struct sockaddr *sa;
1234 struct sockaddr_in *sin, dst;
1235 struct nhop_object *nh;
1236 int error;
1237
1238 NET_EPOCH_ASSERT();
1239 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1240
1241 /*
1242 * Bypass source address selection and use the primary jail IP
1243 * if requested.
1244 */
1245 if (!prison_saddrsel_ip4(cred, laddr))
1246 return (0);
1247
1248 /*
1249 * If the destination address is multicast and an outgoing
1250 * interface has been set as a multicast option, prefer the
1251 * address of that interface as our source address.
1252 */
1253 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL &&
1254 inp->inp_moptions->imo_multicast_ifp != NULL) {
1255 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp;
1256 struct in_ifaddr *ia;
1257
1258 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1259 if (ia->ia_ifp == ifp &&
1260 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)
1261 break;
1262 }
1263 if (ia == NULL)
1264 return (EADDRNOTAVAIL);
1265 *laddr = ia->ia_addr.sin_addr;
1266 return (0);
1267 }
1268
1269 error = 0;
1270
1271 nh = NULL;
1272 bzero(&dst, sizeof(dst));
1273 sin = &dst;
1274 sin->sin_family = AF_INET;
1275 sin->sin_len = sizeof(struct sockaddr_in);
1276 sin->sin_addr.s_addr = faddr->s_addr;
1277
1278 /*
1279 * If route is known our src addr is taken from the i/f,
1280 * else punt.
1281 *
1282 * Find out route to destination.
1283 */
1284 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1285 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1286 0, NHR_NONE, 0);
1287
1288 /*
1289 * If we found a route, use the address corresponding to
1290 * the outgoing interface.
1291 *
1292 * Otherwise assume faddr is reachable on a directly connected
1293 * network and try to find a corresponding interface to take
1294 * the source address from.
1295 */
1296 if (nh == NULL || nh->nh_ifp == NULL) {
1297 struct in_ifaddr *ia;
1298 struct ifnet *ifp;
1299
1300 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1301 inp->inp_socket->so_fibnum));
1302 if (ia == NULL) {
1303 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1304 inp->inp_socket->so_fibnum));
1305 }
1306 if (ia == NULL) {
1307 error = ENETUNREACH;
1308 goto done;
1309 }
1310
1311 if (!prison_flag(cred, PR_IP4)) {
1312 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1313 goto done;
1314 }
1315
1316 ifp = ia->ia_ifp;
1317 ia = NULL;
1318 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1319 sa = ifa->ifa_addr;
1320 if (sa->sa_family != AF_INET)
1321 continue;
1322 sin = (struct sockaddr_in *)sa;
1323 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1324 ia = (struct in_ifaddr *)ifa;
1325 break;
1326 }
1327 }
1328 if (ia != NULL) {
1329 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1330 goto done;
1331 }
1332
1333 /* 3. As a last resort return the 'default' jail address. */
1334 error = prison_get_ip4(cred, laddr);
1335 goto done;
1336 }
1337
1338 /*
1339 * If the outgoing interface on the route found is not
1340 * a loopback interface, use the address from that interface.
1341 * In case of jails do those three steps:
1342 * 1. check if the interface address belongs to the jail. If so use it.
1343 * 2. check if we have any address on the outgoing interface
1344 * belonging to this jail. If so use it.
1345 * 3. as a last resort return the 'default' jail address.
1346 */
1347 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1348 struct in_ifaddr *ia;
1349 struct ifnet *ifp;
1350
1351 /* If not jailed, use the default returned. */
1352 if (!prison_flag(cred, PR_IP4)) {
1353 ia = (struct in_ifaddr *)nh->nh_ifa;
1354 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1355 goto done;
1356 }
1357
1358 /* Jailed. */
1359 /* 1. Check if the iface address belongs to the jail. */
1360 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1361 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1362 ia = (struct in_ifaddr *)nh->nh_ifa;
1363 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1364 goto done;
1365 }
1366
1367 /*
1368 * 2. Check if we have any address on the outgoing interface
1369 * belonging to this jail.
1370 */
1371 ia = NULL;
1372 ifp = nh->nh_ifp;
1373 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1374 sa = ifa->ifa_addr;
1375 if (sa->sa_family != AF_INET)
1376 continue;
1377 sin = (struct sockaddr_in *)sa;
1378 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1379 ia = (struct in_ifaddr *)ifa;
1380 break;
1381 }
1382 }
1383 if (ia != NULL) {
1384 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1385 goto done;
1386 }
1387
1388 /* 3. As a last resort return the 'default' jail address. */
1389 error = prison_get_ip4(cred, laddr);
1390 goto done;
1391 }
1392
1393 /*
1394 * The outgoing interface is marked with 'loopback net', so a route
1395 * to ourselves is here.
1396 * Try to find the interface of the destination address and then
1397 * take the address from there. That interface is not necessarily
1398 * a loopback interface.
1399 * In case of jails, check that it is an address of the jail
1400 * and if we cannot find, fall back to the 'default' jail address.
1401 */
1402 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1403 struct in_ifaddr *ia;
1404
1405 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1406 inp->inp_socket->so_fibnum));
1407 if (ia == NULL)
1408 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1409 inp->inp_socket->so_fibnum));
1410 if (ia == NULL)
1411 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1412
1413 if (!prison_flag(cred, PR_IP4)) {
1414 if (ia == NULL) {
1415 error = ENETUNREACH;
1416 goto done;
1417 }
1418 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1419 goto done;
1420 }
1421
1422 /* Jailed. */
1423 if (ia != NULL) {
1424 struct ifnet *ifp;
1425
1426 ifp = ia->ia_ifp;
1427 ia = NULL;
1428 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1429 sa = ifa->ifa_addr;
1430 if (sa->sa_family != AF_INET)
1431 continue;
1432 sin = (struct sockaddr_in *)sa;
1433 if (prison_check_ip4(cred,
1434 &sin->sin_addr) == 0) {
1435 ia = (struct in_ifaddr *)ifa;
1436 break;
1437 }
1438 }
1439 if (ia != NULL) {
1440 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1441 goto done;
1442 }
1443 }
1444
1445 /* 3. As a last resort return the 'default' jail address. */
1446 error = prison_get_ip4(cred, laddr);
1447 goto done;
1448 }
1449
1450 done:
1451 if (error == 0 && laddr->s_addr == INADDR_ANY)
1452 return (EHOSTUNREACH);
1453 return (error);
1454 }
1455
1456 void
in_pcbdisconnect(struct inpcb * inp)1457 in_pcbdisconnect(struct inpcb *inp)
1458 {
1459
1460 INP_WLOCK_ASSERT(inp);
1461 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1462 ("%s: inp %p was already disconnected", __func__, inp));
1463
1464 if (inp->inp_flags & INP_UNCONNECTED)
1465 return;
1466
1467 INP_HASH_WLOCK(inp->inp_pcbinfo);
1468 in_pcbremhash(inp);
1469 CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
1470 inp_unconn_list);
1471 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1472 inp->inp_flags |= INP_UNCONNECTED;
1473
1474 if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1475 /* See the comment in in_pcbinshash(). */
1476 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1477 inp->inp_faddr.s_addr = INADDR_ANY;
1478 inp->inp_fport = 0;
1479 }
1480 }
1481 #endif /* INET */
1482
1483 void
in_pcblisten(struct inpcb * inp)1484 in_pcblisten(struct inpcb *inp)
1485 {
1486 struct inpcblbgroup *grp;
1487
1488 INP_WLOCK_ASSERT(inp);
1489
1490 if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1491 struct inpcbinfo *pcbinfo;
1492
1493 pcbinfo = inp->inp_pcbinfo;
1494 INP_HASH_WLOCK(pcbinfo);
1495 grp = in_pcblbgroup_find(inp);
1496 LIST_REMOVE(inp, inp_lbgroup_list);
1497 grp->il_pendcnt--;
1498 in_pcblbgroup_insert(grp, inp);
1499 INP_HASH_WUNLOCK(pcbinfo);
1500 }
1501 }
1502
1503 /*
1504 * inpcb hash lookups are protected by SMR section.
1505 *
1506 * Once desired pcb has been found, switching from SMR section to a pcb
1507 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1508 * here because SMR is a critical section.
1509 * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1510 */
1511 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1512 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1513 {
1514
1515 lock == INPLOOKUP_RLOCKPCB ?
1516 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1517 }
1518
1519 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1520 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1521 {
1522
1523 lock == INPLOOKUP_RLOCKPCB ?
1524 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1525 }
1526
1527 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1528 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1529 {
1530
1531 return (lock == INPLOOKUP_RLOCKPCB ?
1532 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1533 }
1534
1535 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1536 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1537 {
1538
1539 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1540 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1541
1542 if (__predict_true(inp_trylock(inp, lock))) {
1543 if (__predict_false(inp->inp_flags & ignflags)) {
1544 smr_exit(inp->inp_pcbinfo->ipi_smr);
1545 inp_unlock(inp, lock);
1546 return (false);
1547 }
1548 smr_exit(inp->inp_pcbinfo->ipi_smr);
1549 return (true);
1550 }
1551
1552 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1553 smr_exit(inp->inp_pcbinfo->ipi_smr);
1554 inp_lock(inp, lock);
1555 if (__predict_false(in_pcbrele(inp, lock)))
1556 return (false);
1557 /*
1558 * inp acquired through refcount & lock for sure didn't went
1559 * through uma_zfree(). However, it may have already went
1560 * through in_pcbfree() and has another reference, that
1561 * prevented its release by our in_pcbrele().
1562 */
1563 if (__predict_false(inp->inp_flags & ignflags)) {
1564 inp_unlock(inp, lock);
1565 return (false);
1566 }
1567 return (true);
1568 } else {
1569 smr_exit(inp->inp_pcbinfo->ipi_smr);
1570 return (false);
1571 }
1572 }
1573
1574 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1575 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1576 {
1577
1578 /*
1579 * in_pcblookup() family of functions shall ignore not onlu pcbs that
1580 * had been freed that may be found due to lockless access to the hash,
1581 * but also pcbs that were removed from the hash, but are still around.
1582 */
1583 return (_inp_smr_lock(inp, lock, INP_FREED | INP_UNCONNECTED));
1584 }
1585
1586 /*
1587 * inp_next() - inpcb hash/list traversal iterator
1588 *
1589 * Requires initialized struct inpcb_iterator for context.
1590 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1591 *
1592 * - Iterator can have either write-lock or read-lock semantics, that can not
1593 * be changed later.
1594 * - Iterator has three modes of operation, defined by value of .hash member
1595 * on the first call:
1596 * - .hash = INP_ALL_LIST: the iterator will go through the unconnected
1597 * list, then all wildcard hash slots and then all exact hash slots.
1598 * - .hash = INP_UNCONN_LIST: the iterator will go through the list of
1599 * unconnected pcbs only.
1600 * - .hash initialized with an arbitrary positive value: iterator will go
1601 * through this exact hash slot only.
1602 * Note: only rip_input() and sysctl_setsockopt() use the latter.
1603 * The interface may be extended for iteration over single wildcard hash
1604 * slot, but there is no use case for that today.
1605 * - Iterator may have optional bool matching function. The matching function
1606 * will be executed for each inpcb in the SMR context, so it can not acquire
1607 * locks and can safely access only immutable fields of inpcb.
1608 *
1609 * A fresh initialized iterator has NULL inpcb in its context and that
1610 * means that inp_next() call would return the very first inpcb on the list
1611 * locked with desired semantic. In all following calls the context pointer
1612 * shall hold the current inpcb pointer. The KPI user is not supposed to
1613 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1614 * and write NULL to its context. After end of traversal an iterator can be
1615 * reused.
1616 *
1617 * List traversals have the following features/constraints:
1618 * - New entries won't be seen, as they are always added to the head of a list.
1619 * - Removed entries won't stop traversal as long as they are not added to
1620 * a different list. This is violated by in_pcbrehash().
1621 */
1622 static inline struct inpcb *
ii_list_first(const struct inpcb_iterator * ii)1623 ii_list_first(const struct inpcb_iterator *ii)
1624 {
1625 const struct inpcbinfo *ipi = ii->ipi;
1626 const int hash = ii->hash;
1627
1628 if (hash < 0)
1629 return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
1630 else if (hash <= ipi->ipi_hashmask)
1631 return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
1632 else
1633 return (CK_LIST_FIRST(
1634 &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
1635 }
1636
1637 static inline struct inpcb *
ii_list_next(const struct inpcb_iterator * ii,struct inpcb * inp)1638 ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
1639 {
1640 if (ii->hash < 0)
1641 return (CK_LIST_NEXT(inp, inp_unconn_list));
1642 else if (ii->hash <= ii->ipi->ipi_hashmask)
1643 return (CK_LIST_NEXT(inp, inp_hash_wild));
1644 else
1645 return (CK_LIST_NEXT(inp, inp_hash_exact));
1646 }
1647
1648 struct inpcb *
inp_next(struct inpcb_iterator * ii)1649 inp_next(struct inpcb_iterator *ii)
1650 {
1651 const struct inpcbinfo *ipi = ii->ipi;
1652 const int hashmax = (ipi->ipi_hashmask + 1) * 2;
1653 inp_match_t *match = ii->match;
1654 void *ctx = ii->ctx;
1655 inp_lookup_t lock = ii->lock;
1656 struct inpcb *inp;
1657
1658 if (ii->inp == NULL) { /* First call. */
1659 if ((ii->hash = ii->mode) >= 0) {
1660 /* Targeted iterators support only the exact hash. */
1661 MPASS(ii->hash <= ipi->ipi_hashmask);
1662 ii->hash += ipi->ipi_hashmask + 1;
1663 }
1664 smr_enter(ipi->ipi_smr);
1665 next_first:
1666 /* This is unrolled CK_LIST_FOREACH() over different headers. */
1667 for (inp = ii_list_first(ii);
1668 inp != NULL;
1669 inp = ii_list_next(ii, inp)) {
1670 if (match != NULL && (match)(inp, ctx) == false)
1671 continue;
1672 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1673 break;
1674 else {
1675 smr_enter(ipi->ipi_smr);
1676 MPASS(inp != ii_list_first(ii));
1677 inp = ii_list_first(ii);
1678 if (inp == NULL)
1679 break;
1680 }
1681 }
1682
1683 if (inp == NULL) {
1684 if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
1685 goto next_first;
1686 smr_exit(ipi->ipi_smr);
1687 } else
1688 ii->inp = inp;
1689
1690 return (inp);
1691 }
1692
1693 /* Not a first call. */
1694 smr_enter(ipi->ipi_smr);
1695 restart:
1696 inp = ii->inp;
1697 rw_assert(&inp->inp_lock,
1698 lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
1699 next:
1700 inp = ii_list_next(ii, inp);
1701 if (inp == NULL) {
1702 if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
1703 inp_unlock(ii->inp, lock);
1704 ii->inp = NULL;
1705 goto next_first;
1706 }
1707 smr_exit(ipi->ipi_smr);
1708 goto found;
1709 }
1710
1711 if (match != NULL && (match)(inp, ctx) == false)
1712 goto next;
1713
1714 if (__predict_true(inp_trylock(inp, lock))) {
1715 if (__predict_false(inp->inp_flags & INP_FREED)) {
1716 /*
1717 * Entries are never inserted in middle of a list, thus
1718 * as long as we are in SMR, we can continue traversal.
1719 * Jump to 'next' should yield in the same result, but
1720 * could produce unnecessary looping. Could this
1721 * looping be unbound?
1722 */
1723 inp_unlock(inp, lock);
1724 goto next;
1725 } else {
1726 smr_exit(ipi->ipi_smr);
1727 goto found;
1728 }
1729 }
1730
1731 /*
1732 * Can't obtain lock immediately, thus going hard. Once we exit the
1733 * SMR section we can no longer jump to 'next', and our only stable
1734 * anchoring point is ii->inp, which we keep locked for this case, so
1735 * we jump to 'restart'.
1736 */
1737 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1738 smr_exit(ipi->ipi_smr);
1739 inp_lock(inp, lock);
1740 if (__predict_false(in_pcbrele(inp, lock))) {
1741 smr_enter(ipi->ipi_smr);
1742 goto restart;
1743 }
1744 /*
1745 * See comment in inp_smr_lock().
1746 */
1747 if (__predict_false(inp->inp_flags & INP_FREED)) {
1748 inp_unlock(inp, lock);
1749 smr_enter(ipi->ipi_smr);
1750 goto restart;
1751 }
1752 } else
1753 goto next;
1754
1755 found:
1756 inp_unlock(ii->inp, lock);
1757 ii->inp = inp;
1758
1759 return (ii->inp);
1760 }
1761
1762 /*
1763 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1764 * stability of an inpcb pointer despite the inpcb lock being released or
1765 * SMR section exited.
1766 *
1767 * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1768 */
1769 void
in_pcbref(struct inpcb * inp)1770 in_pcbref(struct inpcb *inp)
1771 {
1772 u_int old __diagused;
1773
1774 old = refcount_acquire(&inp->inp_refcount);
1775 KASSERT(old > 0, ("%s: refcount 0", __func__));
1776 }
1777
1778 /*
1779 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1780 * freeing the pcb, if the reference was very last.
1781 */
1782 bool
in_pcbrele_rlocked(struct inpcb * inp)1783 in_pcbrele_rlocked(struct inpcb *inp)
1784 {
1785
1786 INP_RLOCK_ASSERT(inp);
1787
1788 if (!refcount_release(&inp->inp_refcount))
1789 return (false);
1790
1791 MPASS(inp->inp_flags & INP_FREED);
1792 MPASS(inp->inp_socket == NULL);
1793 crfree(inp->inp_cred);
1794 #ifdef INVARIANTS
1795 inp->inp_cred = NULL;
1796 #endif
1797 INP_RUNLOCK(inp);
1798 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1799 return (true);
1800 }
1801
1802 bool
in_pcbrele_wlocked(struct inpcb * inp)1803 in_pcbrele_wlocked(struct inpcb *inp)
1804 {
1805
1806 INP_WLOCK_ASSERT(inp);
1807
1808 if (!refcount_release(&inp->inp_refcount))
1809 return (false);
1810
1811 MPASS(inp->inp_flags & INP_FREED);
1812 MPASS(inp->inp_socket == NULL);
1813 crfree(inp->inp_cred);
1814 #ifdef INVARIANTS
1815 inp->inp_cred = NULL;
1816 #endif
1817 INP_WUNLOCK(inp);
1818 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1819 return (true);
1820 }
1821
1822 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1823 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1824 {
1825
1826 return (lock == INPLOOKUP_RLOCKPCB ?
1827 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1828 }
1829
1830 /*
1831 * Dereference and rlock inp, for which the caller must own the
1832 * reference. Returns true if inp no longer usable, false otherwise.
1833 */
1834 bool
in_pcbrele_rlock(struct inpcb * inp)1835 in_pcbrele_rlock(struct inpcb *inp)
1836 {
1837 INP_RLOCK(inp);
1838 if (in_pcbrele_rlocked(inp))
1839 return (true);
1840 if ((inp->inp_flags & INP_FREED) != 0) {
1841 INP_RUNLOCK(inp);
1842 return (true);
1843 }
1844 return (false);
1845 }
1846
1847 /*
1848 * Unconditionally schedule an inpcb to be freed by decrementing its
1849 * reference count, which should occur only after the inpcb has been detached
1850 * from its socket. If another thread holds a temporary reference (acquired
1851 * using in_pcbref()) then the free is deferred until that reference is
1852 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1853 * Almost all work, including removal from global lists, is done in this
1854 * context, where the pcbinfo lock is held.
1855 */
1856 void
in_pcbfree(struct inpcb * inp)1857 in_pcbfree(struct inpcb *inp)
1858 {
1859 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1860 #ifdef INET
1861 struct ip_moptions *imo;
1862 #endif
1863 #ifdef INET6
1864 struct ip6_moptions *im6o;
1865 #endif
1866
1867 INP_WLOCK_ASSERT(inp);
1868 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1869 KASSERT((inp->inp_flags & INP_FREED) == 0,
1870 ("%s: called twice for pcb %p", __func__, inp));
1871
1872 /*
1873 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1874 * from the hash without acquiring inpcb lock, they rely on the hash
1875 * lock, thus in_pcbremhash() should be the first action.
1876 */
1877 INP_HASH_WLOCK(pcbinfo);
1878 if (inp->inp_flags & INP_UNCONNECTED)
1879 CK_LIST_REMOVE(inp, inp_unconn_list);
1880 else
1881 in_pcbremhash(inp);
1882 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1883 pcbinfo->ipi_count--;
1884 INP_HASH_WUNLOCK(pcbinfo);
1885
1886 #ifdef RATELIMIT
1887 if (inp->inp_snd_tag != NULL)
1888 in_pcbdetach_txrtlmt(inp);
1889 #endif
1890 inp->inp_flags |= INP_FREED;
1891 inp->inp_socket->so_pcb = NULL;
1892 inp->inp_socket = NULL;
1893
1894 RO_INVALIDATE_CACHE(&inp->inp_route);
1895 #ifdef MAC
1896 mac_inpcb_destroy(inp);
1897 #endif
1898 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1899 if (inp->inp_sp != NULL)
1900 ipsec_delete_pcbpolicy(inp);
1901 #endif
1902 #ifdef INET
1903 if (inp->inp_options)
1904 (void)m_free(inp->inp_options);
1905 DEBUG_POISON_POINTER(inp->inp_options);
1906 imo = inp->inp_moptions;
1907 DEBUG_POISON_POINTER(inp->inp_moptions);
1908 #endif
1909 #ifdef INET6
1910 if (inp->inp_vflag & INP_IPV6PROTO) {
1911 ip6_freepcbopts(inp->in6p_outputopts);
1912 DEBUG_POISON_POINTER(inp->in6p_outputopts);
1913 im6o = inp->in6p_moptions;
1914 DEBUG_POISON_POINTER(inp->in6p_moptions);
1915 } else
1916 im6o = NULL;
1917 #endif
1918
1919 if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1920 INP_WUNLOCK(inp);
1921 }
1922 #ifdef INET6
1923 ip6_freemoptions(im6o);
1924 #endif
1925 #ifdef INET
1926 inp_freemoptions(imo);
1927 #endif
1928 }
1929
1930 /*
1931 * Different protocols initialize their inpcbs differently - giving
1932 * different name to the lock. But they all are disposed the same.
1933 */
1934 static void
inpcb_fini(void * mem,int size)1935 inpcb_fini(void *mem, int size)
1936 {
1937 struct inpcb *inp = mem;
1938
1939 INP_LOCK_DESTROY(inp);
1940 }
1941
1942 #ifdef INET
1943 /*
1944 * Common routines to return the socket addresses associated with inpcbs.
1945 */
1946 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1947 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1948 {
1949 struct inpcb *inp;
1950
1951 inp = sotoinpcb(so);
1952 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1953
1954 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1955 .sin_len = sizeof(struct sockaddr_in),
1956 .sin_family = AF_INET,
1957 .sin_port = inp->inp_lport,
1958 .sin_addr = inp->inp_laddr,
1959 };
1960
1961 return (0);
1962 }
1963
1964 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1965 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1966 {
1967 struct inpcb *inp;
1968
1969 inp = sotoinpcb(so);
1970 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1971
1972 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1973 .sin_len = sizeof(struct sockaddr_in),
1974 .sin_family = AF_INET,
1975 .sin_port = inp->inp_fport,
1976 .sin_addr = inp->inp_faddr,
1977 };
1978
1979 return (0);
1980 }
1981
1982 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1983 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1984 {
1985
1986 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1987 return (true);
1988 else
1989 return (false);
1990 }
1991
1992 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1993 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1994 {
1995 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1996 inp_v4_multi_match, NULL);
1997 struct inpcb *inp;
1998 struct in_multi *inm;
1999 struct in_mfilter *imf;
2000 struct ip_moptions *imo;
2001
2002 IN_MULTI_LOCK_ASSERT();
2003
2004 while ((inp = inp_next(&inpi)) != NULL) {
2005 INP_WLOCK_ASSERT(inp);
2006
2007 imo = inp->inp_moptions;
2008 /*
2009 * Unselect the outgoing interface if it is being
2010 * detached.
2011 */
2012 if (imo->imo_multicast_ifp == ifp)
2013 imo->imo_multicast_ifp = NULL;
2014
2015 /*
2016 * Drop multicast group membership if we joined
2017 * through the interface being detached.
2018 *
2019 * XXX This can all be deferred to an epoch_call
2020 */
2021 restart:
2022 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
2023 if ((inm = imf->imf_inm) == NULL)
2024 continue;
2025 if (inm->inm_ifp != ifp)
2026 continue;
2027 ip_mfilter_remove(&imo->imo_head, imf);
2028 in_leavegroup_locked(inm, NULL);
2029 ip_mfilter_free(imf);
2030 goto restart;
2031 }
2032 }
2033 }
2034
2035 /*
2036 * Lookup a PCB based on the local address and port. Caller must hold the
2037 * hash lock. No inpcb locks or references are acquired.
2038 */
2039 #define INP_LOOKUP_MAPPED_PCB_COST 3
2040 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)2041 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2042 u_short lport, int fib, int lookupflags, struct ucred *cred)
2043 {
2044 struct inpcb *inp;
2045 #ifdef INET6
2046 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2047 #else
2048 int matchwild = 3;
2049 #endif
2050 int wildcard;
2051
2052 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2053 ("%s: invalid lookup flags %d", __func__, lookupflags));
2054 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
2055 ("%s: invalid fib %d", __func__, fib));
2056
2057 INP_HASH_LOCK_ASSERT(pcbinfo);
2058
2059 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2060 struct inpcbhead *head;
2061 /*
2062 * Look for an unconnected (wildcard foreign addr) PCB that
2063 * matches the local address and port we're looking for.
2064 */
2065 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2066 pcbinfo->ipi_hashmask)];
2067 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2068 #ifdef INET6
2069 /* XXX inp locking */
2070 if ((inp->inp_vflag & INP_IPV4) == 0)
2071 continue;
2072 #endif
2073 if (inp->inp_laddr.s_addr == laddr.s_addr &&
2074 inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2075 inp->inp_inc.inc_fibnum == fib)) {
2076 /*
2077 * Found?
2078 */
2079 if (prison_equal_ip4(cred->cr_prison,
2080 inp->inp_cred->cr_prison))
2081 return (inp);
2082 }
2083 }
2084 /*
2085 * Not found.
2086 */
2087 return (NULL);
2088 } else {
2089 struct inpcbhead *porthash;
2090 struct inpcb *match = NULL;
2091
2092 /*
2093 * Port is in use by one or more PCBs. Look for best
2094 * fit.
2095 */
2096 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2097 pcbinfo->ipi_porthashmask)];
2098 CK_LIST_FOREACH(inp, porthash, inp_portlist) {
2099 if (inp->inp_lport != lport)
2100 continue;
2101 if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2102 cred->cr_prison))
2103 continue;
2104 if (fib != RT_ALL_FIBS &&
2105 inp->inp_inc.inc_fibnum != fib)
2106 continue;
2107 wildcard = 0;
2108 #ifdef INET6
2109 /* XXX inp locking */
2110 if ((inp->inp_vflag & INP_IPV4) == 0)
2111 continue;
2112 /*
2113 * We never select the PCB that has INP_IPV6 flag and
2114 * is bound to :: if we have another PCB which is bound
2115 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we
2116 * set its cost higher than IPv4 only PCBs.
2117 *
2118 * Note that the case only happens when a socket is
2119 * bound to ::, under the condition that the use of the
2120 * mapped address is allowed.
2121 */
2122 if ((inp->inp_vflag & INP_IPV6) != 0)
2123 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2124 #endif
2125 if (inp->inp_faddr.s_addr != INADDR_ANY)
2126 wildcard++;
2127 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2128 if (laddr.s_addr == INADDR_ANY)
2129 wildcard++;
2130 else if (inp->inp_laddr.s_addr != laddr.s_addr)
2131 continue;
2132 } else {
2133 if (laddr.s_addr != INADDR_ANY)
2134 wildcard++;
2135 }
2136 if (wildcard < matchwild) {
2137 match = inp;
2138 matchwild = wildcard;
2139 if (matchwild == 0)
2140 break;
2141 }
2142 }
2143 return (match);
2144 }
2145 }
2146 #undef INP_LOOKUP_MAPPED_PCB_COST
2147
2148 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2149 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2150 {
2151 return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2152 (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2153 }
2154
2155 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2156 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2157 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2158 uint16_t lport, int domain, int fib)
2159 {
2160 const struct inpcblbgrouphead *hdr;
2161 struct inpcblbgroup *grp;
2162 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2163 struct inpcb *inp;
2164 u_int count;
2165
2166 INP_HASH_LOCK_ASSERT(pcbinfo);
2167 NET_EPOCH_ASSERT();
2168
2169 hdr = &pcbinfo->ipi_lbgrouphashbase[
2170 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2171
2172 /*
2173 * Search for an LB group match based on the following criteria:
2174 * - prefer jailed groups to non-jailed groups
2175 * - prefer exact source address matches to wildcard matches
2176 * - prefer groups bound to the specified NUMA domain
2177 */
2178 jail_exact = jail_wild = local_exact = local_wild = NULL;
2179 CK_LIST_FOREACH(grp, hdr, il_list) {
2180 bool injail;
2181
2182 #ifdef INET6
2183 if (!(grp->il_vflag & INP_IPV4))
2184 continue;
2185 #endif
2186 if (grp->il_lport != lport)
2187 continue;
2188
2189 injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2190 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2191 laddr) != 0)
2192 continue;
2193
2194 if (grp->il_laddr.s_addr == laddr->s_addr) {
2195 if (injail) {
2196 jail_exact = grp;
2197 if (in_pcblookup_lb_match(grp, domain, fib))
2198 /* This is a perfect match. */
2199 goto out;
2200 } else if (local_exact == NULL ||
2201 in_pcblookup_lb_match(grp, domain, fib)) {
2202 local_exact = grp;
2203 }
2204 } else if (grp->il_laddr.s_addr == INADDR_ANY) {
2205 if (injail) {
2206 if (jail_wild == NULL ||
2207 in_pcblookup_lb_match(grp, domain, fib))
2208 jail_wild = grp;
2209 } else if (local_wild == NULL ||
2210 in_pcblookup_lb_match(grp, domain, fib)) {
2211 local_wild = grp;
2212 }
2213 }
2214 }
2215
2216 if (jail_exact != NULL)
2217 grp = jail_exact;
2218 else if (jail_wild != NULL)
2219 grp = jail_wild;
2220 else if (local_exact != NULL)
2221 grp = local_exact;
2222 else
2223 grp = local_wild;
2224 if (grp == NULL)
2225 return (NULL);
2226
2227 out:
2228 /*
2229 * Synchronize with in_pcblbgroup_insert().
2230 */
2231 count = atomic_load_acq_int(&grp->il_inpcnt);
2232 if (count == 0)
2233 return (NULL);
2234 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2235 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2236 return (inp);
2237 }
2238
2239 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2240 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2241 u_short fport, struct in_addr laddr, u_short lport)
2242 {
2243 #ifdef INET6
2244 /* XXX inp locking */
2245 if ((inp->inp_vflag & INP_IPV4) == 0)
2246 return (false);
2247 #endif
2248 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2249 inp->inp_laddr.s_addr == laddr.s_addr &&
2250 inp->inp_fport == fport &&
2251 inp->inp_lport == lport)
2252 return (true);
2253 return (false);
2254 }
2255
2256 static struct inpcb *
in_pcblookup_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2257 in_pcblookup_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2258 u_short fport, struct in_addr laddr, u_short lport)
2259 {
2260 struct inpcbhead *head;
2261 struct inpcb *inp;
2262
2263 INP_HASH_LOCK_ASSERT(pcbinfo);
2264
2265 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2266 pcbinfo->ipi_hashmask)];
2267 CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2268 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2269 return (inp);
2270 }
2271 return (NULL);
2272 }
2273
2274 typedef enum {
2275 INPLOOKUP_MATCH_NONE = 0,
2276 INPLOOKUP_MATCH_WILD = 1,
2277 INPLOOKUP_MATCH_LADDR = 2,
2278 } inp_lookup_match_t;
2279
2280 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2281 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2282 u_short lport, int fib)
2283 {
2284 #ifdef INET6
2285 /* XXX inp locking */
2286 if ((inp->inp_vflag & INP_IPV4) == 0)
2287 return (INPLOOKUP_MATCH_NONE);
2288 #endif
2289 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2290 return (INPLOOKUP_MATCH_NONE);
2291 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2292 return (INPLOOKUP_MATCH_NONE);
2293 if (inp->inp_laddr.s_addr == INADDR_ANY)
2294 return (INPLOOKUP_MATCH_WILD);
2295 if (inp->inp_laddr.s_addr == laddr.s_addr)
2296 return (INPLOOKUP_MATCH_LADDR);
2297 return (INPLOOKUP_MATCH_NONE);
2298 }
2299
2300 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
2301
2302 static struct inpcb *
in_pcblookup_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2303 in_pcblookup_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2304 u_short lport, int fib, const inp_lookup_t lockflags)
2305 {
2306 struct inpcbhead *head;
2307 struct inpcb *inp;
2308
2309 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2310 ("%s: not in SMR read section", __func__));
2311
2312 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2313 pcbinfo->ipi_hashmask)];
2314 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2315 inp_lookup_match_t match;
2316
2317 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2318 if (match == INPLOOKUP_MATCH_NONE)
2319 continue;
2320
2321 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2322 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2323 if (match != INPLOOKUP_MATCH_NONE &&
2324 prison_check_ip4_locked(inp->inp_cred->cr_prison,
2325 &laddr) == 0)
2326 return (inp);
2327 inp_unlock(inp, lockflags);
2328 }
2329
2330 /*
2331 * The matching socket disappeared out from under us. Fall back
2332 * to a serialized lookup.
2333 */
2334 return (INP_LOOKUP_AGAIN);
2335 }
2336 return (NULL);
2337 }
2338
2339 static struct inpcb *
in_pcblookup_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2340 in_pcblookup_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2341 u_short lport, int fib)
2342 {
2343 struct inpcbhead *head;
2344 struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2345 #ifdef INET6
2346 struct inpcb *local_wild_mapped;
2347 #endif
2348
2349 INP_HASH_LOCK_ASSERT(pcbinfo);
2350
2351 /*
2352 * Order of socket selection - we always prefer jails.
2353 * 1. jailed, non-wild.
2354 * 2. jailed, wild.
2355 * 3. non-jailed, non-wild.
2356 * 4. non-jailed, wild.
2357 */
2358 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2359 pcbinfo->ipi_hashmask)];
2360 local_wild = local_exact = jail_wild = NULL;
2361 #ifdef INET6
2362 local_wild_mapped = NULL;
2363 #endif
2364 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2365 inp_lookup_match_t match;
2366 bool injail;
2367
2368 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2369 if (match == INPLOOKUP_MATCH_NONE)
2370 continue;
2371
2372 injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2373 if (injail) {
2374 if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2375 &laddr) != 0)
2376 continue;
2377 } else {
2378 if (local_exact != NULL)
2379 continue;
2380 }
2381
2382 if (match == INPLOOKUP_MATCH_LADDR) {
2383 if (injail)
2384 return (inp);
2385 local_exact = inp;
2386 } else {
2387 #ifdef INET6
2388 /* XXX inp locking, NULL check */
2389 if (inp->inp_vflag & INP_IPV6PROTO)
2390 local_wild_mapped = inp;
2391 else
2392 #endif
2393 if (injail)
2394 jail_wild = inp;
2395 else
2396 local_wild = inp;
2397 }
2398 }
2399 if (jail_wild != NULL)
2400 return (jail_wild);
2401 if (local_exact != NULL)
2402 return (local_exact);
2403 if (local_wild != NULL)
2404 return (local_wild);
2405 #ifdef INET6
2406 if (local_wild_mapped != NULL)
2407 return (local_wild_mapped);
2408 #endif
2409 return (NULL);
2410 }
2411
2412 /*
2413 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2414 * that the caller has either locked the hash list, which usually happens
2415 * for bind(2) operations, or is in SMR section, which happens when sorting
2416 * out incoming packets.
2417 */
2418 static struct inpcb *
in_pcblookup_internal(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2419 in_pcblookup_internal(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2420 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2421 uint8_t numa_domain, int fib)
2422 {
2423 struct inpcb *inp;
2424 const u_short fport = fport_arg, lport = lport_arg;
2425
2426 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2427 ("%s: invalid lookup flags %d", __func__, lookupflags));
2428 KASSERT(faddr.s_addr != INADDR_ANY,
2429 ("%s: invalid foreign address", __func__));
2430 KASSERT(laddr.s_addr != INADDR_ANY,
2431 ("%s: invalid local address", __func__));
2432 INP_HASH_WLOCK_ASSERT(pcbinfo);
2433
2434 inp = in_pcblookup_exact(pcbinfo, faddr, fport, laddr, lport);
2435 if (inp != NULL)
2436 return (inp);
2437
2438 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2439 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2440 &laddr, lport, numa_domain, fib);
2441 if (inp == NULL) {
2442 inp = in_pcblookup_wild_locked(pcbinfo, laddr,
2443 lport, fib);
2444 }
2445 }
2446
2447 return (inp);
2448 }
2449
2450 /*
2451 * Lookup inpcb using locks. Used by in_pcblookup_smr() in case inp_smr_lock()
2452 * failed.
2453 */
2454 static struct inpcb *
in_pcblookup_with_lock(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2455 in_pcblookup_with_lock(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2456 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2457 uint8_t numa_domain, int fib)
2458 {
2459 struct inpcb *inp;
2460 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2461
2462 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2463 ("%s: LOCKPCB not set", __func__));
2464
2465 INP_HASH_WLOCK(pcbinfo);
2466 inp = in_pcblookup_internal(pcbinfo, faddr, fport, laddr, lport,
2467 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2468 if (inp != NULL && !inp_trylock(inp, lockflags)) {
2469 in_pcbref(inp);
2470 INP_HASH_WUNLOCK(pcbinfo);
2471 inp_lock(inp, lockflags);
2472 if (in_pcbrele(inp, lockflags))
2473 /* XXX-MJ or retry until we get a negative match? */
2474 inp = NULL;
2475 } else {
2476 INP_HASH_WUNLOCK(pcbinfo);
2477 }
2478 return (inp);
2479 }
2480
2481 static struct inpcb *
in_pcblookup_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2482 in_pcblookup_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2483 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2484 uint8_t numa_domain, int fib)
2485 {
2486 struct inpcb *inp;
2487 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2488 const u_short fport = fport_arg, lport = lport_arg;
2489
2490 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2491 ("%s: invalid lookup flags %d", __func__, lookupflags));
2492 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2493 ("%s: LOCKPCB not set", __func__));
2494
2495 smr_enter(pcbinfo->ipi_smr);
2496 inp = in_pcblookup_exact(pcbinfo, faddr, fport, laddr, lport);
2497 if (inp != NULL) {
2498 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2499 /*
2500 * Revalidate the 4-tuple, the socket could have been
2501 * disconnected.
2502 */
2503 if (__predict_true(in_pcblookup_exact_match(inp,
2504 faddr, fport, laddr, lport)))
2505 return (inp);
2506 inp_unlock(inp, lockflags);
2507 }
2508
2509 /*
2510 * We failed to lock the inpcb, or its connection state changed
2511 * out from under us. Fall back to a precise search.
2512 */
2513 return (in_pcblookup_with_lock(pcbinfo, faddr, fport, laddr,
2514 lport, lookupflags, numa_domain, fib));
2515 }
2516
2517 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2518 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2519 &laddr, lport, numa_domain, fib);
2520 if (inp != NULL) {
2521 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2522 if (__predict_true(in_pcblookup_wild_match(inp,
2523 laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2524 return (inp);
2525 inp_unlock(inp, lockflags);
2526 }
2527 inp = INP_LOOKUP_AGAIN;
2528 } else {
2529 inp = in_pcblookup_wild_smr(pcbinfo, laddr, lport,
2530 fib, lockflags);
2531 }
2532 if (inp == INP_LOOKUP_AGAIN) {
2533 return (in_pcblookup_with_lock(pcbinfo, faddr, fport,
2534 laddr, lport, lookupflags, numa_domain, fib));
2535 }
2536 }
2537
2538 if (inp == NULL)
2539 smr_exit(pcbinfo->ipi_smr);
2540
2541 return (inp);
2542 }
2543
2544 /*
2545 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2546 * from which a pre-calculated hash value may be extracted.
2547 */
2548 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2549 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2550 struct in_addr laddr, u_int lport, int lookupflags,
2551 struct ifnet *ifp)
2552 {
2553 int fib;
2554
2555 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2556 return (in_pcblookup_smr(pcbinfo, faddr, fport, laddr, lport,
2557 lookupflags, M_NODOM, fib));
2558 }
2559
2560 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2561 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2562 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2563 struct ifnet *ifp __unused, struct mbuf *m)
2564 {
2565 int fib;
2566
2567 M_ASSERTPKTHDR(m);
2568 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2569 return (in_pcblookup_smr(pcbinfo, faddr, fport, laddr, lport,
2570 lookupflags, m->m_pkthdr.numa_domain, fib));
2571 }
2572 #endif /* INET */
2573
2574 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2575 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2576 {
2577 return (prison_flag(inp->inp_cred, flag) != 0);
2578 }
2579
2580 /*
2581 * Insert the PCB into a hash chain using ordering rules which ensure that
2582 * in_pcblookup_wild_*() always encounter the highest-ranking PCB first.
2583 *
2584 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2585 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
2586 * always appear last no matter whether they are jailed.
2587 */
2588 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2589 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2590 {
2591 struct inpcb *last;
2592 bool bound, injail;
2593
2594 INP_LOCK_ASSERT(inp);
2595 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2596
2597 last = NULL;
2598 bound = inp->inp_laddr.s_addr != INADDR_ANY;
2599 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2600 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2601 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2602 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2603 return;
2604 }
2605 }
2606 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2607 return;
2608 }
2609
2610 injail = in_pcbjailed(inp, PR_IP4);
2611 if (!injail) {
2612 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2613 if (!in_pcbjailed(last, PR_IP4))
2614 break;
2615 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2616 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2617 return;
2618 }
2619 }
2620 } else if (!CK_LIST_EMPTY(pcbhash) &&
2621 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2622 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2623 return;
2624 }
2625 if (!bound) {
2626 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2627 if (last->inp_laddr.s_addr == INADDR_ANY)
2628 break;
2629 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2630 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2631 return;
2632 }
2633 }
2634 }
2635 if (last == NULL)
2636 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2637 else
2638 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2639 }
2640
2641 #ifdef INET6
2642 /*
2643 * See the comment above _in_pcbinshash_wild().
2644 */
2645 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2646 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2647 {
2648 struct inpcb *last;
2649 bool bound, injail;
2650
2651 INP_LOCK_ASSERT(inp);
2652 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2653
2654 last = NULL;
2655 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2656 injail = in_pcbjailed(inp, PR_IP6);
2657 if (!injail) {
2658 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2659 if (!in_pcbjailed(last, PR_IP6))
2660 break;
2661 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2662 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2663 return;
2664 }
2665 }
2666 } else if (!CK_LIST_EMPTY(pcbhash) &&
2667 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2668 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2669 return;
2670 }
2671 if (!bound) {
2672 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2673 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2674 break;
2675 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2676 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2677 return;
2678 }
2679 }
2680 }
2681 if (last == NULL)
2682 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2683 else
2684 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2685 }
2686 #endif
2687
2688 /*
2689 * Insert PCB onto various hash lists.
2690 *
2691 * With normal sockets this function shall not fail, so it could return void.
2692 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held,
2693 * that's the only condition when it can fail.
2694 */
2695 int
in_pcbinshash(struct inpcb * inp)2696 in_pcbinshash(struct inpcb *inp)
2697 {
2698 struct inpcbhead *pcbhash, *pcbporthash;
2699 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2700 uint32_t hash;
2701 bool connected;
2702
2703 INP_WLOCK_ASSERT(inp);
2704 INP_HASH_WLOCK_ASSERT(pcbinfo);
2705 MPASS(inp->inp_flags & INP_UNCONNECTED);
2706
2707 #ifdef INET6
2708 if (inp->inp_vflag & INP_IPV6) {
2709 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2710 inp->inp_fport, pcbinfo->ipi_hashmask);
2711 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2712 } else
2713 #endif
2714 {
2715 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2716 inp->inp_fport, pcbinfo->ipi_hashmask);
2717 connected = !in_nullhost(inp->inp_faddr);
2718 }
2719
2720 if (connected)
2721 pcbhash = &pcbinfo->ipi_hash_exact[hash];
2722 else
2723 pcbhash = &pcbinfo->ipi_hash_wild[hash];
2724
2725 pcbporthash = &pcbinfo->ipi_porthashbase[
2726 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2727
2728 /*
2729 * Ignore SO_REUSEPORT_LB if the socket is connected. Really this case
2730 * should be an error, but for UDP sockets it is not, and some
2731 * applications erroneously set it on connected UDP sockets, so we can't
2732 * change this without breaking compatibility.
2733 */
2734 if (!connected &&
2735 (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2736 int error = in_pcbinslbgrouphash(inp, M_NODOM);
2737 if (error != 0)
2738 return (error);
2739 }
2740
2741 /*
2742 * The PCB may have been disconnected in the past. Before we can safely
2743 * make it visible in the hash table, we must wait for all readers which
2744 * may be traversing this PCB to finish.
2745 */
2746 if (inp->inp_smr != SMR_SEQ_INVALID) {
2747 smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2748 inp->inp_smr = SMR_SEQ_INVALID;
2749 }
2750
2751 CK_LIST_REMOVE(inp, inp_unconn_list);
2752
2753 if (connected)
2754 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2755 else {
2756 #ifdef INET6
2757 if ((inp->inp_vflag & INP_IPV6) != 0)
2758 _in6_pcbinshash_wild(pcbhash, inp);
2759 else
2760 #endif
2761 _in_pcbinshash_wild(pcbhash, inp);
2762 }
2763 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
2764 inp->inp_flags &= ~INP_UNCONNECTED;
2765
2766 return (0);
2767 }
2768
2769 void
in_pcbremhash(struct inpcb * inp)2770 in_pcbremhash(struct inpcb *inp)
2771 {
2772
2773 INP_WLOCK_ASSERT(inp);
2774 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2775 MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2776
2777 if ((inp->inp_flags & INP_INLBGROUP) != 0)
2778 in_pcbremlbgrouphash(inp);
2779 #ifdef INET6
2780 if (inp->inp_vflag & INP_IPV6) {
2781 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2782 CK_LIST_REMOVE(inp, inp_hash_wild);
2783 else
2784 CK_LIST_REMOVE(inp, inp_hash_exact);
2785 } else
2786 #endif
2787 {
2788 if (in_nullhost(inp->inp_faddr))
2789 CK_LIST_REMOVE(inp, inp_hash_wild);
2790 else
2791 CK_LIST_REMOVE(inp, inp_hash_exact);
2792 }
2793 CK_LIST_REMOVE(inp, inp_portlist);
2794 }
2795
2796 /*
2797 * Move PCB to the proper hash bucket when { faddr, fport } have been
2798 * changed. NOTE: This does not handle the case of the lport changing (the
2799 * hashed port list would have to be updated as well), so the lport must
2800 * not change after in_pcbinshash() has been called.
2801 */
2802 void
in_pcbrehash(struct inpcb * inp)2803 in_pcbrehash(struct inpcb *inp)
2804 {
2805 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2806 struct inpcbhead *head;
2807 uint32_t hash;
2808 bool connected;
2809
2810 INP_WLOCK_ASSERT(inp);
2811 INP_HASH_WLOCK_ASSERT(pcbinfo);
2812 MPASS(!(inp->inp_flags & INP_UNCONNECTED));
2813 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2814 ("%s: inp was disconnected", __func__));
2815
2816 #ifdef INET6
2817 if (inp->inp_vflag & INP_IPV6) {
2818 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2819 inp->inp_fport, pcbinfo->ipi_hashmask);
2820 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2821 } else
2822 #endif
2823 {
2824 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2825 inp->inp_fport, pcbinfo->ipi_hashmask);
2826 connected = !in_nullhost(inp->inp_faddr);
2827 }
2828
2829 /* See the comment in in_pcbinshash(). */
2830 if (connected && (inp->inp_flags & INP_INLBGROUP) != 0)
2831 in_pcbremlbgrouphash(inp);
2832
2833 /*
2834 * When rehashing, the caller must ensure that either the new or the old
2835 * foreign address was unspecified.
2836 */
2837 if (connected) {
2838 CK_LIST_REMOVE(inp, inp_hash_wild);
2839 head = &pcbinfo->ipi_hash_exact[hash];
2840 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2841 } else {
2842 CK_LIST_REMOVE(inp, inp_hash_exact);
2843 head = &pcbinfo->ipi_hash_wild[hash];
2844 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2845 }
2846 }
2847
2848 void
ripcb_connect(struct inpcb * inp)2849 ripcb_connect(struct inpcb *inp)
2850 {
2851 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2852 uint32_t hash;
2853
2854 INP_WLOCK_ASSERT(inp);
2855 MPASS(inp->inp_flags & INP_UNCONNECTED);
2856
2857 hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask;
2858
2859 INP_HASH_WLOCK(pcbinfo);
2860 CK_LIST_REMOVE(inp, inp_unconn_list);
2861 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp,
2862 inp_hash_exact);
2863 INP_HASH_WUNLOCK(pcbinfo);
2864 inp->inp_flags &= ~INP_UNCONNECTED;
2865 }
2866
2867 void
ripcb_disconnect(struct inpcb * inp)2868 ripcb_disconnect(struct inpcb *inp)
2869 {
2870 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2871
2872 INP_WLOCK_ASSERT(inp);
2873
2874 if (inp->inp_flags & INP_UNCONNECTED)
2875 return;
2876
2877 INP_HASH_WLOCK(pcbinfo);
2878 CK_LIST_REMOVE(inp, inp_hash_exact);
2879 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
2880 INP_HASH_WUNLOCK(pcbinfo);
2881 inp->inp_flags |= INP_UNCONNECTED;
2882 }
2883
2884 /*
2885 * Check for alternatives when higher level complains
2886 * about service problems. For now, invalidate cached
2887 * routing information. If the route was created dynamically
2888 * (by a redirect), time to try a default gateway again.
2889 */
2890 void
in_losing(struct inpcb * inp)2891 in_losing(struct inpcb *inp)
2892 {
2893
2894 RO_INVALIDATE_CACHE(&inp->inp_route);
2895 return;
2896 }
2897
2898 /*
2899 * A set label operation has occurred at the socket layer, propagate the
2900 * label change into the in_pcb for the socket.
2901 */
2902 void
in_pcbsosetlabel(struct socket * so)2903 in_pcbsosetlabel(struct socket *so)
2904 {
2905 #ifdef MAC
2906 struct inpcb *inp;
2907
2908 inp = sotoinpcb(so);
2909 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2910
2911 INP_WLOCK(inp);
2912 SOCK_LOCK(so);
2913 mac_inpcb_sosetlabel(so, inp);
2914 SOCK_UNLOCK(so);
2915 INP_WUNLOCK(inp);
2916 #endif
2917 }
2918
2919 void
inp_wlock(struct inpcb * inp)2920 inp_wlock(struct inpcb *inp)
2921 {
2922
2923 INP_WLOCK(inp);
2924 }
2925
2926 void
inp_wunlock(struct inpcb * inp)2927 inp_wunlock(struct inpcb *inp)
2928 {
2929
2930 INP_WUNLOCK(inp);
2931 }
2932
2933 void
inp_rlock(struct inpcb * inp)2934 inp_rlock(struct inpcb *inp)
2935 {
2936
2937 INP_RLOCK(inp);
2938 }
2939
2940 void
inp_runlock(struct inpcb * inp)2941 inp_runlock(struct inpcb *inp)
2942 {
2943
2944 INP_RUNLOCK(inp);
2945 }
2946
2947 #ifdef INVARIANT_SUPPORT
2948 void
inp_lock_assert(struct inpcb * inp)2949 inp_lock_assert(struct inpcb *inp)
2950 {
2951
2952 INP_WLOCK_ASSERT(inp);
2953 }
2954
2955 void
inp_unlock_assert(struct inpcb * inp)2956 inp_unlock_assert(struct inpcb *inp)
2957 {
2958
2959 INP_UNLOCK_ASSERT(inp);
2960 }
2961 #endif
2962
2963 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2964 inp_apply_all(struct inpcbinfo *pcbinfo,
2965 void (*func)(struct inpcb *, void *), void *arg)
2966 {
2967 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2968 INPLOOKUP_WLOCKPCB);
2969 struct inpcb *inp;
2970
2971 while ((inp = inp_next(&inpi)) != NULL)
2972 func(inp, arg);
2973 }
2974
2975 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2976 inp_inpcbtosocket(struct inpcb *inp)
2977 {
2978
2979 INP_WLOCK_ASSERT(inp);
2980 return (inp->inp_socket);
2981 }
2982
2983 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2984 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2985 uint32_t *faddr, uint16_t *fp)
2986 {
2987
2988 INP_LOCK_ASSERT(inp);
2989 *laddr = inp->inp_laddr.s_addr;
2990 *faddr = inp->inp_faddr.s_addr;
2991 *lp = inp->inp_lport;
2992 *fp = inp->inp_fport;
2993 }
2994
2995 /*
2996 * Create an external-format (``xinpcb'') structure using the information in
2997 * the kernel-format in_pcb structure pointed to by inp. This is done to
2998 * reduce the spew of irrelevant information over this interface, to isolate
2999 * user code from changes in the kernel structure, and potentially to provide
3000 * information-hiding if we decide that some of this information should be
3001 * hidden from users.
3002 */
3003 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)3004 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
3005 {
3006
3007 bzero(xi, sizeof(*xi));
3008 xi->xi_len = sizeof(struct xinpcb);
3009 if (inp->inp_socket)
3010 sotoxsocket(inp->inp_socket, &xi->xi_socket);
3011 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
3012 xi->inp_gencnt = inp->inp_gencnt;
3013 xi->inp_flow = inp->inp_flow;
3014 xi->inp_flowid = inp->inp_flowid;
3015 xi->inp_flowtype = inp->inp_flowtype;
3016 xi->inp_flags = inp->inp_flags;
3017 xi->inp_flags2 = inp->inp_flags2;
3018 xi->in6p_cksum = inp->in6p_cksum;
3019 xi->in6p_hops = inp->in6p_hops;
3020 xi->inp_ip_tos = inp->inp_ip_tos;
3021 xi->inp_vflag = inp->inp_vflag;
3022 xi->inp_ip_ttl = inp->inp_ip_ttl;
3023 xi->inp_ip_p = inp->inp_ip_p;
3024 xi->inp_ip_minttl = inp->inp_ip_minttl;
3025 }
3026
3027 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))3028 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
3029 int (*ctloutput_set)(struct inpcb *, struct sockopt *))
3030 {
3031 struct sockopt sopt;
3032 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
3033 INPLOOKUP_WLOCKPCB);
3034 struct inpcb *inp;
3035 struct sockopt_parameters *params;
3036 struct socket *so;
3037 int error;
3038 char buf[1024];
3039
3040 if (req->oldptr != NULL || req->oldlen != 0)
3041 return (EINVAL);
3042 if (req->newptr == NULL)
3043 return (EPERM);
3044 if (req->newlen > sizeof(buf))
3045 return (ENOMEM);
3046 error = SYSCTL_IN(req, buf, req->newlen);
3047 if (error != 0)
3048 return (error);
3049 if (req->newlen < sizeof(struct sockopt_parameters))
3050 return (EINVAL);
3051 params = (struct sockopt_parameters *)buf;
3052 sopt.sopt_level = params->sop_level;
3053 sopt.sopt_name = params->sop_optname;
3054 sopt.sopt_dir = SOPT_SET;
3055 sopt.sopt_val = params->sop_optval;
3056 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
3057 sopt.sopt_td = NULL;
3058 #ifdef INET6
3059 if (params->sop_inc.inc_flags & INC_ISIPV6) {
3060 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr))
3061 params->sop_inc.inc6_laddr.s6_addr16[1] =
3062 htons(params->sop_inc.inc6_zoneid & 0xffff);
3063 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr))
3064 params->sop_inc.inc6_faddr.s6_addr16[1] =
3065 htons(params->sop_inc.inc6_zoneid & 0xffff);
3066 }
3067 #endif
3068 if (params->sop_inc.inc_lport != htons(0) &&
3069 params->sop_inc.inc_fport != htons(0)) {
3070 #ifdef INET6
3071 if (params->sop_inc.inc_flags & INC_ISIPV6)
3072 inpi.hash = INP6_PCBHASH(
3073 ¶ms->sop_inc.inc6_faddr,
3074 params->sop_inc.inc_lport,
3075 params->sop_inc.inc_fport,
3076 pcbinfo->ipi_hashmask);
3077 else
3078 #endif
3079 inpi.hash = INP_PCBHASH(
3080 ¶ms->sop_inc.inc_faddr,
3081 params->sop_inc.inc_lport,
3082 params->sop_inc.inc_fport,
3083 pcbinfo->ipi_hashmask);
3084 }
3085 while ((inp = inp_next(&inpi)) != NULL)
3086 if (inp->inp_gencnt == params->sop_id) {
3087 /*
3088 * XXXGL
3089 * 1) the inp_next() that ignores INP_UNCONNECTED needs
3090 * to be generally supported.
3091 * 2) Why do we ECONNRESET instead of continueing?
3092 */
3093 if (inp->inp_flags & INP_UNCONNECTED) {
3094 INP_WUNLOCK(inp);
3095 return (ECONNRESET);
3096 }
3097 so = inp->inp_socket;
3098 KASSERT(so != NULL, ("inp_socket == NULL"));
3099 soref(so);
3100 if (params->sop_level == SOL_SOCKET) {
3101 INP_WUNLOCK(inp);
3102 error = sosetopt(so, &sopt);
3103 } else
3104 error = (*ctloutput_set)(inp, &sopt);
3105 sorele(so);
3106 break;
3107 }
3108 if (inp == NULL)
3109 error = ESRCH;
3110 return (error);
3111 }
3112
3113 #ifdef DDB
3114 static void
db_print_indent(int indent)3115 db_print_indent(int indent)
3116 {
3117 int i;
3118
3119 for (i = 0; i < indent; i++)
3120 db_printf(" ");
3121 }
3122
3123 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3124 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3125 {
3126 char faddr_str[48], laddr_str[48];
3127
3128 db_print_indent(indent);
3129 db_printf("%s at %p\n", name, inc);
3130
3131 indent += 2;
3132
3133 #ifdef INET6
3134 if (inc->inc_flags & INC_ISIPV6) {
3135 /* IPv6. */
3136 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3137 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3138 } else
3139 #endif
3140 {
3141 /* IPv4. */
3142 inet_ntoa_r(inc->inc_laddr, laddr_str);
3143 inet_ntoa_r(inc->inc_faddr, faddr_str);
3144 }
3145 db_print_indent(indent);
3146 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3147 ntohs(inc->inc_lport));
3148 db_print_indent(indent);
3149 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3150 ntohs(inc->inc_fport));
3151 }
3152
3153 void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3154 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3155 {
3156
3157 db_print_indent(indent);
3158 db_printf("%s at %p\n", name, inp);
3159
3160 indent += 2;
3161
3162 db_print_indent(indent);
3163 db_printf("inp_flow: 0x%x inp_label: %p\n", inp->inp_flow,
3164 inp->inp_label);
3165
3166 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3167
3168 db_print_indent(indent);
3169 db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS);
3170
3171 db_print_indent(indent);
3172 db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS);
3173
3174 db_print_indent(indent);
3175 db_printf("inp_sp: %p inp_vflag: 0x%b\n", inp->inp_sp,
3176 inp->inp_vflag, INP_VFLAGS_BITS);
3177
3178 db_print_indent(indent);
3179 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3180 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3181
3182 #ifdef INET6
3183 if (inp->inp_vflag & INP_IPV6) {
3184 db_print_indent(indent);
3185 db_printf("in6p_options: %p in6p_outputopts: %p "
3186 "in6p_moptions: %p\n", inp->in6p_options,
3187 inp->in6p_outputopts, inp->in6p_moptions);
3188 db_print_indent(indent);
3189 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3190 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3191 inp->in6p_hops);
3192 } else
3193 #endif
3194 {
3195 db_print_indent(indent);
3196 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3197 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3198 inp->inp_options, inp->inp_moptions);
3199 }
3200
3201 db_print_indent(indent);
3202 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
3203 }
3204
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3205 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3206 {
3207 struct inpcb *inp;
3208
3209 if (!have_addr) {
3210 db_printf("usage: show inpcb <addr>\n");
3211 return;
3212 }
3213 inp = (struct inpcb *)addr;
3214
3215 db_print_inpcb(inp, "inpcb", 0);
3216 }
3217 #endif /* DDB */
3218
3219 #ifdef RATELIMIT
3220 /*
3221 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3222 * if any.
3223 */
3224 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3225 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3226 {
3227 union if_snd_tag_modify_params params = {
3228 .rate_limit.max_rate = max_pacing_rate,
3229 .rate_limit.flags = M_NOWAIT,
3230 };
3231 struct m_snd_tag *mst;
3232 int error;
3233
3234 mst = inp->inp_snd_tag;
3235 if (mst == NULL)
3236 return (EINVAL);
3237
3238 if (mst->sw->snd_tag_modify == NULL) {
3239 error = EOPNOTSUPP;
3240 } else {
3241 error = mst->sw->snd_tag_modify(mst, ¶ms);
3242 }
3243 return (error);
3244 }
3245
3246 /*
3247 * Query existing TX rate limit based on the existing
3248 * "inp->inp_snd_tag", if any.
3249 */
3250 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3251 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3252 {
3253 union if_snd_tag_query_params params = { };
3254 struct m_snd_tag *mst;
3255 int error;
3256
3257 mst = inp->inp_snd_tag;
3258 if (mst == NULL)
3259 return (EINVAL);
3260
3261 if (mst->sw->snd_tag_query == NULL) {
3262 error = EOPNOTSUPP;
3263 } else {
3264 error = mst->sw->snd_tag_query(mst, ¶ms);
3265 if (error == 0 && p_max_pacing_rate != NULL)
3266 *p_max_pacing_rate = params.rate_limit.max_rate;
3267 }
3268 return (error);
3269 }
3270
3271 /*
3272 * Query existing TX queue level based on the existing
3273 * "inp->inp_snd_tag", if any.
3274 */
3275 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3276 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3277 {
3278 union if_snd_tag_query_params params = { };
3279 struct m_snd_tag *mst;
3280 int error;
3281
3282 mst = inp->inp_snd_tag;
3283 if (mst == NULL)
3284 return (EINVAL);
3285
3286 if (mst->sw->snd_tag_query == NULL)
3287 return (EOPNOTSUPP);
3288
3289 error = mst->sw->snd_tag_query(mst, ¶ms);
3290 if (error == 0 && p_txqueue_level != NULL)
3291 *p_txqueue_level = params.rate_limit.queue_level;
3292 return (error);
3293 }
3294
3295 /*
3296 * Allocate a new TX rate limit send tag from the network interface
3297 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3298 */
3299 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3300 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3301 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3302
3303 {
3304 union if_snd_tag_alloc_params params = {
3305 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3306 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3307 .rate_limit.hdr.flowid = flowid,
3308 .rate_limit.hdr.flowtype = flowtype,
3309 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3310 .rate_limit.max_rate = max_pacing_rate,
3311 .rate_limit.flags = M_NOWAIT,
3312 };
3313 int error;
3314
3315 INP_WLOCK_ASSERT(inp);
3316
3317 /*
3318 * If there is already a send tag, or the INP is being torn
3319 * down, allocating a new send tag is not allowed. Else send
3320 * tags may leak.
3321 */
3322 if (*st != NULL || (inp->inp_flags & INP_UNCONNECTED))
3323 return (EINVAL);
3324
3325 error = m_snd_tag_alloc(ifp, ¶ms, st);
3326 #ifdef INET
3327 if (error == 0) {
3328 counter_u64_add(rate_limit_set_ok, 1);
3329 counter_u64_add(rate_limit_active, 1);
3330 } else if (error != EOPNOTSUPP)
3331 counter_u64_add(rate_limit_alloc_fail, 1);
3332 #endif
3333 return (error);
3334 }
3335
3336 void
in_pcbdetach_tag(struct m_snd_tag * mst)3337 in_pcbdetach_tag(struct m_snd_tag *mst)
3338 {
3339
3340 m_snd_tag_rele(mst);
3341 #ifdef INET
3342 counter_u64_add(rate_limit_active, -1);
3343 #endif
3344 }
3345
3346 /*
3347 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3348 * if any:
3349 */
3350 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3351 in_pcbdetach_txrtlmt(struct inpcb *inp)
3352 {
3353 struct m_snd_tag *mst;
3354
3355 INP_WLOCK_ASSERT(inp);
3356
3357 mst = inp->inp_snd_tag;
3358 inp->inp_snd_tag = NULL;
3359
3360 if (mst == NULL)
3361 return;
3362
3363 m_snd_tag_rele(mst);
3364 #ifdef INET
3365 counter_u64_add(rate_limit_active, -1);
3366 #endif
3367 }
3368
3369 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3370 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3371 {
3372 int error;
3373
3374 /*
3375 * If the existing send tag is for the wrong interface due to
3376 * a route change, first drop the existing tag. Set the
3377 * CHANGED flag so that we will keep trying to allocate a new
3378 * tag if we fail to allocate one this time.
3379 */
3380 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3381 in_pcbdetach_txrtlmt(inp);
3382 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3383 }
3384
3385 /*
3386 * NOTE: When attaching to a network interface a reference is
3387 * made to ensure the network interface doesn't go away until
3388 * all ratelimit connections are gone. The network interface
3389 * pointers compared below represent valid network interfaces,
3390 * except when comparing towards NULL.
3391 */
3392 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3393 error = 0;
3394 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3395 if (inp->inp_snd_tag != NULL)
3396 in_pcbdetach_txrtlmt(inp);
3397 error = 0;
3398 } else if (inp->inp_snd_tag == NULL) {
3399 /*
3400 * In order to utilize packet pacing with RSS, we need
3401 * to wait until there is a valid RSS hash before we
3402 * can proceed:
3403 */
3404 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3405 error = EAGAIN;
3406 } else {
3407 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3408 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3409 }
3410 } else {
3411 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3412 }
3413 if (error == 0 || error == EOPNOTSUPP)
3414 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3415
3416 return (error);
3417 }
3418
3419 /*
3420 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3421 * is set in the fast path and will attach/detach/modify the TX rate
3422 * limit send tag based on the socket's so_max_pacing_rate value.
3423 */
3424 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3425 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3426 {
3427 struct socket *socket;
3428 uint32_t max_pacing_rate;
3429 bool did_upgrade;
3430
3431 if (inp == NULL)
3432 return;
3433
3434 socket = inp->inp_socket;
3435 if (socket == NULL)
3436 return;
3437
3438 if (!INP_WLOCKED(inp)) {
3439 /*
3440 * NOTE: If the write locking fails, we need to bail
3441 * out and use the non-ratelimited ring for the
3442 * transmit until there is a new chance to get the
3443 * write lock.
3444 */
3445 if (!INP_TRY_UPGRADE(inp))
3446 return;
3447 did_upgrade = 1;
3448 } else {
3449 did_upgrade = 0;
3450 }
3451
3452 /*
3453 * NOTE: The so_max_pacing_rate value is read unlocked,
3454 * because atomic updates are not required since the variable
3455 * is checked at every mbuf we send. It is assumed that the
3456 * variable read itself will be atomic.
3457 */
3458 max_pacing_rate = socket->so_max_pacing_rate;
3459
3460 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3461
3462 if (did_upgrade)
3463 INP_DOWNGRADE(inp);
3464 }
3465
3466 /*
3467 * Track route changes for TX rate limiting.
3468 */
3469 void
in_pcboutput_eagain(struct inpcb * inp)3470 in_pcboutput_eagain(struct inpcb *inp)
3471 {
3472 bool did_upgrade;
3473
3474 if (inp == NULL)
3475 return;
3476
3477 if (inp->inp_snd_tag == NULL)
3478 return;
3479
3480 if (!INP_WLOCKED(inp)) {
3481 /*
3482 * NOTE: If the write locking fails, we need to bail
3483 * out and use the non-ratelimited ring for the
3484 * transmit until there is a new chance to get the
3485 * write lock.
3486 */
3487 if (!INP_TRY_UPGRADE(inp))
3488 return;
3489 did_upgrade = 1;
3490 } else {
3491 did_upgrade = 0;
3492 }
3493
3494 /* detach rate limiting */
3495 in_pcbdetach_txrtlmt(inp);
3496
3497 /* make sure new mbuf send tag allocation is made */
3498 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3499
3500 if (did_upgrade)
3501 INP_DOWNGRADE(inp);
3502 }
3503
3504 #ifdef INET
3505 static void
rl_init(void * st)3506 rl_init(void *st)
3507 {
3508 rate_limit_new = counter_u64_alloc(M_WAITOK);
3509 rate_limit_chg = counter_u64_alloc(M_WAITOK);
3510 rate_limit_active = counter_u64_alloc(M_WAITOK);
3511 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3512 rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3513 }
3514
3515 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3516 #endif
3517 #endif /* RATELIMIT */
3518