1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org>
9 * All rights reserved.
10 *
11 * Portions of this software were developed by Robert N. M. Watson under
12 * contract to Juniper Networks, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #include "opt_ddb.h"
41 #include "opt_ipsec.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_ratelimit.h"
45 #include "opt_route.h"
46 #include "opt_rss.h"
47
48 #include <sys/param.h>
49 #include <sys/hash.h>
50 #include <sys/systm.h>
51 #include <sys/libkern.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/eventhandler.h>
56 #include <sys/domain.h>
57 #include <sys/proc.h>
58 #include <sys/protosw.h>
59 #include <sys/smp.h>
60 #include <sys/smr.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <sys/priv.h>
65 #include <sys/proc.h>
66 #include <sys/refcount.h>
67 #include <sys/jail.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74
75 #include <vm/uma.h>
76 #include <vm/vm.h>
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/if_private.h>
81 #include <net/if_types.h>
82 #include <net/if_llatbl.h>
83 #include <net/route.h>
84 #include <net/rss_config.h>
85 #include <net/vnet.h>
86
87 #if defined(INET) || defined(INET6)
88 #include <netinet/in.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/in_pcb_var.h>
91 #include <netinet/tcp.h>
92 #ifdef INET
93 #include <netinet/in_var.h>
94 #include <netinet/in_fib.h>
95 #endif
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/in6_pcb.h>
100 #include <netinet6/in6_var.h>
101 #include <netinet6/ip6_var.h>
102 #endif /* INET6 */
103 #include <net/route/nhop.h>
104 #endif
105
106 #include <netipsec/ipsec_support.h>
107
108 #include <security/mac/mac_framework.h>
109
110 #define INPCBLBGROUP_SIZMIN 8
111 #define INPCBLBGROUP_SIZMAX 256
112
113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */
114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */
115
116 /*
117 * These configure the range of local port addresses assigned to
118 * "unspecified" outgoing connections/packets/whatever.
119 */
120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
126
127 /*
128 * Reserved ports accessible only to root. There are significant
129 * security considerations that must be accounted for when changing these,
130 * but the security benefits can be great. Please be careful.
131 */
132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
133 VNET_DEFINE(int, ipport_reservedlow);
134
135 /* Enable random ephemeral port allocation by default. */
136 VNET_DEFINE(int, ipport_randomized) = 1;
137
138 #ifdef INET
139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
140 struct in_addr faddr, u_int fport_arg,
141 struct in_addr laddr, u_int lport_arg,
142 int lookupflags, uint8_t numa_domain, int fib);
143
144 #define RANGECHK(var, min, max) \
145 if ((var) < (min)) { (var) = (min); } \
146 else if ((var) > (max)) { (var) = (max); }
147
148 static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
150 {
151 int error;
152
153 error = sysctl_handle_int(oidp, arg1, arg2, req);
154 if (error == 0) {
155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
161 }
162 return (error);
163 }
164
165 #undef RANGECHK
166
167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
169 "IP Ports");
170
171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
174 "");
175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
178 "");
179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
182 "");
183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
186 "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
190 "");
191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
194 "");
195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
197 &VNET_NAME(ipport_reservedhigh), 0, "");
198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
201 CTLFLAG_VNET | CTLFLAG_RW,
202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
203
204 #ifdef RATELIMIT
205 counter_u64_t rate_limit_new;
206 counter_u64_t rate_limit_chg;
207 counter_u64_t rate_limit_active;
208 counter_u64_t rate_limit_alloc_fail;
209 counter_u64_t rate_limit_set_ok;
210
211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
212 "IP Rate Limiting");
213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
214 &rate_limit_active, "Active rate limited connections");
215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
216 &rate_limit_alloc_fail, "Rate limited connection failures");
217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
218 &rate_limit_set_ok, "Rate limited setting succeeded");
219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
220 &rate_limit_new, "Total Rate limit new attempts");
221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
222 &rate_limit_chg, "Total Rate limited change attempts");
223 #endif /* RATELIMIT */
224
225 #endif /* INET */
226
227 VNET_DEFINE(uint32_t, in_pcbhashseed);
228 static void
in_pcbhashseed_init(void)229 in_pcbhashseed_init(void)
230 {
231
232 V_in_pcbhashseed = arc4random();
233 }
234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
235 in_pcbhashseed_init, NULL);
236
237 #ifdef INET
238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1;
239 #define V_connect_inaddr_wild VNET(connect_inaddr_wild)
240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
241 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0,
242 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
243 #endif
244
245 static void in_pcbremhash(struct inpcb *);
246
247 /*
248 * in_pcb.c: manage the Protocol Control Blocks.
249 *
250 * NOTE: It is assumed that most of these functions will be called with
251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
252 * functions often modify hash chains or addresses in pcbs.
253 */
254
255 static struct inpcblbgroup *
in_pcblbgroup_alloc(struct ucred * cred,u_char vflag,uint16_t port,const union in_dependaddr * addr,int size,uint8_t numa_domain,int fib)256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
257 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib)
258 {
259 struct inpcblbgroup *grp;
260 size_t bytes;
261
262 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
263 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
264 if (grp == NULL)
265 return (NULL);
266 LIST_INIT(&grp->il_pending);
267 grp->il_cred = crhold(cred);
268 grp->il_vflag = vflag;
269 grp->il_lport = port;
270 grp->il_numa_domain = numa_domain;
271 grp->il_fibnum = fib;
272 grp->il_dependladdr = *addr;
273 grp->il_inpsiz = size;
274 return (grp);
275 }
276
277 static void
in_pcblbgroup_free_deferred(epoch_context_t ctx)278 in_pcblbgroup_free_deferred(epoch_context_t ctx)
279 {
280 struct inpcblbgroup *grp;
281
282 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
283 crfree(grp->il_cred);
284 free(grp, M_PCB);
285 }
286
287 static void
in_pcblbgroup_free(struct inpcblbgroup * grp)288 in_pcblbgroup_free(struct inpcblbgroup *grp)
289 {
290 KASSERT(LIST_EMPTY(&grp->il_pending),
291 ("local group %p still has pending inps", grp));
292
293 CK_LIST_REMOVE(grp, il_list);
294 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
295 }
296
297 static struct inpcblbgroup *
in_pcblbgroup_find(struct inpcb * inp)298 in_pcblbgroup_find(struct inpcb *inp)
299 {
300 struct inpcbinfo *pcbinfo;
301 struct inpcblbgroup *grp;
302 struct inpcblbgrouphead *hdr;
303
304 INP_LOCK_ASSERT(inp);
305
306 pcbinfo = inp->inp_pcbinfo;
307 INP_HASH_LOCK_ASSERT(pcbinfo);
308
309 hdr = &pcbinfo->ipi_lbgrouphashbase[
310 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
311 CK_LIST_FOREACH(grp, hdr, il_list) {
312 struct inpcb *inp1;
313
314 for (unsigned int i = 0; i < grp->il_inpcnt; i++) {
315 if (inp == grp->il_inp[i])
316 goto found;
317 }
318 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
319 if (inp == inp1)
320 goto found;
321 }
322 }
323 found:
324 return (grp);
325 }
326
327 static void
in_pcblbgroup_insert(struct inpcblbgroup * grp,struct inpcb * inp)328 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
329 {
330 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
331 ("invalid local group size %d and count %d", grp->il_inpsiz,
332 grp->il_inpcnt));
333 INP_WLOCK_ASSERT(inp);
334
335 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp &&
336 !SOLISTENING(inp->inp_socket)) {
337 /*
338 * If this is a TCP socket, it should not be visible to lbgroup
339 * lookups until listen() has been called.
340 */
341 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list);
342 } else {
343 grp->il_inp[grp->il_inpcnt] = inp;
344
345 /*
346 * Synchronize with in_pcblookup_lbgroup(): make sure that we
347 * don't expose a null slot to the lookup path.
348 */
349 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
350 }
351
352 inp->inp_flags |= INP_INLBGROUP;
353 }
354
355 static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead * hdr,struct inpcblbgroup * old_grp,int size)356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
357 struct inpcblbgroup *old_grp, int size)
358 {
359 struct inpcblbgroup *grp;
360 int i;
361
362 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
363 old_grp->il_lport, &old_grp->il_dependladdr, size,
364 old_grp->il_numa_domain, old_grp->il_fibnum);
365 if (grp == NULL)
366 return (NULL);
367
368 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
369 ("invalid new local group size %d and old local group count %d",
370 grp->il_inpsiz, old_grp->il_inpcnt));
371
372 for (i = 0; i < old_grp->il_inpcnt; ++i)
373 grp->il_inp[i] = old_grp->il_inp[i];
374 grp->il_inpcnt = old_grp->il_inpcnt;
375 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
376 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb,
377 inp_lbgroup_list);
378 in_pcblbgroup_free(old_grp);
379 return (grp);
380 }
381
382 /*
383 * Add PCB to load balance group for SO_REUSEPORT_LB option.
384 */
385 static int
in_pcbinslbgrouphash(struct inpcb * inp,uint8_t numa_domain)386 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
387 {
388 const static struct timeval interval = { 60, 0 };
389 static struct timeval lastprint;
390 struct inpcbinfo *pcbinfo;
391 struct inpcblbgrouphead *hdr;
392 struct inpcblbgroup *grp;
393 uint32_t idx;
394 int fib;
395
396 pcbinfo = inp->inp_pcbinfo;
397
398 INP_WLOCK_ASSERT(inp);
399 INP_HASH_WLOCK_ASSERT(pcbinfo);
400
401 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ?
402 inp->inp_inc.inc_fibnum : RT_ALL_FIBS;
403
404 #ifdef INET6
405 /*
406 * Don't allow IPv4 mapped INET6 wild socket.
407 */
408 if ((inp->inp_vflag & INP_IPV4) &&
409 inp->inp_laddr.s_addr == INADDR_ANY &&
410 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
411 return (0);
412 }
413 #endif
414
415 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
416 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
417 CK_LIST_FOREACH(grp, hdr, il_list) {
418 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
419 grp->il_vflag == inp->inp_vflag &&
420 grp->il_lport == inp->inp_lport &&
421 grp->il_numa_domain == numa_domain &&
422 grp->il_fibnum == fib &&
423 memcmp(&grp->il_dependladdr,
424 &inp->inp_inc.inc_ie.ie_dependladdr,
425 sizeof(grp->il_dependladdr)) == 0) {
426 break;
427 }
428 }
429 if (grp == NULL) {
430 /* Create new load balance group. */
431 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
432 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
433 INPCBLBGROUP_SIZMIN, numa_domain, fib);
434 if (grp == NULL)
435 return (ENOBUFS);
436 in_pcblbgroup_insert(grp, inp);
437 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
438 } else if (grp->il_inpcnt == grp->il_inpsiz) {
439 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
440 if (ratecheck(&lastprint, &interval))
441 printf("lb group port %d, limit reached\n",
442 ntohs(grp->il_lport));
443 return (0);
444 }
445
446 /* Expand this local group. */
447 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
448 if (grp == NULL)
449 return (ENOBUFS);
450 in_pcblbgroup_insert(grp, inp);
451 } else {
452 in_pcblbgroup_insert(grp, inp);
453 }
454 return (0);
455 }
456
457 /*
458 * Remove PCB from load balance group.
459 */
460 static void
in_pcbremlbgrouphash(struct inpcb * inp)461 in_pcbremlbgrouphash(struct inpcb *inp)
462 {
463 struct inpcbinfo *pcbinfo;
464 struct inpcblbgrouphead *hdr;
465 struct inpcblbgroup *grp;
466 struct inpcb *inp1;
467 int i;
468
469 pcbinfo = inp->inp_pcbinfo;
470
471 INP_WLOCK_ASSERT(inp);
472 MPASS(inp->inp_flags & INP_INLBGROUP);
473 INP_HASH_WLOCK_ASSERT(pcbinfo);
474
475 hdr = &pcbinfo->ipi_lbgrouphashbase[
476 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
477 CK_LIST_FOREACH(grp, hdr, il_list) {
478 for (i = 0; i < grp->il_inpcnt; ++i) {
479 if (grp->il_inp[i] != inp)
480 continue;
481
482 if (grp->il_inpcnt == 1 &&
483 LIST_EMPTY(&grp->il_pending)) {
484 /* We are the last, free this local group. */
485 in_pcblbgroup_free(grp);
486 } else {
487 grp->il_inp[i] =
488 grp->il_inp[grp->il_inpcnt - 1];
489
490 /*
491 * Synchronize with in_pcblookup_lbgroup().
492 */
493 atomic_store_rel_int(&grp->il_inpcnt,
494 grp->il_inpcnt - 1);
495 }
496 inp->inp_flags &= ~INP_INLBGROUP;
497 return;
498 }
499 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) {
500 if (inp == inp1) {
501 LIST_REMOVE(inp, inp_lbgroup_list);
502 inp->inp_flags &= ~INP_INLBGROUP;
503 return;
504 }
505 }
506 }
507 __assert_unreachable();
508 }
509
510 int
in_pcblbgroup_numa(struct inpcb * inp,int arg)511 in_pcblbgroup_numa(struct inpcb *inp, int arg)
512 {
513 struct inpcbinfo *pcbinfo;
514 int error;
515 uint8_t numa_domain;
516
517 switch (arg) {
518 case TCP_REUSPORT_LB_NUMA_NODOM:
519 numa_domain = M_NODOM;
520 break;
521 case TCP_REUSPORT_LB_NUMA_CURDOM:
522 numa_domain = PCPU_GET(domain);
523 break;
524 default:
525 if (arg < 0 || arg >= vm_ndomains)
526 return (EINVAL);
527 numa_domain = arg;
528 }
529
530 pcbinfo = inp->inp_pcbinfo;
531 INP_WLOCK_ASSERT(inp);
532 INP_HASH_WLOCK(pcbinfo);
533 if (in_pcblbgroup_find(inp) != NULL) {
534 /* Remove it from the old group. */
535 in_pcbremlbgrouphash(inp);
536 /* Add it to the new group based on numa domain. */
537 in_pcbinslbgrouphash(inp, numa_domain);
538 error = 0;
539 } else {
540 error = ENOENT;
541 }
542 INP_HASH_WUNLOCK(pcbinfo);
543 return (error);
544 }
545
546 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
547 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
548
549 /*
550 * Initialize an inpcbinfo - a per-VNET instance of connections db.
551 */
552 void
in_pcbinfo_init(struct inpcbinfo * pcbinfo,struct inpcbstorage * pcbstor,u_int hash_nelements,u_int porthash_nelements)553 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
554 u_int hash_nelements, u_int porthash_nelements)
555 {
556
557 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
558 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
559 NULL, MTX_DEF);
560 #ifdef VIMAGE
561 pcbinfo->ipi_vnet = curvnet;
562 #endif
563 CK_LIST_INIT(&pcbinfo->ipi_listhead);
564 pcbinfo->ipi_count = 0;
565 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
566 &pcbinfo->ipi_hashmask);
567 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
568 &pcbinfo->ipi_hashmask);
569 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
570 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
571 &pcbinfo->ipi_porthashmask);
572 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
573 &pcbinfo->ipi_lbgrouphashmask);
574 pcbinfo->ipi_zone = pcbstor->ips_zone;
575 pcbinfo->ipi_portzone = pcbstor->ips_portzone;
576 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
577 }
578
579 /*
580 * Destroy an inpcbinfo.
581 */
582 void
in_pcbinfo_destroy(struct inpcbinfo * pcbinfo)583 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
584 {
585
586 KASSERT(pcbinfo->ipi_count == 0,
587 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
588
589 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
590 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
591 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
592 pcbinfo->ipi_porthashmask);
593 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
594 pcbinfo->ipi_lbgrouphashmask);
595 mtx_destroy(&pcbinfo->ipi_hash_lock);
596 mtx_destroy(&pcbinfo->ipi_lock);
597 }
598
599 /*
600 * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
601 */
602 static void inpcb_fini(void *, int);
603 void
in_pcbstorage_init(void * arg)604 in_pcbstorage_init(void *arg)
605 {
606 struct inpcbstorage *pcbstor = arg;
607
608 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
609 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
610 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
611 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
612 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
613 uma_zone_set_smr(pcbstor->ips_portzone,
614 uma_zone_get_smr(pcbstor->ips_zone));
615 }
616
617 /*
618 * Destroy a pcbstorage - used by unloadable protocols.
619 */
620 void
in_pcbstorage_destroy(void * arg)621 in_pcbstorage_destroy(void *arg)
622 {
623 struct inpcbstorage *pcbstor = arg;
624
625 uma_zdestroy(pcbstor->ips_zone);
626 uma_zdestroy(pcbstor->ips_portzone);
627 }
628
629 /*
630 * Allocate a PCB and associate it with the socket.
631 * On success return with the PCB locked.
632 */
633 int
in_pcballoc(struct socket * so,struct inpcbinfo * pcbinfo)634 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
635 {
636 struct inpcb *inp;
637 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
638 int error;
639 #endif
640
641 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
642 if (inp == NULL)
643 return (ENOBUFS);
644 bzero(&inp->inp_start_zero, inp_zero_size);
645 #ifdef NUMA
646 inp->inp_numa_domain = M_NODOM;
647 #endif
648 inp->inp_pcbinfo = pcbinfo;
649 inp->inp_socket = so;
650 inp->inp_cred = crhold(so->so_cred);
651 inp->inp_inc.inc_fibnum = so->so_fibnum;
652 #ifdef MAC
653 error = mac_inpcb_init(inp, M_NOWAIT);
654 if (error != 0)
655 goto out;
656 mac_inpcb_create(so, inp);
657 #endif
658 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
659 error = ipsec_init_pcbpolicy(inp);
660 if (error != 0) {
661 #ifdef MAC
662 mac_inpcb_destroy(inp);
663 #endif
664 goto out;
665 }
666 #endif /*IPSEC*/
667 #ifdef INET6
668 if (INP_SOCKAF(so) == AF_INET6) {
669 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
670 if (V_ip6_v6only)
671 inp->inp_flags |= IN6P_IPV6_V6ONLY;
672 #ifdef INET
673 else
674 inp->inp_vflag |= INP_IPV4;
675 #endif
676 if (V_ip6_auto_flowlabel)
677 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
678 inp->in6p_hops = -1; /* use kernel default */
679 }
680 #endif
681 #if defined(INET) && defined(INET6)
682 else
683 #endif
684 #ifdef INET
685 inp->inp_vflag |= INP_IPV4;
686 #endif
687 inp->inp_smr = SMR_SEQ_INVALID;
688
689 /*
690 * Routes in inpcb's can cache L2 as well; they are guaranteed
691 * to be cleaned up.
692 */
693 inp->inp_route.ro_flags = RT_LLE_CACHE;
694 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
695 INP_WLOCK(inp);
696 INP_INFO_WLOCK(pcbinfo);
697 pcbinfo->ipi_count++;
698 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
699 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
700 INP_INFO_WUNLOCK(pcbinfo);
701 so->so_pcb = inp;
702
703 return (0);
704
705 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
706 out:
707 crfree(inp->inp_cred);
708 #ifdef INVARIANTS
709 inp->inp_cred = NULL;
710 #endif
711 uma_zfree_smr(pcbinfo->ipi_zone, inp);
712 return (error);
713 #endif
714 }
715
716 #ifdef INET
717 int
in_pcbbind(struct inpcb * inp,struct sockaddr_in * sin,int flags,struct ucred * cred)718 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags,
719 struct ucred *cred)
720 {
721 int anonport, error;
722
723 KASSERT(sin == NULL || sin->sin_family == AF_INET,
724 ("%s: invalid address family for %p", __func__, sin));
725 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in),
726 ("%s: invalid address length for %p", __func__, sin));
727 INP_WLOCK_ASSERT(inp);
728 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
729
730 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
731 return (EINVAL);
732 anonport = sin == NULL || sin->sin_port == 0;
733 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr,
734 &inp->inp_lport, flags, cred);
735 if (error)
736 return (error);
737 if (in_pcbinshash(inp) != 0) {
738 inp->inp_laddr.s_addr = INADDR_ANY;
739 inp->inp_lport = 0;
740 inp->inp_flags &= ~INP_BOUNDFIB;
741 return (EAGAIN);
742 }
743 if (anonport)
744 inp->inp_flags |= INP_ANONPORT;
745 return (0);
746 }
747 #endif
748
749 #if defined(INET) || defined(INET6)
750 /*
751 * Assign a local port like in_pcb_lport(), but also used with connect()
752 * and a foreign address and port. If fsa is non-NULL, choose a local port
753 * that is unused with those, otherwise one that is completely unused.
754 * lsa can be NULL for IPv6.
755 */
756 int
in_pcb_lport_dest(struct inpcb * inp,struct sockaddr * lsa,u_short * lportp,struct sockaddr * fsa,u_short fport,struct ucred * cred,int lookupflags)757 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
758 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
759 {
760 struct inpcbinfo *pcbinfo;
761 struct inpcb *tmpinp;
762 unsigned short *lastport;
763 int count, error;
764 u_short aux, first, last, lport;
765 #ifdef INET
766 struct in_addr laddr, faddr;
767 #endif
768 #ifdef INET6
769 struct in6_addr *laddr6, *faddr6;
770 #endif
771
772 pcbinfo = inp->inp_pcbinfo;
773
774 /*
775 * Because no actual state changes occur here, a global write lock on
776 * the pcbinfo isn't required.
777 */
778 INP_LOCK_ASSERT(inp);
779 INP_HASH_LOCK_ASSERT(pcbinfo);
780
781 if (inp->inp_flags & INP_HIGHPORT) {
782 first = V_ipport_hifirstauto; /* sysctl */
783 last = V_ipport_hilastauto;
784 lastport = &pcbinfo->ipi_lasthi;
785 } else if (inp->inp_flags & INP_LOWPORT) {
786 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
787 if (error)
788 return (error);
789 first = V_ipport_lowfirstauto; /* 1023 */
790 last = V_ipport_lowlastauto; /* 600 */
791 lastport = &pcbinfo->ipi_lastlow;
792 } else {
793 first = V_ipport_firstauto; /* sysctl */
794 last = V_ipport_lastauto;
795 lastport = &pcbinfo->ipi_lastport;
796 }
797
798 /*
799 * Instead of having two loops further down counting up or down
800 * make sure that first is always <= last and go with only one
801 * code path implementing all logic.
802 */
803 if (first > last) {
804 aux = first;
805 first = last;
806 last = aux;
807 }
808
809 #ifdef INET
810 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */
811 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
812 if (lsa != NULL)
813 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
814 if (fsa != NULL)
815 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
816 }
817 #endif
818 #ifdef INET6
819 laddr6 = NULL;
820 if ((inp->inp_vflag & INP_IPV6) != 0) {
821 if (lsa != NULL)
822 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
823 if (fsa != NULL)
824 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
825 }
826 #endif
827
828 tmpinp = NULL;
829 lport = *lportp;
830
831 if (V_ipport_randomized)
832 *lastport = first + (arc4random() % (last - first));
833
834 count = last - first;
835
836 do {
837 if (count-- < 0) /* completely used? */
838 return (EADDRNOTAVAIL);
839 ++*lastport;
840 if (*lastport < first || *lastport > last)
841 *lastport = first;
842 lport = htons(*lastport);
843
844 if (fsa != NULL) {
845 #ifdef INET
846 if (lsa->sa_family == AF_INET) {
847 tmpinp = in_pcblookup_hash_locked(pcbinfo,
848 faddr, fport, laddr, lport, lookupflags,
849 M_NODOM, RT_ALL_FIBS);
850 }
851 #endif
852 #ifdef INET6
853 if (lsa->sa_family == AF_INET6) {
854 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
855 faddr6, fport, laddr6, lport, lookupflags,
856 M_NODOM, RT_ALL_FIBS);
857 }
858 #endif
859 } else {
860 #ifdef INET6
861 if ((inp->inp_vflag & INP_IPV6) != 0) {
862 tmpinp = in6_pcblookup_local(pcbinfo,
863 &inp->in6p_laddr, lport, RT_ALL_FIBS,
864 lookupflags, cred);
865 #ifdef INET
866 if (tmpinp == NULL &&
867 (inp->inp_vflag & INP_IPV4))
868 tmpinp = in_pcblookup_local(pcbinfo,
869 laddr, lport, RT_ALL_FIBS,
870 lookupflags, cred);
871 #endif
872 }
873 #endif
874 #if defined(INET) && defined(INET6)
875 else
876 #endif
877 #ifdef INET
878 tmpinp = in_pcblookup_local(pcbinfo, laddr,
879 lport, RT_ALL_FIBS, lookupflags, cred);
880 #endif
881 }
882 } while (tmpinp != NULL);
883
884 *lportp = lport;
885
886 return (0);
887 }
888
889 /*
890 * Select a local port (number) to use.
891 */
892 int
in_pcb_lport(struct inpcb * inp,struct in_addr * laddrp,u_short * lportp,struct ucred * cred,int lookupflags)893 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
894 struct ucred *cred, int lookupflags)
895 {
896 struct sockaddr_in laddr;
897
898 if (laddrp) {
899 bzero(&laddr, sizeof(laddr));
900 laddr.sin_family = AF_INET;
901 laddr.sin_addr = *laddrp;
902 }
903 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
904 NULL, lportp, NULL, 0, cred, lookupflags));
905 }
906 #endif /* INET || INET6 */
907
908 #ifdef INET
909 /*
910 * Determine whether the inpcb can be bound to the specified address/port tuple.
911 */
912 static int
in_pcbbind_avail(struct inpcb * inp,const struct in_addr laddr,const u_short lport,const int fib,int sooptions,int lookupflags,struct ucred * cred)913 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr,
914 const u_short lport, const int fib, int sooptions, int lookupflags,
915 struct ucred *cred)
916 {
917 int reuseport, reuseport_lb;
918
919 INP_LOCK_ASSERT(inp);
920 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
921
922 reuseport = (sooptions & SO_REUSEPORT);
923 reuseport_lb = (sooptions & SO_REUSEPORT_LB);
924
925 if (IN_MULTICAST(ntohl(laddr.s_addr))) {
926 /*
927 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
928 * allow complete duplication of binding if
929 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
930 * and a multicast address is bound on both
931 * new and duplicated sockets.
932 */
933 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0)
934 reuseport = SO_REUSEADDR | SO_REUSEPORT;
935 /*
936 * XXX: How to deal with SO_REUSEPORT_LB here?
937 * Treat same as SO_REUSEPORT for now.
938 */
939 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0)
940 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB;
941 } else if (!in_nullhost(laddr)) {
942 struct sockaddr_in sin;
943
944 memset(&sin, 0, sizeof(sin));
945 sin.sin_family = AF_INET;
946 sin.sin_len = sizeof(sin);
947 sin.sin_addr = laddr;
948
949 /*
950 * Is the address a local IP address?
951 * If INP_BINDANY is set, then the socket may be bound
952 * to any endpoint address, local or not.
953 */
954 if ((inp->inp_flags & INP_BINDANY) == 0 &&
955 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0)
956 return (EADDRNOTAVAIL);
957 }
958
959 if (lport != 0) {
960 struct inpcb *t;
961
962 if (ntohs(lport) <= V_ipport_reservedhigh &&
963 ntohs(lport) >= V_ipport_reservedlow &&
964 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
965 return (EACCES);
966
967 if (!IN_MULTICAST(ntohl(laddr.s_addr)) &&
968 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
969 /*
970 * If a socket owned by a different user is already
971 * bound to this port, fail. In particular, SO_REUSE*
972 * can only be used to share a port among sockets owned
973 * by the same user.
974 *
975 * However, we can share a port with a connected socket
976 * which has a unique 4-tuple.
977 */
978 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport,
979 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred);
980 if (t != NULL &&
981 (inp->inp_socket->so_type != SOCK_STREAM ||
982 in_nullhost(t->inp_faddr)) &&
983 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid))
984 return (EADDRINUSE);
985 }
986 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib,
987 lookupflags, cred);
988 if (t != NULL && ((reuseport | reuseport_lb) &
989 t->inp_socket->so_options) == 0) {
990 #ifdef INET6
991 if (!in_nullhost(laddr) ||
992 !in_nullhost(t->inp_laddr) ||
993 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
994 (t->inp_vflag & INP_IPV6PROTO) == 0)
995 #endif
996 return (EADDRINUSE);
997 }
998 }
999 return (0);
1000 }
1001
1002 /*
1003 * Set up a bind operation on a PCB, performing port allocation
1004 * as required, but do not actually modify the PCB. Callers can
1005 * either complete the bind by setting inp_laddr/inp_lport and
1006 * calling in_pcbinshash(), or they can just use the resulting
1007 * port and address to authorise the sending of a once-off packet.
1008 *
1009 * On error, the values of *laddrp and *lportp are not changed.
1010 */
1011 int
in_pcbbind_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,int flags,struct ucred * cred)1012 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
1013 u_short *lportp, int flags, struct ucred *cred)
1014 {
1015 struct socket *so = inp->inp_socket;
1016 struct in_addr laddr;
1017 u_short lport = 0;
1018 int error, fib, lookupflags, sooptions;
1019
1020 /*
1021 * No state changes, so read locks are sufficient here.
1022 */
1023 INP_LOCK_ASSERT(inp);
1024 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1025
1026 laddr.s_addr = *laddrp;
1027 if (sin != NULL && laddr.s_addr != INADDR_ANY)
1028 return (EINVAL);
1029
1030 lookupflags = 0;
1031 sooptions = atomic_load_int(&so->so_options);
1032 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0)
1033 lookupflags = INPLOOKUP_WILDCARD;
1034 if (sin == NULL) {
1035 if ((error = prison_local_ip4(cred, &laddr)) != 0)
1036 return (error);
1037 } else {
1038 KASSERT(sin->sin_family == AF_INET,
1039 ("%s: invalid family for address %p", __func__, sin));
1040 KASSERT(sin->sin_len == sizeof(*sin),
1041 ("%s: invalid length for address %p", __func__, sin));
1042
1043 error = prison_local_ip4(cred, &sin->sin_addr);
1044 if (error)
1045 return (error);
1046 if (sin->sin_port != *lportp) {
1047 /* Don't allow the port to change. */
1048 if (*lportp != 0)
1049 return (EINVAL);
1050 lport = sin->sin_port;
1051 }
1052 laddr = sin->sin_addr;
1053
1054 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum :
1055 RT_ALL_FIBS;
1056
1057 /* See if this address/port combo is available. */
1058 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions,
1059 lookupflags, cred);
1060 if (error != 0)
1061 return (error);
1062 }
1063 if (*lportp != 0)
1064 lport = *lportp;
1065 if (lport == 0) {
1066 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
1067 if (error != 0)
1068 return (error);
1069 }
1070 *laddrp = laddr.s_addr;
1071 *lportp = lport;
1072 if ((flags & INPBIND_FIB) != 0)
1073 inp->inp_flags |= INP_BOUNDFIB;
1074 return (0);
1075 }
1076
1077 /*
1078 * Connect from a socket to a specified address.
1079 * Both address and port must be specified in argument sin.
1080 * If don't have a local address for this socket yet,
1081 * then pick one.
1082 */
1083 int
in_pcbconnect(struct inpcb * inp,struct sockaddr_in * sin,struct ucred * cred)1084 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred)
1085 {
1086 u_short lport, fport;
1087 in_addr_t laddr, faddr;
1088 int anonport, error;
1089
1090 INP_WLOCK_ASSERT(inp);
1091 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1092 KASSERT(in_nullhost(inp->inp_faddr),
1093 ("%s: inp is already connected", __func__));
1094
1095 lport = inp->inp_lport;
1096 laddr = inp->inp_laddr.s_addr;
1097 anonport = (lport == 0);
1098 error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport,
1099 cred);
1100 if (error)
1101 return (error);
1102
1103 inp->inp_faddr.s_addr = faddr;
1104 inp->inp_fport = fport;
1105
1106 /* Do the initial binding of the local address if required. */
1107 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1108 inp->inp_lport = lport;
1109 inp->inp_laddr.s_addr = laddr;
1110 if (in_pcbinshash(inp) != 0) {
1111 inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
1112 INADDR_ANY;
1113 inp->inp_lport = inp->inp_fport = 0;
1114 return (EAGAIN);
1115 }
1116 } else {
1117 inp->inp_lport = lport;
1118 inp->inp_laddr.s_addr = laddr;
1119 if ((inp->inp_flags & INP_INHASHLIST) != 0)
1120 in_pcbrehash(inp);
1121 else
1122 in_pcbinshash(inp);
1123 }
1124
1125 if (anonport)
1126 inp->inp_flags |= INP_ANONPORT;
1127 return (0);
1128 }
1129
1130 /*
1131 * Do proper source address selection on an unbound socket in case
1132 * of connect. Take jails into account as well.
1133 */
1134 int
in_pcbladdr(struct inpcb * inp,struct in_addr * faddr,struct in_addr * laddr,struct ucred * cred)1135 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1136 struct ucred *cred)
1137 {
1138 struct ifaddr *ifa;
1139 struct sockaddr *sa;
1140 struct sockaddr_in *sin, dst;
1141 struct nhop_object *nh;
1142 int error;
1143
1144 NET_EPOCH_ASSERT();
1145 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1146
1147 /*
1148 * Bypass source address selection and use the primary jail IP
1149 * if requested.
1150 */
1151 if (!prison_saddrsel_ip4(cred, laddr))
1152 return (0);
1153
1154 error = 0;
1155
1156 nh = NULL;
1157 bzero(&dst, sizeof(dst));
1158 sin = &dst;
1159 sin->sin_family = AF_INET;
1160 sin->sin_len = sizeof(struct sockaddr_in);
1161 sin->sin_addr.s_addr = faddr->s_addr;
1162
1163 /*
1164 * If route is known our src addr is taken from the i/f,
1165 * else punt.
1166 *
1167 * Find out route to destination.
1168 */
1169 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1170 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
1171 0, NHR_NONE, 0);
1172
1173 /*
1174 * If we found a route, use the address corresponding to
1175 * the outgoing interface.
1176 *
1177 * Otherwise assume faddr is reachable on a directly connected
1178 * network and try to find a corresponding interface to take
1179 * the source address from.
1180 */
1181 if (nh == NULL || nh->nh_ifp == NULL) {
1182 struct in_ifaddr *ia;
1183 struct ifnet *ifp;
1184
1185 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1186 inp->inp_socket->so_fibnum));
1187 if (ia == NULL) {
1188 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1189 inp->inp_socket->so_fibnum));
1190 }
1191 if (ia == NULL) {
1192 error = ENETUNREACH;
1193 goto done;
1194 }
1195
1196 if (!prison_flag(cred, PR_IP4)) {
1197 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1198 goto done;
1199 }
1200
1201 ifp = ia->ia_ifp;
1202 ia = NULL;
1203 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1204 sa = ifa->ifa_addr;
1205 if (sa->sa_family != AF_INET)
1206 continue;
1207 sin = (struct sockaddr_in *)sa;
1208 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1209 ia = (struct in_ifaddr *)ifa;
1210 break;
1211 }
1212 }
1213 if (ia != NULL) {
1214 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1215 goto done;
1216 }
1217
1218 /* 3. As a last resort return the 'default' jail address. */
1219 error = prison_get_ip4(cred, laddr);
1220 goto done;
1221 }
1222
1223 /*
1224 * If the outgoing interface on the route found is not
1225 * a loopback interface, use the address from that interface.
1226 * In case of jails do those three steps:
1227 * 1. check if the interface address belongs to the jail. If so use it.
1228 * 2. check if we have any address on the outgoing interface
1229 * belonging to this jail. If so use it.
1230 * 3. as a last resort return the 'default' jail address.
1231 */
1232 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
1233 struct in_ifaddr *ia;
1234 struct ifnet *ifp;
1235
1236 /* If not jailed, use the default returned. */
1237 if (!prison_flag(cred, PR_IP4)) {
1238 ia = (struct in_ifaddr *)nh->nh_ifa;
1239 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1240 goto done;
1241 }
1242
1243 /* Jailed. */
1244 /* 1. Check if the iface address belongs to the jail. */
1245 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
1246 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1247 ia = (struct in_ifaddr *)nh->nh_ifa;
1248 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1249 goto done;
1250 }
1251
1252 /*
1253 * 2. Check if we have any address on the outgoing interface
1254 * belonging to this jail.
1255 */
1256 ia = NULL;
1257 ifp = nh->nh_ifp;
1258 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1259 sa = ifa->ifa_addr;
1260 if (sa->sa_family != AF_INET)
1261 continue;
1262 sin = (struct sockaddr_in *)sa;
1263 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1264 ia = (struct in_ifaddr *)ifa;
1265 break;
1266 }
1267 }
1268 if (ia != NULL) {
1269 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1270 goto done;
1271 }
1272
1273 /* 3. As a last resort return the 'default' jail address. */
1274 error = prison_get_ip4(cred, laddr);
1275 goto done;
1276 }
1277
1278 /*
1279 * The outgoing interface is marked with 'loopback net', so a route
1280 * to ourselves is here.
1281 * Try to find the interface of the destination address and then
1282 * take the address from there. That interface is not necessarily
1283 * a loopback interface.
1284 * In case of jails, check that it is an address of the jail
1285 * and if we cannot find, fall back to the 'default' jail address.
1286 */
1287 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
1288 struct in_ifaddr *ia;
1289
1290 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
1291 inp->inp_socket->so_fibnum));
1292 if (ia == NULL)
1293 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
1294 inp->inp_socket->so_fibnum));
1295 if (ia == NULL)
1296 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
1297
1298 if (!prison_flag(cred, PR_IP4)) {
1299 if (ia == NULL) {
1300 error = ENETUNREACH;
1301 goto done;
1302 }
1303 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1304 goto done;
1305 }
1306
1307 /* Jailed. */
1308 if (ia != NULL) {
1309 struct ifnet *ifp;
1310
1311 ifp = ia->ia_ifp;
1312 ia = NULL;
1313 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1314 sa = ifa->ifa_addr;
1315 if (sa->sa_family != AF_INET)
1316 continue;
1317 sin = (struct sockaddr_in *)sa;
1318 if (prison_check_ip4(cred,
1319 &sin->sin_addr) == 0) {
1320 ia = (struct in_ifaddr *)ifa;
1321 break;
1322 }
1323 }
1324 if (ia != NULL) {
1325 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1326 goto done;
1327 }
1328 }
1329
1330 /* 3. As a last resort return the 'default' jail address. */
1331 error = prison_get_ip4(cred, laddr);
1332 goto done;
1333 }
1334
1335 done:
1336 if (error == 0 && laddr->s_addr == INADDR_ANY)
1337 return (EHOSTUNREACH);
1338 return (error);
1339 }
1340
1341 /*
1342 * Set up for a connect from a socket to the specified address.
1343 * On entry, *laddrp and *lportp should contain the current local
1344 * address and port for the PCB; these are updated to the values
1345 * that should be placed in inp_laddr and inp_lport to complete
1346 * the connect.
1347 *
1348 * On success, *faddrp and *fportp will be set to the remote address
1349 * and port. These are not updated in the error case.
1350 */
1351 int
in_pcbconnect_setup(struct inpcb * inp,struct sockaddr_in * sin,in_addr_t * laddrp,u_short * lportp,in_addr_t * faddrp,u_short * fportp,struct ucred * cred)1352 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin,
1353 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1354 struct ucred *cred)
1355 {
1356 struct in_ifaddr *ia;
1357 struct in_addr laddr, faddr;
1358 u_short lport, fport;
1359 int error;
1360
1361 KASSERT(sin->sin_family == AF_INET,
1362 ("%s: invalid address family for %p", __func__, sin));
1363 KASSERT(sin->sin_len == sizeof(*sin),
1364 ("%s: invalid address length for %p", __func__, sin));
1365
1366 /*
1367 * Because a global state change doesn't actually occur here, a read
1368 * lock is sufficient.
1369 */
1370 NET_EPOCH_ASSERT();
1371 INP_LOCK_ASSERT(inp);
1372 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1373
1374 if (sin->sin_port == 0)
1375 return (EADDRNOTAVAIL);
1376 laddr.s_addr = *laddrp;
1377 lport = *lportp;
1378 faddr = sin->sin_addr;
1379 fport = sin->sin_port;
1380 #ifdef ROUTE_MPATH
1381 if (CALC_FLOWID_OUTBOUND) {
1382 uint32_t hash_val, hash_type;
1383
1384 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
1385 inp->inp_socket->so_proto->pr_protocol, &hash_type);
1386
1387 inp->inp_flowid = hash_val;
1388 inp->inp_flowtype = hash_type;
1389 }
1390 #endif
1391 if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1392 /*
1393 * If the destination address is INADDR_ANY,
1394 * use the primary local address.
1395 * If the supplied address is INADDR_BROADCAST,
1396 * and the primary interface supports broadcast,
1397 * choose the broadcast address for that interface.
1398 */
1399 if (faddr.s_addr == INADDR_ANY) {
1400 faddr =
1401 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1402 if ((error = prison_get_ip4(cred, &faddr)) != 0)
1403 return (error);
1404 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1405 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1406 IFF_BROADCAST)
1407 faddr = satosin(&CK_STAILQ_FIRST(
1408 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1409 }
1410 } else if (faddr.s_addr == INADDR_ANY) {
1411 return (ENETUNREACH);
1412 }
1413 if (laddr.s_addr == INADDR_ANY) {
1414 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1415 /*
1416 * If the destination address is multicast and an outgoing
1417 * interface has been set as a multicast option, prefer the
1418 * address of that interface as our source address.
1419 */
1420 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1421 inp->inp_moptions != NULL) {
1422 struct ip_moptions *imo;
1423 struct ifnet *ifp;
1424
1425 imo = inp->inp_moptions;
1426 if (imo->imo_multicast_ifp != NULL) {
1427 ifp = imo->imo_multicast_ifp;
1428 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1429 if (ia->ia_ifp == ifp &&
1430 prison_check_ip4(cred,
1431 &ia->ia_addr.sin_addr) == 0)
1432 break;
1433 }
1434 if (ia == NULL)
1435 error = EADDRNOTAVAIL;
1436 else {
1437 laddr = ia->ia_addr.sin_addr;
1438 error = 0;
1439 }
1440 }
1441 }
1442 if (error)
1443 return (error);
1444 }
1445
1446 if (lport != 0) {
1447 if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1448 fport, laddr, lport, 0, M_NODOM, RT_ALL_FIBS) != NULL)
1449 return (EADDRINUSE);
1450 } else {
1451 struct sockaddr_in lsin, fsin;
1452
1453 bzero(&lsin, sizeof(lsin));
1454 bzero(&fsin, sizeof(fsin));
1455 lsin.sin_family = AF_INET;
1456 lsin.sin_addr = laddr;
1457 fsin.sin_family = AF_INET;
1458 fsin.sin_addr = faddr;
1459 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1460 &lport, (struct sockaddr *)& fsin, fport, cred,
1461 INPLOOKUP_WILDCARD);
1462 if (error)
1463 return (error);
1464 }
1465 *laddrp = laddr.s_addr;
1466 *lportp = lport;
1467 *faddrp = faddr.s_addr;
1468 *fportp = fport;
1469 return (0);
1470 }
1471
1472 void
in_pcbdisconnect(struct inpcb * inp)1473 in_pcbdisconnect(struct inpcb *inp)
1474 {
1475
1476 INP_WLOCK_ASSERT(inp);
1477 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1478 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
1479 ("%s: inp %p was already disconnected", __func__, inp));
1480
1481 in_pcbremhash_locked(inp);
1482
1483 /* See the comment in in_pcbinshash(). */
1484 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
1485 inp->inp_laddr.s_addr = INADDR_ANY;
1486 inp->inp_faddr.s_addr = INADDR_ANY;
1487 inp->inp_fport = 0;
1488 }
1489 #endif /* INET */
1490
1491 void
in_pcblisten(struct inpcb * inp)1492 in_pcblisten(struct inpcb *inp)
1493 {
1494 struct inpcblbgroup *grp;
1495
1496 INP_WLOCK_ASSERT(inp);
1497
1498 if ((inp->inp_flags & INP_INLBGROUP) != 0) {
1499 struct inpcbinfo *pcbinfo;
1500
1501 pcbinfo = inp->inp_pcbinfo;
1502 INP_HASH_WLOCK(pcbinfo);
1503 grp = in_pcblbgroup_find(inp);
1504 LIST_REMOVE(inp, inp_lbgroup_list);
1505 in_pcblbgroup_insert(grp, inp);
1506 INP_HASH_WUNLOCK(pcbinfo);
1507 }
1508 }
1509
1510 /*
1511 * inpcb hash lookups are protected by SMR section.
1512 *
1513 * Once desired pcb has been found, switching from SMR section to a pcb
1514 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
1515 * here because SMR is a critical section.
1516 * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
1517 */
1518 void
inp_lock(struct inpcb * inp,const inp_lookup_t lock)1519 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
1520 {
1521
1522 lock == INPLOOKUP_RLOCKPCB ?
1523 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
1524 }
1525
1526 void
inp_unlock(struct inpcb * inp,const inp_lookup_t lock)1527 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
1528 {
1529
1530 lock == INPLOOKUP_RLOCKPCB ?
1531 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
1532 }
1533
1534 int
inp_trylock(struct inpcb * inp,const inp_lookup_t lock)1535 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
1536 {
1537
1538 return (lock == INPLOOKUP_RLOCKPCB ?
1539 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
1540 }
1541
1542 static inline bool
_inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock,const int ignflags)1543 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
1544 {
1545
1546 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
1547 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
1548
1549 if (__predict_true(inp_trylock(inp, lock))) {
1550 if (__predict_false(inp->inp_flags & ignflags)) {
1551 smr_exit(inp->inp_pcbinfo->ipi_smr);
1552 inp_unlock(inp, lock);
1553 return (false);
1554 }
1555 smr_exit(inp->inp_pcbinfo->ipi_smr);
1556 return (true);
1557 }
1558
1559 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1560 smr_exit(inp->inp_pcbinfo->ipi_smr);
1561 inp_lock(inp, lock);
1562 if (__predict_false(in_pcbrele(inp, lock)))
1563 return (false);
1564 /*
1565 * inp acquired through refcount & lock for sure didn't went
1566 * through uma_zfree(). However, it may have already went
1567 * through in_pcbfree() and has another reference, that
1568 * prevented its release by our in_pcbrele().
1569 */
1570 if (__predict_false(inp->inp_flags & ignflags)) {
1571 inp_unlock(inp, lock);
1572 return (false);
1573 }
1574 return (true);
1575 } else {
1576 smr_exit(inp->inp_pcbinfo->ipi_smr);
1577 return (false);
1578 }
1579 }
1580
1581 bool
inp_smr_lock(struct inpcb * inp,const inp_lookup_t lock)1582 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
1583 {
1584
1585 /*
1586 * in_pcblookup() family of functions ignore not only freed entries,
1587 * that may be found due to lockless access to the hash, but dropped
1588 * entries, too.
1589 */
1590 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
1591 }
1592
1593 /*
1594 * inp_next() - inpcb hash/list traversal iterator
1595 *
1596 * Requires initialized struct inpcb_iterator for context.
1597 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
1598 *
1599 * - Iterator can have either write-lock or read-lock semantics, that can not
1600 * be changed later.
1601 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
1602 * a single hash slot. Note: only rip_input() does the latter.
1603 * - Iterator may have optional bool matching function. The matching function
1604 * will be executed for each inpcb in the SMR context, so it can not acquire
1605 * locks and can safely access only immutable fields of inpcb.
1606 *
1607 * A fresh initialized iterator has NULL inpcb in its context and that
1608 * means that inp_next() call would return the very first inpcb on the list
1609 * locked with desired semantic. In all following calls the context pointer
1610 * shall hold the current inpcb pointer. The KPI user is not supposed to
1611 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
1612 * and write NULL to its context. After end of traversal an iterator can be
1613 * reused.
1614 *
1615 * List traversals have the following features/constraints:
1616 * - New entries won't be seen, as they are always added to the head of a list.
1617 * - Removed entries won't stop traversal as long as they are not added to
1618 * a different list. This is violated by in_pcbrehash().
1619 */
1620 #define II_LIST_FIRST(ipi, hash) \
1621 (((hash) == INP_ALL_LIST) ? \
1622 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
1623 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
1624 #define II_LIST_NEXT(inp, hash) \
1625 (((hash) == INP_ALL_LIST) ? \
1626 CK_LIST_NEXT((inp), inp_list) : \
1627 CK_LIST_NEXT((inp), inp_hash_exact))
1628 #define II_LOCK_ASSERT(inp, lock) \
1629 rw_assert(&(inp)->inp_lock, \
1630 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
1631 struct inpcb *
inp_next(struct inpcb_iterator * ii)1632 inp_next(struct inpcb_iterator *ii)
1633 {
1634 const struct inpcbinfo *ipi = ii->ipi;
1635 inp_match_t *match = ii->match;
1636 void *ctx = ii->ctx;
1637 inp_lookup_t lock = ii->lock;
1638 int hash = ii->hash;
1639 struct inpcb *inp;
1640
1641 if (ii->inp == NULL) { /* First call. */
1642 smr_enter(ipi->ipi_smr);
1643 /* This is unrolled CK_LIST_FOREACH(). */
1644 for (inp = II_LIST_FIRST(ipi, hash);
1645 inp != NULL;
1646 inp = II_LIST_NEXT(inp, hash)) {
1647 if (match != NULL && (match)(inp, ctx) == false)
1648 continue;
1649 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
1650 break;
1651 else {
1652 smr_enter(ipi->ipi_smr);
1653 MPASS(inp != II_LIST_FIRST(ipi, hash));
1654 inp = II_LIST_FIRST(ipi, hash);
1655 if (inp == NULL)
1656 break;
1657 }
1658 }
1659
1660 if (inp == NULL)
1661 smr_exit(ipi->ipi_smr);
1662 else
1663 ii->inp = inp;
1664
1665 return (inp);
1666 }
1667
1668 /* Not a first call. */
1669 smr_enter(ipi->ipi_smr);
1670 restart:
1671 inp = ii->inp;
1672 II_LOCK_ASSERT(inp, lock);
1673 next:
1674 inp = II_LIST_NEXT(inp, hash);
1675 if (inp == NULL) {
1676 smr_exit(ipi->ipi_smr);
1677 goto found;
1678 }
1679
1680 if (match != NULL && (match)(inp, ctx) == false)
1681 goto next;
1682
1683 if (__predict_true(inp_trylock(inp, lock))) {
1684 if (__predict_false(inp->inp_flags & INP_FREED)) {
1685 /*
1686 * Entries are never inserted in middle of a list, thus
1687 * as long as we are in SMR, we can continue traversal.
1688 * Jump to 'restart' should yield in the same result,
1689 * but could produce unnecessary looping. Could this
1690 * looping be unbound?
1691 */
1692 inp_unlock(inp, lock);
1693 goto next;
1694 } else {
1695 smr_exit(ipi->ipi_smr);
1696 goto found;
1697 }
1698 }
1699
1700 /*
1701 * Can't obtain lock immediately, thus going hard. Once we exit the
1702 * SMR section we can no longer jump to 'next', and our only stable
1703 * anchoring point is ii->inp, which we keep locked for this case, so
1704 * we jump to 'restart'.
1705 */
1706 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
1707 smr_exit(ipi->ipi_smr);
1708 inp_lock(inp, lock);
1709 if (__predict_false(in_pcbrele(inp, lock))) {
1710 smr_enter(ipi->ipi_smr);
1711 goto restart;
1712 }
1713 /*
1714 * See comment in inp_smr_lock().
1715 */
1716 if (__predict_false(inp->inp_flags & INP_FREED)) {
1717 inp_unlock(inp, lock);
1718 smr_enter(ipi->ipi_smr);
1719 goto restart;
1720 }
1721 } else
1722 goto next;
1723
1724 found:
1725 inp_unlock(ii->inp, lock);
1726 ii->inp = inp;
1727
1728 return (ii->inp);
1729 }
1730
1731 /*
1732 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1733 * stability of an inpcb pointer despite the inpcb lock being released or
1734 * SMR section exited.
1735 *
1736 * To free a reference later in_pcbrele_(r|w)locked() must be performed.
1737 */
1738 void
in_pcbref(struct inpcb * inp)1739 in_pcbref(struct inpcb *inp)
1740 {
1741 u_int old __diagused;
1742
1743 old = refcount_acquire(&inp->inp_refcount);
1744 KASSERT(old > 0, ("%s: refcount 0", __func__));
1745 }
1746
1747 /*
1748 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
1749 * freeing the pcb, if the reference was very last.
1750 */
1751 bool
in_pcbrele_rlocked(struct inpcb * inp)1752 in_pcbrele_rlocked(struct inpcb *inp)
1753 {
1754
1755 INP_RLOCK_ASSERT(inp);
1756
1757 if (!refcount_release(&inp->inp_refcount))
1758 return (false);
1759
1760 MPASS(inp->inp_flags & INP_FREED);
1761 MPASS(inp->inp_socket == NULL);
1762 crfree(inp->inp_cred);
1763 #ifdef INVARIANTS
1764 inp->inp_cred = NULL;
1765 #endif
1766 INP_RUNLOCK(inp);
1767 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1768 return (true);
1769 }
1770
1771 bool
in_pcbrele_wlocked(struct inpcb * inp)1772 in_pcbrele_wlocked(struct inpcb *inp)
1773 {
1774
1775 INP_WLOCK_ASSERT(inp);
1776
1777 if (!refcount_release(&inp->inp_refcount))
1778 return (false);
1779
1780 MPASS(inp->inp_flags & INP_FREED);
1781 MPASS(inp->inp_socket == NULL);
1782 crfree(inp->inp_cred);
1783 #ifdef INVARIANTS
1784 inp->inp_cred = NULL;
1785 #endif
1786 INP_WUNLOCK(inp);
1787 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
1788 return (true);
1789 }
1790
1791 bool
in_pcbrele(struct inpcb * inp,const inp_lookup_t lock)1792 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
1793 {
1794
1795 return (lock == INPLOOKUP_RLOCKPCB ?
1796 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
1797 }
1798
1799 /*
1800 * Unconditionally schedule an inpcb to be freed by decrementing its
1801 * reference count, which should occur only after the inpcb has been detached
1802 * from its socket. If another thread holds a temporary reference (acquired
1803 * using in_pcbref()) then the free is deferred until that reference is
1804 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
1805 * Almost all work, including removal from global lists, is done in this
1806 * context, where the pcbinfo lock is held.
1807 */
1808 void
in_pcbfree(struct inpcb * inp)1809 in_pcbfree(struct inpcb *inp)
1810 {
1811 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1812 #ifdef INET
1813 struct ip_moptions *imo;
1814 #endif
1815 #ifdef INET6
1816 struct ip6_moptions *im6o;
1817 #endif
1818
1819 INP_WLOCK_ASSERT(inp);
1820 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1821 KASSERT((inp->inp_flags & INP_FREED) == 0,
1822 ("%s: called twice for pcb %p", __func__, inp));
1823
1824 /*
1825 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb
1826 * from the hash without acquiring inpcb lock, they rely on the hash
1827 * lock, thus in_pcbremhash() should be the first action.
1828 */
1829 if (inp->inp_flags & INP_INHASHLIST)
1830 in_pcbremhash(inp);
1831 INP_INFO_WLOCK(pcbinfo);
1832 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1833 pcbinfo->ipi_count--;
1834 CK_LIST_REMOVE(inp, inp_list);
1835 INP_INFO_WUNLOCK(pcbinfo);
1836
1837 #ifdef RATELIMIT
1838 if (inp->inp_snd_tag != NULL)
1839 in_pcbdetach_txrtlmt(inp);
1840 #endif
1841 inp->inp_flags |= INP_FREED;
1842 inp->inp_socket->so_pcb = NULL;
1843 inp->inp_socket = NULL;
1844
1845 RO_INVALIDATE_CACHE(&inp->inp_route);
1846 #ifdef MAC
1847 mac_inpcb_destroy(inp);
1848 #endif
1849 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1850 if (inp->inp_sp != NULL)
1851 ipsec_delete_pcbpolicy(inp);
1852 #endif
1853 #ifdef INET
1854 if (inp->inp_options)
1855 (void)m_free(inp->inp_options);
1856 DEBUG_POISON_POINTER(inp->inp_options);
1857 imo = inp->inp_moptions;
1858 DEBUG_POISON_POINTER(inp->inp_moptions);
1859 #endif
1860 #ifdef INET6
1861 if (inp->inp_vflag & INP_IPV6PROTO) {
1862 ip6_freepcbopts(inp->in6p_outputopts);
1863 DEBUG_POISON_POINTER(inp->in6p_outputopts);
1864 im6o = inp->in6p_moptions;
1865 DEBUG_POISON_POINTER(inp->in6p_moptions);
1866 } else
1867 im6o = NULL;
1868 #endif
1869
1870 if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
1871 INP_WUNLOCK(inp);
1872 }
1873 #ifdef INET6
1874 ip6_freemoptions(im6o);
1875 #endif
1876 #ifdef INET
1877 inp_freemoptions(imo);
1878 #endif
1879 }
1880
1881 /*
1882 * Different protocols initialize their inpcbs differently - giving
1883 * different name to the lock. But they all are disposed the same.
1884 */
1885 static void
inpcb_fini(void * mem,int size)1886 inpcb_fini(void *mem, int size)
1887 {
1888 struct inpcb *inp = mem;
1889
1890 INP_LOCK_DESTROY(inp);
1891 }
1892
1893 /*
1894 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1895 * port reservation, and preventing it from being returned by inpcb lookups.
1896 *
1897 * It is used by TCP to mark an inpcb as unused and avoid future packet
1898 * delivery or event notification when a socket remains open but TCP has
1899 * closed. This might occur as a result of a shutdown()-initiated TCP close
1900 * or a RST on the wire, and allows the port binding to be reused while still
1901 * maintaining the invariant that so_pcb always points to a valid inpcb until
1902 * in_pcbdetach().
1903 *
1904 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1905 * in_pcbpurgeif0()?
1906 */
1907 void
in_pcbdrop(struct inpcb * inp)1908 in_pcbdrop(struct inpcb *inp)
1909 {
1910
1911 INP_WLOCK_ASSERT(inp);
1912
1913 inp->inp_flags |= INP_DROPPED;
1914 if (inp->inp_flags & INP_INHASHLIST)
1915 in_pcbremhash(inp);
1916 }
1917
1918 #ifdef INET
1919 /*
1920 * Common routines to return the socket addresses associated with inpcbs.
1921 */
1922 int
in_getsockaddr(struct socket * so,struct sockaddr * sa)1923 in_getsockaddr(struct socket *so, struct sockaddr *sa)
1924 {
1925 struct inpcb *inp;
1926
1927 inp = sotoinpcb(so);
1928 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1929
1930 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1931 .sin_len = sizeof(struct sockaddr_in),
1932 .sin_family = AF_INET,
1933 .sin_port = inp->inp_lport,
1934 .sin_addr = inp->inp_laddr,
1935 };
1936
1937 return (0);
1938 }
1939
1940 int
in_getpeeraddr(struct socket * so,struct sockaddr * sa)1941 in_getpeeraddr(struct socket *so, struct sockaddr *sa)
1942 {
1943 struct inpcb *inp;
1944
1945 inp = sotoinpcb(so);
1946 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1947
1948 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
1949 .sin_len = sizeof(struct sockaddr_in),
1950 .sin_family = AF_INET,
1951 .sin_port = inp->inp_fport,
1952 .sin_addr = inp->inp_faddr,
1953 };
1954
1955 return (0);
1956 }
1957
1958 static bool
inp_v4_multi_match(const struct inpcb * inp,void * v __unused)1959 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
1960 {
1961
1962 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
1963 return (true);
1964 else
1965 return (false);
1966 }
1967
1968 void
in_pcbpurgeif0(struct inpcbinfo * pcbinfo,struct ifnet * ifp)1969 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1970 {
1971 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
1972 inp_v4_multi_match, NULL);
1973 struct inpcb *inp;
1974 struct in_multi *inm;
1975 struct in_mfilter *imf;
1976 struct ip_moptions *imo;
1977
1978 IN_MULTI_LOCK_ASSERT();
1979
1980 while ((inp = inp_next(&inpi)) != NULL) {
1981 INP_WLOCK_ASSERT(inp);
1982
1983 imo = inp->inp_moptions;
1984 /*
1985 * Unselect the outgoing interface if it is being
1986 * detached.
1987 */
1988 if (imo->imo_multicast_ifp == ifp)
1989 imo->imo_multicast_ifp = NULL;
1990
1991 /*
1992 * Drop multicast group membership if we joined
1993 * through the interface being detached.
1994 *
1995 * XXX This can all be deferred to an epoch_call
1996 */
1997 restart:
1998 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1999 if ((inm = imf->imf_inm) == NULL)
2000 continue;
2001 if (inm->inm_ifp != ifp)
2002 continue;
2003 ip_mfilter_remove(&imo->imo_head, imf);
2004 in_leavegroup_locked(inm, NULL);
2005 ip_mfilter_free(imf);
2006 goto restart;
2007 }
2008 }
2009 }
2010
2011 /*
2012 * Lookup a PCB based on the local address and port. Caller must hold the
2013 * hash lock. No inpcb locks or references are acquired.
2014 */
2015 #define INP_LOOKUP_MAPPED_PCB_COST 3
2016 struct inpcb *
in_pcblookup_local(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,int lookupflags,struct ucred * cred)2017 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2018 u_short lport, int fib, int lookupflags, struct ucred *cred)
2019 {
2020 struct inpcb *inp;
2021 #ifdef INET6
2022 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
2023 #else
2024 int matchwild = 3;
2025 #endif
2026 int wildcard;
2027
2028 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2029 ("%s: invalid lookup flags %d", __func__, lookupflags));
2030 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs),
2031 ("%s: invalid fib %d", __func__, fib));
2032
2033 INP_HASH_LOCK_ASSERT(pcbinfo);
2034
2035 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
2036 struct inpcbhead *head;
2037 /*
2038 * Look for an unconnected (wildcard foreign addr) PCB that
2039 * matches the local address and port we're looking for.
2040 */
2041 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2042 pcbinfo->ipi_hashmask)];
2043 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2044 #ifdef INET6
2045 /* XXX inp locking */
2046 if ((inp->inp_vflag & INP_IPV4) == 0)
2047 continue;
2048 #endif
2049 if (inp->inp_faddr.s_addr == INADDR_ANY &&
2050 inp->inp_laddr.s_addr == laddr.s_addr &&
2051 inp->inp_lport == lport && (fib == RT_ALL_FIBS ||
2052 inp->inp_inc.inc_fibnum == fib)) {
2053 /*
2054 * Found?
2055 */
2056 if (prison_equal_ip4(cred->cr_prison,
2057 inp->inp_cred->cr_prison))
2058 return (inp);
2059 }
2060 }
2061 /*
2062 * Not found.
2063 */
2064 return (NULL);
2065 } else {
2066 struct inpcbporthead *porthash;
2067 struct inpcbport *phd;
2068 struct inpcb *match = NULL;
2069 /*
2070 * Best fit PCB lookup.
2071 *
2072 * First see if this local port is in use by looking on the
2073 * port hash list.
2074 */
2075 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
2076 pcbinfo->ipi_porthashmask)];
2077 CK_LIST_FOREACH(phd, porthash, phd_hash) {
2078 if (phd->phd_port == lport)
2079 break;
2080 }
2081 if (phd != NULL) {
2082 /*
2083 * Port is in use by one or more PCBs. Look for best
2084 * fit.
2085 */
2086 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
2087 wildcard = 0;
2088 if (!prison_equal_ip4(inp->inp_cred->cr_prison,
2089 cred->cr_prison))
2090 continue;
2091 if (fib != RT_ALL_FIBS &&
2092 inp->inp_inc.inc_fibnum != fib)
2093 continue;
2094 #ifdef INET6
2095 /* XXX inp locking */
2096 if ((inp->inp_vflag & INP_IPV4) == 0)
2097 continue;
2098 /*
2099 * We never select the PCB that has
2100 * INP_IPV6 flag and is bound to :: if
2101 * we have another PCB which is bound
2102 * to 0.0.0.0. If a PCB has the
2103 * INP_IPV6 flag, then we set its cost
2104 * higher than IPv4 only PCBs.
2105 *
2106 * Note that the case only happens
2107 * when a socket is bound to ::, under
2108 * the condition that the use of the
2109 * mapped address is allowed.
2110 */
2111 if ((inp->inp_vflag & INP_IPV6) != 0)
2112 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
2113 #endif
2114 if (inp->inp_faddr.s_addr != INADDR_ANY)
2115 wildcard++;
2116 if (inp->inp_laddr.s_addr != INADDR_ANY) {
2117 if (laddr.s_addr == INADDR_ANY)
2118 wildcard++;
2119 else if (inp->inp_laddr.s_addr != laddr.s_addr)
2120 continue;
2121 } else {
2122 if (laddr.s_addr != INADDR_ANY)
2123 wildcard++;
2124 }
2125 if (wildcard < matchwild) {
2126 match = inp;
2127 matchwild = wildcard;
2128 if (matchwild == 0)
2129 break;
2130 }
2131 }
2132 }
2133 return (match);
2134 }
2135 }
2136 #undef INP_LOOKUP_MAPPED_PCB_COST
2137
2138 static bool
in_pcblookup_lb_match(const struct inpcblbgroup * grp,int domain,int fib)2139 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib)
2140 {
2141 return ((domain == M_NODOM || domain == grp->il_numa_domain) &&
2142 (fib == RT_ALL_FIBS || fib == grp->il_fibnum));
2143 }
2144
2145 static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo * pcbinfo,const struct in_addr * faddr,uint16_t fport,const struct in_addr * laddr,uint16_t lport,int domain,int fib)2146 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2147 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr,
2148 uint16_t lport, int domain, int fib)
2149 {
2150 const struct inpcblbgrouphead *hdr;
2151 struct inpcblbgroup *grp;
2152 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
2153 struct inpcb *inp;
2154 u_int count;
2155
2156 INP_HASH_LOCK_ASSERT(pcbinfo);
2157 NET_EPOCH_ASSERT();
2158
2159 hdr = &pcbinfo->ipi_lbgrouphashbase[
2160 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2161
2162 /*
2163 * Search for an LB group match based on the following criteria:
2164 * - prefer jailed groups to non-jailed groups
2165 * - prefer exact source address matches to wildcard matches
2166 * - prefer groups bound to the specified NUMA domain
2167 */
2168 jail_exact = jail_wild = local_exact = local_wild = NULL;
2169 CK_LIST_FOREACH(grp, hdr, il_list) {
2170 bool injail;
2171
2172 #ifdef INET6
2173 if (!(grp->il_vflag & INP_IPV4))
2174 continue;
2175 #endif
2176 if (grp->il_lport != lport)
2177 continue;
2178
2179 injail = prison_flag(grp->il_cred, PR_IP4) != 0;
2180 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
2181 laddr) != 0)
2182 continue;
2183
2184 if (grp->il_laddr.s_addr == laddr->s_addr) {
2185 if (injail) {
2186 jail_exact = grp;
2187 if (in_pcblookup_lb_match(grp, domain, fib))
2188 /* This is a perfect match. */
2189 goto out;
2190 } else if (local_exact == NULL ||
2191 in_pcblookup_lb_match(grp, domain, fib)) {
2192 local_exact = grp;
2193 }
2194 } else if (grp->il_laddr.s_addr == INADDR_ANY) {
2195 if (injail) {
2196 if (jail_wild == NULL ||
2197 in_pcblookup_lb_match(grp, domain, fib))
2198 jail_wild = grp;
2199 } else if (local_wild == NULL ||
2200 in_pcblookup_lb_match(grp, domain, fib)) {
2201 local_wild = grp;
2202 }
2203 }
2204 }
2205
2206 if (jail_exact != NULL)
2207 grp = jail_exact;
2208 else if (jail_wild != NULL)
2209 grp = jail_wild;
2210 else if (local_exact != NULL)
2211 grp = local_exact;
2212 else
2213 grp = local_wild;
2214 if (grp == NULL)
2215 return (NULL);
2216
2217 out:
2218 /*
2219 * Synchronize with in_pcblbgroup_insert().
2220 */
2221 count = atomic_load_acq_int(&grp->il_inpcnt);
2222 if (count == 0)
2223 return (NULL);
2224 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
2225 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
2226 return (inp);
2227 }
2228
2229 static bool
in_pcblookup_exact_match(const struct inpcb * inp,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2230 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr,
2231 u_short fport, struct in_addr laddr, u_short lport)
2232 {
2233 #ifdef INET6
2234 /* XXX inp locking */
2235 if ((inp->inp_vflag & INP_IPV4) == 0)
2236 return (false);
2237 #endif
2238 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2239 inp->inp_laddr.s_addr == laddr.s_addr &&
2240 inp->inp_fport == fport &&
2241 inp->inp_lport == lport)
2242 return (true);
2243 return (false);
2244 }
2245
2246 static struct inpcb *
in_pcblookup_hash_exact(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_short fport,struct in_addr laddr,u_short lport)2247 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2248 u_short fport, struct in_addr laddr, u_short lport)
2249 {
2250 struct inpcbhead *head;
2251 struct inpcb *inp;
2252
2253 INP_HASH_LOCK_ASSERT(pcbinfo);
2254
2255 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
2256 pcbinfo->ipi_hashmask)];
2257 CK_LIST_FOREACH(inp, head, inp_hash_exact) {
2258 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))
2259 return (inp);
2260 }
2261 return (NULL);
2262 }
2263
2264 typedef enum {
2265 INPLOOKUP_MATCH_NONE = 0,
2266 INPLOOKUP_MATCH_WILD = 1,
2267 INPLOOKUP_MATCH_LADDR = 2,
2268 } inp_lookup_match_t;
2269
2270 static inp_lookup_match_t
in_pcblookup_wild_match(const struct inpcb * inp,struct in_addr laddr,u_short lport,int fib)2271 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr,
2272 u_short lport, int fib)
2273 {
2274 #ifdef INET6
2275 /* XXX inp locking */
2276 if ((inp->inp_vflag & INP_IPV4) == 0)
2277 return (INPLOOKUP_MATCH_NONE);
2278 #endif
2279 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport)
2280 return (INPLOOKUP_MATCH_NONE);
2281 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib)
2282 return (INPLOOKUP_MATCH_NONE);
2283 if (inp->inp_laddr.s_addr == INADDR_ANY)
2284 return (INPLOOKUP_MATCH_WILD);
2285 if (inp->inp_laddr.s_addr == laddr.s_addr)
2286 return (INPLOOKUP_MATCH_LADDR);
2287 return (INPLOOKUP_MATCH_NONE);
2288 }
2289
2290 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1)
2291
2292 static struct inpcb *
in_pcblookup_hash_wild_smr(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib,const inp_lookup_t lockflags)2293 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2294 u_short lport, int fib, const inp_lookup_t lockflags)
2295 {
2296 struct inpcbhead *head;
2297 struct inpcb *inp;
2298
2299 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr),
2300 ("%s: not in SMR read section", __func__));
2301
2302 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2303 pcbinfo->ipi_hashmask)];
2304 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2305 inp_lookup_match_t match;
2306
2307 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2308 if (match == INPLOOKUP_MATCH_NONE)
2309 continue;
2310
2311 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2312 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2313 if (match != INPLOOKUP_MATCH_NONE &&
2314 prison_check_ip4_locked(inp->inp_cred->cr_prison,
2315 &laddr) == 0)
2316 return (inp);
2317 inp_unlock(inp, lockflags);
2318 }
2319
2320 /*
2321 * The matching socket disappeared out from under us. Fall back
2322 * to a serialized lookup.
2323 */
2324 return (INP_LOOKUP_AGAIN);
2325 }
2326 return (NULL);
2327 }
2328
2329 static struct inpcb *
in_pcblookup_hash_wild_locked(struct inpcbinfo * pcbinfo,struct in_addr laddr,u_short lport,int fib)2330 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr,
2331 u_short lport, int fib)
2332 {
2333 struct inpcbhead *head;
2334 struct inpcb *inp, *local_wild, *local_exact, *jail_wild;
2335 #ifdef INET6
2336 struct inpcb *local_wild_mapped;
2337 #endif
2338
2339 INP_HASH_LOCK_ASSERT(pcbinfo);
2340
2341 /*
2342 * Order of socket selection - we always prefer jails.
2343 * 1. jailed, non-wild.
2344 * 2. jailed, wild.
2345 * 3. non-jailed, non-wild.
2346 * 4. non-jailed, wild.
2347 */
2348 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
2349 pcbinfo->ipi_hashmask)];
2350 local_wild = local_exact = jail_wild = NULL;
2351 #ifdef INET6
2352 local_wild_mapped = NULL;
2353 #endif
2354 CK_LIST_FOREACH(inp, head, inp_hash_wild) {
2355 inp_lookup_match_t match;
2356 bool injail;
2357
2358 match = in_pcblookup_wild_match(inp, laddr, lport, fib);
2359 if (match == INPLOOKUP_MATCH_NONE)
2360 continue;
2361
2362 injail = prison_flag(inp->inp_cred, PR_IP4) != 0;
2363 if (injail) {
2364 if (prison_check_ip4_locked(inp->inp_cred->cr_prison,
2365 &laddr) != 0)
2366 continue;
2367 } else {
2368 if (local_exact != NULL)
2369 continue;
2370 }
2371
2372 if (match == INPLOOKUP_MATCH_LADDR) {
2373 if (injail)
2374 return (inp);
2375 local_exact = inp;
2376 } else {
2377 #ifdef INET6
2378 /* XXX inp locking, NULL check */
2379 if (inp->inp_vflag & INP_IPV6PROTO)
2380 local_wild_mapped = inp;
2381 else
2382 #endif
2383 if (injail)
2384 jail_wild = inp;
2385 else
2386 local_wild = inp;
2387 }
2388 }
2389 if (jail_wild != NULL)
2390 return (jail_wild);
2391 if (local_exact != NULL)
2392 return (local_exact);
2393 if (local_wild != NULL)
2394 return (local_wild);
2395 #ifdef INET6
2396 if (local_wild_mapped != NULL)
2397 return (local_wild_mapped);
2398 #endif
2399 return (NULL);
2400 }
2401
2402 /*
2403 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2404 * that the caller has either locked the hash list, which usually happens
2405 * for bind(2) operations, or is in SMR section, which happens when sorting
2406 * out incoming packets.
2407 */
2408 static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2409 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2410 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2411 uint8_t numa_domain, int fib)
2412 {
2413 struct inpcb *inp;
2414 const u_short fport = fport_arg, lport = lport_arg;
2415
2416 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0,
2417 ("%s: invalid lookup flags %d", __func__, lookupflags));
2418 KASSERT(faddr.s_addr != INADDR_ANY,
2419 ("%s: invalid foreign address", __func__));
2420 KASSERT(laddr.s_addr != INADDR_ANY,
2421 ("%s: invalid local address", __func__));
2422 INP_HASH_WLOCK_ASSERT(pcbinfo);
2423
2424 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2425 if (inp != NULL)
2426 return (inp);
2427
2428 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2429 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2430 &laddr, lport, numa_domain, fib);
2431 if (inp == NULL) {
2432 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr,
2433 lport, fib);
2434 }
2435 }
2436
2437 return (inp);
2438 }
2439
2440 static struct inpcb *
in_pcblookup_hash(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,uint8_t numa_domain,int fib)2441 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2442 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2443 uint8_t numa_domain, int fib)
2444 {
2445 struct inpcb *inp;
2446 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2447
2448 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2449 ("%s: LOCKPCB not set", __func__));
2450
2451 INP_HASH_WLOCK(pcbinfo);
2452 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2453 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib);
2454 if (inp != NULL && !inp_trylock(inp, lockflags)) {
2455 in_pcbref(inp);
2456 INP_HASH_WUNLOCK(pcbinfo);
2457 inp_lock(inp, lockflags);
2458 if (in_pcbrele(inp, lockflags))
2459 /* XXX-MJ or retry until we get a negative match? */
2460 inp = NULL;
2461 } else {
2462 INP_HASH_WUNLOCK(pcbinfo);
2463 }
2464 return (inp);
2465 }
2466
2467 static struct inpcb *
in_pcblookup_hash_smr(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport_arg,struct in_addr laddr,u_int lport_arg,int lookupflags,uint8_t numa_domain,int fib)2468 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2469 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2470 uint8_t numa_domain, int fib)
2471 {
2472 struct inpcb *inp;
2473 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK;
2474 const u_short fport = fport_arg, lport = lport_arg;
2475
2476 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2477 ("%s: invalid lookup flags %d", __func__, lookupflags));
2478 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2479 ("%s: LOCKPCB not set", __func__));
2480
2481 smr_enter(pcbinfo->ipi_smr);
2482 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport);
2483 if (inp != NULL) {
2484 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2485 /*
2486 * Revalidate the 4-tuple, the socket could have been
2487 * disconnected.
2488 */
2489 if (__predict_true(in_pcblookup_exact_match(inp,
2490 faddr, fport, laddr, lport)))
2491 return (inp);
2492 inp_unlock(inp, lockflags);
2493 }
2494
2495 /*
2496 * We failed to lock the inpcb, or its connection state changed
2497 * out from under us. Fall back to a precise search.
2498 */
2499 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2500 lookupflags, numa_domain, fib));
2501 }
2502
2503 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2504 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport,
2505 &laddr, lport, numa_domain, fib);
2506 if (inp != NULL) {
2507 if (__predict_true(inp_smr_lock(inp, lockflags))) {
2508 if (__predict_true(in_pcblookup_wild_match(inp,
2509 laddr, lport, fib) != INPLOOKUP_MATCH_NONE))
2510 return (inp);
2511 inp_unlock(inp, lockflags);
2512 }
2513 inp = INP_LOOKUP_AGAIN;
2514 } else {
2515 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport,
2516 fib, lockflags);
2517 }
2518 if (inp == INP_LOOKUP_AGAIN) {
2519 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr,
2520 lport, lookupflags, numa_domain, fib));
2521 }
2522 }
2523
2524 if (inp == NULL)
2525 smr_exit(pcbinfo->ipi_smr);
2526
2527 return (inp);
2528 }
2529
2530 /*
2531 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2532 * from which a pre-calculated hash value may be extracted.
2533 */
2534 struct inpcb *
in_pcblookup(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp)2535 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2536 struct in_addr laddr, u_int lport, int lookupflags,
2537 struct ifnet *ifp)
2538 {
2539 int fib;
2540
2541 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS;
2542 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2543 lookupflags, M_NODOM, fib));
2544 }
2545
2546 struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo * pcbinfo,struct in_addr faddr,u_int fport,struct in_addr laddr,u_int lport,int lookupflags,struct ifnet * ifp __unused,struct mbuf * m)2547 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2548 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2549 struct ifnet *ifp __unused, struct mbuf *m)
2550 {
2551 int fib;
2552
2553 M_ASSERTPKTHDR(m);
2554 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS;
2555 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport,
2556 lookupflags, m->m_pkthdr.numa_domain, fib));
2557 }
2558 #endif /* INET */
2559
2560 static bool
in_pcbjailed(const struct inpcb * inp,unsigned int flag)2561 in_pcbjailed(const struct inpcb *inp, unsigned int flag)
2562 {
2563 return (prison_flag(inp->inp_cred, flag) != 0);
2564 }
2565
2566 /*
2567 * Insert the PCB into a hash chain using ordering rules which ensure that
2568 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first.
2569 *
2570 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs
2571 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs
2572 * always appear last no matter whether they are jailed.
2573 */
2574 static void
_in_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2575 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2576 {
2577 struct inpcb *last;
2578 bool bound, injail;
2579
2580 INP_LOCK_ASSERT(inp);
2581 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2582
2583 last = NULL;
2584 bound = inp->inp_laddr.s_addr != INADDR_ANY;
2585 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) {
2586 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2587 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2588 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2589 return;
2590 }
2591 }
2592 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2593 return;
2594 }
2595
2596 injail = in_pcbjailed(inp, PR_IP4);
2597 if (!injail) {
2598 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2599 if (!in_pcbjailed(last, PR_IP4))
2600 break;
2601 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2602 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2603 return;
2604 }
2605 }
2606 } else if (!CK_LIST_EMPTY(pcbhash) &&
2607 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) {
2608 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2609 return;
2610 }
2611 if (!bound) {
2612 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2613 if (last->inp_laddr.s_addr == INADDR_ANY)
2614 break;
2615 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2616 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2617 return;
2618 }
2619 }
2620 }
2621 if (last == NULL)
2622 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2623 else
2624 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2625 }
2626
2627 #ifdef INET6
2628 /*
2629 * See the comment above _in_pcbinshash_wild().
2630 */
2631 static void
_in6_pcbinshash_wild(struct inpcbhead * pcbhash,struct inpcb * inp)2632 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp)
2633 {
2634 struct inpcb *last;
2635 bool bound, injail;
2636
2637 INP_LOCK_ASSERT(inp);
2638 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2639
2640 last = NULL;
2641 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr);
2642 injail = in_pcbjailed(inp, PR_IP6);
2643 if (!injail) {
2644 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) {
2645 if (!in_pcbjailed(last, PR_IP6))
2646 break;
2647 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2648 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2649 return;
2650 }
2651 }
2652 } else if (!CK_LIST_EMPTY(pcbhash) &&
2653 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) {
2654 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2655 return;
2656 }
2657 if (!bound) {
2658 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) {
2659 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr))
2660 break;
2661 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) {
2662 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild);
2663 return;
2664 }
2665 }
2666 }
2667 if (last == NULL)
2668 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
2669 else
2670 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild);
2671 }
2672 #endif
2673
2674 /*
2675 * Insert PCB onto various hash lists.
2676 */
2677 int
in_pcbinshash(struct inpcb * inp)2678 in_pcbinshash(struct inpcb *inp)
2679 {
2680 struct inpcbhead *pcbhash;
2681 struct inpcbporthead *pcbporthash;
2682 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2683 struct inpcbport *phd;
2684 uint32_t hash;
2685 bool connected;
2686
2687 INP_WLOCK_ASSERT(inp);
2688 INP_HASH_WLOCK_ASSERT(pcbinfo);
2689 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2690 ("in_pcbinshash: INP_INHASHLIST"));
2691
2692 #ifdef INET6
2693 if (inp->inp_vflag & INP_IPV6) {
2694 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2695 inp->inp_fport, pcbinfo->ipi_hashmask);
2696 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2697 } else
2698 #endif
2699 {
2700 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2701 inp->inp_fport, pcbinfo->ipi_hashmask);
2702 connected = !in_nullhost(inp->inp_faddr);
2703 }
2704
2705 if (connected)
2706 pcbhash = &pcbinfo->ipi_hash_exact[hash];
2707 else
2708 pcbhash = &pcbinfo->ipi_hash_wild[hash];
2709
2710 pcbporthash = &pcbinfo->ipi_porthashbase[
2711 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2712
2713 /*
2714 * Add entry to load balance group.
2715 * Only do this if SO_REUSEPORT_LB is set.
2716 */
2717 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) {
2718 int error = in_pcbinslbgrouphash(inp, M_NODOM);
2719 if (error != 0)
2720 return (error);
2721 }
2722
2723 /*
2724 * Go through port list and look for a head for this lport.
2725 */
2726 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2727 if (phd->phd_port == inp->inp_lport)
2728 break;
2729 }
2730
2731 /*
2732 * If none exists, malloc one and tack it on.
2733 */
2734 if (phd == NULL) {
2735 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
2736 if (phd == NULL) {
2737 if ((inp->inp_flags & INP_INLBGROUP) != 0)
2738 in_pcbremlbgrouphash(inp);
2739 return (ENOMEM);
2740 }
2741 phd->phd_port = inp->inp_lport;
2742 CK_LIST_INIT(&phd->phd_pcblist);
2743 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2744 }
2745 inp->inp_phd = phd;
2746 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2747
2748 /*
2749 * The PCB may have been disconnected in the past. Before we can safely
2750 * make it visible in the hash table, we must wait for all readers which
2751 * may be traversing this PCB to finish.
2752 */
2753 if (inp->inp_smr != SMR_SEQ_INVALID) {
2754 smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
2755 inp->inp_smr = SMR_SEQ_INVALID;
2756 }
2757
2758 if (connected)
2759 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
2760 else {
2761 #ifdef INET6
2762 if ((inp->inp_vflag & INP_IPV6) != 0)
2763 _in6_pcbinshash_wild(pcbhash, inp);
2764 else
2765 #endif
2766 _in_pcbinshash_wild(pcbhash, inp);
2767 }
2768 inp->inp_flags |= INP_INHASHLIST;
2769
2770 return (0);
2771 }
2772
2773 void
in_pcbremhash_locked(struct inpcb * inp)2774 in_pcbremhash_locked(struct inpcb *inp)
2775 {
2776 struct inpcbport *phd = inp->inp_phd;
2777
2778 INP_WLOCK_ASSERT(inp);
2779 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
2780 MPASS(inp->inp_flags & INP_INHASHLIST);
2781
2782 if ((inp->inp_flags & INP_INLBGROUP) != 0)
2783 in_pcbremlbgrouphash(inp);
2784 #ifdef INET6
2785 if (inp->inp_vflag & INP_IPV6) {
2786 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
2787 CK_LIST_REMOVE(inp, inp_hash_wild);
2788 else
2789 CK_LIST_REMOVE(inp, inp_hash_exact);
2790 } else
2791 #endif
2792 {
2793 if (in_nullhost(inp->inp_faddr))
2794 CK_LIST_REMOVE(inp, inp_hash_wild);
2795 else
2796 CK_LIST_REMOVE(inp, inp_hash_exact);
2797 }
2798 CK_LIST_REMOVE(inp, inp_portlist);
2799 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2800 CK_LIST_REMOVE(phd, phd_hash);
2801 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
2802 }
2803 inp->inp_flags &= ~INP_INHASHLIST;
2804 }
2805
2806 static void
in_pcbremhash(struct inpcb * inp)2807 in_pcbremhash(struct inpcb *inp)
2808 {
2809 INP_HASH_WLOCK(inp->inp_pcbinfo);
2810 in_pcbremhash_locked(inp);
2811 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
2812 }
2813
2814 /*
2815 * Move PCB to the proper hash bucket when { faddr, fport } have been
2816 * changed. NOTE: This does not handle the case of the lport changing (the
2817 * hashed port list would have to be updated as well), so the lport must
2818 * not change after in_pcbinshash() has been called.
2819 */
2820 void
in_pcbrehash(struct inpcb * inp)2821 in_pcbrehash(struct inpcb *inp)
2822 {
2823 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2824 struct inpcbhead *head;
2825 uint32_t hash;
2826 bool connected;
2827
2828 INP_WLOCK_ASSERT(inp);
2829 INP_HASH_WLOCK_ASSERT(pcbinfo);
2830 KASSERT(inp->inp_flags & INP_INHASHLIST,
2831 ("%s: !INP_INHASHLIST", __func__));
2832 KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
2833 ("%s: inp was disconnected", __func__));
2834
2835 #ifdef INET6
2836 if (inp->inp_vflag & INP_IPV6) {
2837 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
2838 inp->inp_fport, pcbinfo->ipi_hashmask);
2839 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
2840 } else
2841 #endif
2842 {
2843 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
2844 inp->inp_fport, pcbinfo->ipi_hashmask);
2845 connected = !in_nullhost(inp->inp_faddr);
2846 }
2847
2848 /*
2849 * When rehashing, the caller must ensure that either the new or the old
2850 * foreign address was unspecified.
2851 */
2852 if (connected)
2853 CK_LIST_REMOVE(inp, inp_hash_wild);
2854 else
2855 CK_LIST_REMOVE(inp, inp_hash_exact);
2856
2857 if (connected) {
2858 head = &pcbinfo->ipi_hash_exact[hash];
2859 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
2860 } else {
2861 head = &pcbinfo->ipi_hash_wild[hash];
2862 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
2863 }
2864 }
2865
2866 /*
2867 * Check for alternatives when higher level complains
2868 * about service problems. For now, invalidate cached
2869 * routing information. If the route was created dynamically
2870 * (by a redirect), time to try a default gateway again.
2871 */
2872 void
in_losing(struct inpcb * inp)2873 in_losing(struct inpcb *inp)
2874 {
2875
2876 RO_INVALIDATE_CACHE(&inp->inp_route);
2877 return;
2878 }
2879
2880 /*
2881 * A set label operation has occurred at the socket layer, propagate the
2882 * label change into the in_pcb for the socket.
2883 */
2884 void
in_pcbsosetlabel(struct socket * so)2885 in_pcbsosetlabel(struct socket *so)
2886 {
2887 #ifdef MAC
2888 struct inpcb *inp;
2889
2890 inp = sotoinpcb(so);
2891 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2892
2893 INP_WLOCK(inp);
2894 SOCK_LOCK(so);
2895 mac_inpcb_sosetlabel(so, inp);
2896 SOCK_UNLOCK(so);
2897 INP_WUNLOCK(inp);
2898 #endif
2899 }
2900
2901 void
inp_wlock(struct inpcb * inp)2902 inp_wlock(struct inpcb *inp)
2903 {
2904
2905 INP_WLOCK(inp);
2906 }
2907
2908 void
inp_wunlock(struct inpcb * inp)2909 inp_wunlock(struct inpcb *inp)
2910 {
2911
2912 INP_WUNLOCK(inp);
2913 }
2914
2915 void
inp_rlock(struct inpcb * inp)2916 inp_rlock(struct inpcb *inp)
2917 {
2918
2919 INP_RLOCK(inp);
2920 }
2921
2922 void
inp_runlock(struct inpcb * inp)2923 inp_runlock(struct inpcb *inp)
2924 {
2925
2926 INP_RUNLOCK(inp);
2927 }
2928
2929 #ifdef INVARIANT_SUPPORT
2930 void
inp_lock_assert(struct inpcb * inp)2931 inp_lock_assert(struct inpcb *inp)
2932 {
2933
2934 INP_WLOCK_ASSERT(inp);
2935 }
2936
2937 void
inp_unlock_assert(struct inpcb * inp)2938 inp_unlock_assert(struct inpcb *inp)
2939 {
2940
2941 INP_UNLOCK_ASSERT(inp);
2942 }
2943 #endif
2944
2945 void
inp_apply_all(struct inpcbinfo * pcbinfo,void (* func)(struct inpcb *,void *),void * arg)2946 inp_apply_all(struct inpcbinfo *pcbinfo,
2947 void (*func)(struct inpcb *, void *), void *arg)
2948 {
2949 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
2950 INPLOOKUP_WLOCKPCB);
2951 struct inpcb *inp;
2952
2953 while ((inp = inp_next(&inpi)) != NULL)
2954 func(inp, arg);
2955 }
2956
2957 struct socket *
inp_inpcbtosocket(struct inpcb * inp)2958 inp_inpcbtosocket(struct inpcb *inp)
2959 {
2960
2961 INP_WLOCK_ASSERT(inp);
2962 return (inp->inp_socket);
2963 }
2964
2965 void
inp_4tuple_get(struct inpcb * inp,uint32_t * laddr,uint16_t * lp,uint32_t * faddr,uint16_t * fp)2966 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2967 uint32_t *faddr, uint16_t *fp)
2968 {
2969
2970 INP_LOCK_ASSERT(inp);
2971 *laddr = inp->inp_laddr.s_addr;
2972 *faddr = inp->inp_faddr.s_addr;
2973 *lp = inp->inp_lport;
2974 *fp = inp->inp_fport;
2975 }
2976
2977 /*
2978 * Create an external-format (``xinpcb'') structure using the information in
2979 * the kernel-format in_pcb structure pointed to by inp. This is done to
2980 * reduce the spew of irrelevant information over this interface, to isolate
2981 * user code from changes in the kernel structure, and potentially to provide
2982 * information-hiding if we decide that some of this information should be
2983 * hidden from users.
2984 */
2985 void
in_pcbtoxinpcb(const struct inpcb * inp,struct xinpcb * xi)2986 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2987 {
2988
2989 bzero(xi, sizeof(*xi));
2990 xi->xi_len = sizeof(struct xinpcb);
2991 if (inp->inp_socket)
2992 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2993 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2994 xi->inp_gencnt = inp->inp_gencnt;
2995 xi->inp_flow = inp->inp_flow;
2996 xi->inp_flowid = inp->inp_flowid;
2997 xi->inp_flowtype = inp->inp_flowtype;
2998 xi->inp_flags = inp->inp_flags;
2999 xi->inp_flags2 = inp->inp_flags2;
3000 xi->in6p_cksum = inp->in6p_cksum;
3001 xi->in6p_hops = inp->in6p_hops;
3002 xi->inp_ip_tos = inp->inp_ip_tos;
3003 xi->inp_vflag = inp->inp_vflag;
3004 xi->inp_ip_ttl = inp->inp_ip_ttl;
3005 xi->inp_ip_p = inp->inp_ip_p;
3006 xi->inp_ip_minttl = inp->inp_ip_minttl;
3007 }
3008
3009 int
sysctl_setsockopt(SYSCTL_HANDLER_ARGS,struct inpcbinfo * pcbinfo,int (* ctloutput_set)(struct inpcb *,struct sockopt *))3010 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
3011 int (*ctloutput_set)(struct inpcb *, struct sockopt *))
3012 {
3013 struct sockopt sopt;
3014 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
3015 INPLOOKUP_WLOCKPCB);
3016 struct inpcb *inp;
3017 struct sockopt_parameters *params;
3018 struct socket *so;
3019 int error;
3020 char buf[1024];
3021
3022 if (req->oldptr != NULL || req->oldlen != 0)
3023 return (EINVAL);
3024 if (req->newptr == NULL)
3025 return (EPERM);
3026 if (req->newlen > sizeof(buf))
3027 return (ENOMEM);
3028 error = SYSCTL_IN(req, buf, req->newlen);
3029 if (error != 0)
3030 return (error);
3031 if (req->newlen < sizeof(struct sockopt_parameters))
3032 return (EINVAL);
3033 params = (struct sockopt_parameters *)buf;
3034 sopt.sopt_level = params->sop_level;
3035 sopt.sopt_name = params->sop_optname;
3036 sopt.sopt_dir = SOPT_SET;
3037 sopt.sopt_val = params->sop_optval;
3038 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
3039 sopt.sopt_td = NULL;
3040 #ifdef INET6
3041 if (params->sop_inc.inc_flags & INC_ISIPV6) {
3042 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr))
3043 params->sop_inc.inc6_laddr.s6_addr16[1] =
3044 htons(params->sop_inc.inc6_zoneid & 0xffff);
3045 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr))
3046 params->sop_inc.inc6_faddr.s6_addr16[1] =
3047 htons(params->sop_inc.inc6_zoneid & 0xffff);
3048 }
3049 #endif
3050 if (params->sop_inc.inc_lport != htons(0) &&
3051 params->sop_inc.inc_fport != htons(0)) {
3052 #ifdef INET6
3053 if (params->sop_inc.inc_flags & INC_ISIPV6)
3054 inpi.hash = INP6_PCBHASH(
3055 ¶ms->sop_inc.inc6_faddr,
3056 params->sop_inc.inc_lport,
3057 params->sop_inc.inc_fport,
3058 pcbinfo->ipi_hashmask);
3059 else
3060 #endif
3061 inpi.hash = INP_PCBHASH(
3062 ¶ms->sop_inc.inc_faddr,
3063 params->sop_inc.inc_lport,
3064 params->sop_inc.inc_fport,
3065 pcbinfo->ipi_hashmask);
3066 }
3067 while ((inp = inp_next(&inpi)) != NULL)
3068 if (inp->inp_gencnt == params->sop_id) {
3069 if (inp->inp_flags & INP_DROPPED) {
3070 INP_WUNLOCK(inp);
3071 return (ECONNRESET);
3072 }
3073 so = inp->inp_socket;
3074 KASSERT(so != NULL, ("inp_socket == NULL"));
3075 soref(so);
3076 if (params->sop_level == SOL_SOCKET) {
3077 INP_WUNLOCK(inp);
3078 error = sosetopt(so, &sopt);
3079 } else
3080 error = (*ctloutput_set)(inp, &sopt);
3081 sorele(so);
3082 break;
3083 }
3084 if (inp == NULL)
3085 error = ESRCH;
3086 return (error);
3087 }
3088
3089 #ifdef DDB
3090 static void
db_print_indent(int indent)3091 db_print_indent(int indent)
3092 {
3093 int i;
3094
3095 for (i = 0; i < indent; i++)
3096 db_printf(" ");
3097 }
3098
3099 static void
db_print_inconninfo(struct in_conninfo * inc,const char * name,int indent)3100 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
3101 {
3102 char faddr_str[48], laddr_str[48];
3103
3104 db_print_indent(indent);
3105 db_printf("%s at %p\n", name, inc);
3106
3107 indent += 2;
3108
3109 #ifdef INET6
3110 if (inc->inc_flags & INC_ISIPV6) {
3111 /* IPv6. */
3112 ip6_sprintf(laddr_str, &inc->inc6_laddr);
3113 ip6_sprintf(faddr_str, &inc->inc6_faddr);
3114 } else
3115 #endif
3116 {
3117 /* IPv4. */
3118 inet_ntoa_r(inc->inc_laddr, laddr_str);
3119 inet_ntoa_r(inc->inc_faddr, faddr_str);
3120 }
3121 db_print_indent(indent);
3122 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
3123 ntohs(inc->inc_lport));
3124 db_print_indent(indent);
3125 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3126 ntohs(inc->inc_fport));
3127 }
3128
3129 static void
db_print_inpflags(int inp_flags)3130 db_print_inpflags(int inp_flags)
3131 {
3132 int comma;
3133
3134 comma = 0;
3135 if (inp_flags & INP_RECVOPTS) {
3136 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3137 comma = 1;
3138 }
3139 if (inp_flags & INP_RECVRETOPTS) {
3140 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3141 comma = 1;
3142 }
3143 if (inp_flags & INP_RECVDSTADDR) {
3144 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3145 comma = 1;
3146 }
3147 if (inp_flags & INP_ORIGDSTADDR) {
3148 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3149 comma = 1;
3150 }
3151 if (inp_flags & INP_HDRINCL) {
3152 db_printf("%sINP_HDRINCL", comma ? ", " : "");
3153 comma = 1;
3154 }
3155 if (inp_flags & INP_HIGHPORT) {
3156 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3157 comma = 1;
3158 }
3159 if (inp_flags & INP_LOWPORT) {
3160 db_printf("%sINP_LOWPORT", comma ? ", " : "");
3161 comma = 1;
3162 }
3163 if (inp_flags & INP_ANONPORT) {
3164 db_printf("%sINP_ANONPORT", comma ? ", " : "");
3165 comma = 1;
3166 }
3167 if (inp_flags & INP_RECVIF) {
3168 db_printf("%sINP_RECVIF", comma ? ", " : "");
3169 comma = 1;
3170 }
3171 if (inp_flags & INP_MTUDISC) {
3172 db_printf("%sINP_MTUDISC", comma ? ", " : "");
3173 comma = 1;
3174 }
3175 if (inp_flags & INP_RECVTTL) {
3176 db_printf("%sINP_RECVTTL", comma ? ", " : "");
3177 comma = 1;
3178 }
3179 if (inp_flags & INP_DONTFRAG) {
3180 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3181 comma = 1;
3182 }
3183 if (inp_flags & INP_RECVTOS) {
3184 db_printf("%sINP_RECVTOS", comma ? ", " : "");
3185 comma = 1;
3186 }
3187 if (inp_flags & IN6P_IPV6_V6ONLY) {
3188 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3189 comma = 1;
3190 }
3191 if (inp_flags & IN6P_PKTINFO) {
3192 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3193 comma = 1;
3194 }
3195 if (inp_flags & IN6P_HOPLIMIT) {
3196 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3197 comma = 1;
3198 }
3199 if (inp_flags & IN6P_HOPOPTS) {
3200 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3201 comma = 1;
3202 }
3203 if (inp_flags & IN6P_DSTOPTS) {
3204 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3205 comma = 1;
3206 }
3207 if (inp_flags & IN6P_RTHDR) {
3208 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3209 comma = 1;
3210 }
3211 if (inp_flags & IN6P_RTHDRDSTOPTS) {
3212 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3213 comma = 1;
3214 }
3215 if (inp_flags & IN6P_TCLASS) {
3216 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3217 comma = 1;
3218 }
3219 if (inp_flags & IN6P_AUTOFLOWLABEL) {
3220 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3221 comma = 1;
3222 }
3223 if (inp_flags & INP_ONESBCAST) {
3224 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3225 comma = 1;
3226 }
3227 if (inp_flags & INP_DROPPED) {
3228 db_printf("%sINP_DROPPED", comma ? ", " : "");
3229 comma = 1;
3230 }
3231 if (inp_flags & INP_SOCKREF) {
3232 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3233 comma = 1;
3234 }
3235 if (inp_flags & IN6P_RFC2292) {
3236 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3237 comma = 1;
3238 }
3239 if (inp_flags & IN6P_MTU) {
3240 db_printf("IN6P_MTU%s", comma ? ", " : "");
3241 comma = 1;
3242 }
3243 }
3244
3245 static void
db_print_inpvflag(u_char inp_vflag)3246 db_print_inpvflag(u_char inp_vflag)
3247 {
3248 int comma;
3249
3250 comma = 0;
3251 if (inp_vflag & INP_IPV4) {
3252 db_printf("%sINP_IPV4", comma ? ", " : "");
3253 comma = 1;
3254 }
3255 if (inp_vflag & INP_IPV6) {
3256 db_printf("%sINP_IPV6", comma ? ", " : "");
3257 comma = 1;
3258 }
3259 if (inp_vflag & INP_IPV6PROTO) {
3260 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3261 comma = 1;
3262 }
3263 }
3264
3265 static void
db_print_inpcb(struct inpcb * inp,const char * name,int indent)3266 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3267 {
3268
3269 db_print_indent(indent);
3270 db_printf("%s at %p\n", name, inp);
3271
3272 indent += 2;
3273
3274 db_print_indent(indent);
3275 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3276
3277 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3278
3279 db_print_indent(indent);
3280 db_printf("inp_label: %p inp_flags: 0x%x (",
3281 inp->inp_label, inp->inp_flags);
3282 db_print_inpflags(inp->inp_flags);
3283 db_printf(")\n");
3284
3285 db_print_indent(indent);
3286 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
3287 inp->inp_vflag);
3288 db_print_inpvflag(inp->inp_vflag);
3289 db_printf(")\n");
3290
3291 db_print_indent(indent);
3292 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3293 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3294
3295 db_print_indent(indent);
3296 #ifdef INET6
3297 if (inp->inp_vflag & INP_IPV6) {
3298 db_printf("in6p_options: %p in6p_outputopts: %p "
3299 "in6p_moptions: %p\n", inp->in6p_options,
3300 inp->in6p_outputopts, inp->in6p_moptions);
3301 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3302 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3303 inp->in6p_hops);
3304 } else
3305 #endif
3306 {
3307 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3308 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3309 inp->inp_options, inp->inp_moptions);
3310 }
3311
3312 db_print_indent(indent);
3313 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
3314 (uintmax_t)inp->inp_gencnt);
3315 }
3316
DB_SHOW_COMMAND(inpcb,db_show_inpcb)3317 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3318 {
3319 struct inpcb *inp;
3320
3321 if (!have_addr) {
3322 db_printf("usage: show inpcb <addr>\n");
3323 return;
3324 }
3325 inp = (struct inpcb *)addr;
3326
3327 db_print_inpcb(inp, "inpcb", 0);
3328 }
3329 #endif /* DDB */
3330
3331 #ifdef RATELIMIT
3332 /*
3333 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3334 * if any.
3335 */
3336 int
in_pcbmodify_txrtlmt(struct inpcb * inp,uint32_t max_pacing_rate)3337 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3338 {
3339 union if_snd_tag_modify_params params = {
3340 .rate_limit.max_rate = max_pacing_rate,
3341 .rate_limit.flags = M_NOWAIT,
3342 };
3343 struct m_snd_tag *mst;
3344 int error;
3345
3346 mst = inp->inp_snd_tag;
3347 if (mst == NULL)
3348 return (EINVAL);
3349
3350 if (mst->sw->snd_tag_modify == NULL) {
3351 error = EOPNOTSUPP;
3352 } else {
3353 error = mst->sw->snd_tag_modify(mst, ¶ms);
3354 }
3355 return (error);
3356 }
3357
3358 /*
3359 * Query existing TX rate limit based on the existing
3360 * "inp->inp_snd_tag", if any.
3361 */
3362 int
in_pcbquery_txrtlmt(struct inpcb * inp,uint32_t * p_max_pacing_rate)3363 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3364 {
3365 union if_snd_tag_query_params params = { };
3366 struct m_snd_tag *mst;
3367 int error;
3368
3369 mst = inp->inp_snd_tag;
3370 if (mst == NULL)
3371 return (EINVAL);
3372
3373 if (mst->sw->snd_tag_query == NULL) {
3374 error = EOPNOTSUPP;
3375 } else {
3376 error = mst->sw->snd_tag_query(mst, ¶ms);
3377 if (error == 0 && p_max_pacing_rate != NULL)
3378 *p_max_pacing_rate = params.rate_limit.max_rate;
3379 }
3380 return (error);
3381 }
3382
3383 /*
3384 * Query existing TX queue level based on the existing
3385 * "inp->inp_snd_tag", if any.
3386 */
3387 int
in_pcbquery_txrlevel(struct inpcb * inp,uint32_t * p_txqueue_level)3388 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3389 {
3390 union if_snd_tag_query_params params = { };
3391 struct m_snd_tag *mst;
3392 int error;
3393
3394 mst = inp->inp_snd_tag;
3395 if (mst == NULL)
3396 return (EINVAL);
3397
3398 if (mst->sw->snd_tag_query == NULL)
3399 return (EOPNOTSUPP);
3400
3401 error = mst->sw->snd_tag_query(mst, ¶ms);
3402 if (error == 0 && p_txqueue_level != NULL)
3403 *p_txqueue_level = params.rate_limit.queue_level;
3404 return (error);
3405 }
3406
3407 /*
3408 * Allocate a new TX rate limit send tag from the network interface
3409 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3410 */
3411 int
in_pcbattach_txrtlmt(struct inpcb * inp,struct ifnet * ifp,uint32_t flowtype,uint32_t flowid,uint32_t max_pacing_rate,struct m_snd_tag ** st)3412 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3413 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
3414
3415 {
3416 union if_snd_tag_alloc_params params = {
3417 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3418 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3419 .rate_limit.hdr.flowid = flowid,
3420 .rate_limit.hdr.flowtype = flowtype,
3421 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
3422 .rate_limit.max_rate = max_pacing_rate,
3423 .rate_limit.flags = M_NOWAIT,
3424 };
3425 int error;
3426
3427 INP_WLOCK_ASSERT(inp);
3428
3429 /*
3430 * If there is already a send tag, or the INP is being torn
3431 * down, allocating a new send tag is not allowed. Else send
3432 * tags may leak.
3433 */
3434 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
3435 return (EINVAL);
3436
3437 error = m_snd_tag_alloc(ifp, ¶ms, st);
3438 #ifdef INET
3439 if (error == 0) {
3440 counter_u64_add(rate_limit_set_ok, 1);
3441 counter_u64_add(rate_limit_active, 1);
3442 } else if (error != EOPNOTSUPP)
3443 counter_u64_add(rate_limit_alloc_fail, 1);
3444 #endif
3445 return (error);
3446 }
3447
3448 void
in_pcbdetach_tag(struct m_snd_tag * mst)3449 in_pcbdetach_tag(struct m_snd_tag *mst)
3450 {
3451
3452 m_snd_tag_rele(mst);
3453 #ifdef INET
3454 counter_u64_add(rate_limit_active, -1);
3455 #endif
3456 }
3457
3458 /*
3459 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3460 * if any:
3461 */
3462 void
in_pcbdetach_txrtlmt(struct inpcb * inp)3463 in_pcbdetach_txrtlmt(struct inpcb *inp)
3464 {
3465 struct m_snd_tag *mst;
3466
3467 INP_WLOCK_ASSERT(inp);
3468
3469 mst = inp->inp_snd_tag;
3470 inp->inp_snd_tag = NULL;
3471
3472 if (mst == NULL)
3473 return;
3474
3475 m_snd_tag_rele(mst);
3476 #ifdef INET
3477 counter_u64_add(rate_limit_active, -1);
3478 #endif
3479 }
3480
3481 int
in_pcboutput_txrtlmt_locked(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb,uint32_t max_pacing_rate)3482 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
3483 {
3484 int error;
3485
3486 /*
3487 * If the existing send tag is for the wrong interface due to
3488 * a route change, first drop the existing tag. Set the
3489 * CHANGED flag so that we will keep trying to allocate a new
3490 * tag if we fail to allocate one this time.
3491 */
3492 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
3493 in_pcbdetach_txrtlmt(inp);
3494 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3495 }
3496
3497 /*
3498 * NOTE: When attaching to a network interface a reference is
3499 * made to ensure the network interface doesn't go away until
3500 * all ratelimit connections are gone. The network interface
3501 * pointers compared below represent valid network interfaces,
3502 * except when comparing towards NULL.
3503 */
3504 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3505 error = 0;
3506 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3507 if (inp->inp_snd_tag != NULL)
3508 in_pcbdetach_txrtlmt(inp);
3509 error = 0;
3510 } else if (inp->inp_snd_tag == NULL) {
3511 /*
3512 * In order to utilize packet pacing with RSS, we need
3513 * to wait until there is a valid RSS hash before we
3514 * can proceed:
3515 */
3516 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3517 error = EAGAIN;
3518 } else {
3519 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3520 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
3521 }
3522 } else {
3523 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3524 }
3525 if (error == 0 || error == EOPNOTSUPP)
3526 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3527
3528 return (error);
3529 }
3530
3531 /*
3532 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3533 * is set in the fast path and will attach/detach/modify the TX rate
3534 * limit send tag based on the socket's so_max_pacing_rate value.
3535 */
3536 void
in_pcboutput_txrtlmt(struct inpcb * inp,struct ifnet * ifp,struct mbuf * mb)3537 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3538 {
3539 struct socket *socket;
3540 uint32_t max_pacing_rate;
3541 bool did_upgrade;
3542
3543 if (inp == NULL)
3544 return;
3545
3546 socket = inp->inp_socket;
3547 if (socket == NULL)
3548 return;
3549
3550 if (!INP_WLOCKED(inp)) {
3551 /*
3552 * NOTE: If the write locking fails, we need to bail
3553 * out and use the non-ratelimited ring for the
3554 * transmit until there is a new chance to get the
3555 * write lock.
3556 */
3557 if (!INP_TRY_UPGRADE(inp))
3558 return;
3559 did_upgrade = 1;
3560 } else {
3561 did_upgrade = 0;
3562 }
3563
3564 /*
3565 * NOTE: The so_max_pacing_rate value is read unlocked,
3566 * because atomic updates are not required since the variable
3567 * is checked at every mbuf we send. It is assumed that the
3568 * variable read itself will be atomic.
3569 */
3570 max_pacing_rate = socket->so_max_pacing_rate;
3571
3572 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
3573
3574 if (did_upgrade)
3575 INP_DOWNGRADE(inp);
3576 }
3577
3578 /*
3579 * Track route changes for TX rate limiting.
3580 */
3581 void
in_pcboutput_eagain(struct inpcb * inp)3582 in_pcboutput_eagain(struct inpcb *inp)
3583 {
3584 bool did_upgrade;
3585
3586 if (inp == NULL)
3587 return;
3588
3589 if (inp->inp_snd_tag == NULL)
3590 return;
3591
3592 if (!INP_WLOCKED(inp)) {
3593 /*
3594 * NOTE: If the write locking fails, we need to bail
3595 * out and use the non-ratelimited ring for the
3596 * transmit until there is a new chance to get the
3597 * write lock.
3598 */
3599 if (!INP_TRY_UPGRADE(inp))
3600 return;
3601 did_upgrade = 1;
3602 } else {
3603 did_upgrade = 0;
3604 }
3605
3606 /* detach rate limiting */
3607 in_pcbdetach_txrtlmt(inp);
3608
3609 /* make sure new mbuf send tag allocation is made */
3610 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3611
3612 if (did_upgrade)
3613 INP_DOWNGRADE(inp);
3614 }
3615
3616 #ifdef INET
3617 static void
rl_init(void * st)3618 rl_init(void *st)
3619 {
3620 rate_limit_new = counter_u64_alloc(M_WAITOK);
3621 rate_limit_chg = counter_u64_alloc(M_WAITOK);
3622 rate_limit_active = counter_u64_alloc(M_WAITOK);
3623 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
3624 rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
3625 }
3626
3627 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
3628 #endif
3629 #endif /* RATELIMIT */
3630