1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24 * Copyright 2022 Joyent, Inc.
25 * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org>
26 */
27
28 /*
29 * IP PACKET CLASSIFIER
30 *
31 * The IP packet classifier provides mapping between IP packets and persistent
32 * connection state for connection-oriented protocols. It also provides
33 * interface for managing connection states.
34 *
35 * The connection state is kept in conn_t data structure and contains, among
36 * other things:
37 *
38 * o local/remote address and ports
39 * o Transport protocol
40 * o squeue for the connection (for TCP only)
41 * o reference counter
42 * o Connection state
43 * o hash table linkage
44 * o interface/ire information
45 * o credentials
46 * o ipsec policy
47 * o send and receive functions.
48 * o mutex lock.
49 *
50 * Connections use a reference counting scheme. They are freed when the
51 * reference counter drops to zero. A reference is incremented when connection
52 * is placed in a list or table, when incoming packet for the connection arrives
53 * and when connection is processed via squeue (squeue processing may be
54 * asynchronous and the reference protects the connection from being destroyed
55 * before its processing is finished).
56 *
57 * conn_recv is used to pass up packets to the ULP.
58 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
59 * a listener, and changes to tcp_input_listener as the listener has picked a
60 * good squeue. For other cases it is set to tcp_input_data.
61 *
62 * conn_recvicmp is used to pass up ICMP errors to the ULP.
63 *
64 * Classifier uses several hash tables:
65 *
66 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
67 * ipcl_bind_fanout: contains all connections in BOUND state
68 * ipcl_proto_fanout: IPv4 protocol fanout
69 * ipcl_proto_fanout_v6: IPv6 protocol fanout
70 * ipcl_udp_fanout: contains all UDP connections
71 * ipcl_iptun_fanout: contains all IP tunnel connections
72 * ipcl_globalhash_fanout: contains all connections
73 *
74 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
75 * which need to view all existing connections.
76 *
77 * All tables are protected by per-bucket locks. When both per-bucket lock and
78 * connection lock need to be held, the per-bucket lock should be acquired
79 * first, followed by the connection lock.
80 *
81 * All functions doing search in one of these tables increment a reference
82 * counter on the connection found (if any). This reference should be dropped
83 * when the caller has finished processing the connection.
84 *
85 *
86 * INTERFACES:
87 * ===========
88 *
89 * Connection Lookup:
90 * ------------------
91 *
92 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
93 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
94 *
95 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
96 * it can't find any associated connection. If the connection is found, its
97 * reference counter is incremented.
98 *
99 * mp: mblock, containing packet header. The full header should fit
100 * into a single mblock. It should also contain at least full IP
101 * and TCP or UDP header.
102 *
103 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
104 *
105 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
106 * the packet.
107 *
108 * ira->ira_zoneid: The zone in which the returned connection must be; the
109 * zoneid corresponding to the ire_zoneid on the IRE located for
110 * the packet's destination address.
111 *
112 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
113 * IRAF_TX_SHARED_ADDR flags
114 *
115 * For TCP connections, the lookup order is as follows:
116 * 5-tuple {src, dst, protocol, local port, remote port}
117 * lookup in ipcl_conn_fanout table.
118 * 3-tuple {dst, remote port, protocol} lookup in
119 * ipcl_bind_fanout table.
120 *
121 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
122 * remote port} lookup is done on ipcl_udp_fanout. Note that,
123 * these interfaces do not handle cases where a packets belongs
124 * to multiple UDP clients, which is handled in IP itself.
125 *
126 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
127 * determine which actual zone gets the segment. This is used only in a
128 * labeled environment. The matching rules are:
129 *
130 * - If it's not a multilevel port, then the label on the packet selects
131 * the zone. Unlabeled packets are delivered to the global zone.
132 *
133 * - If it's a multilevel port, then only the zone registered to receive
134 * packets on that port matches.
135 *
136 * Also, in a labeled environment, packet labels need to be checked. For fully
137 * bound TCP connections, we can assume that the packet label was checked
138 * during connection establishment, and doesn't need to be checked on each
139 * packet. For others, though, we need to check for strict equality or, for
140 * multilevel ports, membership in the range or set. This part currently does
141 * a tnrh lookup on each packet, but could be optimized to use cached results
142 * if that were necessary. (SCTP doesn't come through here, but if it did,
143 * we would apply the same rules as TCP.)
144 *
145 * An implication of the above is that fully-bound TCP sockets must always use
146 * distinct 4-tuples; they can't be discriminated by label alone.
147 *
148 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
149 * as there's no connection set-up handshake and no shared state.
150 *
151 * Labels on looped-back packets within a single zone do not need to be
152 * checked, as all processes in the same zone have the same label.
153 *
154 * Finally, for unlabeled packets received by a labeled system, special rules
155 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
156 * socket in the zone whose label matches the default label of the sender, if
157 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
158 * receiver's label must dominate the sender's default label.
159 *
160 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
161 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
162 * ip_stack);
163 *
164 * Lookup routine to find a exact match for {src, dst, local port,
165 * remote port) for TCP connections in ipcl_conn_fanout. The address and
166 * ports are read from the IP and TCP header respectively.
167 *
168 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
169 * zoneid, ip_stack);
170 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
171 * zoneid, ip_stack);
172 *
173 * Lookup routine to find a listener with the tuple {lport, laddr,
174 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
175 * parameter interface index is also compared.
176 *
177 * void ipcl_walk(func, arg, ip_stack)
178 *
179 * Apply 'func' to every connection available. The 'func' is called as
180 * (*func)(connp, arg). The walk is non-atomic so connections may be
181 * created and destroyed during the walk. The CONN_CONDEMNED and
182 * CONN_INCIPIENT flags ensure that connections which are newly created
183 * or being destroyed are not selected by the walker.
184 *
185 * Table Updates
186 * -------------
187 *
188 * int ipcl_conn_insert(connp);
189 * int ipcl_conn_insert_v4(connp);
190 * int ipcl_conn_insert_v6(connp);
191 *
192 * Insert 'connp' in the ipcl_conn_fanout.
193 * Arguments :
194 * connp conn_t to be inserted
195 *
196 * Return value :
197 * 0 if connp was inserted
198 * EADDRINUSE if the connection with the same tuple
199 * already exists.
200 *
201 * int ipcl_bind_insert(connp);
202 * int ipcl_bind_insert_v4(connp);
203 * int ipcl_bind_insert_v6(connp);
204 *
205 * Insert 'connp' in ipcl_bind_fanout.
206 * Arguments :
207 * connp conn_t to be inserted
208 *
209 *
210 * void ipcl_hash_remove(connp);
211 *
212 * Removes the 'connp' from the connection fanout table.
213 *
214 * Connection Creation/Destruction
215 * -------------------------------
216 *
217 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
218 *
219 * Creates a new conn based on the type flag, inserts it into
220 * globalhash table.
221 *
222 * type: This flag determines the type of conn_t which needs to be
223 * created i.e., which kmem_cache it comes from.
224 * IPCL_TCPCONN indicates a TCP connection
225 * IPCL_SCTPCONN indicates a SCTP connection
226 * IPCL_UDPCONN indicates a UDP conn_t.
227 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
228 * IPCL_RTSCONN indicates a RTS conn_t.
229 * IPCL_IPCCONN indicates all other connections.
230 *
231 * void ipcl_conn_destroy(connp)
232 *
233 * Destroys the connection state, removes it from the global
234 * connection hash table and frees its memory.
235 */
236
237 #include <sys/types.h>
238 #include <sys/stream.h>
239 #include <sys/stropts.h>
240 #include <sys/sysmacros.h>
241 #include <sys/strsubr.h>
242 #include <sys/strsun.h>
243 #define _SUN_TPI_VERSION 2
244 #include <sys/ddi.h>
245 #include <sys/cmn_err.h>
246 #include <sys/debug.h>
247
248 #include <sys/systm.h>
249 #include <sys/param.h>
250 #include <sys/kmem.h>
251 #include <sys/isa_defs.h>
252 #include <inet/common.h>
253 #include <netinet/ip6.h>
254 #include <netinet/icmp6.h>
255
256 #include <inet/ip.h>
257 #include <inet/ip_if.h>
258 #include <inet/ip_ire.h>
259 #include <inet/ip6.h>
260 #include <inet/ip_ndp.h>
261 #include <inet/ip_impl.h>
262 #include <inet/udp_impl.h>
263 #include <inet/sctp_ip.h>
264 #include <inet/sctp/sctp_impl.h>
265 #include <inet/rawip_impl.h>
266 #include <inet/rts_impl.h>
267 #include <inet/iptun/iptun_impl.h>
268
269 #include <sys/cpuvar.h>
270
271 #include <inet/ipclassifier.h>
272 #include <inet/tcp.h>
273 #include <inet/ipsec_impl.h>
274
275 #include <sys/tsol/tnet.h>
276 #include <sys/sockio.h>
277
278 /* Old value for compatibility. Setable in /etc/system */
279 uint_t tcp_conn_hash_size = 0;
280
281 /* New value. Zero means choose automatically. Setable in /etc/system */
282 uint_t ipcl_conn_hash_size = 0;
283 uint_t ipcl_conn_hash_memfactor = 8192;
284 uint_t ipcl_conn_hash_maxsize = 82500;
285
286 /* bind/udp fanout table size */
287 uint_t ipcl_bind_fanout_size = 512;
288 uint_t ipcl_udp_fanout_size = 16384;
289
290 /* Raw socket fanout size. Must be a power of 2. */
291 uint_t ipcl_raw_fanout_size = 256;
292
293 /*
294 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
295 * expect that most large deployments would have hundreds of tunnels, and
296 * thousands in the extreme case.
297 */
298 uint_t ipcl_iptun_fanout_size = 6143;
299
300 /*
301 * Power of 2^N Primes useful for hashing for N of 0-28,
302 * these primes are the nearest prime <= 2^N - 2^(N-2).
303 */
304
305 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
306 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
307 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
308 50331599, 100663291, 201326557, 0}
309
310 /*
311 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
312 * are aligned on cache lines.
313 */
314 typedef union itc_s {
315 conn_t itc_conn;
316 char itcu_filler[CACHE_ALIGN(conn_s)];
317 } itc_t;
318
319 struct kmem_cache *tcp_conn_cache;
320 struct kmem_cache *ip_conn_cache;
321 extern struct kmem_cache *sctp_conn_cache;
322 struct kmem_cache *udp_conn_cache;
323 struct kmem_cache *rawip_conn_cache;
324 struct kmem_cache *rts_conn_cache;
325
326 extern void tcp_timermp_free(tcp_t *);
327 extern mblk_t *tcp_timermp_alloc(int);
328
329 static int ip_conn_constructor(void *, void *, int);
330 static void ip_conn_destructor(void *, void *);
331
332 static int tcp_conn_constructor(void *, void *, int);
333 static void tcp_conn_destructor(void *, void *);
334
335 static int udp_conn_constructor(void *, void *, int);
336 static void udp_conn_destructor(void *, void *);
337
338 static int rawip_conn_constructor(void *, void *, int);
339 static void rawip_conn_destructor(void *, void *);
340
341 static int rts_conn_constructor(void *, void *, int);
342 static void rts_conn_destructor(void *, void *);
343
344 /*
345 * Global (for all stack instances) init routine
346 */
347 void
ipcl_g_init(void)348 ipcl_g_init(void)
349 {
350 ip_conn_cache = kmem_cache_create("ip_conn_cache",
351 sizeof (conn_t), CACHE_ALIGN_SIZE,
352 ip_conn_constructor, ip_conn_destructor,
353 NULL, NULL, NULL, 0);
354
355 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
356 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
357 tcp_conn_constructor, tcp_conn_destructor,
358 tcp_conn_reclaim, NULL, NULL, 0);
359
360 udp_conn_cache = kmem_cache_create("udp_conn_cache",
361 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
362 udp_conn_constructor, udp_conn_destructor,
363 NULL, NULL, NULL, 0);
364
365 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
366 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
367 rawip_conn_constructor, rawip_conn_destructor,
368 NULL, NULL, NULL, 0);
369
370 rts_conn_cache = kmem_cache_create("rts_conn_cache",
371 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
372 rts_conn_constructor, rts_conn_destructor,
373 NULL, NULL, NULL, 0);
374 }
375
376 /*
377 * ipclassifier intialization routine, sets up hash tables.
378 */
379 void
ipcl_init(ip_stack_t * ipst)380 ipcl_init(ip_stack_t *ipst)
381 {
382 int i;
383 int sizes[] = P2Ps();
384
385 /*
386 * Calculate size of conn fanout table from /etc/system settings
387 */
388 if (ipcl_conn_hash_size != 0) {
389 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
390 } else if (tcp_conn_hash_size != 0) {
391 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
392 } else {
393 extern pgcnt_t freemem;
394
395 ipst->ips_ipcl_conn_fanout_size =
396 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
397
398 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
399 ipst->ips_ipcl_conn_fanout_size =
400 ipcl_conn_hash_maxsize;
401 }
402 }
403
404 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
405 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
406 break;
407 }
408 }
409 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
410 /* Out of range, use the 2^16 value */
411 ipst->ips_ipcl_conn_fanout_size = sizes[16];
412 }
413
414 /* Take values from /etc/system */
415 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
416 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
417 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
418 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
419
420 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
421
422 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
423 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
424
425 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
426 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
427 MUTEX_DEFAULT, NULL);
428 }
429
430 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
431 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
432
433 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
434 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
435 MUTEX_DEFAULT, NULL);
436 }
437
438 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
439 sizeof (connf_t), KM_SLEEP);
440 for (i = 0; i < IPPROTO_MAX; i++) {
441 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
442 MUTEX_DEFAULT, NULL);
443 }
444
445 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
446 sizeof (connf_t), KM_SLEEP);
447 for (i = 0; i < IPPROTO_MAX; i++) {
448 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
449 MUTEX_DEFAULT, NULL);
450 }
451
452 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
453 mutex_init(&ipst->ips_rts_clients->connf_lock,
454 NULL, MUTEX_DEFAULT, NULL);
455
456 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
457 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
458 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
459 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
460 MUTEX_DEFAULT, NULL);
461 }
462
463 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
464 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
465 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
466 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
467 MUTEX_DEFAULT, NULL);
468 }
469
470 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
471 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
472 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
473 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
474 MUTEX_DEFAULT, NULL);
475 }
476
477 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
478 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
479 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
480 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
481 NULL, MUTEX_DEFAULT, NULL);
482 }
483 }
484
485 void
ipcl_g_destroy(void)486 ipcl_g_destroy(void)
487 {
488 kmem_cache_destroy(ip_conn_cache);
489 kmem_cache_destroy(tcp_conn_cache);
490 kmem_cache_destroy(udp_conn_cache);
491 kmem_cache_destroy(rawip_conn_cache);
492 kmem_cache_destroy(rts_conn_cache);
493 }
494
495 /*
496 * All user-level and kernel use of the stack must be gone
497 * by now.
498 */
499 void
ipcl_destroy(ip_stack_t * ipst)500 ipcl_destroy(ip_stack_t *ipst)
501 {
502 int i;
503
504 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
505 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
506 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
507 }
508 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
509 sizeof (connf_t));
510 ipst->ips_ipcl_conn_fanout = NULL;
511
512 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
513 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
514 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
515 }
516 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
517 sizeof (connf_t));
518 ipst->ips_ipcl_bind_fanout = NULL;
519
520 for (i = 0; i < IPPROTO_MAX; i++) {
521 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
522 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
523 }
524 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
525 IPPROTO_MAX * sizeof (connf_t));
526 ipst->ips_ipcl_proto_fanout_v4 = NULL;
527
528 for (i = 0; i < IPPROTO_MAX; i++) {
529 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
530 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
531 }
532 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
533 IPPROTO_MAX * sizeof (connf_t));
534 ipst->ips_ipcl_proto_fanout_v6 = NULL;
535
536 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
537 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
538 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
539 }
540 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
541 sizeof (connf_t));
542 ipst->ips_ipcl_udp_fanout = NULL;
543
544 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
545 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
546 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
547 }
548 kmem_free(ipst->ips_ipcl_iptun_fanout,
549 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
550 ipst->ips_ipcl_iptun_fanout = NULL;
551
552 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
553 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
554 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
555 }
556 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
557 sizeof (connf_t));
558 ipst->ips_ipcl_raw_fanout = NULL;
559
560 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
561 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
562 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
563 }
564 kmem_free(ipst->ips_ipcl_globalhash_fanout,
565 sizeof (connf_t) * CONN_G_HASH_SIZE);
566 ipst->ips_ipcl_globalhash_fanout = NULL;
567
568 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
569 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
570 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
571 ipst->ips_rts_clients = NULL;
572 }
573
574 /*
575 * conn creation routine. initialize the conn, sets the reference
576 * and inserts it in the global hash table.
577 */
578 conn_t *
ipcl_conn_create(uint32_t type,int sleep,netstack_t * ns)579 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
580 {
581 conn_t *connp;
582 struct kmem_cache *conn_cache;
583
584 switch (type) {
585 case IPCL_SCTPCONN:
586 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
587 return (NULL);
588 sctp_conn_init(connp);
589 netstack_hold(ns);
590 connp->conn_netstack = ns;
591 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
592 connp->conn_ixa->ixa_conn_id = (long)connp;
593 ipcl_globalhash_insert(connp);
594 return (connp);
595
596 case IPCL_TCPCONN:
597 conn_cache = tcp_conn_cache;
598 break;
599
600 case IPCL_UDPCONN:
601 conn_cache = udp_conn_cache;
602 break;
603
604 case IPCL_RAWIPCONN:
605 conn_cache = rawip_conn_cache;
606 break;
607
608 case IPCL_RTSCONN:
609 conn_cache = rts_conn_cache;
610 break;
611
612 case IPCL_IPCCONN:
613 conn_cache = ip_conn_cache;
614 break;
615
616 default:
617 conn_cache = NULL;
618 connp = NULL;
619 ASSERT(0);
620 }
621
622 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
623 return (NULL);
624
625 connp->conn_ref = 1;
626 netstack_hold(ns);
627 connp->conn_netstack = ns;
628 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
629 connp->conn_ixa->ixa_conn_id = (long)connp;
630 ipcl_globalhash_insert(connp);
631 return (connp);
632 }
633
634 void
ipcl_conn_destroy(conn_t * connp)635 ipcl_conn_destroy(conn_t *connp)
636 {
637 mblk_t *mp;
638 netstack_t *ns = connp->conn_netstack;
639
640 ASSERT(!MUTEX_HELD(&connp->conn_lock));
641 ASSERT(connp->conn_ref == 0);
642 ASSERT(connp->conn_ioctlref == 0);
643
644 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
645
646 if (connp->conn_cred != NULL) {
647 crfree(connp->conn_cred);
648 connp->conn_cred = NULL;
649 /* ixa_cred done in ipcl_conn_cleanup below */
650 }
651
652 if (connp->conn_ht_iphc != NULL) {
653 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
654 connp->conn_ht_iphc = NULL;
655 connp->conn_ht_iphc_allocated = 0;
656 connp->conn_ht_iphc_len = 0;
657 connp->conn_ht_ulp = NULL;
658 connp->conn_ht_ulp_len = 0;
659 }
660 ip_pkt_free(&connp->conn_xmit_ipp);
661
662 ipcl_globalhash_remove(connp);
663
664 if (connp->conn_latch != NULL) {
665 IPLATCH_REFRELE(connp->conn_latch);
666 connp->conn_latch = NULL;
667 }
668 if (connp->conn_latch_in_policy != NULL) {
669 IPPOL_REFRELE(connp->conn_latch_in_policy);
670 connp->conn_latch_in_policy = NULL;
671 }
672 if (connp->conn_latch_in_action != NULL) {
673 IPACT_REFRELE(connp->conn_latch_in_action);
674 connp->conn_latch_in_action = NULL;
675 }
676 if (connp->conn_policy != NULL) {
677 IPPH_REFRELE(connp->conn_policy, ns);
678 connp->conn_policy = NULL;
679 }
680
681 if (connp->conn_ipsec_opt_mp != NULL) {
682 freemsg(connp->conn_ipsec_opt_mp);
683 connp->conn_ipsec_opt_mp = NULL;
684 }
685
686 if (connp->conn_flags & IPCL_TCPCONN) {
687 tcp_t *tcp = connp->conn_tcp;
688
689 tcp_free(tcp);
690 mp = tcp->tcp_timercache;
691
692 tcp->tcp_tcps = NULL;
693
694 /*
695 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
696 * the mblk.
697 */
698 if (tcp->tcp_rsrv_mp != NULL) {
699 freeb(tcp->tcp_rsrv_mp);
700 tcp->tcp_rsrv_mp = NULL;
701 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
702 }
703
704 ipcl_conn_cleanup(connp);
705 connp->conn_flags = IPCL_TCPCONN;
706 if (ns != NULL) {
707 ASSERT(tcp->tcp_tcps == NULL);
708 connp->conn_netstack = NULL;
709 connp->conn_ixa->ixa_ipst = NULL;
710 netstack_rele(ns);
711 }
712
713 bzero(tcp, sizeof (tcp_t));
714
715 tcp->tcp_timercache = mp;
716 tcp->tcp_connp = connp;
717 kmem_cache_free(tcp_conn_cache, connp);
718 return;
719 }
720
721 if (connp->conn_flags & IPCL_SCTPCONN) {
722 ASSERT(ns != NULL);
723 sctp_free(connp);
724 return;
725 }
726
727 ipcl_conn_cleanup(connp);
728 if (ns != NULL) {
729 connp->conn_netstack = NULL;
730 connp->conn_ixa->ixa_ipst = NULL;
731 netstack_rele(ns);
732 }
733
734 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
735 if (connp->conn_flags & IPCL_UDPCONN) {
736 connp->conn_flags = IPCL_UDPCONN;
737 kmem_cache_free(udp_conn_cache, connp);
738 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
739 connp->conn_flags = IPCL_RAWIPCONN;
740 connp->conn_proto = IPPROTO_ICMP;
741 connp->conn_ixa->ixa_protocol = connp->conn_proto;
742 kmem_cache_free(rawip_conn_cache, connp);
743 } else if (connp->conn_flags & IPCL_RTSCONN) {
744 connp->conn_flags = IPCL_RTSCONN;
745 kmem_cache_free(rts_conn_cache, connp);
746 } else {
747 connp->conn_flags = IPCL_IPCCONN;
748 ASSERT(connp->conn_flags & IPCL_IPCCONN);
749 ASSERT(connp->conn_priv == NULL);
750 kmem_cache_free(ip_conn_cache, connp);
751 }
752 }
753
754 /*
755 * Running in cluster mode - deregister listener information
756 */
757 static void
ipcl_conn_unlisten(conn_t * connp)758 ipcl_conn_unlisten(conn_t *connp)
759 {
760 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
761 ASSERT(connp->conn_lport != 0);
762
763 if (cl_inet_unlisten != NULL) {
764 sa_family_t addr_family;
765 uint8_t *laddrp;
766
767 if (connp->conn_ipversion == IPV6_VERSION) {
768 addr_family = AF_INET6;
769 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
770 } else {
771 addr_family = AF_INET;
772 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
773 }
774 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
775 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
776 }
777 connp->conn_flags &= ~IPCL_CL_LISTENER;
778 }
779
780 /*
781 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
782 * which table the conn belonged to). So for debugging we can see which hash
783 * table this connection was in.
784 */
785 #define IPCL_HASH_REMOVE(connp) { \
786 connf_t *connfp = (connp)->conn_fanout; \
787 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
788 if (connfp != NULL) { \
789 mutex_enter(&connfp->connf_lock); \
790 if ((connp)->conn_next != NULL) \
791 (connp)->conn_next->conn_prev = \
792 (connp)->conn_prev; \
793 if ((connp)->conn_prev != NULL) \
794 (connp)->conn_prev->conn_next = \
795 (connp)->conn_next; \
796 else \
797 connfp->connf_head = (connp)->conn_next; \
798 (connp)->conn_fanout = NULL; \
799 (connp)->conn_next = NULL; \
800 (connp)->conn_prev = NULL; \
801 (connp)->conn_flags |= IPCL_REMOVED; \
802 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
803 ipcl_conn_unlisten((connp)); \
804 CONN_DEC_REF((connp)); \
805 mutex_exit(&connfp->connf_lock); \
806 } \
807 }
808
809 void
ipcl_hash_remove(conn_t * connp)810 ipcl_hash_remove(conn_t *connp)
811 {
812 uint8_t protocol = connp->conn_proto;
813
814 IPCL_HASH_REMOVE(connp);
815 if (protocol == IPPROTO_RSVP)
816 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
817 }
818
819 /*
820 * The whole purpose of this function is allow removal of
821 * a conn_t from the connected hash for timewait reclaim.
822 * This is essentially a TW reclaim fastpath where timewait
823 * collector checks under fanout lock (so no one else can
824 * get access to the conn_t) that refcnt is 2 i.e. one for
825 * TCP and one for the classifier hash list. If ref count
826 * is indeed 2, we can just remove the conn under lock and
827 * avoid cleaning up the conn under squeue. This gives us
828 * improved performance.
829 */
830 void
ipcl_hash_remove_locked(conn_t * connp,connf_t * connfp)831 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
832 {
833 ASSERT(MUTEX_HELD(&connfp->connf_lock));
834 ASSERT(MUTEX_HELD(&connp->conn_lock));
835 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
836
837 if ((connp)->conn_next != NULL) {
838 (connp)->conn_next->conn_prev = (connp)->conn_prev;
839 }
840 if ((connp)->conn_prev != NULL) {
841 (connp)->conn_prev->conn_next = (connp)->conn_next;
842 } else {
843 connfp->connf_head = (connp)->conn_next;
844 }
845 (connp)->conn_fanout = NULL;
846 (connp)->conn_next = NULL;
847 (connp)->conn_prev = NULL;
848 (connp)->conn_flags |= IPCL_REMOVED;
849 ASSERT((connp)->conn_ref == 2);
850 (connp)->conn_ref--;
851 }
852
853 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
854 ASSERT((connp)->conn_fanout == NULL); \
855 ASSERT((connp)->conn_next == NULL); \
856 ASSERT((connp)->conn_prev == NULL); \
857 if ((connfp)->connf_head != NULL) { \
858 (connfp)->connf_head->conn_prev = (connp); \
859 (connp)->conn_next = (connfp)->connf_head; \
860 } \
861 (connp)->conn_fanout = (connfp); \
862 (connfp)->connf_head = (connp); \
863 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
864 IPCL_CONNECTED; \
865 CONN_INC_REF(connp); \
866 }
867
868 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
869 IPCL_HASH_REMOVE((connp)); \
870 mutex_enter(&(connfp)->connf_lock); \
871 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
872 mutex_exit(&(connfp)->connf_lock); \
873 }
874
875 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
876 conn_t *pconnp = NULL, *nconnp; \
877 IPCL_HASH_REMOVE((connp)); \
878 mutex_enter(&(connfp)->connf_lock); \
879 nconnp = (connfp)->connf_head; \
880 while (nconnp != NULL && \
881 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
882 pconnp = nconnp; \
883 nconnp = nconnp->conn_next; \
884 } \
885 if (pconnp != NULL) { \
886 pconnp->conn_next = (connp); \
887 (connp)->conn_prev = pconnp; \
888 } else { \
889 (connfp)->connf_head = (connp); \
890 } \
891 if (nconnp != NULL) { \
892 (connp)->conn_next = nconnp; \
893 nconnp->conn_prev = (connp); \
894 } \
895 (connp)->conn_fanout = (connfp); \
896 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
897 IPCL_BOUND; \
898 CONN_INC_REF(connp); \
899 mutex_exit(&(connfp)->connf_lock); \
900 }
901
902 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
903 conn_t **list, *prev, *next; \
904 boolean_t isv4mapped = \
905 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
906 IPCL_HASH_REMOVE((connp)); \
907 mutex_enter(&(connfp)->connf_lock); \
908 list = &(connfp)->connf_head; \
909 prev = NULL; \
910 while ((next = *list) != NULL) { \
911 if (isv4mapped && \
912 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
913 connp->conn_zoneid == next->conn_zoneid) { \
914 (connp)->conn_next = next; \
915 if (prev != NULL) \
916 prev = next->conn_prev; \
917 next->conn_prev = (connp); \
918 break; \
919 } \
920 list = &next->conn_next; \
921 prev = next; \
922 } \
923 (connp)->conn_prev = prev; \
924 *list = (connp); \
925 (connp)->conn_fanout = (connfp); \
926 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
927 IPCL_BOUND; \
928 CONN_INC_REF((connp)); \
929 mutex_exit(&(connfp)->connf_lock); \
930 }
931
932 void
ipcl_hash_insert_wildcard(connf_t * connfp,conn_t * connp)933 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
934 {
935 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
936 }
937
938 /*
939 * Because the classifier is used to classify inbound packets, the destination
940 * address is meant to be our local tunnel address (tunnel source), and the
941 * source the remote tunnel address (tunnel destination).
942 *
943 * Note that conn_proto can't be used for fanout since the upper protocol
944 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
945 */
946 conn_t *
ipcl_iptun_classify_v4(ipaddr_t * src,ipaddr_t * dst,ip_stack_t * ipst)947 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
948 {
949 connf_t *connfp;
950 conn_t *connp;
951
952 /* first look for IPv4 tunnel links */
953 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
954 mutex_enter(&connfp->connf_lock);
955 for (connp = connfp->connf_head; connp != NULL;
956 connp = connp->conn_next) {
957 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
958 break;
959 }
960 if (connp != NULL)
961 goto done;
962
963 mutex_exit(&connfp->connf_lock);
964
965 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
966 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
967 INADDR_ANY)];
968 mutex_enter(&connfp->connf_lock);
969 for (connp = connfp->connf_head; connp != NULL;
970 connp = connp->conn_next) {
971 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
972 break;
973 }
974 done:
975 if (connp != NULL)
976 CONN_INC_REF(connp);
977 mutex_exit(&connfp->connf_lock);
978 return (connp);
979 }
980
981 conn_t *
ipcl_iptun_classify_v6(in6_addr_t * src,in6_addr_t * dst,ip_stack_t * ipst)982 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
983 {
984 connf_t *connfp;
985 conn_t *connp;
986
987 /* Look for an IPv6 tunnel link */
988 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
989 mutex_enter(&connfp->connf_lock);
990 for (connp = connfp->connf_head; connp != NULL;
991 connp = connp->conn_next) {
992 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
993 CONN_INC_REF(connp);
994 break;
995 }
996 }
997 mutex_exit(&connfp->connf_lock);
998 return (connp);
999 }
1000
1001 /*
1002 * This function is used only for inserting SCTP raw socket now.
1003 * This may change later.
1004 *
1005 * Note that only one raw socket can be bound to a port. The param
1006 * lport is in network byte order.
1007 */
1008 static int
ipcl_sctp_hash_insert(conn_t * connp,in_port_t lport)1009 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1010 {
1011 connf_t *connfp;
1012 conn_t *oconnp;
1013 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1014
1015 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1016
1017 /* Check for existing raw socket already bound to the port. */
1018 mutex_enter(&connfp->connf_lock);
1019 for (oconnp = connfp->connf_head; oconnp != NULL;
1020 oconnp = oconnp->conn_next) {
1021 if (oconnp->conn_lport == lport &&
1022 oconnp->conn_zoneid == connp->conn_zoneid &&
1023 oconnp->conn_family == connp->conn_family &&
1024 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1025 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1026 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1027 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1028 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1029 &connp->conn_laddr_v6))) {
1030 break;
1031 }
1032 }
1033 mutex_exit(&connfp->connf_lock);
1034 if (oconnp != NULL)
1035 return (EADDRNOTAVAIL);
1036
1037 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1038 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1039 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1040 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1041 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1042 } else {
1043 IPCL_HASH_INSERT_BOUND(connfp, connp);
1044 }
1045 } else {
1046 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1047 }
1048 return (0);
1049 }
1050
1051 static int
ipcl_iptun_hash_insert(conn_t * connp,ip_stack_t * ipst)1052 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1053 {
1054 connf_t *connfp;
1055 conn_t *tconnp;
1056 ipaddr_t laddr = connp->conn_laddr_v4;
1057 ipaddr_t faddr = connp->conn_faddr_v4;
1058
1059 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1060 mutex_enter(&connfp->connf_lock);
1061 for (tconnp = connfp->connf_head; tconnp != NULL;
1062 tconnp = tconnp->conn_next) {
1063 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1064 /* A tunnel is already bound to these addresses. */
1065 mutex_exit(&connfp->connf_lock);
1066 return (EADDRINUSE);
1067 }
1068 }
1069 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1070 mutex_exit(&connfp->connf_lock);
1071 return (0);
1072 }
1073
1074 static int
ipcl_iptun_hash_insert_v6(conn_t * connp,ip_stack_t * ipst)1075 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1076 {
1077 connf_t *connfp;
1078 conn_t *tconnp;
1079 in6_addr_t *laddr = &connp->conn_laddr_v6;
1080 in6_addr_t *faddr = &connp->conn_faddr_v6;
1081
1082 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1083 mutex_enter(&connfp->connf_lock);
1084 for (tconnp = connfp->connf_head; tconnp != NULL;
1085 tconnp = tconnp->conn_next) {
1086 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1087 /* A tunnel is already bound to these addresses. */
1088 mutex_exit(&connfp->connf_lock);
1089 return (EADDRINUSE);
1090 }
1091 }
1092 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1093 mutex_exit(&connfp->connf_lock);
1094 return (0);
1095 }
1096
1097 /*
1098 * Check for a MAC exemption conflict on a labeled system. Note that for
1099 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1100 * transport layer. This check is for binding all other protocols.
1101 *
1102 * Returns true if there's a conflict.
1103 */
1104 static boolean_t
check_exempt_conflict_v4(conn_t * connp,ip_stack_t * ipst)1105 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1106 {
1107 connf_t *connfp;
1108 conn_t *tconn;
1109
1110 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1111 mutex_enter(&connfp->connf_lock);
1112 for (tconn = connfp->connf_head; tconn != NULL;
1113 tconn = tconn->conn_next) {
1114 /* We don't allow v4 fallback for v6 raw socket */
1115 if (connp->conn_family != tconn->conn_family)
1116 continue;
1117 /* If neither is exempt, then there's no conflict */
1118 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1119 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1120 continue;
1121 /* We are only concerned about sockets for a different zone */
1122 if (connp->conn_zoneid == tconn->conn_zoneid)
1123 continue;
1124 /* If both are bound to different specific addrs, ok */
1125 if (connp->conn_laddr_v4 != INADDR_ANY &&
1126 tconn->conn_laddr_v4 != INADDR_ANY &&
1127 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1128 continue;
1129 /* These two conflict; fail */
1130 break;
1131 }
1132 mutex_exit(&connfp->connf_lock);
1133 return (tconn != NULL);
1134 }
1135
1136 static boolean_t
check_exempt_conflict_v6(conn_t * connp,ip_stack_t * ipst)1137 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1138 {
1139 connf_t *connfp;
1140 conn_t *tconn;
1141
1142 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1143 mutex_enter(&connfp->connf_lock);
1144 for (tconn = connfp->connf_head; tconn != NULL;
1145 tconn = tconn->conn_next) {
1146 /* We don't allow v4 fallback for v6 raw socket */
1147 if (connp->conn_family != tconn->conn_family)
1148 continue;
1149 /* If neither is exempt, then there's no conflict */
1150 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1151 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1152 continue;
1153 /* We are only concerned about sockets for a different zone */
1154 if (connp->conn_zoneid == tconn->conn_zoneid)
1155 continue;
1156 /* If both are bound to different addrs, ok */
1157 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1158 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1159 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1160 &tconn->conn_laddr_v6))
1161 continue;
1162 /* These two conflict; fail */
1163 break;
1164 }
1165 mutex_exit(&connfp->connf_lock);
1166 return (tconn != NULL);
1167 }
1168
1169 /*
1170 * (v4, v6) bind hash insertion routines
1171 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1172 */
1173
1174 int
ipcl_bind_insert(conn_t * connp)1175 ipcl_bind_insert(conn_t *connp)
1176 {
1177 if (connp->conn_ipversion == IPV6_VERSION)
1178 return (ipcl_bind_insert_v6(connp));
1179 else
1180 return (ipcl_bind_insert_v4(connp));
1181 }
1182
1183 int
ipcl_bind_insert_v4(conn_t * connp)1184 ipcl_bind_insert_v4(conn_t *connp)
1185 {
1186 connf_t *connfp;
1187 int ret = 0;
1188 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1189 uint16_t lport = connp->conn_lport;
1190 uint8_t protocol = connp->conn_proto;
1191
1192 if (IPCL_IS_IPTUN(connp))
1193 return (ipcl_iptun_hash_insert(connp, ipst));
1194
1195 switch (protocol) {
1196 default:
1197 if (is_system_labeled() &&
1198 check_exempt_conflict_v4(connp, ipst))
1199 return (EADDRINUSE);
1200 /* FALLTHROUGH */
1201 case IPPROTO_UDP:
1202 if (protocol == IPPROTO_UDP) {
1203 connfp = &ipst->ips_ipcl_udp_fanout[
1204 IPCL_UDP_HASH(lport, ipst)];
1205 } else {
1206 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1207 }
1208
1209 if (connp->conn_faddr_v4 != INADDR_ANY) {
1210 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1211 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1212 IPCL_HASH_INSERT_BOUND(connfp, connp);
1213 } else {
1214 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1215 }
1216 if (protocol == IPPROTO_RSVP)
1217 ill_set_inputfn_all(ipst);
1218 break;
1219
1220 case IPPROTO_TCP:
1221 /* Insert it in the Bind Hash */
1222 ASSERT(connp->conn_zoneid != ALL_ZONES);
1223 connfp = &ipst->ips_ipcl_bind_fanout[
1224 IPCL_BIND_HASH(lport, ipst)];
1225 if (connp->conn_laddr_v4 != INADDR_ANY) {
1226 IPCL_HASH_INSERT_BOUND(connfp, connp);
1227 } else {
1228 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1229 }
1230 if (cl_inet_listen != NULL) {
1231 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1232 connp->conn_flags |= IPCL_CL_LISTENER;
1233 (*cl_inet_listen)(
1234 connp->conn_netstack->netstack_stackid,
1235 IPPROTO_TCP, AF_INET,
1236 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1237 }
1238 break;
1239
1240 case IPPROTO_SCTP:
1241 ret = ipcl_sctp_hash_insert(connp, lport);
1242 break;
1243 }
1244
1245 return (ret);
1246 }
1247
1248 int
ipcl_bind_insert_v6(conn_t * connp)1249 ipcl_bind_insert_v6(conn_t *connp)
1250 {
1251 connf_t *connfp;
1252 int ret = 0;
1253 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1254 uint16_t lport = connp->conn_lport;
1255 uint8_t protocol = connp->conn_proto;
1256
1257 if (IPCL_IS_IPTUN(connp)) {
1258 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1259 }
1260
1261 switch (protocol) {
1262 default:
1263 if (is_system_labeled() &&
1264 check_exempt_conflict_v6(connp, ipst))
1265 return (EADDRINUSE);
1266 /* FALLTHROUGH */
1267 case IPPROTO_UDP:
1268 if (protocol == IPPROTO_UDP) {
1269 connfp = &ipst->ips_ipcl_udp_fanout[
1270 IPCL_UDP_HASH(lport, ipst)];
1271 } else {
1272 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1273 }
1274
1275 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1276 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1277 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1278 IPCL_HASH_INSERT_BOUND(connfp, connp);
1279 } else {
1280 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1281 }
1282 break;
1283
1284 case IPPROTO_TCP:
1285 /* Insert it in the Bind Hash */
1286 ASSERT(connp->conn_zoneid != ALL_ZONES);
1287 connfp = &ipst->ips_ipcl_bind_fanout[
1288 IPCL_BIND_HASH(lport, ipst)];
1289 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1290 IPCL_HASH_INSERT_BOUND(connfp, connp);
1291 } else {
1292 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1293 }
1294 if (cl_inet_listen != NULL) {
1295 sa_family_t addr_family;
1296 uint8_t *laddrp;
1297
1298 if (connp->conn_ipversion == IPV6_VERSION) {
1299 addr_family = AF_INET6;
1300 laddrp =
1301 (uint8_t *)&connp->conn_bound_addr_v6;
1302 } else {
1303 addr_family = AF_INET;
1304 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1305 }
1306 connp->conn_flags |= IPCL_CL_LISTENER;
1307 (*cl_inet_listen)(
1308 connp->conn_netstack->netstack_stackid,
1309 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1310 }
1311 break;
1312
1313 case IPPROTO_SCTP:
1314 ret = ipcl_sctp_hash_insert(connp, lport);
1315 break;
1316 }
1317
1318 return (ret);
1319 }
1320
1321 /*
1322 * ipcl_conn_hash insertion routines.
1323 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1324 */
1325
1326 int
ipcl_conn_insert(conn_t * connp)1327 ipcl_conn_insert(conn_t *connp)
1328 {
1329 if (connp->conn_ipversion == IPV6_VERSION)
1330 return (ipcl_conn_insert_v6(connp));
1331 else
1332 return (ipcl_conn_insert_v4(connp));
1333 }
1334
1335 int
ipcl_conn_insert_v4(conn_t * connp)1336 ipcl_conn_insert_v4(conn_t *connp)
1337 {
1338 connf_t *connfp;
1339 conn_t *tconnp;
1340 int ret = 0;
1341 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1342 uint16_t lport = connp->conn_lport;
1343 uint8_t protocol = connp->conn_proto;
1344
1345 if (IPCL_IS_IPTUN(connp))
1346 return (ipcl_iptun_hash_insert(connp, ipst));
1347
1348 switch (protocol) {
1349 case IPPROTO_TCP:
1350 /*
1351 * For TCP, we check whether the connection tuple already
1352 * exists before allowing the connection to proceed. We
1353 * also allow indexing on the zoneid. This is to allow
1354 * multiple shared stack zones to have the same tcp
1355 * connection tuple. In practice this only happens for
1356 * INADDR_LOOPBACK as it's the only local address which
1357 * doesn't have to be unique.
1358 */
1359 connfp = &ipst->ips_ipcl_conn_fanout[
1360 IPCL_CONN_HASH(connp->conn_faddr_v4,
1361 connp->conn_ports, ipst)];
1362 mutex_enter(&connfp->connf_lock);
1363 for (tconnp = connfp->connf_head; tconnp != NULL;
1364 tconnp = tconnp->conn_next) {
1365 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1366 connp->conn_faddr_v4, connp->conn_laddr_v4,
1367 connp->conn_ports) &&
1368 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1369 /* Already have a conn. bail out */
1370 mutex_exit(&connfp->connf_lock);
1371 return (EADDRINUSE);
1372 }
1373 }
1374 if (connp->conn_fanout != NULL) {
1375 /*
1376 * Probably a XTI/TLI application trying to do a
1377 * rebind. Let it happen.
1378 */
1379 mutex_exit(&connfp->connf_lock);
1380 IPCL_HASH_REMOVE(connp);
1381 mutex_enter(&connfp->connf_lock);
1382 }
1383
1384 ASSERT(connp->conn_recv != NULL);
1385 ASSERT(connp->conn_recvicmp != NULL);
1386
1387 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1388 mutex_exit(&connfp->connf_lock);
1389 break;
1390
1391 case IPPROTO_SCTP:
1392 /*
1393 * The raw socket may have already been bound, remove it
1394 * from the hash first.
1395 */
1396 IPCL_HASH_REMOVE(connp);
1397 ret = ipcl_sctp_hash_insert(connp, lport);
1398 break;
1399
1400 default:
1401 /*
1402 * Check for conflicts among MAC exempt bindings. For
1403 * transports with port numbers, this is done by the upper
1404 * level per-transport binding logic. For all others, it's
1405 * done here.
1406 */
1407 if (is_system_labeled() &&
1408 check_exempt_conflict_v4(connp, ipst))
1409 return (EADDRINUSE);
1410 /* FALLTHROUGH */
1411
1412 case IPPROTO_UDP:
1413 if (protocol == IPPROTO_UDP) {
1414 connfp = &ipst->ips_ipcl_udp_fanout[
1415 IPCL_UDP_HASH(lport, ipst)];
1416 } else {
1417 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1418 }
1419
1420 if (connp->conn_faddr_v4 != INADDR_ANY) {
1421 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1422 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1423 IPCL_HASH_INSERT_BOUND(connfp, connp);
1424 } else {
1425 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1426 }
1427 break;
1428 }
1429
1430 return (ret);
1431 }
1432
1433 int
ipcl_conn_insert_v6(conn_t * connp)1434 ipcl_conn_insert_v6(conn_t *connp)
1435 {
1436 connf_t *connfp;
1437 conn_t *tconnp;
1438 int ret = 0;
1439 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1440 uint16_t lport = connp->conn_lport;
1441 uint8_t protocol = connp->conn_proto;
1442 uint_t ifindex = connp->conn_bound_if;
1443
1444 if (IPCL_IS_IPTUN(connp))
1445 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1446
1447 switch (protocol) {
1448 case IPPROTO_TCP:
1449
1450 /*
1451 * For tcp, we check whether the connection tuple already
1452 * exists before allowing the connection to proceed. We
1453 * also allow indexing on the zoneid. This is to allow
1454 * multiple shared stack zones to have the same tcp
1455 * connection tuple. In practice this only happens for
1456 * ipv6_loopback as it's the only local address which
1457 * doesn't have to be unique.
1458 */
1459 connfp = &ipst->ips_ipcl_conn_fanout[
1460 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1461 ipst)];
1462 mutex_enter(&connfp->connf_lock);
1463 for (tconnp = connfp->connf_head; tconnp != NULL;
1464 tconnp = tconnp->conn_next) {
1465 /* NOTE: need to match zoneid. Bug in onnv-gate */
1466 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1467 connp->conn_faddr_v6, connp->conn_laddr_v6,
1468 connp->conn_ports) &&
1469 (tconnp->conn_bound_if == 0 ||
1470 tconnp->conn_bound_if == ifindex) &&
1471 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1472 /* Already have a conn. bail out */
1473 mutex_exit(&connfp->connf_lock);
1474 return (EADDRINUSE);
1475 }
1476 }
1477 if (connp->conn_fanout != NULL) {
1478 /*
1479 * Probably a XTI/TLI application trying to do a
1480 * rebind. Let it happen.
1481 */
1482 mutex_exit(&connfp->connf_lock);
1483 IPCL_HASH_REMOVE(connp);
1484 mutex_enter(&connfp->connf_lock);
1485 }
1486 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1487 mutex_exit(&connfp->connf_lock);
1488 break;
1489
1490 case IPPROTO_SCTP:
1491 IPCL_HASH_REMOVE(connp);
1492 ret = ipcl_sctp_hash_insert(connp, lport);
1493 break;
1494
1495 default:
1496 if (is_system_labeled() &&
1497 check_exempt_conflict_v6(connp, ipst))
1498 return (EADDRINUSE);
1499 /* FALLTHROUGH */
1500 case IPPROTO_UDP:
1501 if (protocol == IPPROTO_UDP) {
1502 connfp = &ipst->ips_ipcl_udp_fanout[
1503 IPCL_UDP_HASH(lport, ipst)];
1504 } else {
1505 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1506 }
1507
1508 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1509 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1510 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1511 IPCL_HASH_INSERT_BOUND(connfp, connp);
1512 } else {
1513 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1514 }
1515 break;
1516 }
1517
1518 return (ret);
1519 }
1520
1521 /*
1522 * v4 packet classifying function. looks up the fanout table to
1523 * find the conn, the packet belongs to. returns the conn with
1524 * the reference held, null otherwise.
1525 *
1526 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1527 * Lookup" comment block are applied. Labels are also checked as described
1528 * above. If the packet is from the inside (looped back), and is from the same
1529 * zone, then label checks are omitted.
1530 */
1531 conn_t *
ipcl_classify_v4(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1532 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1533 ip_recv_attr_t *ira, ip_stack_t *ipst)
1534 {
1535 ipha_t *ipha;
1536 connf_t *connfp, *bind_connfp;
1537 uint16_t lport;
1538 uint16_t fport;
1539 uint32_t ports;
1540 conn_t *connp;
1541 uint16_t *up;
1542 zoneid_t zoneid = ira->ira_zoneid;
1543 int ifindex = ira->ira_ruifindex;
1544
1545 ipha = (ipha_t *)mp->b_rptr;
1546 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1547
1548 switch (protocol) {
1549 case IPPROTO_TCP:
1550 ports = *(uint32_t *)up;
1551 connfp =
1552 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1553 ports, ipst)];
1554 mutex_enter(&connfp->connf_lock);
1555 for (connp = connfp->connf_head; connp != NULL;
1556 connp = connp->conn_next) {
1557 if (IPCL_CONN_MATCH(connp, protocol,
1558 ipha->ipha_src, ipha->ipha_dst, ports) &&
1559 (connp->conn_incoming_ifindex == 0 ||
1560 connp->conn_incoming_ifindex == ifindex) &&
1561 (connp->conn_zoneid == zoneid ||
1562 connp->conn_allzones ||
1563 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1564 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1565 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1566 break;
1567 }
1568
1569 if (connp != NULL) {
1570 /*
1571 * We have a fully-bound TCP connection.
1572 *
1573 * For labeled systems, there's no need to check the
1574 * label here. It's known to be good as we checked
1575 * before allowing the connection to become bound.
1576 */
1577 CONN_INC_REF(connp);
1578 mutex_exit(&connfp->connf_lock);
1579 return (connp);
1580 }
1581
1582 mutex_exit(&connfp->connf_lock);
1583 lport = up[1];
1584 bind_connfp =
1585 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1586 mutex_enter(&bind_connfp->connf_lock);
1587 for (connp = bind_connfp->connf_head; connp != NULL;
1588 connp = connp->conn_next) {
1589 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1590 lport) &&
1591 (connp->conn_incoming_ifindex == 0 ||
1592 connp->conn_incoming_ifindex == ifindex) &&
1593 (connp->conn_zoneid == zoneid ||
1594 connp->conn_allzones ||
1595 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1596 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1597 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1598 break;
1599 }
1600
1601 /*
1602 * If the matching connection is SLP on a private address, then
1603 * the label on the packet must match the local zone's label.
1604 * Otherwise, it must be in the label range defined by tnrh.
1605 * This is ensured by tsol_receive_local.
1606 *
1607 * Note that we don't check tsol_receive_local for
1608 * the connected case.
1609 */
1610 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1611 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1612 ira, connp)) {
1613 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1614 char *, "connp(1) could not receive mp(2)",
1615 conn_t *, connp, mblk_t *, mp);
1616 connp = NULL;
1617 }
1618
1619 if (connp != NULL) {
1620 /* Have a listener at least */
1621 CONN_INC_REF(connp);
1622 mutex_exit(&bind_connfp->connf_lock);
1623 return (connp);
1624 }
1625
1626 mutex_exit(&bind_connfp->connf_lock);
1627 break;
1628
1629 case IPPROTO_UDP:
1630 lport = up[1];
1631 fport = up[0];
1632 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1633 mutex_enter(&connfp->connf_lock);
1634 for (connp = connfp->connf_head; connp != NULL;
1635 connp = connp->conn_next) {
1636 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1637 fport, ipha->ipha_src) &&
1638 (connp->conn_incoming_ifindex == 0 ||
1639 connp->conn_incoming_ifindex == ifindex) &&
1640 (connp->conn_zoneid == zoneid ||
1641 connp->conn_allzones ||
1642 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1643 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1644 break;
1645 }
1646
1647 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1648 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1649 ira, connp)) {
1650 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1651 char *, "connp(1) could not receive mp(2)",
1652 conn_t *, connp, mblk_t *, mp);
1653 connp = NULL;
1654 }
1655
1656 if (connp != NULL) {
1657 CONN_INC_REF(connp);
1658 mutex_exit(&connfp->connf_lock);
1659 return (connp);
1660 }
1661
1662 /*
1663 * We shouldn't come here for multicast/broadcast packets
1664 */
1665 mutex_exit(&connfp->connf_lock);
1666
1667 break;
1668
1669 case IPPROTO_ENCAP:
1670 case IPPROTO_IPV6:
1671 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1672 &ipha->ipha_dst, ipst));
1673 }
1674
1675 return (NULL);
1676 }
1677
1678 conn_t *
ipcl_classify_v6(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1679 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1680 ip_recv_attr_t *ira, ip_stack_t *ipst)
1681 {
1682 ip6_t *ip6h;
1683 connf_t *connfp, *bind_connfp;
1684 uint16_t lport;
1685 uint16_t fport;
1686 tcpha_t *tcpha;
1687 uint32_t ports;
1688 conn_t *connp;
1689 uint16_t *up;
1690 zoneid_t zoneid = ira->ira_zoneid;
1691 int ifindex = ira->ira_ruifindex;
1692
1693 ip6h = (ip6_t *)mp->b_rptr;
1694
1695 switch (protocol) {
1696 case IPPROTO_TCP:
1697 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1698 up = &tcpha->tha_lport;
1699 ports = *(uint32_t *)up;
1700
1701 connfp =
1702 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1703 ports, ipst)];
1704 mutex_enter(&connfp->connf_lock);
1705 for (connp = connfp->connf_head; connp != NULL;
1706 connp = connp->conn_next) {
1707 if (IPCL_CONN_MATCH_V6(connp, protocol,
1708 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1709 (connp->conn_incoming_ifindex == 0 ||
1710 connp->conn_incoming_ifindex == ifindex) &&
1711 (connp->conn_zoneid == zoneid ||
1712 connp->conn_allzones ||
1713 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1714 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1715 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1716 break;
1717 }
1718
1719 if (connp != NULL) {
1720 /*
1721 * We have a fully-bound TCP connection.
1722 *
1723 * For labeled systems, there's no need to check the
1724 * label here. It's known to be good as we checked
1725 * before allowing the connection to become bound.
1726 */
1727 CONN_INC_REF(connp);
1728 mutex_exit(&connfp->connf_lock);
1729 return (connp);
1730 }
1731
1732 mutex_exit(&connfp->connf_lock);
1733
1734 lport = up[1];
1735 bind_connfp =
1736 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1737 mutex_enter(&bind_connfp->connf_lock);
1738 for (connp = bind_connfp->connf_head; connp != NULL;
1739 connp = connp->conn_next) {
1740 if (IPCL_BIND_MATCH_V6(connp, protocol,
1741 ip6h->ip6_dst, lport) &&
1742 (connp->conn_incoming_ifindex == 0 ||
1743 connp->conn_incoming_ifindex == ifindex) &&
1744 (connp->conn_zoneid == zoneid ||
1745 connp->conn_allzones ||
1746 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1747 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1748 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1749 break;
1750 }
1751
1752 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1753 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1754 ira, connp)) {
1755 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1756 char *, "connp(1) could not receive mp(2)",
1757 conn_t *, connp, mblk_t *, mp);
1758 connp = NULL;
1759 }
1760
1761 if (connp != NULL) {
1762 /* Have a listner at least */
1763 CONN_INC_REF(connp);
1764 mutex_exit(&bind_connfp->connf_lock);
1765 return (connp);
1766 }
1767
1768 mutex_exit(&bind_connfp->connf_lock);
1769 break;
1770
1771 case IPPROTO_UDP:
1772 up = (uint16_t *)&mp->b_rptr[hdr_len];
1773 lport = up[1];
1774 fport = up[0];
1775 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1776 mutex_enter(&connfp->connf_lock);
1777 for (connp = connfp->connf_head; connp != NULL;
1778 connp = connp->conn_next) {
1779 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1780 fport, ip6h->ip6_src) &&
1781 (connp->conn_incoming_ifindex == 0 ||
1782 connp->conn_incoming_ifindex == ifindex) &&
1783 (connp->conn_zoneid == zoneid ||
1784 connp->conn_allzones ||
1785 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1786 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1787 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1788 break;
1789 }
1790
1791 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1792 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1793 ira, connp)) {
1794 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1795 char *, "connp(1) could not receive mp(2)",
1796 conn_t *, connp, mblk_t *, mp);
1797 connp = NULL;
1798 }
1799
1800 if (connp != NULL) {
1801 CONN_INC_REF(connp);
1802 mutex_exit(&connfp->connf_lock);
1803 return (connp);
1804 }
1805
1806 /*
1807 * We shouldn't come here for multicast/broadcast packets
1808 */
1809 mutex_exit(&connfp->connf_lock);
1810 break;
1811 case IPPROTO_ENCAP:
1812 case IPPROTO_IPV6:
1813 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1814 &ip6h->ip6_dst, ipst));
1815 }
1816
1817 return (NULL);
1818 }
1819
1820 /*
1821 * wrapper around ipcl_classify_(v4,v6) routines.
1822 */
1823 conn_t *
ipcl_classify(mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst)1824 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1825 {
1826 if (ira->ira_flags & IRAF_IS_IPV4) {
1827 return (ipcl_classify_v4(mp, ira->ira_protocol,
1828 ira->ira_ip_hdr_length, ira, ipst));
1829 } else {
1830 return (ipcl_classify_v6(mp, ira->ira_protocol,
1831 ira->ira_ip_hdr_length, ira, ipst));
1832 }
1833 }
1834
1835 /*
1836 * Only used to classify SCTP RAW sockets
1837 */
1838 conn_t *
ipcl_classify_raw(mblk_t * mp,uint8_t protocol,uint32_t ports,ipha_t * ipha,ip6_t * ip6h,ip_recv_attr_t * ira,ip_stack_t * ipst)1839 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1840 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1841 {
1842 connf_t *connfp;
1843 conn_t *connp;
1844 in_port_t lport;
1845 int ipversion;
1846 const void *dst;
1847 zoneid_t zoneid = ira->ira_zoneid;
1848
1849 lport = ((uint16_t *)&ports)[1];
1850 if (ira->ira_flags & IRAF_IS_IPV4) {
1851 dst = (const void *)&ipha->ipha_dst;
1852 ipversion = IPV4_VERSION;
1853 } else {
1854 dst = (const void *)&ip6h->ip6_dst;
1855 ipversion = IPV6_VERSION;
1856 }
1857
1858 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1859 mutex_enter(&connfp->connf_lock);
1860 for (connp = connfp->connf_head; connp != NULL;
1861 connp = connp->conn_next) {
1862 /* We don't allow v4 fallback for v6 raw socket. */
1863 if (ipversion != connp->conn_ipversion)
1864 continue;
1865 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1866 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1867 if (ipversion == IPV4_VERSION) {
1868 if (!IPCL_CONN_MATCH(connp, protocol,
1869 ipha->ipha_src, ipha->ipha_dst, ports))
1870 continue;
1871 } else {
1872 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1873 ip6h->ip6_src, ip6h->ip6_dst, ports))
1874 continue;
1875 }
1876 } else {
1877 if (ipversion == IPV4_VERSION) {
1878 if (!IPCL_BIND_MATCH(connp, protocol,
1879 ipha->ipha_dst, lport))
1880 continue;
1881 } else {
1882 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1883 ip6h->ip6_dst, lport))
1884 continue;
1885 }
1886 }
1887
1888 if (connp->conn_zoneid == zoneid ||
1889 connp->conn_allzones ||
1890 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1891 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1892 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1893 break;
1894 }
1895
1896 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1897 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1898 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1899 char *, "connp(1) could not receive mp(2)",
1900 conn_t *, connp, mblk_t *, mp);
1901 connp = NULL;
1902 }
1903
1904 if (connp != NULL)
1905 goto found;
1906 mutex_exit(&connfp->connf_lock);
1907
1908 /* Try to look for a wildcard SCTP RAW socket match. */
1909 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1910 mutex_enter(&connfp->connf_lock);
1911 for (connp = connfp->connf_head; connp != NULL;
1912 connp = connp->conn_next) {
1913 /* We don't allow v4 fallback for v6 raw socket. */
1914 if (ipversion != connp->conn_ipversion)
1915 continue;
1916 if (!IPCL_ZONE_MATCH(connp, zoneid))
1917 continue;
1918
1919 if (ipversion == IPV4_VERSION) {
1920 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1921 break;
1922 } else {
1923 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1924 break;
1925 }
1926 }
1927 }
1928
1929 if (connp != NULL)
1930 goto found;
1931
1932 mutex_exit(&connfp->connf_lock);
1933 return (NULL);
1934
1935 found:
1936 ASSERT(connp != NULL);
1937 CONN_INC_REF(connp);
1938 mutex_exit(&connfp->connf_lock);
1939 return (connp);
1940 }
1941
1942 /* ARGSUSED */
1943 static int
tcp_conn_constructor(void * buf,void * cdrarg,int kmflags)1944 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1945 {
1946 itc_t *itc = (itc_t *)buf;
1947 conn_t *connp = &itc->itc_conn;
1948 tcp_t *tcp = (tcp_t *)&itc[1];
1949
1950 bzero(connp, sizeof (conn_t));
1951 bzero(tcp, sizeof (tcp_t));
1952
1953 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1954 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1955 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1956 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1957 if (tcp->tcp_timercache == NULL)
1958 return (ENOMEM);
1959 connp->conn_tcp = tcp;
1960 connp->conn_flags = IPCL_TCPCONN;
1961 connp->conn_proto = IPPROTO_TCP;
1962 tcp->tcp_connp = connp;
1963 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1964
1965 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1966 if (connp->conn_ixa == NULL) {
1967 tcp_timermp_free(tcp);
1968 return (ENOMEM);
1969 }
1970 connp->conn_ixa->ixa_refcnt = 1;
1971 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1972 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1973 return (0);
1974 }
1975
1976 /* ARGSUSED */
1977 static void
tcp_conn_destructor(void * buf,void * cdrarg)1978 tcp_conn_destructor(void *buf, void *cdrarg)
1979 {
1980 itc_t *itc = (itc_t *)buf;
1981 conn_t *connp = &itc->itc_conn;
1982 tcp_t *tcp = (tcp_t *)&itc[1];
1983
1984 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1985 ASSERT(tcp->tcp_connp == connp);
1986 ASSERT(connp->conn_tcp == tcp);
1987 tcp_timermp_free(tcp);
1988 mutex_destroy(&connp->conn_lock);
1989 cv_destroy(&connp->conn_cv);
1990 cv_destroy(&connp->conn_sq_cv);
1991 rw_destroy(&connp->conn_ilg_lock);
1992
1993 /* Can be NULL if constructor failed */
1994 if (connp->conn_ixa != NULL) {
1995 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1996 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1997 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1998 ixa_refrele(connp->conn_ixa);
1999 }
2000 }
2001
2002 /* ARGSUSED */
2003 static int
ip_conn_constructor(void * buf,void * cdrarg,int kmflags)2004 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2005 {
2006 itc_t *itc = (itc_t *)buf;
2007 conn_t *connp = &itc->itc_conn;
2008
2009 bzero(connp, sizeof (conn_t));
2010 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2011 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2012 connp->conn_flags = IPCL_IPCCONN;
2013 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2014
2015 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2016 if (connp->conn_ixa == NULL)
2017 return (ENOMEM);
2018 connp->conn_ixa->ixa_refcnt = 1;
2019 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2020 return (0);
2021 }
2022
2023 /* ARGSUSED */
2024 static void
ip_conn_destructor(void * buf,void * cdrarg)2025 ip_conn_destructor(void *buf, void *cdrarg)
2026 {
2027 itc_t *itc = (itc_t *)buf;
2028 conn_t *connp = &itc->itc_conn;
2029
2030 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2031 ASSERT(connp->conn_priv == NULL);
2032 mutex_destroy(&connp->conn_lock);
2033 cv_destroy(&connp->conn_cv);
2034 rw_destroy(&connp->conn_ilg_lock);
2035
2036 /* Can be NULL if constructor failed */
2037 if (connp->conn_ixa != NULL) {
2038 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2039 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2040 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2041 ixa_refrele(connp->conn_ixa);
2042 }
2043 }
2044
2045 /* ARGSUSED */
2046 static int
udp_conn_constructor(void * buf,void * cdrarg,int kmflags)2047 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2048 {
2049 itc_t *itc = (itc_t *)buf;
2050 conn_t *connp = &itc->itc_conn;
2051 udp_t *udp = (udp_t *)&itc[1];
2052
2053 bzero(connp, sizeof (conn_t));
2054 bzero(udp, sizeof (udp_t));
2055
2056 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2057 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2058 connp->conn_udp = udp;
2059 connp->conn_flags = IPCL_UDPCONN;
2060 connp->conn_proto = IPPROTO_UDP;
2061 udp->udp_connp = connp;
2062 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2063 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2064 if (connp->conn_ixa == NULL)
2065 return (ENOMEM);
2066 connp->conn_ixa->ixa_refcnt = 1;
2067 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2068 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2069 return (0);
2070 }
2071
2072 /* ARGSUSED */
2073 static void
udp_conn_destructor(void * buf,void * cdrarg)2074 udp_conn_destructor(void *buf, void *cdrarg)
2075 {
2076 itc_t *itc = (itc_t *)buf;
2077 conn_t *connp = &itc->itc_conn;
2078 udp_t *udp = (udp_t *)&itc[1];
2079
2080 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2081 ASSERT(udp->udp_connp == connp);
2082 ASSERT(connp->conn_udp == udp);
2083 mutex_destroy(&connp->conn_lock);
2084 cv_destroy(&connp->conn_cv);
2085 rw_destroy(&connp->conn_ilg_lock);
2086
2087 /* Can be NULL if constructor failed */
2088 if (connp->conn_ixa != NULL) {
2089 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2090 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2091 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2092 ixa_refrele(connp->conn_ixa);
2093 }
2094 }
2095
2096 /* ARGSUSED */
2097 static int
rawip_conn_constructor(void * buf,void * cdrarg,int kmflags)2098 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2099 {
2100 itc_t *itc = (itc_t *)buf;
2101 conn_t *connp = &itc->itc_conn;
2102 icmp_t *icmp = (icmp_t *)&itc[1];
2103
2104 bzero(connp, sizeof (conn_t));
2105 bzero(icmp, sizeof (icmp_t));
2106
2107 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2108 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2109 connp->conn_icmp = icmp;
2110 connp->conn_flags = IPCL_RAWIPCONN;
2111 connp->conn_proto = IPPROTO_ICMP;
2112 icmp->icmp_connp = connp;
2113 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2114 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2115 if (connp->conn_ixa == NULL)
2116 return (ENOMEM);
2117 connp->conn_ixa->ixa_refcnt = 1;
2118 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2119 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2120 return (0);
2121 }
2122
2123 /* ARGSUSED */
2124 static void
rawip_conn_destructor(void * buf,void * cdrarg)2125 rawip_conn_destructor(void *buf, void *cdrarg)
2126 {
2127 itc_t *itc = (itc_t *)buf;
2128 conn_t *connp = &itc->itc_conn;
2129 icmp_t *icmp = (icmp_t *)&itc[1];
2130
2131 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2132 ASSERT(icmp->icmp_connp == connp);
2133 ASSERT(connp->conn_icmp == icmp);
2134 mutex_destroy(&connp->conn_lock);
2135 cv_destroy(&connp->conn_cv);
2136 rw_destroy(&connp->conn_ilg_lock);
2137
2138 /* Can be NULL if constructor failed */
2139 if (connp->conn_ixa != NULL) {
2140 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2141 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2142 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2143 ixa_refrele(connp->conn_ixa);
2144 }
2145 }
2146
2147 /* ARGSUSED */
2148 static int
rts_conn_constructor(void * buf,void * cdrarg,int kmflags)2149 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2150 {
2151 itc_t *itc = (itc_t *)buf;
2152 conn_t *connp = &itc->itc_conn;
2153 rts_t *rts = (rts_t *)&itc[1];
2154
2155 bzero(connp, sizeof (conn_t));
2156 bzero(rts, sizeof (rts_t));
2157
2158 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2159 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2160 connp->conn_rts = rts;
2161 connp->conn_flags = IPCL_RTSCONN;
2162 rts->rts_connp = connp;
2163 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2164 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2165 if (connp->conn_ixa == NULL)
2166 return (ENOMEM);
2167 connp->conn_ixa->ixa_refcnt = 1;
2168 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2169 return (0);
2170 }
2171
2172 /* ARGSUSED */
2173 static void
rts_conn_destructor(void * buf,void * cdrarg)2174 rts_conn_destructor(void *buf, void *cdrarg)
2175 {
2176 itc_t *itc = (itc_t *)buf;
2177 conn_t *connp = &itc->itc_conn;
2178 rts_t *rts = (rts_t *)&itc[1];
2179
2180 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2181 ASSERT(rts->rts_connp == connp);
2182 ASSERT(connp->conn_rts == rts);
2183 mutex_destroy(&connp->conn_lock);
2184 cv_destroy(&connp->conn_cv);
2185 rw_destroy(&connp->conn_ilg_lock);
2186
2187 /* Can be NULL if constructor failed */
2188 if (connp->conn_ixa != NULL) {
2189 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2190 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2191 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2192 ixa_refrele(connp->conn_ixa);
2193 }
2194 }
2195
2196 /*
2197 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2198 * in the conn_t.
2199 *
2200 * Below we list all the pointers in the conn_t as a documentation aid.
2201 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2202 * If you add any pointers to the conn_t please add an ASSERT here
2203 * and #ifdef it out if it can't be actually asserted to be NULL.
2204 * In any case, we bzero most of the conn_t at the end of the function.
2205 */
2206 void
ipcl_conn_cleanup(conn_t * connp)2207 ipcl_conn_cleanup(conn_t *connp)
2208 {
2209 ip_xmit_attr_t *ixa;
2210
2211 ASSERT(connp->conn_latch == NULL);
2212 ASSERT(connp->conn_latch_in_policy == NULL);
2213 ASSERT(connp->conn_latch_in_action == NULL);
2214 #ifdef notdef
2215 ASSERT(connp->conn_rq == NULL);
2216 ASSERT(connp->conn_wq == NULL);
2217 #endif
2218 ASSERT(connp->conn_cred == NULL);
2219 ASSERT(connp->conn_g_fanout == NULL);
2220 ASSERT(connp->conn_g_next == NULL);
2221 ASSERT(connp->conn_g_prev == NULL);
2222 ASSERT(connp->conn_policy == NULL);
2223 ASSERT(connp->conn_fanout == NULL);
2224 ASSERT(connp->conn_next == NULL);
2225 ASSERT(connp->conn_prev == NULL);
2226 ASSERT(connp->conn_oper_pending_ill == NULL);
2227 ASSERT(connp->conn_ilg == NULL);
2228 ASSERT(connp->conn_drain_next == NULL);
2229 ASSERT(connp->conn_drain_prev == NULL);
2230 #ifdef notdef
2231 /* conn_idl is not cleared when removed from idl list */
2232 ASSERT(connp->conn_idl == NULL);
2233 #endif
2234 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2235 #ifdef notdef
2236 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2237 ASSERT(connp->conn_netstack == NULL);
2238 #endif
2239
2240 ASSERT(connp->conn_helper_info == NULL);
2241 ASSERT(connp->conn_ixa != NULL);
2242 ixa = connp->conn_ixa;
2243 ASSERT(ixa->ixa_refcnt == 1);
2244 /* Need to preserve ixa_protocol */
2245 ixa_cleanup(ixa);
2246 ixa->ixa_flags = 0;
2247
2248 /* Clear out the conn_t fields that are not preserved */
2249 bzero(&connp->conn_start_clr,
2250 sizeof (conn_t) -
2251 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2252 }
2253
2254 /*
2255 * All conns are inserted in a global multi-list for the benefit of
2256 * walkers. The walk is guaranteed to walk all open conns at the time
2257 * of the start of the walk exactly once. This property is needed to
2258 * achieve some cleanups during unplumb of interfaces. This is achieved
2259 * as follows.
2260 *
2261 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2262 * call the insert and delete functions below at creation and deletion
2263 * time respectively. The conn never moves or changes its position in this
2264 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2265 * won't increase due to walkers, once the conn deletion has started. Note
2266 * that we can't remove the conn from the global list and then wait for
2267 * the refcnt to drop to zero, since walkers would then see a truncated
2268 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2269 * conns until ip_open is ready to make them globally visible.
2270 * The global round robin multi-list locks are held only to get the
2271 * next member/insertion/deletion and contention should be negligible
2272 * if the multi-list is much greater than the number of cpus.
2273 */
2274 void
ipcl_globalhash_insert(conn_t * connp)2275 ipcl_globalhash_insert(conn_t *connp)
2276 {
2277 int index;
2278 struct connf_s *connfp;
2279 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2280
2281 /*
2282 * No need for atomic here. Approximate even distribution
2283 * in the global lists is sufficient.
2284 */
2285 ipst->ips_conn_g_index++;
2286 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2287
2288 connp->conn_g_prev = NULL;
2289 /*
2290 * Mark as INCIPIENT, so that walkers will ignore this
2291 * for now, till ip_open is ready to make it visible globally.
2292 */
2293 connp->conn_state_flags |= CONN_INCIPIENT;
2294
2295 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2296 /* Insert at the head of the list */
2297 mutex_enter(&connfp->connf_lock);
2298 connp->conn_g_next = connfp->connf_head;
2299 if (connp->conn_g_next != NULL)
2300 connp->conn_g_next->conn_g_prev = connp;
2301 connfp->connf_head = connp;
2302
2303 /* The fanout bucket this conn points to */
2304 connp->conn_g_fanout = connfp;
2305
2306 mutex_exit(&connfp->connf_lock);
2307 }
2308
2309 void
ipcl_globalhash_remove(conn_t * connp)2310 ipcl_globalhash_remove(conn_t *connp)
2311 {
2312 struct connf_s *connfp;
2313
2314 /*
2315 * We were never inserted in the global multi list.
2316 * IPCL_NONE variety is never inserted in the global multilist
2317 * since it is presumed to not need any cleanup and is transient.
2318 */
2319 if (connp->conn_g_fanout == NULL)
2320 return;
2321
2322 connfp = connp->conn_g_fanout;
2323 mutex_enter(&connfp->connf_lock);
2324 if (connp->conn_g_prev != NULL)
2325 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2326 else
2327 connfp->connf_head = connp->conn_g_next;
2328 if (connp->conn_g_next != NULL)
2329 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2330 mutex_exit(&connfp->connf_lock);
2331
2332 /* Better to stumble on a null pointer than to corrupt memory */
2333 connp->conn_g_next = NULL;
2334 connp->conn_g_prev = NULL;
2335 connp->conn_g_fanout = NULL;
2336 }
2337
2338 /*
2339 * Walk the list of all conn_t's in the system, calling the function provided
2340 * With the specified argument for each.
2341 * Applies to both IPv4 and IPv6.
2342 *
2343 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2344 * conn_oper_pending_ill). To guard against stale pointers
2345 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2346 * unplumbed or removed. New conn_t's that are created while we are walking
2347 * may be missed by this walk, because they are not necessarily inserted
2348 * at the tail of the list. They are new conn_t's and thus don't have any
2349 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2350 * is created to the struct that is going away.
2351 */
2352 void
ipcl_walk(pfv_t func,void * arg,ip_stack_t * ipst)2353 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2354 {
2355 int i;
2356 conn_t *connp;
2357 conn_t *prev_connp;
2358
2359 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2360 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2361 prev_connp = NULL;
2362 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2363 while (connp != NULL) {
2364 mutex_enter(&connp->conn_lock);
2365 if (connp->conn_state_flags &
2366 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2367 mutex_exit(&connp->conn_lock);
2368 connp = connp->conn_g_next;
2369 continue;
2370 }
2371 CONN_INC_REF_LOCKED(connp);
2372 mutex_exit(&connp->conn_lock);
2373 mutex_exit(
2374 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2375 (*func)(connp, arg);
2376 if (prev_connp != NULL)
2377 CONN_DEC_REF(prev_connp);
2378 mutex_enter(
2379 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2380 prev_connp = connp;
2381 connp = connp->conn_g_next;
2382 }
2383 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2384 if (prev_connp != NULL)
2385 CONN_DEC_REF(prev_connp);
2386 }
2387 }
2388
2389 /*
2390 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2391 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2392 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2393 * (peer tcp in ESTABLISHED state).
2394 */
2395 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv4(conn_t * connp,ipha_t * ipha,tcpha_t * tcpha,ip_stack_t * ipst)2396 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2397 ip_stack_t *ipst)
2398 {
2399 uint32_t ports;
2400 uint16_t *pports = (uint16_t *)&ports;
2401 connf_t *connfp;
2402 conn_t *tconnp;
2403 boolean_t zone_chk;
2404
2405 /*
2406 * If either the source of destination address is loopback, then
2407 * both endpoints must be in the same Zone. Otherwise, both of
2408 * the addresses are system-wide unique (tcp is in ESTABLISHED
2409 * state) and the endpoints may reside in different Zones.
2410 */
2411 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2412 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2413
2414 pports[0] = tcpha->tha_fport;
2415 pports[1] = tcpha->tha_lport;
2416
2417 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2418 ports, ipst)];
2419
2420 mutex_enter(&connfp->connf_lock);
2421 for (tconnp = connfp->connf_head; tconnp != NULL;
2422 tconnp = tconnp->conn_next) {
2423
2424 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2425 ipha->ipha_dst, ipha->ipha_src, ports) &&
2426 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2427 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2428
2429 ASSERT(tconnp != connp);
2430 CONN_INC_REF(tconnp);
2431 mutex_exit(&connfp->connf_lock);
2432 return (tconnp);
2433 }
2434 }
2435 mutex_exit(&connfp->connf_lock);
2436 return (NULL);
2437 }
2438
2439 /*
2440 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2441 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2442 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2443 * (peer tcp in ESTABLISHED state).
2444 */
2445 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv6(conn_t * connp,ip6_t * ip6h,tcpha_t * tcpha,ip_stack_t * ipst)2446 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2447 ip_stack_t *ipst)
2448 {
2449 uint32_t ports;
2450 uint16_t *pports = (uint16_t *)&ports;
2451 connf_t *connfp;
2452 conn_t *tconnp;
2453 boolean_t zone_chk;
2454
2455 /*
2456 * If either the source of destination address is loopback, then
2457 * both endpoints must be in the same Zone. Otherwise, both of
2458 * the addresses are system-wide unique (tcp is in ESTABLISHED
2459 * state) and the endpoints may reside in different Zones. We
2460 * don't do Zone check for link local address(es) because the
2461 * current Zone implementation treats each link local address as
2462 * being unique per system node, i.e. they belong to global Zone.
2463 */
2464 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2465 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2466
2467 pports[0] = tcpha->tha_fport;
2468 pports[1] = tcpha->tha_lport;
2469
2470 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2471 ports, ipst)];
2472
2473 mutex_enter(&connfp->connf_lock);
2474 for (tconnp = connfp->connf_head; tconnp != NULL;
2475 tconnp = tconnp->conn_next) {
2476
2477 /* We skip conn_bound_if check here as this is loopback tcp */
2478 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2479 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2480 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2481 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2482
2483 ASSERT(tconnp != connp);
2484 CONN_INC_REF(tconnp);
2485 mutex_exit(&connfp->connf_lock);
2486 return (tconnp);
2487 }
2488 }
2489 mutex_exit(&connfp->connf_lock);
2490 return (NULL);
2491 }
2492
2493 /*
2494 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2495 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2496 * Only checks for connected entries i.e. no INADDR_ANY checks.
2497 */
2498 conn_t *
ipcl_tcp_lookup_reversed_ipv4(ipha_t * ipha,tcpha_t * tcpha,int min_state,ip_stack_t * ipst)2499 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2500 ip_stack_t *ipst)
2501 {
2502 uint32_t ports;
2503 uint16_t *pports;
2504 connf_t *connfp;
2505 conn_t *tconnp;
2506
2507 pports = (uint16_t *)&ports;
2508 pports[0] = tcpha->tha_fport;
2509 pports[1] = tcpha->tha_lport;
2510
2511 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2512 ports, ipst)];
2513
2514 mutex_enter(&connfp->connf_lock);
2515 for (tconnp = connfp->connf_head; tconnp != NULL;
2516 tconnp = tconnp->conn_next) {
2517
2518 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2519 ipha->ipha_dst, ipha->ipha_src, ports) &&
2520 tconnp->conn_tcp->tcp_state >= min_state) {
2521
2522 CONN_INC_REF(tconnp);
2523 mutex_exit(&connfp->connf_lock);
2524 return (tconnp);
2525 }
2526 }
2527 mutex_exit(&connfp->connf_lock);
2528 return (NULL);
2529 }
2530
2531 /*
2532 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2533 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2534 * Only checks for connected entries i.e. no INADDR_ANY checks.
2535 * Match on ifindex in addition to addresses.
2536 */
2537 conn_t *
ipcl_tcp_lookup_reversed_ipv6(ip6_t * ip6h,tcpha_t * tcpha,int min_state,uint_t ifindex,ip_stack_t * ipst)2538 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2539 uint_t ifindex, ip_stack_t *ipst)
2540 {
2541 tcp_t *tcp;
2542 uint32_t ports;
2543 uint16_t *pports;
2544 connf_t *connfp;
2545 conn_t *tconnp;
2546
2547 pports = (uint16_t *)&ports;
2548 pports[0] = tcpha->tha_fport;
2549 pports[1] = tcpha->tha_lport;
2550
2551 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2552 ports, ipst)];
2553
2554 mutex_enter(&connfp->connf_lock);
2555 for (tconnp = connfp->connf_head; tconnp != NULL;
2556 tconnp = tconnp->conn_next) {
2557
2558 tcp = tconnp->conn_tcp;
2559 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2560 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2561 tcp->tcp_state >= min_state &&
2562 (tconnp->conn_bound_if == 0 ||
2563 tconnp->conn_bound_if == ifindex)) {
2564
2565 CONN_INC_REF(tconnp);
2566 mutex_exit(&connfp->connf_lock);
2567 return (tconnp);
2568 }
2569 }
2570 mutex_exit(&connfp->connf_lock);
2571 return (NULL);
2572 }
2573
2574 /*
2575 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2576 * a listener when changing state.
2577 */
2578 conn_t *
ipcl_lookup_listener_v4(uint16_t lport,ipaddr_t laddr,zoneid_t zoneid,ip_stack_t * ipst)2579 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2580 ip_stack_t *ipst)
2581 {
2582 connf_t *bind_connfp;
2583 conn_t *connp;
2584 tcp_t *tcp;
2585
2586 /*
2587 * Avoid false matches for packets sent to an IP destination of
2588 * all zeros.
2589 */
2590 if (laddr == 0)
2591 return (NULL);
2592
2593 ASSERT(zoneid != ALL_ZONES);
2594
2595 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2596 mutex_enter(&bind_connfp->connf_lock);
2597 for (connp = bind_connfp->connf_head; connp != NULL;
2598 connp = connp->conn_next) {
2599 tcp = connp->conn_tcp;
2600 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2601 IPCL_ZONE_MATCH(connp, zoneid) &&
2602 (tcp->tcp_listener == NULL)) {
2603 CONN_INC_REF(connp);
2604 mutex_exit(&bind_connfp->connf_lock);
2605 return (connp);
2606 }
2607 }
2608 mutex_exit(&bind_connfp->connf_lock);
2609 return (NULL);
2610 }
2611
2612 /*
2613 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2614 * a listener when changing state.
2615 */
2616 conn_t *
ipcl_lookup_listener_v6(uint16_t lport,in6_addr_t * laddr,uint_t ifindex,zoneid_t zoneid,ip_stack_t * ipst)2617 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2618 zoneid_t zoneid, ip_stack_t *ipst)
2619 {
2620 connf_t *bind_connfp;
2621 conn_t *connp = NULL;
2622 tcp_t *tcp;
2623
2624 /*
2625 * Avoid false matches for packets sent to an IP destination of
2626 * all zeros.
2627 */
2628 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2629 return (NULL);
2630
2631 ASSERT(zoneid != ALL_ZONES);
2632
2633 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2634 mutex_enter(&bind_connfp->connf_lock);
2635 for (connp = bind_connfp->connf_head; connp != NULL;
2636 connp = connp->conn_next) {
2637 tcp = connp->conn_tcp;
2638 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2639 IPCL_ZONE_MATCH(connp, zoneid) &&
2640 (connp->conn_bound_if == 0 ||
2641 connp->conn_bound_if == ifindex) &&
2642 tcp->tcp_listener == NULL) {
2643 CONN_INC_REF(connp);
2644 mutex_exit(&bind_connfp->connf_lock);
2645 return (connp);
2646 }
2647 }
2648 mutex_exit(&bind_connfp->connf_lock);
2649 return (NULL);
2650 }
2651
2652 /*
2653 * ipcl_get_next_conn
2654 * get the next entry in the conn global list
2655 * and put a reference on the next_conn.
2656 * decrement the reference on the current conn.
2657 *
2658 * This is an iterator based walker function that also provides for
2659 * some selection by the caller. It walks through the conn_hash bucket
2660 * searching for the next valid connp in the list, and selects connections
2661 * that are neither closed nor condemned. It also REFHOLDS the conn
2662 * thus ensuring that the conn exists when the caller uses the conn.
2663 */
2664 conn_t *
ipcl_get_next_conn(connf_t * connfp,conn_t * connp,uint32_t conn_flags)2665 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2666 {
2667 conn_t *next_connp;
2668
2669 if (connfp == NULL)
2670 return (NULL);
2671
2672 mutex_enter(&connfp->connf_lock);
2673
2674 next_connp = (connp == NULL) ?
2675 connfp->connf_head : connp->conn_g_next;
2676
2677 while (next_connp != NULL) {
2678 mutex_enter(&next_connp->conn_lock);
2679 if (!(next_connp->conn_flags & conn_flags) ||
2680 (next_connp->conn_state_flags &
2681 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2682 /*
2683 * This conn has been condemned or
2684 * is closing, or the flags don't match
2685 */
2686 mutex_exit(&next_connp->conn_lock);
2687 next_connp = next_connp->conn_g_next;
2688 continue;
2689 }
2690 CONN_INC_REF_LOCKED(next_connp);
2691 mutex_exit(&next_connp->conn_lock);
2692 break;
2693 }
2694
2695 mutex_exit(&connfp->connf_lock);
2696
2697 if (connp != NULL)
2698 CONN_DEC_REF(connp);
2699
2700 return (next_connp);
2701 }
2702
2703 #ifdef CONN_DEBUG
2704 /*
2705 * Trace of the last NBUF refhold/refrele
2706 */
2707 int
conn_trace_ref(conn_t * connp)2708 conn_trace_ref(conn_t *connp)
2709 {
2710 int last;
2711 conn_trace_t *ctb;
2712
2713 ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 last = connp->conn_trace_last;
2715 last++;
2716 if (last == CONN_TRACE_MAX)
2717 last = 0;
2718
2719 ctb = &connp->conn_trace_buf[last];
2720 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 connp->conn_trace_last = last;
2722 return (1);
2723 }
2724
2725 int
conn_untrace_ref(conn_t * connp)2726 conn_untrace_ref(conn_t *connp)
2727 {
2728 int last;
2729 conn_trace_t *ctb;
2730
2731 ASSERT(MUTEX_HELD(&connp->conn_lock));
2732 last = connp->conn_trace_last;
2733 last++;
2734 if (last == CONN_TRACE_MAX)
2735 last = 0;
2736
2737 ctb = &connp->conn_trace_buf[last];
2738 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2739 connp->conn_trace_last = last;
2740 return (1);
2741 }
2742 #endif
2743
2744 mib2_socketInfoEntry_t *
conn_get_socket_info(conn_t * connp,mib2_socketInfoEntry_t * sie)2745 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2746 {
2747 vnode_t *vn = NULL;
2748 vattr_t attr;
2749 uint64_t flags = 0;
2750 sock_upcalls_t *upcalls;
2751 sock_upper_handle_t upper_handle;
2752
2753 /*
2754 * If the connection is closing, it is not safe to make an upcall or
2755 * access the stream associated with the connection.
2756 * The callers of this function have a reference on connp itself
2757 * so, as long as it is not closing, it's safe to continue.
2758 */
2759 mutex_enter(&connp->conn_lock);
2760
2761 if ((connp->conn_state_flags & CONN_CLOSING)) {
2762 mutex_exit(&connp->conn_lock);
2763 return (NULL);
2764 }
2765
2766 /*
2767 * Continue to hold conn_lock because we don't want to race with an
2768 * in-progress close, which will have set-to-NULL (and destroyed
2769 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2770 *
2771 * There is still a race with an in-progress OPEN, however, where
2772 * conn_upper_handle and conn_upcalls are being assigned (in multiple
2773 * codepaths) WITHOUT conn_lock being held. We address that race
2774 * HERE, however, given that both are going from NULL to non-NULL,
2775 * if we lose the race, we don't get any data for the in-progress-OPEN
2776 * socket.
2777 */
2778
2779 upcalls = connp->conn_upcalls;
2780 upper_handle = connp->conn_upper_handle;
2781 /* Check BOTH for non-NULL before attempting an upcall. */
2782 if (upper_handle != NULL && upcalls != NULL) {
2783 /* su_get_vnode() returns one with VN_HOLD() already done. */
2784 vn = upcalls->su_get_vnode(upper_handle);
2785 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2786 vn = STREAM(connp->conn_rq)->sd_pvnode;
2787 if (vn != NULL)
2788 VN_HOLD(vn);
2789 flags |= MIB2_SOCKINFO_STREAM;
2790 }
2791
2792 mutex_exit(&connp->conn_lock);
2793
2794 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2795 if (vn != NULL)
2796 VN_RELE(vn);
2797 return (NULL);
2798 }
2799
2800 VN_RELE(vn);
2801
2802 bzero(sie, sizeof (*sie));
2803
2804 sie->sie_flags = flags;
2805 sie->sie_inode = attr.va_nodeid;
2806 sie->sie_dev = attr.va_rdev;
2807
2808 return (sie);
2809 }
2810