xref: /illumos-gate/usr/src/uts/common/inet/ip/ipclassifier.c (revision c0586b874d9179e81ca8a124fa6caf98fddb7696)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24  * Copyright 2022 Joyent, Inc.
25  */
26 
27 /*
28  * IP PACKET CLASSIFIER
29  *
30  * The IP packet classifier provides mapping between IP packets and persistent
31  * connection state for connection-oriented protocols. It also provides
32  * interface for managing connection states.
33  *
34  * The connection state is kept in conn_t data structure and contains, among
35  * other things:
36  *
37  *	o local/remote address and ports
38  *	o Transport protocol
39  *	o squeue for the connection (for TCP only)
40  *	o reference counter
41  *	o Connection state
42  *	o hash table linkage
43  *	o interface/ire information
44  *	o credentials
45  *	o ipsec policy
46  *	o send and receive functions.
47  *	o mutex lock.
48  *
49  * Connections use a reference counting scheme. They are freed when the
50  * reference counter drops to zero. A reference is incremented when connection
51  * is placed in a list or table, when incoming packet for the connection arrives
52  * and when connection is processed via squeue (squeue processing may be
53  * asynchronous and the reference protects the connection from being destroyed
54  * before its processing is finished).
55  *
56  * conn_recv is used to pass up packets to the ULP.
57  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
58  * a listener, and changes to tcp_input_listener as the listener has picked a
59  * good squeue. For other cases it is set to tcp_input_data.
60  *
61  * conn_recvicmp is used to pass up ICMP errors to the ULP.
62  *
63  * Classifier uses several hash tables:
64  *
65  *	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
66  *	ipcl_bind_fanout:	contains all connections in BOUND state
67  *	ipcl_proto_fanout:	IPv4 protocol fanout
68  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
69  *	ipcl_udp_fanout:	contains all UDP connections
70  *	ipcl_iptun_fanout:	contains all IP tunnel connections
71  *	ipcl_globalhash_fanout:	contains all connections
72  *
73  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
74  * which need to view all existing connections.
75  *
76  * All tables are protected by per-bucket locks. When both per-bucket lock and
77  * connection lock need to be held, the per-bucket lock should be acquired
78  * first, followed by the connection lock.
79  *
80  * All functions doing search in one of these tables increment a reference
81  * counter on the connection found (if any). This reference should be dropped
82  * when the caller has finished processing the connection.
83  *
84  *
85  * INTERFACES:
86  * ===========
87  *
88  * Connection Lookup:
89  * ------------------
90  *
91  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
92  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
93  *
94  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
95  * it can't find any associated connection. If the connection is found, its
96  * reference counter is incremented.
97  *
98  *	mp:	mblock, containing packet header. The full header should fit
99  *		into a single mblock. It should also contain at least full IP
100  *		and TCP or UDP header.
101  *
102  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
103  *
104  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
105  *		 the packet.
106  *
107  *	ira->ira_zoneid: The zone in which the returned connection must be; the
108  *		zoneid corresponding to the ire_zoneid on the IRE located for
109  *		the packet's destination address.
110  *
111  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
112  *		IRAF_TX_SHARED_ADDR flags
113  *
114  *	For TCP connections, the lookup order is as follows:
115  *		5-tuple {src, dst, protocol, local port, remote port}
116  *			lookup in ipcl_conn_fanout table.
117  *		3-tuple {dst, remote port, protocol} lookup in
118  *			ipcl_bind_fanout table.
119  *
120  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
121  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
122  *	these interfaces do not handle cases where a packets belongs
123  *	to multiple UDP clients, which is handled in IP itself.
124  *
125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
126  * determine which actual zone gets the segment.  This is used only in a
127  * labeled environment.  The matching rules are:
128  *
129  *	- If it's not a multilevel port, then the label on the packet selects
130  *	  the zone.  Unlabeled packets are delivered to the global zone.
131  *
132  *	- If it's a multilevel port, then only the zone registered to receive
133  *	  packets on that port matches.
134  *
135  * Also, in a labeled environment, packet labels need to be checked.  For fully
136  * bound TCP connections, we can assume that the packet label was checked
137  * during connection establishment, and doesn't need to be checked on each
138  * packet.  For others, though, we need to check for strict equality or, for
139  * multilevel ports, membership in the range or set.  This part currently does
140  * a tnrh lookup on each packet, but could be optimized to use cached results
141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
142  * we would apply the same rules as TCP.)
143  *
144  * An implication of the above is that fully-bound TCP sockets must always use
145  * distinct 4-tuples; they can't be discriminated by label alone.
146  *
147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
148  * as there's no connection set-up handshake and no shared state.
149  *
150  * Labels on looped-back packets within a single zone do not need to be
151  * checked, as all processes in the same zone have the same label.
152  *
153  * Finally, for unlabeled packets received by a labeled system, special rules
154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
155  * socket in the zone whose label matches the default label of the sender, if
156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
157  * receiver's label must dominate the sender's default label.
158  *
159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161  *					 ip_stack);
162  *
163  *	Lookup routine to find a exact match for {src, dst, local port,
164  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
165  *	ports are read from the IP and TCP header respectively.
166  *
167  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
168  *					 zoneid, ip_stack);
169  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170  *					 zoneid, ip_stack);
171  *
172  *	Lookup routine to find a listener with the tuple {lport, laddr,
173  *	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174  *	parameter interface index is also compared.
175  *
176  * void ipcl_walk(func, arg, ip_stack)
177  *
178  *	Apply 'func' to every connection available. The 'func' is called as
179  *	(*func)(connp, arg). The walk is non-atomic so connections may be
180  *	created and destroyed during the walk. The CONN_CONDEMNED and
181  *	CONN_INCIPIENT flags ensure that connections which are newly created
182  *	or being destroyed are not selected by the walker.
183  *
184  * Table Updates
185  * -------------
186  *
187  * int ipcl_conn_insert(connp);
188  * int ipcl_conn_insert_v4(connp);
189  * int ipcl_conn_insert_v6(connp);
190  *
191  *	Insert 'connp' in the ipcl_conn_fanout.
192  *	Arguments :
193  *		connp		conn_t to be inserted
194  *
195  *	Return value :
196  *		0		if connp was inserted
197  *		EADDRINUSE	if the connection with the same tuple
198  *				already exists.
199  *
200  * int ipcl_bind_insert(connp);
201  * int ipcl_bind_insert_v4(connp);
202  * int ipcl_bind_insert_v6(connp);
203  *
204  *	Insert 'connp' in ipcl_bind_fanout.
205  *	Arguments :
206  *		connp		conn_t to be inserted
207  *
208  *
209  * void ipcl_hash_remove(connp);
210  *
211  *	Removes the 'connp' from the connection fanout table.
212  *
213  * Connection Creation/Destruction
214  * -------------------------------
215  *
216  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
217  *
218  *	Creates a new conn based on the type flag, inserts it into
219  *	globalhash table.
220  *
221  *	type:	This flag determines the type of conn_t which needs to be
222  *		created i.e., which kmem_cache it comes from.
223  *		IPCL_TCPCONN	indicates a TCP connection
224  *		IPCL_SCTPCONN	indicates a SCTP connection
225  *		IPCL_UDPCONN	indicates a UDP conn_t.
226  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
227  *		IPCL_RTSCONN	indicates a RTS conn_t.
228  *		IPCL_IPCCONN	indicates all other connections.
229  *
230  * void ipcl_conn_destroy(connp)
231  *
232  *	Destroys the connection state, removes it from the global
233  *	connection hash table and frees its memory.
234  */
235 
236 #include <sys/types.h>
237 #include <sys/stream.h>
238 #include <sys/stropts.h>
239 #include <sys/sysmacros.h>
240 #include <sys/strsubr.h>
241 #include <sys/strsun.h>
242 #define	_SUN_TPI_VERSION 2
243 #include <sys/ddi.h>
244 #include <sys/cmn_err.h>
245 #include <sys/debug.h>
246 
247 #include <sys/systm.h>
248 #include <sys/param.h>
249 #include <sys/kmem.h>
250 #include <sys/isa_defs.h>
251 #include <inet/common.h>
252 #include <netinet/ip6.h>
253 #include <netinet/icmp6.h>
254 
255 #include <inet/ip.h>
256 #include <inet/ip_if.h>
257 #include <inet/ip_ire.h>
258 #include <inet/ip6.h>
259 #include <inet/ip_ndp.h>
260 #include <inet/ip_impl.h>
261 #include <inet/udp_impl.h>
262 #include <inet/sctp_ip.h>
263 #include <inet/sctp/sctp_impl.h>
264 #include <inet/rawip_impl.h>
265 #include <inet/rts_impl.h>
266 #include <inet/iptun/iptun_impl.h>
267 
268 #include <sys/cpuvar.h>
269 
270 #include <inet/ipclassifier.h>
271 #include <inet/tcp.h>
272 #include <inet/ipsec_impl.h>
273 
274 #include <sys/tsol/tnet.h>
275 #include <sys/sockio.h>
276 
277 /* Old value for compatibility. Setable in /etc/system */
278 uint_t tcp_conn_hash_size = 0;
279 
280 /* New value. Zero means choose automatically.  Setable in /etc/system */
281 uint_t ipcl_conn_hash_size = 0;
282 uint_t ipcl_conn_hash_memfactor = 8192;
283 uint_t ipcl_conn_hash_maxsize = 82500;
284 
285 /* bind/udp fanout table size */
286 uint_t ipcl_bind_fanout_size = 512;
287 uint_t ipcl_udp_fanout_size = 16384;
288 
289 /* Raw socket fanout size.  Must be a power of 2. */
290 uint_t ipcl_raw_fanout_size = 256;
291 
292 /*
293  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
294  * expect that most large deployments would have hundreds of tunnels, and
295  * thousands in the extreme case.
296  */
297 uint_t ipcl_iptun_fanout_size = 6143;
298 
299 /*
300  * Power of 2^N Primes useful for hashing for N of 0-28,
301  * these primes are the nearest prime <= 2^N - 2^(N-2).
302  */
303 
304 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
305 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
306 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
307 		50331599, 100663291, 201326557, 0}
308 
309 /*
310  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
311  * are aligned on cache lines.
312  */
313 typedef union itc_s {
314 	conn_t	itc_conn;
315 	char	itcu_filler[CACHE_ALIGN(conn_s)];
316 } itc_t;
317 
318 struct kmem_cache  *tcp_conn_cache;
319 struct kmem_cache  *ip_conn_cache;
320 extern struct kmem_cache  *sctp_conn_cache;
321 struct kmem_cache  *udp_conn_cache;
322 struct kmem_cache  *rawip_conn_cache;
323 struct kmem_cache  *rts_conn_cache;
324 
325 extern void	tcp_timermp_free(tcp_t *);
326 extern mblk_t	*tcp_timermp_alloc(int);
327 
328 static int	ip_conn_constructor(void *, void *, int);
329 static void	ip_conn_destructor(void *, void *);
330 
331 static int	tcp_conn_constructor(void *, void *, int);
332 static void	tcp_conn_destructor(void *, void *);
333 
334 static int	udp_conn_constructor(void *, void *, int);
335 static void	udp_conn_destructor(void *, void *);
336 
337 static int	rawip_conn_constructor(void *, void *, int);
338 static void	rawip_conn_destructor(void *, void *);
339 
340 static int	rts_conn_constructor(void *, void *, int);
341 static void	rts_conn_destructor(void *, void *);
342 
343 /*
344  * Global (for all stack instances) init routine
345  */
346 void
347 ipcl_g_init(void)
348 {
349 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
350 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
351 	    ip_conn_constructor, ip_conn_destructor,
352 	    NULL, NULL, NULL, 0);
353 
354 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
355 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
356 	    tcp_conn_constructor, tcp_conn_destructor,
357 	    tcp_conn_reclaim, NULL, NULL, 0);
358 
359 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
360 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
361 	    udp_conn_constructor, udp_conn_destructor,
362 	    NULL, NULL, NULL, 0);
363 
364 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
365 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
366 	    rawip_conn_constructor, rawip_conn_destructor,
367 	    NULL, NULL, NULL, 0);
368 
369 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
370 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
371 	    rts_conn_constructor, rts_conn_destructor,
372 	    NULL, NULL, NULL, 0);
373 }
374 
375 /*
376  * ipclassifier intialization routine, sets up hash tables.
377  */
378 void
379 ipcl_init(ip_stack_t *ipst)
380 {
381 	int i;
382 	int sizes[] = P2Ps();
383 
384 	/*
385 	 * Calculate size of conn fanout table from /etc/system settings
386 	 */
387 	if (ipcl_conn_hash_size != 0) {
388 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
389 	} else if (tcp_conn_hash_size != 0) {
390 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
391 	} else {
392 		extern pgcnt_t freemem;
393 
394 		ipst->ips_ipcl_conn_fanout_size =
395 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
396 
397 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
398 			ipst->ips_ipcl_conn_fanout_size =
399 			    ipcl_conn_hash_maxsize;
400 		}
401 	}
402 
403 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
404 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
405 			break;
406 		}
407 	}
408 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
409 		/* Out of range, use the 2^16 value */
410 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
411 	}
412 
413 	/* Take values from /etc/system */
414 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
415 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
416 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
417 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
418 
419 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
420 
421 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
422 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
423 
424 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
425 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
426 		    MUTEX_DEFAULT, NULL);
427 	}
428 
429 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
430 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
431 
432 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
433 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
434 		    MUTEX_DEFAULT, NULL);
435 	}
436 
437 	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
438 	    sizeof (connf_t), KM_SLEEP);
439 	for (i = 0; i < IPPROTO_MAX; i++) {
440 		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
441 		    MUTEX_DEFAULT, NULL);
442 	}
443 
444 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
445 	    sizeof (connf_t), KM_SLEEP);
446 	for (i = 0; i < IPPROTO_MAX; i++) {
447 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
448 		    MUTEX_DEFAULT, NULL);
449 	}
450 
451 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
452 	mutex_init(&ipst->ips_rts_clients->connf_lock,
453 	    NULL, MUTEX_DEFAULT, NULL);
454 
455 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
456 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
457 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
458 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
459 		    MUTEX_DEFAULT, NULL);
460 	}
461 
462 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
463 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
464 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
465 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
466 		    MUTEX_DEFAULT, NULL);
467 	}
468 
469 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
470 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
471 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
472 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
473 		    MUTEX_DEFAULT, NULL);
474 	}
475 
476 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
477 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
478 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
479 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
480 		    NULL, MUTEX_DEFAULT, NULL);
481 	}
482 }
483 
484 void
485 ipcl_g_destroy(void)
486 {
487 	kmem_cache_destroy(ip_conn_cache);
488 	kmem_cache_destroy(tcp_conn_cache);
489 	kmem_cache_destroy(udp_conn_cache);
490 	kmem_cache_destroy(rawip_conn_cache);
491 	kmem_cache_destroy(rts_conn_cache);
492 }
493 
494 /*
495  * All user-level and kernel use of the stack must be gone
496  * by now.
497  */
498 void
499 ipcl_destroy(ip_stack_t *ipst)
500 {
501 	int i;
502 
503 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
504 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
505 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
506 	}
507 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
508 	    sizeof (connf_t));
509 	ipst->ips_ipcl_conn_fanout = NULL;
510 
511 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
512 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
513 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
514 	}
515 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
516 	    sizeof (connf_t));
517 	ipst->ips_ipcl_bind_fanout = NULL;
518 
519 	for (i = 0; i < IPPROTO_MAX; i++) {
520 		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
521 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
522 	}
523 	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
524 	    IPPROTO_MAX * sizeof (connf_t));
525 	ipst->ips_ipcl_proto_fanout_v4 = NULL;
526 
527 	for (i = 0; i < IPPROTO_MAX; i++) {
528 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
529 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
530 	}
531 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
532 	    IPPROTO_MAX * sizeof (connf_t));
533 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
534 
535 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
536 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
537 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
538 	}
539 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
540 	    sizeof (connf_t));
541 	ipst->ips_ipcl_udp_fanout = NULL;
542 
543 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
544 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
545 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
546 	}
547 	kmem_free(ipst->ips_ipcl_iptun_fanout,
548 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
549 	ipst->ips_ipcl_iptun_fanout = NULL;
550 
551 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
552 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
553 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
554 	}
555 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
556 	    sizeof (connf_t));
557 	ipst->ips_ipcl_raw_fanout = NULL;
558 
559 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
560 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
561 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
562 	}
563 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
564 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
565 	ipst->ips_ipcl_globalhash_fanout = NULL;
566 
567 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
568 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
569 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
570 	ipst->ips_rts_clients = NULL;
571 }
572 
573 /*
574  * conn creation routine. initialize the conn, sets the reference
575  * and inserts it in the global hash table.
576  */
577 conn_t *
578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
579 {
580 	conn_t	*connp;
581 	struct kmem_cache *conn_cache;
582 
583 	switch (type) {
584 	case IPCL_SCTPCONN:
585 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
586 			return (NULL);
587 		sctp_conn_init(connp);
588 		netstack_hold(ns);
589 		connp->conn_netstack = ns;
590 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
591 		connp->conn_ixa->ixa_conn_id = (long)connp;
592 		ipcl_globalhash_insert(connp);
593 		return (connp);
594 
595 	case IPCL_TCPCONN:
596 		conn_cache = tcp_conn_cache;
597 		break;
598 
599 	case IPCL_UDPCONN:
600 		conn_cache = udp_conn_cache;
601 		break;
602 
603 	case IPCL_RAWIPCONN:
604 		conn_cache = rawip_conn_cache;
605 		break;
606 
607 	case IPCL_RTSCONN:
608 		conn_cache = rts_conn_cache;
609 		break;
610 
611 	case IPCL_IPCCONN:
612 		conn_cache = ip_conn_cache;
613 		break;
614 
615 	default:
616 		conn_cache = NULL;
617 		connp = NULL;
618 		ASSERT(0);
619 	}
620 
621 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
622 		return (NULL);
623 
624 	connp->conn_ref = 1;
625 	netstack_hold(ns);
626 	connp->conn_netstack = ns;
627 	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
628 	connp->conn_ixa->ixa_conn_id = (long)connp;
629 	ipcl_globalhash_insert(connp);
630 	return (connp);
631 }
632 
633 void
634 ipcl_conn_destroy(conn_t *connp)
635 {
636 	mblk_t	*mp;
637 	netstack_t	*ns = connp->conn_netstack;
638 
639 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
640 	ASSERT(connp->conn_ref == 0);
641 	ASSERT(connp->conn_ioctlref == 0);
642 
643 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
644 
645 	if (connp->conn_cred != NULL) {
646 		crfree(connp->conn_cred);
647 		connp->conn_cred = NULL;
648 		/* ixa_cred done in ipcl_conn_cleanup below */
649 	}
650 
651 	if (connp->conn_ht_iphc != NULL) {
652 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
653 		connp->conn_ht_iphc = NULL;
654 		connp->conn_ht_iphc_allocated = 0;
655 		connp->conn_ht_iphc_len = 0;
656 		connp->conn_ht_ulp = NULL;
657 		connp->conn_ht_ulp_len = 0;
658 	}
659 	ip_pkt_free(&connp->conn_xmit_ipp);
660 
661 	ipcl_globalhash_remove(connp);
662 
663 	if (connp->conn_latch != NULL) {
664 		IPLATCH_REFRELE(connp->conn_latch);
665 		connp->conn_latch = NULL;
666 	}
667 	if (connp->conn_latch_in_policy != NULL) {
668 		IPPOL_REFRELE(connp->conn_latch_in_policy);
669 		connp->conn_latch_in_policy = NULL;
670 	}
671 	if (connp->conn_latch_in_action != NULL) {
672 		IPACT_REFRELE(connp->conn_latch_in_action);
673 		connp->conn_latch_in_action = NULL;
674 	}
675 	if (connp->conn_policy != NULL) {
676 		IPPH_REFRELE(connp->conn_policy, ns);
677 		connp->conn_policy = NULL;
678 	}
679 
680 	if (connp->conn_ipsec_opt_mp != NULL) {
681 		freemsg(connp->conn_ipsec_opt_mp);
682 		connp->conn_ipsec_opt_mp = NULL;
683 	}
684 
685 	if (connp->conn_flags & IPCL_TCPCONN) {
686 		tcp_t *tcp = connp->conn_tcp;
687 
688 		tcp_free(tcp);
689 		mp = tcp->tcp_timercache;
690 
691 		tcp->tcp_tcps = NULL;
692 
693 		/*
694 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
695 		 * the mblk.
696 		 */
697 		if (tcp->tcp_rsrv_mp != NULL) {
698 			freeb(tcp->tcp_rsrv_mp);
699 			tcp->tcp_rsrv_mp = NULL;
700 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
701 		}
702 
703 		ipcl_conn_cleanup(connp);
704 		connp->conn_flags = IPCL_TCPCONN;
705 		if (ns != NULL) {
706 			ASSERT(tcp->tcp_tcps == NULL);
707 			connp->conn_netstack = NULL;
708 			connp->conn_ixa->ixa_ipst = NULL;
709 			netstack_rele(ns);
710 		}
711 
712 		bzero(tcp, sizeof (tcp_t));
713 
714 		tcp->tcp_timercache = mp;
715 		tcp->tcp_connp = connp;
716 		kmem_cache_free(tcp_conn_cache, connp);
717 		return;
718 	}
719 
720 	if (connp->conn_flags & IPCL_SCTPCONN) {
721 		ASSERT(ns != NULL);
722 		sctp_free(connp);
723 		return;
724 	}
725 
726 	ipcl_conn_cleanup(connp);
727 	if (ns != NULL) {
728 		connp->conn_netstack = NULL;
729 		connp->conn_ixa->ixa_ipst = NULL;
730 		netstack_rele(ns);
731 	}
732 
733 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
734 	if (connp->conn_flags & IPCL_UDPCONN) {
735 		connp->conn_flags = IPCL_UDPCONN;
736 		kmem_cache_free(udp_conn_cache, connp);
737 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
738 		connp->conn_flags = IPCL_RAWIPCONN;
739 		connp->conn_proto = IPPROTO_ICMP;
740 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
741 		kmem_cache_free(rawip_conn_cache, connp);
742 	} else if (connp->conn_flags & IPCL_RTSCONN) {
743 		connp->conn_flags = IPCL_RTSCONN;
744 		kmem_cache_free(rts_conn_cache, connp);
745 	} else {
746 		connp->conn_flags = IPCL_IPCCONN;
747 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
748 		ASSERT(connp->conn_priv == NULL);
749 		kmem_cache_free(ip_conn_cache, connp);
750 	}
751 }
752 
753 /*
754  * Running in cluster mode - deregister listener information
755  */
756 static void
757 ipcl_conn_unlisten(conn_t *connp)
758 {
759 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
760 	ASSERT(connp->conn_lport != 0);
761 
762 	if (cl_inet_unlisten != NULL) {
763 		sa_family_t	addr_family;
764 		uint8_t		*laddrp;
765 
766 		if (connp->conn_ipversion == IPV6_VERSION) {
767 			addr_family = AF_INET6;
768 			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
769 		} else {
770 			addr_family = AF_INET;
771 			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
772 		}
773 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
774 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
775 	}
776 	connp->conn_flags &= ~IPCL_CL_LISTENER;
777 }
778 
779 /*
780  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
781  * which table the conn belonged to). So for debugging we can see which hash
782  * table this connection was in.
783  */
784 #define	IPCL_HASH_REMOVE(connp)	{					\
785 	connf_t	*connfp = (connp)->conn_fanout;				\
786 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
787 	if (connfp != NULL) {						\
788 		mutex_enter(&connfp->connf_lock);			\
789 		if ((connp)->conn_next != NULL)				\
790 			(connp)->conn_next->conn_prev =			\
791 			    (connp)->conn_prev;				\
792 		if ((connp)->conn_prev != NULL)				\
793 			(connp)->conn_prev->conn_next =			\
794 			    (connp)->conn_next;				\
795 		else							\
796 			connfp->connf_head = (connp)->conn_next;	\
797 		(connp)->conn_fanout = NULL;				\
798 		(connp)->conn_next = NULL;				\
799 		(connp)->conn_prev = NULL;				\
800 		(connp)->conn_flags |= IPCL_REMOVED;			\
801 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
802 			ipcl_conn_unlisten((connp));			\
803 		CONN_DEC_REF((connp));					\
804 		mutex_exit(&connfp->connf_lock);			\
805 	}								\
806 }
807 
808 void
809 ipcl_hash_remove(conn_t *connp)
810 {
811 	uint8_t		protocol = connp->conn_proto;
812 
813 	IPCL_HASH_REMOVE(connp);
814 	if (protocol == IPPROTO_RSVP)
815 		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
816 }
817 
818 /*
819  * The whole purpose of this function is allow removal of
820  * a conn_t from the connected hash for timewait reclaim.
821  * This is essentially a TW reclaim fastpath where timewait
822  * collector checks under fanout lock (so no one else can
823  * get access to the conn_t) that refcnt is 2 i.e. one for
824  * TCP and one for the classifier hash list. If ref count
825  * is indeed 2, we can just remove the conn under lock and
826  * avoid cleaning up the conn under squeue. This gives us
827  * improved performance.
828  */
829 void
830 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
831 {
832 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
833 	ASSERT(MUTEX_HELD(&connp->conn_lock));
834 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
835 
836 	if ((connp)->conn_next != NULL) {
837 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
838 	}
839 	if ((connp)->conn_prev != NULL) {
840 		(connp)->conn_prev->conn_next = (connp)->conn_next;
841 	} else {
842 		connfp->connf_head = (connp)->conn_next;
843 	}
844 	(connp)->conn_fanout = NULL;
845 	(connp)->conn_next = NULL;
846 	(connp)->conn_prev = NULL;
847 	(connp)->conn_flags |= IPCL_REMOVED;
848 	ASSERT((connp)->conn_ref == 2);
849 	(connp)->conn_ref--;
850 }
851 
852 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
853 	ASSERT((connp)->conn_fanout == NULL);				\
854 	ASSERT((connp)->conn_next == NULL);				\
855 	ASSERT((connp)->conn_prev == NULL);				\
856 	if ((connfp)->connf_head != NULL) {				\
857 		(connfp)->connf_head->conn_prev = (connp);		\
858 		(connp)->conn_next = (connfp)->connf_head;		\
859 	}								\
860 	(connp)->conn_fanout = (connfp);				\
861 	(connfp)->connf_head = (connp);					\
862 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
863 	    IPCL_CONNECTED;						\
864 	CONN_INC_REF(connp);						\
865 }
866 
867 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
868 	IPCL_HASH_REMOVE((connp));					\
869 	mutex_enter(&(connfp)->connf_lock);				\
870 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
871 	mutex_exit(&(connfp)->connf_lock);				\
872 }
873 
874 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
875 	conn_t *pconnp = NULL, *nconnp;					\
876 	IPCL_HASH_REMOVE((connp));					\
877 	mutex_enter(&(connfp)->connf_lock);				\
878 	nconnp = (connfp)->connf_head;					\
879 	while (nconnp != NULL &&					\
880 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
881 		pconnp = nconnp;					\
882 		nconnp = nconnp->conn_next;				\
883 	}								\
884 	if (pconnp != NULL) {						\
885 		pconnp->conn_next = (connp);				\
886 		(connp)->conn_prev = pconnp;				\
887 	} else {							\
888 		(connfp)->connf_head = (connp);				\
889 	}								\
890 	if (nconnp != NULL) {						\
891 		(connp)->conn_next = nconnp;				\
892 		nconnp->conn_prev = (connp);				\
893 	}								\
894 	(connp)->conn_fanout = (connfp);				\
895 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
896 	    IPCL_BOUND;							\
897 	CONN_INC_REF(connp);						\
898 	mutex_exit(&(connfp)->connf_lock);				\
899 }
900 
901 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
902 	conn_t **list, *prev, *next;					\
903 	boolean_t isv4mapped =						\
904 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
905 	IPCL_HASH_REMOVE((connp));					\
906 	mutex_enter(&(connfp)->connf_lock);				\
907 	list = &(connfp)->connf_head;					\
908 	prev = NULL;							\
909 	while ((next = *list) != NULL) {				\
910 		if (isv4mapped &&					\
911 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
912 		    connp->conn_zoneid == next->conn_zoneid) {		\
913 			(connp)->conn_next = next;			\
914 			if (prev != NULL)				\
915 				prev = next->conn_prev;			\
916 			next->conn_prev = (connp);			\
917 			break;						\
918 		}							\
919 		list = &next->conn_next;				\
920 		prev = next;						\
921 	}								\
922 	(connp)->conn_prev = prev;					\
923 	*list = (connp);						\
924 	(connp)->conn_fanout = (connfp);				\
925 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
926 	    IPCL_BOUND;							\
927 	CONN_INC_REF((connp));						\
928 	mutex_exit(&(connfp)->connf_lock);				\
929 }
930 
931 void
932 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
933 {
934 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
935 }
936 
937 /*
938  * Because the classifier is used to classify inbound packets, the destination
939  * address is meant to be our local tunnel address (tunnel source), and the
940  * source the remote tunnel address (tunnel destination).
941  *
942  * Note that conn_proto can't be used for fanout since the upper protocol
943  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
944  */
945 conn_t *
946 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
947 {
948 	connf_t	*connfp;
949 	conn_t	*connp;
950 
951 	/* first look for IPv4 tunnel links */
952 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
953 	mutex_enter(&connfp->connf_lock);
954 	for (connp = connfp->connf_head; connp != NULL;
955 	    connp = connp->conn_next) {
956 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
957 			break;
958 	}
959 	if (connp != NULL)
960 		goto done;
961 
962 	mutex_exit(&connfp->connf_lock);
963 
964 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
965 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
966 	    INADDR_ANY)];
967 	mutex_enter(&connfp->connf_lock);
968 	for (connp = connfp->connf_head; connp != NULL;
969 	    connp = connp->conn_next) {
970 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
971 			break;
972 	}
973 done:
974 	if (connp != NULL)
975 		CONN_INC_REF(connp);
976 	mutex_exit(&connfp->connf_lock);
977 	return (connp);
978 }
979 
980 conn_t *
981 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
982 {
983 	connf_t	*connfp;
984 	conn_t	*connp;
985 
986 	/* Look for an IPv6 tunnel link */
987 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
988 	mutex_enter(&connfp->connf_lock);
989 	for (connp = connfp->connf_head; connp != NULL;
990 	    connp = connp->conn_next) {
991 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
992 			CONN_INC_REF(connp);
993 			break;
994 		}
995 	}
996 	mutex_exit(&connfp->connf_lock);
997 	return (connp);
998 }
999 
1000 /*
1001  * This function is used only for inserting SCTP raw socket now.
1002  * This may change later.
1003  *
1004  * Note that only one raw socket can be bound to a port.  The param
1005  * lport is in network byte order.
1006  */
1007 static int
1008 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1009 {
1010 	connf_t	*connfp;
1011 	conn_t	*oconnp;
1012 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1013 
1014 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1015 
1016 	/* Check for existing raw socket already bound to the port. */
1017 	mutex_enter(&connfp->connf_lock);
1018 	for (oconnp = connfp->connf_head; oconnp != NULL;
1019 	    oconnp = oconnp->conn_next) {
1020 		if (oconnp->conn_lport == lport &&
1021 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1022 		    oconnp->conn_family == connp->conn_family &&
1023 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1024 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1025 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1026 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1027 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1028 		    &connp->conn_laddr_v6))) {
1029 			break;
1030 		}
1031 	}
1032 	mutex_exit(&connfp->connf_lock);
1033 	if (oconnp != NULL)
1034 		return (EADDRNOTAVAIL);
1035 
1036 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1037 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1038 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1039 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1040 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1041 		} else {
1042 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1043 		}
1044 	} else {
1045 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1046 	}
1047 	return (0);
1048 }
1049 
1050 static int
1051 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1052 {
1053 	connf_t	*connfp;
1054 	conn_t	*tconnp;
1055 	ipaddr_t laddr = connp->conn_laddr_v4;
1056 	ipaddr_t faddr = connp->conn_faddr_v4;
1057 
1058 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1059 	mutex_enter(&connfp->connf_lock);
1060 	for (tconnp = connfp->connf_head; tconnp != NULL;
1061 	    tconnp = tconnp->conn_next) {
1062 		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1063 			/* A tunnel is already bound to these addresses. */
1064 			mutex_exit(&connfp->connf_lock);
1065 			return (EADDRINUSE);
1066 		}
1067 	}
1068 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1069 	mutex_exit(&connfp->connf_lock);
1070 	return (0);
1071 }
1072 
1073 static int
1074 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1075 {
1076 	connf_t	*connfp;
1077 	conn_t	*tconnp;
1078 	in6_addr_t *laddr = &connp->conn_laddr_v6;
1079 	in6_addr_t *faddr = &connp->conn_faddr_v6;
1080 
1081 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1082 	mutex_enter(&connfp->connf_lock);
1083 	for (tconnp = connfp->connf_head; tconnp != NULL;
1084 	    tconnp = tconnp->conn_next) {
1085 		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1086 			/* A tunnel is already bound to these addresses. */
1087 			mutex_exit(&connfp->connf_lock);
1088 			return (EADDRINUSE);
1089 		}
1090 	}
1091 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1092 	mutex_exit(&connfp->connf_lock);
1093 	return (0);
1094 }
1095 
1096 /*
1097  * Check for a MAC exemption conflict on a labeled system.  Note that for
1098  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1099  * transport layer.  This check is for binding all other protocols.
1100  *
1101  * Returns true if there's a conflict.
1102  */
1103 static boolean_t
1104 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1105 {
1106 	connf_t	*connfp;
1107 	conn_t *tconn;
1108 
1109 	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1110 	mutex_enter(&connfp->connf_lock);
1111 	for (tconn = connfp->connf_head; tconn != NULL;
1112 	    tconn = tconn->conn_next) {
1113 		/* We don't allow v4 fallback for v6 raw socket */
1114 		if (connp->conn_family != tconn->conn_family)
1115 			continue;
1116 		/* If neither is exempt, then there's no conflict */
1117 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1118 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1119 			continue;
1120 		/* We are only concerned about sockets for a different zone */
1121 		if (connp->conn_zoneid == tconn->conn_zoneid)
1122 			continue;
1123 		/* If both are bound to different specific addrs, ok */
1124 		if (connp->conn_laddr_v4 != INADDR_ANY &&
1125 		    tconn->conn_laddr_v4 != INADDR_ANY &&
1126 		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1127 			continue;
1128 		/* These two conflict; fail */
1129 		break;
1130 	}
1131 	mutex_exit(&connfp->connf_lock);
1132 	return (tconn != NULL);
1133 }
1134 
1135 static boolean_t
1136 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1137 {
1138 	connf_t	*connfp;
1139 	conn_t *tconn;
1140 
1141 	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1142 	mutex_enter(&connfp->connf_lock);
1143 	for (tconn = connfp->connf_head; tconn != NULL;
1144 	    tconn = tconn->conn_next) {
1145 		/* We don't allow v4 fallback for v6 raw socket */
1146 		if (connp->conn_family != tconn->conn_family)
1147 			continue;
1148 		/* If neither is exempt, then there's no conflict */
1149 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1150 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1151 			continue;
1152 		/* We are only concerned about sockets for a different zone */
1153 		if (connp->conn_zoneid == tconn->conn_zoneid)
1154 			continue;
1155 		/* If both are bound to different addrs, ok */
1156 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1157 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1158 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1159 		    &tconn->conn_laddr_v6))
1160 			continue;
1161 		/* These two conflict; fail */
1162 		break;
1163 	}
1164 	mutex_exit(&connfp->connf_lock);
1165 	return (tconn != NULL);
1166 }
1167 
1168 /*
1169  * (v4, v6) bind hash insertion routines
1170  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1171  */
1172 
1173 int
1174 ipcl_bind_insert(conn_t *connp)
1175 {
1176 	if (connp->conn_ipversion == IPV6_VERSION)
1177 		return (ipcl_bind_insert_v6(connp));
1178 	else
1179 		return (ipcl_bind_insert_v4(connp));
1180 }
1181 
1182 int
1183 ipcl_bind_insert_v4(conn_t *connp)
1184 {
1185 	connf_t	*connfp;
1186 	int	ret = 0;
1187 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1188 	uint16_t	lport = connp->conn_lport;
1189 	uint8_t		protocol = connp->conn_proto;
1190 
1191 	if (IPCL_IS_IPTUN(connp))
1192 		return (ipcl_iptun_hash_insert(connp, ipst));
1193 
1194 	switch (protocol) {
1195 	default:
1196 		if (is_system_labeled() &&
1197 		    check_exempt_conflict_v4(connp, ipst))
1198 			return (EADDRINUSE);
1199 		/* FALLTHROUGH */
1200 	case IPPROTO_UDP:
1201 		if (protocol == IPPROTO_UDP) {
1202 			connfp = &ipst->ips_ipcl_udp_fanout[
1203 			    IPCL_UDP_HASH(lport, ipst)];
1204 		} else {
1205 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1206 		}
1207 
1208 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1209 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1210 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1211 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1212 		} else {
1213 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1214 		}
1215 		if (protocol == IPPROTO_RSVP)
1216 			ill_set_inputfn_all(ipst);
1217 		break;
1218 
1219 	case IPPROTO_TCP:
1220 		/* Insert it in the Bind Hash */
1221 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1222 		connfp = &ipst->ips_ipcl_bind_fanout[
1223 		    IPCL_BIND_HASH(lport, ipst)];
1224 		if (connp->conn_laddr_v4 != INADDR_ANY) {
1225 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1226 		} else {
1227 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1228 		}
1229 		if (cl_inet_listen != NULL) {
1230 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1231 			connp->conn_flags |= IPCL_CL_LISTENER;
1232 			(*cl_inet_listen)(
1233 			    connp->conn_netstack->netstack_stackid,
1234 			    IPPROTO_TCP, AF_INET,
1235 			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1236 		}
1237 		break;
1238 
1239 	case IPPROTO_SCTP:
1240 		ret = ipcl_sctp_hash_insert(connp, lport);
1241 		break;
1242 	}
1243 
1244 	return (ret);
1245 }
1246 
1247 int
1248 ipcl_bind_insert_v6(conn_t *connp)
1249 {
1250 	connf_t		*connfp;
1251 	int		ret = 0;
1252 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1253 	uint16_t	lport = connp->conn_lport;
1254 	uint8_t		protocol = connp->conn_proto;
1255 
1256 	if (IPCL_IS_IPTUN(connp)) {
1257 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1258 	}
1259 
1260 	switch (protocol) {
1261 	default:
1262 		if (is_system_labeled() &&
1263 		    check_exempt_conflict_v6(connp, ipst))
1264 			return (EADDRINUSE);
1265 		/* FALLTHROUGH */
1266 	case IPPROTO_UDP:
1267 		if (protocol == IPPROTO_UDP) {
1268 			connfp = &ipst->ips_ipcl_udp_fanout[
1269 			    IPCL_UDP_HASH(lport, ipst)];
1270 		} else {
1271 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1272 		}
1273 
1274 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1275 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1276 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1277 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1278 		} else {
1279 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1280 		}
1281 		break;
1282 
1283 	case IPPROTO_TCP:
1284 		/* Insert it in the Bind Hash */
1285 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1286 		connfp = &ipst->ips_ipcl_bind_fanout[
1287 		    IPCL_BIND_HASH(lport, ipst)];
1288 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1289 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1290 		} else {
1291 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1292 		}
1293 		if (cl_inet_listen != NULL) {
1294 			sa_family_t	addr_family;
1295 			uint8_t		*laddrp;
1296 
1297 			if (connp->conn_ipversion == IPV6_VERSION) {
1298 				addr_family = AF_INET6;
1299 				laddrp =
1300 				    (uint8_t *)&connp->conn_bound_addr_v6;
1301 			} else {
1302 				addr_family = AF_INET;
1303 				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1304 			}
1305 			connp->conn_flags |= IPCL_CL_LISTENER;
1306 			(*cl_inet_listen)(
1307 			    connp->conn_netstack->netstack_stackid,
1308 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1309 		}
1310 		break;
1311 
1312 	case IPPROTO_SCTP:
1313 		ret = ipcl_sctp_hash_insert(connp, lport);
1314 		break;
1315 	}
1316 
1317 	return (ret);
1318 }
1319 
1320 /*
1321  * ipcl_conn_hash insertion routines.
1322  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1323  */
1324 
1325 int
1326 ipcl_conn_insert(conn_t *connp)
1327 {
1328 	if (connp->conn_ipversion == IPV6_VERSION)
1329 		return (ipcl_conn_insert_v6(connp));
1330 	else
1331 		return (ipcl_conn_insert_v4(connp));
1332 }
1333 
1334 int
1335 ipcl_conn_insert_v4(conn_t *connp)
1336 {
1337 	connf_t		*connfp;
1338 	conn_t		*tconnp;
1339 	int		ret = 0;
1340 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1341 	uint16_t	lport = connp->conn_lport;
1342 	uint8_t		protocol = connp->conn_proto;
1343 
1344 	if (IPCL_IS_IPTUN(connp))
1345 		return (ipcl_iptun_hash_insert(connp, ipst));
1346 
1347 	switch (protocol) {
1348 	case IPPROTO_TCP:
1349 		/*
1350 		 * For TCP, we check whether the connection tuple already
1351 		 * exists before allowing the connection to proceed.  We
1352 		 * also allow indexing on the zoneid. This is to allow
1353 		 * multiple shared stack zones to have the same tcp
1354 		 * connection tuple. In practice this only happens for
1355 		 * INADDR_LOOPBACK as it's the only local address which
1356 		 * doesn't have to be unique.
1357 		 */
1358 		connfp = &ipst->ips_ipcl_conn_fanout[
1359 		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1360 		    connp->conn_ports, ipst)];
1361 		mutex_enter(&connfp->connf_lock);
1362 		for (tconnp = connfp->connf_head; tconnp != NULL;
1363 		    tconnp = tconnp->conn_next) {
1364 			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1365 			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1366 			    connp->conn_ports) &&
1367 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1368 				/* Already have a conn. bail out */
1369 				mutex_exit(&connfp->connf_lock);
1370 				return (EADDRINUSE);
1371 			}
1372 		}
1373 		if (connp->conn_fanout != NULL) {
1374 			/*
1375 			 * Probably a XTI/TLI application trying to do a
1376 			 * rebind. Let it happen.
1377 			 */
1378 			mutex_exit(&connfp->connf_lock);
1379 			IPCL_HASH_REMOVE(connp);
1380 			mutex_enter(&connfp->connf_lock);
1381 		}
1382 
1383 		ASSERT(connp->conn_recv != NULL);
1384 		ASSERT(connp->conn_recvicmp != NULL);
1385 
1386 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1387 		mutex_exit(&connfp->connf_lock);
1388 		break;
1389 
1390 	case IPPROTO_SCTP:
1391 		/*
1392 		 * The raw socket may have already been bound, remove it
1393 		 * from the hash first.
1394 		 */
1395 		IPCL_HASH_REMOVE(connp);
1396 		ret = ipcl_sctp_hash_insert(connp, lport);
1397 		break;
1398 
1399 	default:
1400 		/*
1401 		 * Check for conflicts among MAC exempt bindings.  For
1402 		 * transports with port numbers, this is done by the upper
1403 		 * level per-transport binding logic.  For all others, it's
1404 		 * done here.
1405 		 */
1406 		if (is_system_labeled() &&
1407 		    check_exempt_conflict_v4(connp, ipst))
1408 			return (EADDRINUSE);
1409 		/* FALLTHROUGH */
1410 
1411 	case IPPROTO_UDP:
1412 		if (protocol == IPPROTO_UDP) {
1413 			connfp = &ipst->ips_ipcl_udp_fanout[
1414 			    IPCL_UDP_HASH(lport, ipst)];
1415 		} else {
1416 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1417 		}
1418 
1419 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1420 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1421 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1422 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1423 		} else {
1424 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1425 		}
1426 		break;
1427 	}
1428 
1429 	return (ret);
1430 }
1431 
1432 int
1433 ipcl_conn_insert_v6(conn_t *connp)
1434 {
1435 	connf_t		*connfp;
1436 	conn_t		*tconnp;
1437 	int		ret = 0;
1438 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1439 	uint16_t	lport = connp->conn_lport;
1440 	uint8_t		protocol = connp->conn_proto;
1441 	uint_t		ifindex = connp->conn_bound_if;
1442 
1443 	if (IPCL_IS_IPTUN(connp))
1444 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1445 
1446 	switch (protocol) {
1447 	case IPPROTO_TCP:
1448 
1449 		/*
1450 		 * For tcp, we check whether the connection tuple already
1451 		 * exists before allowing the connection to proceed.  We
1452 		 * also allow indexing on the zoneid. This is to allow
1453 		 * multiple shared stack zones to have the same tcp
1454 		 * connection tuple. In practice this only happens for
1455 		 * ipv6_loopback as it's the only local address which
1456 		 * doesn't have to be unique.
1457 		 */
1458 		connfp = &ipst->ips_ipcl_conn_fanout[
1459 		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1460 		    ipst)];
1461 		mutex_enter(&connfp->connf_lock);
1462 		for (tconnp = connfp->connf_head; tconnp != NULL;
1463 		    tconnp = tconnp->conn_next) {
1464 			/* NOTE: need to match zoneid. Bug in onnv-gate */
1465 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1466 			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1467 			    connp->conn_ports) &&
1468 			    (tconnp->conn_bound_if == 0 ||
1469 			    tconnp->conn_bound_if == ifindex) &&
1470 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1471 				/* Already have a conn. bail out */
1472 				mutex_exit(&connfp->connf_lock);
1473 				return (EADDRINUSE);
1474 			}
1475 		}
1476 		if (connp->conn_fanout != NULL) {
1477 			/*
1478 			 * Probably a XTI/TLI application trying to do a
1479 			 * rebind. Let it happen.
1480 			 */
1481 			mutex_exit(&connfp->connf_lock);
1482 			IPCL_HASH_REMOVE(connp);
1483 			mutex_enter(&connfp->connf_lock);
1484 		}
1485 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1486 		mutex_exit(&connfp->connf_lock);
1487 		break;
1488 
1489 	case IPPROTO_SCTP:
1490 		IPCL_HASH_REMOVE(connp);
1491 		ret = ipcl_sctp_hash_insert(connp, lport);
1492 		break;
1493 
1494 	default:
1495 		if (is_system_labeled() &&
1496 		    check_exempt_conflict_v6(connp, ipst))
1497 			return (EADDRINUSE);
1498 		/* FALLTHROUGH */
1499 	case IPPROTO_UDP:
1500 		if (protocol == IPPROTO_UDP) {
1501 			connfp = &ipst->ips_ipcl_udp_fanout[
1502 			    IPCL_UDP_HASH(lport, ipst)];
1503 		} else {
1504 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1505 		}
1506 
1507 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1508 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1509 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1510 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1511 		} else {
1512 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1513 		}
1514 		break;
1515 	}
1516 
1517 	return (ret);
1518 }
1519 
1520 /*
1521  * v4 packet classifying function. looks up the fanout table to
1522  * find the conn, the packet belongs to. returns the conn with
1523  * the reference held, null otherwise.
1524  *
1525  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1526  * Lookup" comment block are applied.  Labels are also checked as described
1527  * above.  If the packet is from the inside (looped back), and is from the same
1528  * zone, then label checks are omitted.
1529  */
1530 conn_t *
1531 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1532     ip_recv_attr_t *ira, ip_stack_t *ipst)
1533 {
1534 	ipha_t	*ipha;
1535 	connf_t	*connfp, *bind_connfp;
1536 	uint16_t lport;
1537 	uint16_t fport;
1538 	uint32_t ports;
1539 	conn_t	*connp;
1540 	uint16_t  *up;
1541 	zoneid_t	zoneid = ira->ira_zoneid;
1542 
1543 	ipha = (ipha_t *)mp->b_rptr;
1544 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1545 
1546 	switch (protocol) {
1547 	case IPPROTO_TCP:
1548 		ports = *(uint32_t *)up;
1549 		connfp =
1550 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1551 		    ports, ipst)];
1552 		mutex_enter(&connfp->connf_lock);
1553 		for (connp = connfp->connf_head; connp != NULL;
1554 		    connp = connp->conn_next) {
1555 			if (IPCL_CONN_MATCH(connp, protocol,
1556 			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1557 			    (connp->conn_zoneid == zoneid ||
1558 			    connp->conn_allzones ||
1559 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1560 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1561 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1562 				break;
1563 		}
1564 
1565 		if (connp != NULL) {
1566 			/*
1567 			 * We have a fully-bound TCP connection.
1568 			 *
1569 			 * For labeled systems, there's no need to check the
1570 			 * label here.  It's known to be good as we checked
1571 			 * before allowing the connection to become bound.
1572 			 */
1573 			CONN_INC_REF(connp);
1574 			mutex_exit(&connfp->connf_lock);
1575 			return (connp);
1576 		}
1577 
1578 		mutex_exit(&connfp->connf_lock);
1579 		lport = up[1];
1580 		bind_connfp =
1581 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1582 		mutex_enter(&bind_connfp->connf_lock);
1583 		for (connp = bind_connfp->connf_head; connp != NULL;
1584 		    connp = connp->conn_next) {
1585 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1586 			    lport) &&
1587 			    (connp->conn_zoneid == zoneid ||
1588 			    connp->conn_allzones ||
1589 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1590 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1591 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1592 				break;
1593 		}
1594 
1595 		/*
1596 		 * If the matching connection is SLP on a private address, then
1597 		 * the label on the packet must match the local zone's label.
1598 		 * Otherwise, it must be in the label range defined by tnrh.
1599 		 * This is ensured by tsol_receive_local.
1600 		 *
1601 		 * Note that we don't check tsol_receive_local for
1602 		 * the connected case.
1603 		 */
1604 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1605 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1606 		    ira, connp)) {
1607 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1608 			    char *, "connp(1) could not receive mp(2)",
1609 			    conn_t *, connp, mblk_t *, mp);
1610 			connp = NULL;
1611 		}
1612 
1613 		if (connp != NULL) {
1614 			/* Have a listener at least */
1615 			CONN_INC_REF(connp);
1616 			mutex_exit(&bind_connfp->connf_lock);
1617 			return (connp);
1618 		}
1619 
1620 		mutex_exit(&bind_connfp->connf_lock);
1621 		break;
1622 
1623 	case IPPROTO_UDP:
1624 		lport = up[1];
1625 		fport = up[0];
1626 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1627 		mutex_enter(&connfp->connf_lock);
1628 		for (connp = connfp->connf_head; connp != NULL;
1629 		    connp = connp->conn_next) {
1630 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1631 			    fport, ipha->ipha_src) &&
1632 			    (connp->conn_zoneid == zoneid ||
1633 			    connp->conn_allzones ||
1634 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1635 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1636 				break;
1637 		}
1638 
1639 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1640 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1641 		    ira, connp)) {
1642 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1643 			    char *, "connp(1) could not receive mp(2)",
1644 			    conn_t *, connp, mblk_t *, mp);
1645 			connp = NULL;
1646 		}
1647 
1648 		if (connp != NULL) {
1649 			CONN_INC_REF(connp);
1650 			mutex_exit(&connfp->connf_lock);
1651 			return (connp);
1652 		}
1653 
1654 		/*
1655 		 * We shouldn't come here for multicast/broadcast packets
1656 		 */
1657 		mutex_exit(&connfp->connf_lock);
1658 
1659 		break;
1660 
1661 	case IPPROTO_ENCAP:
1662 	case IPPROTO_IPV6:
1663 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1664 		    &ipha->ipha_dst, ipst));
1665 	}
1666 
1667 	return (NULL);
1668 }
1669 
1670 conn_t *
1671 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1672     ip_recv_attr_t *ira, ip_stack_t *ipst)
1673 {
1674 	ip6_t		*ip6h;
1675 	connf_t		*connfp, *bind_connfp;
1676 	uint16_t	lport;
1677 	uint16_t	fport;
1678 	tcpha_t		*tcpha;
1679 	uint32_t	ports;
1680 	conn_t		*connp;
1681 	uint16_t	*up;
1682 	zoneid_t	zoneid = ira->ira_zoneid;
1683 
1684 	ip6h = (ip6_t *)mp->b_rptr;
1685 
1686 	switch (protocol) {
1687 	case IPPROTO_TCP:
1688 		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1689 		up = &tcpha->tha_lport;
1690 		ports = *(uint32_t *)up;
1691 
1692 		connfp =
1693 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1694 		    ports, ipst)];
1695 		mutex_enter(&connfp->connf_lock);
1696 		for (connp = connfp->connf_head; connp != NULL;
1697 		    connp = connp->conn_next) {
1698 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1699 			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1700 			    (connp->conn_zoneid == zoneid ||
1701 			    connp->conn_allzones ||
1702 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1703 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1704 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1705 				break;
1706 		}
1707 
1708 		if (connp != NULL) {
1709 			/*
1710 			 * We have a fully-bound TCP connection.
1711 			 *
1712 			 * For labeled systems, there's no need to check the
1713 			 * label here.  It's known to be good as we checked
1714 			 * before allowing the connection to become bound.
1715 			 */
1716 			CONN_INC_REF(connp);
1717 			mutex_exit(&connfp->connf_lock);
1718 			return (connp);
1719 		}
1720 
1721 		mutex_exit(&connfp->connf_lock);
1722 
1723 		lport = up[1];
1724 		bind_connfp =
1725 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1726 		mutex_enter(&bind_connfp->connf_lock);
1727 		for (connp = bind_connfp->connf_head; connp != NULL;
1728 		    connp = connp->conn_next) {
1729 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1730 			    ip6h->ip6_dst, lport) &&
1731 			    (connp->conn_zoneid == zoneid ||
1732 			    connp->conn_allzones ||
1733 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1734 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1735 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1736 				break;
1737 		}
1738 
1739 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1740 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1741 		    ira, connp)) {
1742 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1743 			    char *, "connp(1) could not receive mp(2)",
1744 			    conn_t *, connp, mblk_t *, mp);
1745 			connp = NULL;
1746 		}
1747 
1748 		if (connp != NULL) {
1749 			/* Have a listner at least */
1750 			CONN_INC_REF(connp);
1751 			mutex_exit(&bind_connfp->connf_lock);
1752 			return (connp);
1753 		}
1754 
1755 		mutex_exit(&bind_connfp->connf_lock);
1756 		break;
1757 
1758 	case IPPROTO_UDP:
1759 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1760 		lport = up[1];
1761 		fport = up[0];
1762 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1763 		mutex_enter(&connfp->connf_lock);
1764 		for (connp = connfp->connf_head; connp != NULL;
1765 		    connp = connp->conn_next) {
1766 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1767 			    fport, ip6h->ip6_src) &&
1768 			    (connp->conn_zoneid == zoneid ||
1769 			    connp->conn_allzones ||
1770 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1771 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1772 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1773 				break;
1774 		}
1775 
1776 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1777 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1778 		    ira, connp)) {
1779 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1780 			    char *, "connp(1) could not receive mp(2)",
1781 			    conn_t *, connp, mblk_t *, mp);
1782 			connp = NULL;
1783 		}
1784 
1785 		if (connp != NULL) {
1786 			CONN_INC_REF(connp);
1787 			mutex_exit(&connfp->connf_lock);
1788 			return (connp);
1789 		}
1790 
1791 		/*
1792 		 * We shouldn't come here for multicast/broadcast packets
1793 		 */
1794 		mutex_exit(&connfp->connf_lock);
1795 		break;
1796 	case IPPROTO_ENCAP:
1797 	case IPPROTO_IPV6:
1798 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1799 		    &ip6h->ip6_dst, ipst));
1800 	}
1801 
1802 	return (NULL);
1803 }
1804 
1805 /*
1806  * wrapper around ipcl_classify_(v4,v6) routines.
1807  */
1808 conn_t *
1809 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1810 {
1811 	if (ira->ira_flags & IRAF_IS_IPV4) {
1812 		return (ipcl_classify_v4(mp, ira->ira_protocol,
1813 		    ira->ira_ip_hdr_length, ira, ipst));
1814 	} else {
1815 		return (ipcl_classify_v6(mp, ira->ira_protocol,
1816 		    ira->ira_ip_hdr_length, ira, ipst));
1817 	}
1818 }
1819 
1820 /*
1821  * Only used to classify SCTP RAW sockets
1822  */
1823 conn_t *
1824 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1825     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1826 {
1827 	connf_t		*connfp;
1828 	conn_t		*connp;
1829 	in_port_t	lport;
1830 	int		ipversion;
1831 	const void	*dst;
1832 	zoneid_t	zoneid = ira->ira_zoneid;
1833 
1834 	lport = ((uint16_t *)&ports)[1];
1835 	if (ira->ira_flags & IRAF_IS_IPV4) {
1836 		dst = (const void *)&ipha->ipha_dst;
1837 		ipversion = IPV4_VERSION;
1838 	} else {
1839 		dst = (const void *)&ip6h->ip6_dst;
1840 		ipversion = IPV6_VERSION;
1841 	}
1842 
1843 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1844 	mutex_enter(&connfp->connf_lock);
1845 	for (connp = connfp->connf_head; connp != NULL;
1846 	    connp = connp->conn_next) {
1847 		/* We don't allow v4 fallback for v6 raw socket. */
1848 		if (ipversion != connp->conn_ipversion)
1849 			continue;
1850 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1851 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1852 			if (ipversion == IPV4_VERSION) {
1853 				if (!IPCL_CONN_MATCH(connp, protocol,
1854 				    ipha->ipha_src, ipha->ipha_dst, ports))
1855 					continue;
1856 			} else {
1857 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1858 				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1859 					continue;
1860 			}
1861 		} else {
1862 			if (ipversion == IPV4_VERSION) {
1863 				if (!IPCL_BIND_MATCH(connp, protocol,
1864 				    ipha->ipha_dst, lport))
1865 					continue;
1866 			} else {
1867 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1868 				    ip6h->ip6_dst, lport))
1869 					continue;
1870 			}
1871 		}
1872 
1873 		if (connp->conn_zoneid == zoneid ||
1874 		    connp->conn_allzones ||
1875 		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1876 		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1877 		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1878 			break;
1879 	}
1880 
1881 	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1882 	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1883 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1884 		    char *, "connp(1) could not receive mp(2)",
1885 		    conn_t *, connp, mblk_t *, mp);
1886 		connp = NULL;
1887 	}
1888 
1889 	if (connp != NULL)
1890 		goto found;
1891 	mutex_exit(&connfp->connf_lock);
1892 
1893 	/* Try to look for a wildcard SCTP RAW socket match. */
1894 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1895 	mutex_enter(&connfp->connf_lock);
1896 	for (connp = connfp->connf_head; connp != NULL;
1897 	    connp = connp->conn_next) {
1898 		/* We don't allow v4 fallback for v6 raw socket. */
1899 		if (ipversion != connp->conn_ipversion)
1900 			continue;
1901 		if (!IPCL_ZONE_MATCH(connp, zoneid))
1902 			continue;
1903 
1904 		if (ipversion == IPV4_VERSION) {
1905 			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1906 				break;
1907 		} else {
1908 			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1909 				break;
1910 			}
1911 		}
1912 	}
1913 
1914 	if (connp != NULL)
1915 		goto found;
1916 
1917 	mutex_exit(&connfp->connf_lock);
1918 	return (NULL);
1919 
1920 found:
1921 	ASSERT(connp != NULL);
1922 	CONN_INC_REF(connp);
1923 	mutex_exit(&connfp->connf_lock);
1924 	return (connp);
1925 }
1926 
1927 /* ARGSUSED */
1928 static int
1929 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1930 {
1931 	itc_t	*itc = (itc_t *)buf;
1932 	conn_t	*connp = &itc->itc_conn;
1933 	tcp_t	*tcp = (tcp_t *)&itc[1];
1934 
1935 	bzero(connp, sizeof (conn_t));
1936 	bzero(tcp, sizeof (tcp_t));
1937 
1938 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1939 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1940 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1941 	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1942 	if (tcp->tcp_timercache == NULL)
1943 		return (ENOMEM);
1944 	connp->conn_tcp = tcp;
1945 	connp->conn_flags = IPCL_TCPCONN;
1946 	connp->conn_proto = IPPROTO_TCP;
1947 	tcp->tcp_connp = connp;
1948 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1949 
1950 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1951 	if (connp->conn_ixa == NULL) {
1952 		tcp_timermp_free(tcp);
1953 		return (ENOMEM);
1954 	}
1955 	connp->conn_ixa->ixa_refcnt = 1;
1956 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1957 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1958 	return (0);
1959 }
1960 
1961 /* ARGSUSED */
1962 static void
1963 tcp_conn_destructor(void *buf, void *cdrarg)
1964 {
1965 	itc_t	*itc = (itc_t *)buf;
1966 	conn_t	*connp = &itc->itc_conn;
1967 	tcp_t	*tcp = (tcp_t *)&itc[1];
1968 
1969 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1970 	ASSERT(tcp->tcp_connp == connp);
1971 	ASSERT(connp->conn_tcp == tcp);
1972 	tcp_timermp_free(tcp);
1973 	mutex_destroy(&connp->conn_lock);
1974 	cv_destroy(&connp->conn_cv);
1975 	cv_destroy(&connp->conn_sq_cv);
1976 	rw_destroy(&connp->conn_ilg_lock);
1977 
1978 	/* Can be NULL if constructor failed */
1979 	if (connp->conn_ixa != NULL) {
1980 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1981 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1982 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1983 		ixa_refrele(connp->conn_ixa);
1984 	}
1985 }
1986 
1987 /* ARGSUSED */
1988 static int
1989 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1990 {
1991 	itc_t	*itc = (itc_t *)buf;
1992 	conn_t	*connp = &itc->itc_conn;
1993 
1994 	bzero(connp, sizeof (conn_t));
1995 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1996 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1997 	connp->conn_flags = IPCL_IPCCONN;
1998 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1999 
2000 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2001 	if (connp->conn_ixa == NULL)
2002 		return (ENOMEM);
2003 	connp->conn_ixa->ixa_refcnt = 1;
2004 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2005 	return (0);
2006 }
2007 
2008 /* ARGSUSED */
2009 static void
2010 ip_conn_destructor(void *buf, void *cdrarg)
2011 {
2012 	itc_t	*itc = (itc_t *)buf;
2013 	conn_t	*connp = &itc->itc_conn;
2014 
2015 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2016 	ASSERT(connp->conn_priv == NULL);
2017 	mutex_destroy(&connp->conn_lock);
2018 	cv_destroy(&connp->conn_cv);
2019 	rw_destroy(&connp->conn_ilg_lock);
2020 
2021 	/* Can be NULL if constructor failed */
2022 	if (connp->conn_ixa != NULL) {
2023 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2024 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2025 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2026 		ixa_refrele(connp->conn_ixa);
2027 	}
2028 }
2029 
2030 /* ARGSUSED */
2031 static int
2032 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2033 {
2034 	itc_t	*itc = (itc_t *)buf;
2035 	conn_t	*connp = &itc->itc_conn;
2036 	udp_t	*udp = (udp_t *)&itc[1];
2037 
2038 	bzero(connp, sizeof (conn_t));
2039 	bzero(udp, sizeof (udp_t));
2040 
2041 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2042 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2043 	connp->conn_udp = udp;
2044 	connp->conn_flags = IPCL_UDPCONN;
2045 	connp->conn_proto = IPPROTO_UDP;
2046 	udp->udp_connp = connp;
2047 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2048 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2049 	if (connp->conn_ixa == NULL)
2050 		return (ENOMEM);
2051 	connp->conn_ixa->ixa_refcnt = 1;
2052 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2053 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2054 	return (0);
2055 }
2056 
2057 /* ARGSUSED */
2058 static void
2059 udp_conn_destructor(void *buf, void *cdrarg)
2060 {
2061 	itc_t	*itc = (itc_t *)buf;
2062 	conn_t	*connp = &itc->itc_conn;
2063 	udp_t	*udp = (udp_t *)&itc[1];
2064 
2065 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2066 	ASSERT(udp->udp_connp == connp);
2067 	ASSERT(connp->conn_udp == udp);
2068 	mutex_destroy(&connp->conn_lock);
2069 	cv_destroy(&connp->conn_cv);
2070 	rw_destroy(&connp->conn_ilg_lock);
2071 
2072 	/* Can be NULL if constructor failed */
2073 	if (connp->conn_ixa != NULL) {
2074 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2075 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2076 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2077 		ixa_refrele(connp->conn_ixa);
2078 	}
2079 }
2080 
2081 /* ARGSUSED */
2082 static int
2083 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2084 {
2085 	itc_t	*itc = (itc_t *)buf;
2086 	conn_t	*connp = &itc->itc_conn;
2087 	icmp_t	*icmp = (icmp_t *)&itc[1];
2088 
2089 	bzero(connp, sizeof (conn_t));
2090 	bzero(icmp, sizeof (icmp_t));
2091 
2092 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2093 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2094 	connp->conn_icmp = icmp;
2095 	connp->conn_flags = IPCL_RAWIPCONN;
2096 	connp->conn_proto = IPPROTO_ICMP;
2097 	icmp->icmp_connp = connp;
2098 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2099 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2100 	if (connp->conn_ixa == NULL)
2101 		return (ENOMEM);
2102 	connp->conn_ixa->ixa_refcnt = 1;
2103 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2104 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2105 	return (0);
2106 }
2107 
2108 /* ARGSUSED */
2109 static void
2110 rawip_conn_destructor(void *buf, void *cdrarg)
2111 {
2112 	itc_t	*itc = (itc_t *)buf;
2113 	conn_t	*connp = &itc->itc_conn;
2114 	icmp_t	*icmp = (icmp_t *)&itc[1];
2115 
2116 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2117 	ASSERT(icmp->icmp_connp == connp);
2118 	ASSERT(connp->conn_icmp == icmp);
2119 	mutex_destroy(&connp->conn_lock);
2120 	cv_destroy(&connp->conn_cv);
2121 	rw_destroy(&connp->conn_ilg_lock);
2122 
2123 	/* Can be NULL if constructor failed */
2124 	if (connp->conn_ixa != NULL) {
2125 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2126 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2127 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2128 		ixa_refrele(connp->conn_ixa);
2129 	}
2130 }
2131 
2132 /* ARGSUSED */
2133 static int
2134 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2135 {
2136 	itc_t	*itc = (itc_t *)buf;
2137 	conn_t	*connp = &itc->itc_conn;
2138 	rts_t	*rts = (rts_t *)&itc[1];
2139 
2140 	bzero(connp, sizeof (conn_t));
2141 	bzero(rts, sizeof (rts_t));
2142 
2143 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2144 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2145 	connp->conn_rts = rts;
2146 	connp->conn_flags = IPCL_RTSCONN;
2147 	rts->rts_connp = connp;
2148 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2149 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2150 	if (connp->conn_ixa == NULL)
2151 		return (ENOMEM);
2152 	connp->conn_ixa->ixa_refcnt = 1;
2153 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2154 	return (0);
2155 }
2156 
2157 /* ARGSUSED */
2158 static void
2159 rts_conn_destructor(void *buf, void *cdrarg)
2160 {
2161 	itc_t	*itc = (itc_t *)buf;
2162 	conn_t	*connp = &itc->itc_conn;
2163 	rts_t	*rts = (rts_t *)&itc[1];
2164 
2165 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2166 	ASSERT(rts->rts_connp == connp);
2167 	ASSERT(connp->conn_rts == rts);
2168 	mutex_destroy(&connp->conn_lock);
2169 	cv_destroy(&connp->conn_cv);
2170 	rw_destroy(&connp->conn_ilg_lock);
2171 
2172 	/* Can be NULL if constructor failed */
2173 	if (connp->conn_ixa != NULL) {
2174 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2175 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2176 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2177 		ixa_refrele(connp->conn_ixa);
2178 	}
2179 }
2180 
2181 /*
2182  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2183  * in the conn_t.
2184  *
2185  * Below we list all the pointers in the conn_t as a documentation aid.
2186  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2187  * If you add any pointers to the conn_t please add an ASSERT here
2188  * and #ifdef it out if it can't be actually asserted to be NULL.
2189  * In any case, we bzero most of the conn_t at the end of the function.
2190  */
2191 void
2192 ipcl_conn_cleanup(conn_t *connp)
2193 {
2194 	ip_xmit_attr_t	*ixa;
2195 
2196 	ASSERT(connp->conn_latch == NULL);
2197 	ASSERT(connp->conn_latch_in_policy == NULL);
2198 	ASSERT(connp->conn_latch_in_action == NULL);
2199 #ifdef notdef
2200 	ASSERT(connp->conn_rq == NULL);
2201 	ASSERT(connp->conn_wq == NULL);
2202 #endif
2203 	ASSERT(connp->conn_cred == NULL);
2204 	ASSERT(connp->conn_g_fanout == NULL);
2205 	ASSERT(connp->conn_g_next == NULL);
2206 	ASSERT(connp->conn_g_prev == NULL);
2207 	ASSERT(connp->conn_policy == NULL);
2208 	ASSERT(connp->conn_fanout == NULL);
2209 	ASSERT(connp->conn_next == NULL);
2210 	ASSERT(connp->conn_prev == NULL);
2211 	ASSERT(connp->conn_oper_pending_ill == NULL);
2212 	ASSERT(connp->conn_ilg == NULL);
2213 	ASSERT(connp->conn_drain_next == NULL);
2214 	ASSERT(connp->conn_drain_prev == NULL);
2215 #ifdef notdef
2216 	/* conn_idl is not cleared when removed from idl list */
2217 	ASSERT(connp->conn_idl == NULL);
2218 #endif
2219 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2220 #ifdef notdef
2221 	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2222 	ASSERT(connp->conn_netstack == NULL);
2223 #endif
2224 
2225 	ASSERT(connp->conn_helper_info == NULL);
2226 	ASSERT(connp->conn_ixa != NULL);
2227 	ixa = connp->conn_ixa;
2228 	ASSERT(ixa->ixa_refcnt == 1);
2229 	/* Need to preserve ixa_protocol */
2230 	ixa_cleanup(ixa);
2231 	ixa->ixa_flags = 0;
2232 
2233 	/* Clear out the conn_t fields that are not preserved */
2234 	bzero(&connp->conn_start_clr,
2235 	    sizeof (conn_t) -
2236 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2237 }
2238 
2239 /*
2240  * All conns are inserted in a global multi-list for the benefit of
2241  * walkers. The walk is guaranteed to walk all open conns at the time
2242  * of the start of the walk exactly once. This property is needed to
2243  * achieve some cleanups during unplumb of interfaces. This is achieved
2244  * as follows.
2245  *
2246  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2247  * call the insert and delete functions below at creation and deletion
2248  * time respectively. The conn never moves or changes its position in this
2249  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2250  * won't increase due to walkers, once the conn deletion has started. Note
2251  * that we can't remove the conn from the global list and then wait for
2252  * the refcnt to drop to zero, since walkers would then see a truncated
2253  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2254  * conns until ip_open is ready to make them globally visible.
2255  * The global round robin multi-list locks are held only to get the
2256  * next member/insertion/deletion and contention should be negligible
2257  * if the multi-list is much greater than the number of cpus.
2258  */
2259 void
2260 ipcl_globalhash_insert(conn_t *connp)
2261 {
2262 	int	index;
2263 	struct connf_s	*connfp;
2264 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2265 
2266 	/*
2267 	 * No need for atomic here. Approximate even distribution
2268 	 * in the global lists is sufficient.
2269 	 */
2270 	ipst->ips_conn_g_index++;
2271 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2272 
2273 	connp->conn_g_prev = NULL;
2274 	/*
2275 	 * Mark as INCIPIENT, so that walkers will ignore this
2276 	 * for now, till ip_open is ready to make it visible globally.
2277 	 */
2278 	connp->conn_state_flags |= CONN_INCIPIENT;
2279 
2280 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2281 	/* Insert at the head of the list */
2282 	mutex_enter(&connfp->connf_lock);
2283 	connp->conn_g_next = connfp->connf_head;
2284 	if (connp->conn_g_next != NULL)
2285 		connp->conn_g_next->conn_g_prev = connp;
2286 	connfp->connf_head = connp;
2287 
2288 	/* The fanout bucket this conn points to */
2289 	connp->conn_g_fanout = connfp;
2290 
2291 	mutex_exit(&connfp->connf_lock);
2292 }
2293 
2294 void
2295 ipcl_globalhash_remove(conn_t *connp)
2296 {
2297 	struct connf_s	*connfp;
2298 
2299 	/*
2300 	 * We were never inserted in the global multi list.
2301 	 * IPCL_NONE variety is never inserted in the global multilist
2302 	 * since it is presumed to not need any cleanup and is transient.
2303 	 */
2304 	if (connp->conn_g_fanout == NULL)
2305 		return;
2306 
2307 	connfp = connp->conn_g_fanout;
2308 	mutex_enter(&connfp->connf_lock);
2309 	if (connp->conn_g_prev != NULL)
2310 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2311 	else
2312 		connfp->connf_head = connp->conn_g_next;
2313 	if (connp->conn_g_next != NULL)
2314 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2315 	mutex_exit(&connfp->connf_lock);
2316 
2317 	/* Better to stumble on a null pointer than to corrupt memory */
2318 	connp->conn_g_next = NULL;
2319 	connp->conn_g_prev = NULL;
2320 	connp->conn_g_fanout = NULL;
2321 }
2322 
2323 /*
2324  * Walk the list of all conn_t's in the system, calling the function provided
2325  * With the specified argument for each.
2326  * Applies to both IPv4 and IPv6.
2327  *
2328  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2329  * conn_oper_pending_ill). To guard against stale pointers
2330  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2331  * unplumbed or removed. New conn_t's that are created while we are walking
2332  * may be missed by this walk, because they are not necessarily inserted
2333  * at the tail of the list. They are new conn_t's and thus don't have any
2334  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2335  * is created to the struct that is going away.
2336  */
2337 void
2338 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2339 {
2340 	int	i;
2341 	conn_t	*connp;
2342 	conn_t	*prev_connp;
2343 
2344 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2345 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2346 		prev_connp = NULL;
2347 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2348 		while (connp != NULL) {
2349 			mutex_enter(&connp->conn_lock);
2350 			if (connp->conn_state_flags &
2351 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2352 				mutex_exit(&connp->conn_lock);
2353 				connp = connp->conn_g_next;
2354 				continue;
2355 			}
2356 			CONN_INC_REF_LOCKED(connp);
2357 			mutex_exit(&connp->conn_lock);
2358 			mutex_exit(
2359 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2360 			(*func)(connp, arg);
2361 			if (prev_connp != NULL)
2362 				CONN_DEC_REF(prev_connp);
2363 			mutex_enter(
2364 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2365 			prev_connp = connp;
2366 			connp = connp->conn_g_next;
2367 		}
2368 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2369 		if (prev_connp != NULL)
2370 			CONN_DEC_REF(prev_connp);
2371 	}
2372 }
2373 
2374 /*
2375  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2376  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2377  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2378  * (peer tcp in ESTABLISHED state).
2379  */
2380 conn_t *
2381 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2382     ip_stack_t *ipst)
2383 {
2384 	uint32_t ports;
2385 	uint16_t *pports = (uint16_t *)&ports;
2386 	connf_t	*connfp;
2387 	conn_t	*tconnp;
2388 	boolean_t zone_chk;
2389 
2390 	/*
2391 	 * If either the source of destination address is loopback, then
2392 	 * both endpoints must be in the same Zone.  Otherwise, both of
2393 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2394 	 * state) and the endpoints may reside in different Zones.
2395 	 */
2396 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2397 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2398 
2399 	pports[0] = tcpha->tha_fport;
2400 	pports[1] = tcpha->tha_lport;
2401 
2402 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2403 	    ports, ipst)];
2404 
2405 	mutex_enter(&connfp->connf_lock);
2406 	for (tconnp = connfp->connf_head; tconnp != NULL;
2407 	    tconnp = tconnp->conn_next) {
2408 
2409 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2410 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2411 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2412 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2413 
2414 			ASSERT(tconnp != connp);
2415 			CONN_INC_REF(tconnp);
2416 			mutex_exit(&connfp->connf_lock);
2417 			return (tconnp);
2418 		}
2419 	}
2420 	mutex_exit(&connfp->connf_lock);
2421 	return (NULL);
2422 }
2423 
2424 /*
2425  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2426  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2427  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2428  * (peer tcp in ESTABLISHED state).
2429  */
2430 conn_t *
2431 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2432     ip_stack_t *ipst)
2433 {
2434 	uint32_t ports;
2435 	uint16_t *pports = (uint16_t *)&ports;
2436 	connf_t	*connfp;
2437 	conn_t	*tconnp;
2438 	boolean_t zone_chk;
2439 
2440 	/*
2441 	 * If either the source of destination address is loopback, then
2442 	 * both endpoints must be in the same Zone.  Otherwise, both of
2443 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2444 	 * state) and the endpoints may reside in different Zones.  We
2445 	 * don't do Zone check for link local address(es) because the
2446 	 * current Zone implementation treats each link local address as
2447 	 * being unique per system node, i.e. they belong to global Zone.
2448 	 */
2449 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2450 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2451 
2452 	pports[0] = tcpha->tha_fport;
2453 	pports[1] = tcpha->tha_lport;
2454 
2455 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2456 	    ports, ipst)];
2457 
2458 	mutex_enter(&connfp->connf_lock);
2459 	for (tconnp = connfp->connf_head; tconnp != NULL;
2460 	    tconnp = tconnp->conn_next) {
2461 
2462 		/* We skip conn_bound_if check here as this is loopback tcp */
2463 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2464 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2465 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2466 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2467 
2468 			ASSERT(tconnp != connp);
2469 			CONN_INC_REF(tconnp);
2470 			mutex_exit(&connfp->connf_lock);
2471 			return (tconnp);
2472 		}
2473 	}
2474 	mutex_exit(&connfp->connf_lock);
2475 	return (NULL);
2476 }
2477 
2478 /*
2479  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2480  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2481  * Only checks for connected entries i.e. no INADDR_ANY checks.
2482  */
2483 conn_t *
2484 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2485     ip_stack_t *ipst)
2486 {
2487 	uint32_t ports;
2488 	uint16_t *pports;
2489 	connf_t	*connfp;
2490 	conn_t	*tconnp;
2491 
2492 	pports = (uint16_t *)&ports;
2493 	pports[0] = tcpha->tha_fport;
2494 	pports[1] = tcpha->tha_lport;
2495 
2496 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2497 	    ports, ipst)];
2498 
2499 	mutex_enter(&connfp->connf_lock);
2500 	for (tconnp = connfp->connf_head; tconnp != NULL;
2501 	    tconnp = tconnp->conn_next) {
2502 
2503 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2504 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2505 		    tconnp->conn_tcp->tcp_state >= min_state) {
2506 
2507 			CONN_INC_REF(tconnp);
2508 			mutex_exit(&connfp->connf_lock);
2509 			return (tconnp);
2510 		}
2511 	}
2512 	mutex_exit(&connfp->connf_lock);
2513 	return (NULL);
2514 }
2515 
2516 /*
2517  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2518  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2519  * Only checks for connected entries i.e. no INADDR_ANY checks.
2520  * Match on ifindex in addition to addresses.
2521  */
2522 conn_t *
2523 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2524     uint_t ifindex, ip_stack_t *ipst)
2525 {
2526 	tcp_t	*tcp;
2527 	uint32_t ports;
2528 	uint16_t *pports;
2529 	connf_t	*connfp;
2530 	conn_t	*tconnp;
2531 
2532 	pports = (uint16_t *)&ports;
2533 	pports[0] = tcpha->tha_fport;
2534 	pports[1] = tcpha->tha_lport;
2535 
2536 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2537 	    ports, ipst)];
2538 
2539 	mutex_enter(&connfp->connf_lock);
2540 	for (tconnp = connfp->connf_head; tconnp != NULL;
2541 	    tconnp = tconnp->conn_next) {
2542 
2543 		tcp = tconnp->conn_tcp;
2544 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2545 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2546 		    tcp->tcp_state >= min_state &&
2547 		    (tconnp->conn_bound_if == 0 ||
2548 		    tconnp->conn_bound_if == ifindex)) {
2549 
2550 			CONN_INC_REF(tconnp);
2551 			mutex_exit(&connfp->connf_lock);
2552 			return (tconnp);
2553 		}
2554 	}
2555 	mutex_exit(&connfp->connf_lock);
2556 	return (NULL);
2557 }
2558 
2559 /*
2560  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2561  * a listener when changing state.
2562  */
2563 conn_t *
2564 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2565     ip_stack_t *ipst)
2566 {
2567 	connf_t		*bind_connfp;
2568 	conn_t		*connp;
2569 	tcp_t		*tcp;
2570 
2571 	/*
2572 	 * Avoid false matches for packets sent to an IP destination of
2573 	 * all zeros.
2574 	 */
2575 	if (laddr == 0)
2576 		return (NULL);
2577 
2578 	ASSERT(zoneid != ALL_ZONES);
2579 
2580 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2581 	mutex_enter(&bind_connfp->connf_lock);
2582 	for (connp = bind_connfp->connf_head; connp != NULL;
2583 	    connp = connp->conn_next) {
2584 		tcp = connp->conn_tcp;
2585 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2586 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2587 		    (tcp->tcp_listener == NULL)) {
2588 			CONN_INC_REF(connp);
2589 			mutex_exit(&bind_connfp->connf_lock);
2590 			return (connp);
2591 		}
2592 	}
2593 	mutex_exit(&bind_connfp->connf_lock);
2594 	return (NULL);
2595 }
2596 
2597 /*
2598  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2599  * a listener when changing state.
2600  */
2601 conn_t *
2602 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2603     zoneid_t zoneid, ip_stack_t *ipst)
2604 {
2605 	connf_t		*bind_connfp;
2606 	conn_t		*connp = NULL;
2607 	tcp_t		*tcp;
2608 
2609 	/*
2610 	 * Avoid false matches for packets sent to an IP destination of
2611 	 * all zeros.
2612 	 */
2613 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2614 		return (NULL);
2615 
2616 	ASSERT(zoneid != ALL_ZONES);
2617 
2618 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2619 	mutex_enter(&bind_connfp->connf_lock);
2620 	for (connp = bind_connfp->connf_head; connp != NULL;
2621 	    connp = connp->conn_next) {
2622 		tcp = connp->conn_tcp;
2623 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2624 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2625 		    (connp->conn_bound_if == 0 ||
2626 		    connp->conn_bound_if == ifindex) &&
2627 		    tcp->tcp_listener == NULL) {
2628 			CONN_INC_REF(connp);
2629 			mutex_exit(&bind_connfp->connf_lock);
2630 			return (connp);
2631 		}
2632 	}
2633 	mutex_exit(&bind_connfp->connf_lock);
2634 	return (NULL);
2635 }
2636 
2637 /*
2638  * ipcl_get_next_conn
2639  *	get the next entry in the conn global list
2640  *	and put a reference on the next_conn.
2641  *	decrement the reference on the current conn.
2642  *
2643  * This is an iterator based walker function that also provides for
2644  * some selection by the caller. It walks through the conn_hash bucket
2645  * searching for the next valid connp in the list, and selects connections
2646  * that are neither closed nor condemned. It also REFHOLDS the conn
2647  * thus ensuring that the conn exists when the caller uses the conn.
2648  */
2649 conn_t *
2650 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2651 {
2652 	conn_t	*next_connp;
2653 
2654 	if (connfp == NULL)
2655 		return (NULL);
2656 
2657 	mutex_enter(&connfp->connf_lock);
2658 
2659 	next_connp = (connp == NULL) ?
2660 	    connfp->connf_head : connp->conn_g_next;
2661 
2662 	while (next_connp != NULL) {
2663 		mutex_enter(&next_connp->conn_lock);
2664 		if (!(next_connp->conn_flags & conn_flags) ||
2665 		    (next_connp->conn_state_flags &
2666 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2667 			/*
2668 			 * This conn has been condemned or
2669 			 * is closing, or the flags don't match
2670 			 */
2671 			mutex_exit(&next_connp->conn_lock);
2672 			next_connp = next_connp->conn_g_next;
2673 			continue;
2674 		}
2675 		CONN_INC_REF_LOCKED(next_connp);
2676 		mutex_exit(&next_connp->conn_lock);
2677 		break;
2678 	}
2679 
2680 	mutex_exit(&connfp->connf_lock);
2681 
2682 	if (connp != NULL)
2683 		CONN_DEC_REF(connp);
2684 
2685 	return (next_connp);
2686 }
2687 
2688 #ifdef CONN_DEBUG
2689 /*
2690  * Trace of the last NBUF refhold/refrele
2691  */
2692 int
2693 conn_trace_ref(conn_t *connp)
2694 {
2695 	int	last;
2696 	conn_trace_t	*ctb;
2697 
2698 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2699 	last = connp->conn_trace_last;
2700 	last++;
2701 	if (last == CONN_TRACE_MAX)
2702 		last = 0;
2703 
2704 	ctb = &connp->conn_trace_buf[last];
2705 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2706 	connp->conn_trace_last = last;
2707 	return (1);
2708 }
2709 
2710 int
2711 conn_untrace_ref(conn_t *connp)
2712 {
2713 	int	last;
2714 	conn_trace_t	*ctb;
2715 
2716 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2717 	last = connp->conn_trace_last;
2718 	last++;
2719 	if (last == CONN_TRACE_MAX)
2720 		last = 0;
2721 
2722 	ctb = &connp->conn_trace_buf[last];
2723 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2724 	connp->conn_trace_last = last;
2725 	return (1);
2726 }
2727 #endif
2728 
2729 mib2_socketInfoEntry_t *
2730 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2731 {
2732 	vnode_t *vn = NULL;
2733 	vattr_t attr;
2734 	uint64_t flags = 0;
2735 	sock_upcalls_t *upcalls;
2736 	sock_upper_handle_t upper_handle;
2737 
2738 	/*
2739 	 * If the connection is closing, it is not safe to make an upcall or
2740 	 * access the stream associated with the connection.
2741 	 * The callers of this function have a reference on connp itself
2742 	 * so, as long as it is not closing, it's safe to continue.
2743 	 */
2744 	mutex_enter(&connp->conn_lock);
2745 
2746 	if ((connp->conn_state_flags & CONN_CLOSING)) {
2747 		mutex_exit(&connp->conn_lock);
2748 		return (NULL);
2749 	}
2750 
2751 	/*
2752 	 * Continue to hold conn_lock because we don't want to race with an
2753 	 * in-progress close, which will have set-to-NULL (and destroyed
2754 	 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2755 	 *
2756 	 * There is still a race with an in-progress OPEN, however, where
2757 	 * conn_upper_handle and conn_upcalls are being assigned (in multiple
2758 	 * codepaths) WITHOUT conn_lock being held.  We address that race
2759 	 * HERE, however, given that both are going from NULL to non-NULL,
2760 	 * if we lose the race, we don't get any data for the in-progress-OPEN
2761 	 * socket.
2762 	 */
2763 
2764 	upcalls = connp->conn_upcalls;
2765 	upper_handle = connp->conn_upper_handle;
2766 	/* Check BOTH for non-NULL before attempting an upcall. */
2767 	if (upper_handle != NULL && upcalls != NULL) {
2768 		/* su_get_vnode() returns one with VN_HOLD() already done. */
2769 		vn = upcalls->su_get_vnode(upper_handle);
2770 	} else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2771 		vn = STREAM(connp->conn_rq)->sd_pvnode;
2772 		if (vn != NULL)
2773 			VN_HOLD(vn);
2774 		flags |= MIB2_SOCKINFO_STREAM;
2775 	}
2776 
2777 	mutex_exit(&connp->conn_lock);
2778 
2779 	if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2780 		if (vn != NULL)
2781 			VN_RELE(vn);
2782 		return (NULL);
2783 	}
2784 
2785 	VN_RELE(vn);
2786 
2787 	bzero(sie, sizeof (*sie));
2788 
2789 	sie->sie_flags = flags;
2790 	sie->sie_inode = attr.va_nodeid;
2791 	sie->sie_dev = attr.va_rdev;
2792 
2793 	return (sie);
2794 }
2795