xref: /illumos-gate/usr/src/uts/common/inet/ip/ipclassifier.c (revision 07a48826732249fcd3aa8dd53c8389595e9f1fbc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP PACKET CLASSIFIER
28  *
29  * The IP packet classifier provides mapping between IP packets and persistent
30  * connection state for connection-oriented protocols. It also provides
31  * interface for managing connection states.
32  *
33  * The connection state is kept in conn_t data structure and contains, among
34  * other things:
35  *
36  *	o local/remote address and ports
37  *	o Transport protocol
38  *	o squeue for the connection (for TCP only)
39  *	o reference counter
40  *	o Connection state
41  *	o hash table linkage
42  *	o interface/ire information
43  *	o credentials
44  *	o ipsec policy
45  *	o send and receive functions.
46  *	o mutex lock.
47  *
48  * Connections use a reference counting scheme. They are freed when the
49  * reference counter drops to zero. A reference is incremented when connection
50  * is placed in a list or table, when incoming packet for the connection arrives
51  * and when connection is processed via squeue (squeue processing may be
52  * asynchronous and the reference protects the connection from being destroyed
53  * before its processing is finished).
54  *
55  * send and receive functions are currently used for TCP only. The send function
56  * determines the IP entry point for the packet once it leaves TCP to be sent to
57  * the destination address. The receive function is used by IP when the packet
58  * should be passed for TCP processing. When a new connection is created these
59  * are set to ip_output() and tcp_input() respectively. During the lifetime of
60  * the connection the send and receive functions may change depending on the
61  * changes in the connection state. For example, Once the connection is bound to
62  * an addresse, the receive function for this connection is set to
63  * tcp_conn_request().  This allows incoming SYNs to go directly into the
64  * listener SYN processing function without going to tcp_input() first.
65  *
66  * Classifier uses several hash tables:
67  *
68  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
69  *	ipcl_bind_fanout:	contains all connections in BOUND state
70  *	ipcl_proto_fanout:	IPv4 protocol fanout
71  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
72  *	ipcl_udp_fanout:	contains all UDP connections
73  *	ipcl_globalhash_fanout:	contains all connections
74  *
75  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
76  * which need to view all existing connections.
77  *
78  * All tables are protected by per-bucket locks. When both per-bucket lock and
79  * connection lock need to be held, the per-bucket lock should be acquired
80  * first, followed by the connection lock.
81  *
82  * All functions doing search in one of these tables increment a reference
83  * counter on the connection found (if any). This reference should be dropped
84  * when the caller has finished processing the connection.
85  *
86  *
87  * INTERFACES:
88  * ===========
89  *
90  * Connection Lookup:
91  * ------------------
92  *
93  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
94  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
95  *
96  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
97  * it can't find any associated connection. If the connection is found, its
98  * reference counter is incremented.
99  *
100  *	mp:	mblock, containing packet header. The full header should fit
101  *		into a single mblock. It should also contain at least full IP
102  *		and TCP or UDP header.
103  *
104  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
105  *
106  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
107  *		 the packet.
108  *
109  * 	zoneid: The zone in which the returned connection must be; the zoneid
110  *		corresponding to the ire_zoneid on the IRE located for the
111  *		packet's destination address.
112  *
113  *	For TCP connections, the lookup order is as follows:
114  *		5-tuple {src, dst, protocol, local port, remote port}
115  *			lookup in ipcl_conn_fanout table.
116  *		3-tuple {dst, remote port, protocol} lookup in
117  *			ipcl_bind_fanout table.
118  *
119  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
120  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
121  *	these interfaces do not handle cases where a packets belongs
122  *	to multiple UDP clients, which is handled in IP itself.
123  *
124  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125  * determine which actual zone gets the segment.  This is used only in a
126  * labeled environment.  The matching rules are:
127  *
128  *	- If it's not a multilevel port, then the label on the packet selects
129  *	  the zone.  Unlabeled packets are delivered to the global zone.
130  *
131  *	- If it's a multilevel port, then only the zone registered to receive
132  *	  packets on that port matches.
133  *
134  * Also, in a labeled environment, packet labels need to be checked.  For fully
135  * bound TCP connections, we can assume that the packet label was checked
136  * during connection establishment, and doesn't need to be checked on each
137  * packet.  For others, though, we need to check for strict equality or, for
138  * multilevel ports, membership in the range or set.  This part currently does
139  * a tnrh lookup on each packet, but could be optimized to use cached results
140  * if that were necessary.  (SCTP doesn't come through here, but if it did,
141  * we would apply the same rules as TCP.)
142  *
143  * An implication of the above is that fully-bound TCP sockets must always use
144  * distinct 4-tuples; they can't be discriminated by label alone.
145  *
146  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147  * as there's no connection set-up handshake and no shared state.
148  *
149  * Labels on looped-back packets within a single zone do not need to be
150  * checked, as all processes in the same zone have the same label.
151  *
152  * Finally, for unlabeled packets received by a labeled system, special rules
153  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
154  * socket in the zone whose label matches the default label of the sender, if
155  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156  * receiver's label must dominate the sender's default label.
157  *
158  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
159  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160  *					 ip_stack);
161  *
162  *	Lookup routine to find a exact match for {src, dst, local port,
163  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
164  *	ports are read from the IP and TCP header respectively.
165  *
166  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
167  *					 zoneid, ip_stack);
168  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169  *					 zoneid, ip_stack);
170  *
171  * 	Lookup routine to find a listener with the tuple {lport, laddr,
172  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173  * 	parameter interface index is also compared.
174  *
175  * void ipcl_walk(func, arg, ip_stack)
176  *
177  * 	Apply 'func' to every connection available. The 'func' is called as
178  *	(*func)(connp, arg). The walk is non-atomic so connections may be
179  *	created and destroyed during the walk. The CONN_CONDEMNED and
180  *	CONN_INCIPIENT flags ensure that connections which are newly created
181  *	or being destroyed are not selected by the walker.
182  *
183  * Table Updates
184  * -------------
185  *
186  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
187  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
188  *
189  *	Insert 'connp' in the ipcl_conn_fanout.
190  *	Arguements :
191  *		connp		conn_t to be inserted
192  *		protocol	connection protocol
193  *		src		source address
194  *		dst		destination address
195  *		ports		local and remote port
196  *		ifindex		interface index for IPv6 connections
197  *
198  *	Return value :
199  *		0		if connp was inserted
200  *		EADDRINUSE	if the connection with the same tuple
201  *				already exists.
202  *
203  * int ipcl_bind_insert(connp, protocol, src, lport);
204  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
205  *
206  * 	Insert 'connp' in ipcl_bind_fanout.
207  * 	Arguements :
208  * 		connp		conn_t to be inserted
209  * 		protocol	connection protocol
210  * 		src		source address connection wants
211  * 				to bind to
212  * 		lport		local port connection wants to
213  * 				bind to
214  *
215  *
216  * void ipcl_hash_remove(connp);
217  *
218  * 	Removes the 'connp' from the connection fanout table.
219  *
220  * Connection Creation/Destruction
221  * -------------------------------
222  *
223  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
224  *
225  * 	Creates a new conn based on the type flag, inserts it into
226  * 	globalhash table.
227  *
228  *	type:	This flag determines the type of conn_t which needs to be
229  *		created i.e., which kmem_cache it comes from.
230  *		IPCL_TCPCONN	indicates a TCP connection
231  *		IPCL_SCTPCONN	indicates a SCTP connection
232  *		IPCL_UDPCONN	indicates a UDP conn_t.
233  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
234  *		IPCL_RTSCONN	indicates a RTS conn_t.
235  *		IPCL_IPCCONN	indicates all other connections.
236  *
237  * void ipcl_conn_destroy(connp)
238  *
239  * 	Destroys the connection state, removes it from the global
240  * 	connection hash table and frees its memory.
241  */
242 
243 #include <sys/types.h>
244 #include <sys/stream.h>
245 #include <sys/stropts.h>
246 #include <sys/sysmacros.h>
247 #include <sys/strsubr.h>
248 #include <sys/strsun.h>
249 #define	_SUN_TPI_VERSION 2
250 #include <sys/ddi.h>
251 #include <sys/cmn_err.h>
252 #include <sys/debug.h>
253 
254 #include <sys/systm.h>
255 #include <sys/param.h>
256 #include <sys/kmem.h>
257 #include <sys/isa_defs.h>
258 #include <inet/common.h>
259 #include <netinet/ip6.h>
260 #include <netinet/icmp6.h>
261 
262 #include <inet/ip.h>
263 #include <inet/ip6.h>
264 #include <inet/ip_ndp.h>
265 #include <inet/ip_impl.h>
266 #include <inet/udp_impl.h>
267 #include <inet/sctp_ip.h>
268 #include <inet/sctp/sctp_impl.h>
269 #include <inet/rawip_impl.h>
270 #include <inet/rts_impl.h>
271 
272 #include <sys/cpuvar.h>
273 
274 #include <inet/ipclassifier.h>
275 #include <inet/tcp.h>
276 #include <inet/ipsec_impl.h>
277 
278 #include <sys/tsol/tnet.h>
279 #include <sys/sockio.h>
280 
281 #ifdef DEBUG
282 #define	IPCL_DEBUG
283 #else
284 #undef	IPCL_DEBUG
285 #endif
286 
287 #ifdef	IPCL_DEBUG
288 int	ipcl_debug_level = 0;
289 #define	IPCL_DEBUG_LVL(level, args)	\
290 	if (ipcl_debug_level  & level) { printf args; }
291 #else
292 #define	IPCL_DEBUG_LVL(level, args) {; }
293 #endif
294 /* Old value for compatibility. Setable in /etc/system */
295 uint_t tcp_conn_hash_size = 0;
296 
297 /* New value. Zero means choose automatically.  Setable in /etc/system */
298 uint_t ipcl_conn_hash_size = 0;
299 uint_t ipcl_conn_hash_memfactor = 8192;
300 uint_t ipcl_conn_hash_maxsize = 82500;
301 
302 /* bind/udp fanout table size */
303 uint_t ipcl_bind_fanout_size = 512;
304 uint_t ipcl_udp_fanout_size = 16384;
305 
306 /* Raw socket fanout size.  Must be a power of 2. */
307 uint_t ipcl_raw_fanout_size = 256;
308 
309 /*
310  * Power of 2^N Primes useful for hashing for N of 0-28,
311  * these primes are the nearest prime <= 2^N - 2^(N-2).
312  */
313 
314 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
315 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
316 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
317 		50331599, 100663291, 201326557, 0}
318 
319 /*
320  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
321  * are aligned on cache lines.
322  */
323 typedef union itc_s {
324 	conn_t	itc_conn;
325 	char	itcu_filler[CACHE_ALIGN(conn_s)];
326 } itc_t;
327 
328 struct kmem_cache  *tcp_conn_cache;
329 struct kmem_cache  *ip_conn_cache;
330 struct kmem_cache  *ip_helper_stream_cache;
331 extern struct kmem_cache  *sctp_conn_cache;
332 extern struct kmem_cache  *tcp_sack_info_cache;
333 extern struct kmem_cache  *tcp_iphc_cache;
334 struct kmem_cache  *udp_conn_cache;
335 struct kmem_cache  *rawip_conn_cache;
336 struct kmem_cache  *rts_conn_cache;
337 
338 extern void	tcp_timermp_free(tcp_t *);
339 extern mblk_t	*tcp_timermp_alloc(int);
340 
341 static int	ip_conn_constructor(void *, void *, int);
342 static void	ip_conn_destructor(void *, void *);
343 
344 static int	tcp_conn_constructor(void *, void *, int);
345 static void	tcp_conn_destructor(void *, void *);
346 
347 static int	udp_conn_constructor(void *, void *, int);
348 static void	udp_conn_destructor(void *, void *);
349 
350 static int	rawip_conn_constructor(void *, void *, int);
351 static void	rawip_conn_destructor(void *, void *);
352 
353 static int	rts_conn_constructor(void *, void *, int);
354 static void	rts_conn_destructor(void *, void *);
355 
356 static int	ip_helper_stream_constructor(void *, void *, int);
357 static void	ip_helper_stream_destructor(void *, void *);
358 
359 boolean_t	ip_use_helper_cache = B_TRUE;
360 
361 /*
362  * Hook functions to enable cluster networking
363  * On non-clustered systems these vectors must always be NULL.
364  */
365 extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
366 		    uint8_t *, in_port_t, void *);
367 extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
368 		    uint8_t *, in_port_t, void *);
369 
370 #ifdef	IPCL_DEBUG
371 #define	INET_NTOA_BUFSIZE	18
372 
373 static char *
374 inet_ntoa_r(uint32_t in, char *b)
375 {
376 	unsigned char	*p;
377 
378 	p = (unsigned char *)&in;
379 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
380 	return (b);
381 }
382 #endif
383 
384 /*
385  * Global (for all stack instances) init routine
386  */
387 void
388 ipcl_g_init(void)
389 {
390 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
391 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
392 	    ip_conn_constructor, ip_conn_destructor,
393 	    NULL, NULL, NULL, 0);
394 
395 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
396 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
397 	    tcp_conn_constructor, tcp_conn_destructor,
398 	    NULL, NULL, NULL, 0);
399 
400 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
401 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
402 	    udp_conn_constructor, udp_conn_destructor,
403 	    NULL, NULL, NULL, 0);
404 
405 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
406 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
407 	    rawip_conn_constructor, rawip_conn_destructor,
408 	    NULL, NULL, NULL, 0);
409 
410 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
411 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
412 	    rts_conn_constructor, rts_conn_destructor,
413 	    NULL, NULL, NULL, 0);
414 
415 	if (ip_use_helper_cache) {
416 		ip_helper_stream_cache = kmem_cache_create
417 		    ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
418 		    CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
419 		    ip_helper_stream_destructor, NULL, NULL, NULL, 0);
420 	} else {
421 		ip_helper_stream_cache = NULL;
422 	}
423 }
424 
425 /*
426  * ipclassifier intialization routine, sets up hash tables.
427  */
428 void
429 ipcl_init(ip_stack_t *ipst)
430 {
431 	int i;
432 	int sizes[] = P2Ps();
433 
434 	/*
435 	 * Calculate size of conn fanout table from /etc/system settings
436 	 */
437 	if (ipcl_conn_hash_size != 0) {
438 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
439 	} else if (tcp_conn_hash_size != 0) {
440 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
441 	} else {
442 		extern pgcnt_t freemem;
443 
444 		ipst->ips_ipcl_conn_fanout_size =
445 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
446 
447 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
448 			ipst->ips_ipcl_conn_fanout_size =
449 			    ipcl_conn_hash_maxsize;
450 		}
451 	}
452 
453 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
454 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
455 			break;
456 		}
457 	}
458 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
459 		/* Out of range, use the 2^16 value */
460 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
461 	}
462 
463 	/* Take values from /etc/system */
464 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
465 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
466 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
467 
468 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
469 
470 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
471 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
472 
473 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
474 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
475 		    MUTEX_DEFAULT, NULL);
476 	}
477 
478 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
479 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
480 
481 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
482 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
483 		    MUTEX_DEFAULT, NULL);
484 	}
485 
486 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
487 	    sizeof (connf_t), KM_SLEEP);
488 	for (i = 0; i < IPPROTO_MAX; i++) {
489 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
490 		    MUTEX_DEFAULT, NULL);
491 	}
492 
493 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
494 	    sizeof (connf_t), KM_SLEEP);
495 	for (i = 0; i < IPPROTO_MAX; i++) {
496 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
497 		    MUTEX_DEFAULT, NULL);
498 	}
499 
500 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
501 	mutex_init(&ipst->ips_rts_clients->connf_lock,
502 	    NULL, MUTEX_DEFAULT, NULL);
503 
504 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
505 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
506 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
507 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
508 		    MUTEX_DEFAULT, NULL);
509 	}
510 
511 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
512 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
513 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
514 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
515 		    MUTEX_DEFAULT, NULL);
516 	}
517 
518 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
519 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
520 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
521 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
522 		    NULL, MUTEX_DEFAULT, NULL);
523 	}
524 }
525 
526 void
527 ipcl_g_destroy(void)
528 {
529 	kmem_cache_destroy(ip_conn_cache);
530 	kmem_cache_destroy(tcp_conn_cache);
531 	kmem_cache_destroy(udp_conn_cache);
532 	kmem_cache_destroy(rawip_conn_cache);
533 	kmem_cache_destroy(rts_conn_cache);
534 }
535 
536 /*
537  * All user-level and kernel use of the stack must be gone
538  * by now.
539  */
540 void
541 ipcl_destroy(ip_stack_t *ipst)
542 {
543 	int i;
544 
545 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
546 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
547 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
548 	}
549 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
550 	    sizeof (connf_t));
551 	ipst->ips_ipcl_conn_fanout = NULL;
552 
553 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
554 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
555 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
556 	}
557 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
558 	    sizeof (connf_t));
559 	ipst->ips_ipcl_bind_fanout = NULL;
560 
561 	for (i = 0; i < IPPROTO_MAX; i++) {
562 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
563 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
564 	}
565 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
566 	ipst->ips_ipcl_proto_fanout = NULL;
567 
568 	for (i = 0; i < IPPROTO_MAX; i++) {
569 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
570 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
571 	}
572 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
573 	    IPPROTO_MAX * sizeof (connf_t));
574 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
575 
576 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
577 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
578 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
579 	}
580 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
581 	    sizeof (connf_t));
582 	ipst->ips_ipcl_udp_fanout = NULL;
583 
584 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
585 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
586 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
587 	}
588 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
589 	    sizeof (connf_t));
590 	ipst->ips_ipcl_raw_fanout = NULL;
591 
592 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
593 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
594 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
595 	}
596 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
597 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
598 	ipst->ips_ipcl_globalhash_fanout = NULL;
599 
600 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
601 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
602 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
603 	ipst->ips_rts_clients = NULL;
604 }
605 
606 /*
607  * conn creation routine. initialize the conn, sets the reference
608  * and inserts it in the global hash table.
609  */
610 conn_t *
611 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
612 {
613 	conn_t	*connp;
614 	sctp_stack_t *sctps;
615 	struct kmem_cache *conn_cache;
616 
617 	switch (type) {
618 	case IPCL_SCTPCONN:
619 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
620 			return (NULL);
621 		sctp_conn_init(connp);
622 		sctps = ns->netstack_sctp;
623 		SCTP_G_Q_REFHOLD(sctps);
624 		netstack_hold(ns);
625 		connp->conn_netstack = ns;
626 		return (connp);
627 
628 	case IPCL_TCPCONN:
629 		conn_cache = tcp_conn_cache;
630 		break;
631 
632 	case IPCL_UDPCONN:
633 		conn_cache = udp_conn_cache;
634 		break;
635 
636 	case IPCL_RAWIPCONN:
637 		conn_cache = rawip_conn_cache;
638 		break;
639 
640 	case IPCL_RTSCONN:
641 		conn_cache = rts_conn_cache;
642 		break;
643 
644 	case IPCL_IPCCONN:
645 		conn_cache = ip_conn_cache;
646 		break;
647 
648 	default:
649 		connp = NULL;
650 		ASSERT(0);
651 	}
652 
653 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
654 		return (NULL);
655 
656 	connp->conn_ref = 1;
657 	netstack_hold(ns);
658 	connp->conn_netstack = ns;
659 	ipcl_globalhash_insert(connp);
660 	return (connp);
661 }
662 
663 void
664 ipcl_conn_destroy(conn_t *connp)
665 {
666 	mblk_t	*mp;
667 	netstack_t	*ns = connp->conn_netstack;
668 
669 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
670 	ASSERT(connp->conn_ref == 0);
671 	ASSERT(connp->conn_ire_cache == NULL);
672 
673 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
674 
675 	if (connp->conn_effective_cred != NULL) {
676 		crfree(connp->conn_effective_cred);
677 		connp->conn_effective_cred = NULL;
678 	}
679 
680 	if (connp->conn_cred != NULL) {
681 		crfree(connp->conn_cred);
682 		connp->conn_cred = NULL;
683 	}
684 
685 	ipcl_globalhash_remove(connp);
686 
687 	/* FIXME: add separate tcp_conn_free()? */
688 	if (connp->conn_flags & IPCL_TCPCONN) {
689 		tcp_t	*tcp = connp->conn_tcp;
690 		tcp_stack_t *tcps;
691 
692 		ASSERT(tcp != NULL);
693 		tcps = tcp->tcp_tcps;
694 		if (tcps != NULL) {
695 			if (connp->conn_latch != NULL) {
696 				IPLATCH_REFRELE(connp->conn_latch, ns);
697 				connp->conn_latch = NULL;
698 			}
699 			if (connp->conn_policy != NULL) {
700 				IPPH_REFRELE(connp->conn_policy, ns);
701 				connp->conn_policy = NULL;
702 			}
703 			tcp->tcp_tcps = NULL;
704 			TCPS_REFRELE(tcps);
705 		}
706 
707 		tcp_free(tcp);
708 		mp = tcp->tcp_timercache;
709 		tcp->tcp_cred = NULL;
710 
711 		if (tcp->tcp_sack_info != NULL) {
712 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
713 			kmem_cache_free(tcp_sack_info_cache,
714 			    tcp->tcp_sack_info);
715 		}
716 		if (tcp->tcp_iphc != NULL) {
717 			if (tcp->tcp_hdr_grown) {
718 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
719 			} else {
720 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
721 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
722 			}
723 			tcp->tcp_iphc_len = 0;
724 		}
725 		ASSERT(tcp->tcp_iphc_len == 0);
726 
727 		/*
728 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
729 		 * the mblk.
730 		 */
731 		if (tcp->tcp_rsrv_mp != NULL) {
732 			freeb(tcp->tcp_rsrv_mp);
733 			tcp->tcp_rsrv_mp = NULL;
734 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
735 		}
736 
737 		ASSERT(connp->conn_latch == NULL);
738 		ASSERT(connp->conn_policy == NULL);
739 
740 		if (ns != NULL) {
741 			ASSERT(tcp->tcp_tcps == NULL);
742 			connp->conn_netstack = NULL;
743 			netstack_rele(ns);
744 		}
745 
746 		ipcl_conn_cleanup(connp);
747 		connp->conn_flags = IPCL_TCPCONN;
748 		bzero(tcp, sizeof (tcp_t));
749 
750 		tcp->tcp_timercache = mp;
751 		tcp->tcp_connp = connp;
752 		kmem_cache_free(tcp_conn_cache, connp);
753 		return;
754 	}
755 	if (connp->conn_latch != NULL) {
756 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
757 		connp->conn_latch = NULL;
758 	}
759 	if (connp->conn_policy != NULL) {
760 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
761 		connp->conn_policy = NULL;
762 	}
763 	if (connp->conn_ipsec_opt_mp != NULL) {
764 		freemsg(connp->conn_ipsec_opt_mp);
765 		connp->conn_ipsec_opt_mp = NULL;
766 	}
767 
768 	if (connp->conn_flags & IPCL_SCTPCONN) {
769 		ASSERT(ns != NULL);
770 		sctp_free(connp);
771 		return;
772 	}
773 
774 	if (ns != NULL) {
775 		connp->conn_netstack = NULL;
776 		netstack_rele(ns);
777 	}
778 
779 	ipcl_conn_cleanup(connp);
780 
781 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
782 	if (connp->conn_flags & IPCL_UDPCONN) {
783 		connp->conn_flags = IPCL_UDPCONN;
784 		kmem_cache_free(udp_conn_cache, connp);
785 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
786 
787 		connp->conn_flags = IPCL_RAWIPCONN;
788 		connp->conn_ulp = IPPROTO_ICMP;
789 		kmem_cache_free(rawip_conn_cache, connp);
790 	} else if (connp->conn_flags & IPCL_RTSCONN) {
791 		connp->conn_flags = IPCL_RTSCONN;
792 		kmem_cache_free(rts_conn_cache, connp);
793 	} else {
794 		connp->conn_flags = IPCL_IPCCONN;
795 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
796 		ASSERT(connp->conn_priv == NULL);
797 		kmem_cache_free(ip_conn_cache, connp);
798 	}
799 }
800 
801 /*
802  * Running in cluster mode - deregister listener information
803  */
804 
805 static void
806 ipcl_conn_unlisten(conn_t *connp)
807 {
808 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
809 	ASSERT(connp->conn_lport != 0);
810 
811 	if (cl_inet_unlisten != NULL) {
812 		sa_family_t	addr_family;
813 		uint8_t		*laddrp;
814 
815 		if (connp->conn_pkt_isv6) {
816 			addr_family = AF_INET6;
817 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
818 		} else {
819 			addr_family = AF_INET;
820 			laddrp = (uint8_t *)&connp->conn_bound_source;
821 		}
822 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
823 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
824 	}
825 	connp->conn_flags &= ~IPCL_CL_LISTENER;
826 }
827 
828 /*
829  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
830  * which table the conn belonged to). So for debugging we can see which hash
831  * table this connection was in.
832  */
833 #define	IPCL_HASH_REMOVE(connp)	{					\
834 	connf_t	*connfp = (connp)->conn_fanout;				\
835 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
836 	if (connfp != NULL) {						\
837 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
838 		    (void *)(connp)));					\
839 		mutex_enter(&connfp->connf_lock);			\
840 		if ((connp)->conn_next != NULL)				\
841 			(connp)->conn_next->conn_prev =			\
842 			    (connp)->conn_prev;				\
843 		if ((connp)->conn_prev != NULL)				\
844 			(connp)->conn_prev->conn_next =			\
845 			    (connp)->conn_next;				\
846 		else							\
847 			connfp->connf_head = (connp)->conn_next;	\
848 		(connp)->conn_fanout = NULL;				\
849 		(connp)->conn_next = NULL;				\
850 		(connp)->conn_prev = NULL;				\
851 		(connp)->conn_flags |= IPCL_REMOVED;			\
852 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
853 			ipcl_conn_unlisten((connp));			\
854 		CONN_DEC_REF((connp));					\
855 		mutex_exit(&connfp->connf_lock);			\
856 	}								\
857 }
858 
859 void
860 ipcl_hash_remove(conn_t *connp)
861 {
862 	IPCL_HASH_REMOVE(connp);
863 }
864 
865 /*
866  * The whole purpose of this function is allow removal of
867  * a conn_t from the connected hash for timewait reclaim.
868  * This is essentially a TW reclaim fastpath where timewait
869  * collector checks under fanout lock (so no one else can
870  * get access to the conn_t) that refcnt is 2 i.e. one for
871  * TCP and one for the classifier hash list. If ref count
872  * is indeed 2, we can just remove the conn under lock and
873  * avoid cleaning up the conn under squeue. This gives us
874  * improved performance.
875  */
876 void
877 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
878 {
879 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
880 	ASSERT(MUTEX_HELD(&connp->conn_lock));
881 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
882 
883 	if ((connp)->conn_next != NULL) {
884 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
885 	}
886 	if ((connp)->conn_prev != NULL) {
887 		(connp)->conn_prev->conn_next = (connp)->conn_next;
888 	} else {
889 		connfp->connf_head = (connp)->conn_next;
890 	}
891 	(connp)->conn_fanout = NULL;
892 	(connp)->conn_next = NULL;
893 	(connp)->conn_prev = NULL;
894 	(connp)->conn_flags |= IPCL_REMOVED;
895 	ASSERT((connp)->conn_ref == 2);
896 	(connp)->conn_ref--;
897 }
898 
899 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
900 	ASSERT((connp)->conn_fanout == NULL);				\
901 	ASSERT((connp)->conn_next == NULL);				\
902 	ASSERT((connp)->conn_prev == NULL);				\
903 	if ((connfp)->connf_head != NULL) {				\
904 		(connfp)->connf_head->conn_prev = (connp);		\
905 		(connp)->conn_next = (connfp)->connf_head;		\
906 	}								\
907 	(connp)->conn_fanout = (connfp);				\
908 	(connfp)->connf_head = (connp);					\
909 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
910 	    IPCL_CONNECTED;						\
911 	CONN_INC_REF(connp);						\
912 }
913 
914 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
915 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
916 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
917 	IPCL_HASH_REMOVE((connp));					\
918 	mutex_enter(&(connfp)->connf_lock);				\
919 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
920 	mutex_exit(&(connfp)->connf_lock);				\
921 }
922 
923 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
924 	conn_t *pconnp = NULL, *nconnp;					\
925 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
926 	    "connp %p", (void *)connfp, (void *)(connp)));		\
927 	IPCL_HASH_REMOVE((connp));					\
928 	mutex_enter(&(connfp)->connf_lock);				\
929 	nconnp = (connfp)->connf_head;					\
930 	while (nconnp != NULL &&					\
931 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
932 		pconnp = nconnp;					\
933 		nconnp = nconnp->conn_next;				\
934 	}								\
935 	if (pconnp != NULL) {						\
936 		pconnp->conn_next = (connp);				\
937 		(connp)->conn_prev = pconnp;				\
938 	} else {							\
939 		(connfp)->connf_head = (connp);				\
940 	}								\
941 	if (nconnp != NULL) {						\
942 		(connp)->conn_next = nconnp;				\
943 		nconnp->conn_prev = (connp);				\
944 	}								\
945 	(connp)->conn_fanout = (connfp);				\
946 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
947 	    IPCL_BOUND;							\
948 	CONN_INC_REF(connp);						\
949 	mutex_exit(&(connfp)->connf_lock);				\
950 }
951 
952 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
953 	conn_t **list, *prev, *next;					\
954 	boolean_t isv4mapped =						\
955 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
956 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
957 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
958 	IPCL_HASH_REMOVE((connp));					\
959 	mutex_enter(&(connfp)->connf_lock);				\
960 	list = &(connfp)->connf_head;					\
961 	prev = NULL;							\
962 	while ((next = *list) != NULL) {				\
963 		if (isv4mapped &&					\
964 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
965 		    connp->conn_zoneid == next->conn_zoneid) {		\
966 			(connp)->conn_next = next;			\
967 			if (prev != NULL)				\
968 				prev = next->conn_prev;			\
969 			next->conn_prev = (connp);			\
970 			break;						\
971 		}							\
972 		list = &next->conn_next;				\
973 		prev = next;						\
974 	}								\
975 	(connp)->conn_prev = prev;					\
976 	*list = (connp);						\
977 	(connp)->conn_fanout = (connfp);				\
978 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
979 	    IPCL_BOUND;							\
980 	CONN_INC_REF((connp));						\
981 	mutex_exit(&(connfp)->connf_lock);				\
982 }
983 
984 void
985 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
986 {
987 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
988 }
989 
990 void
991 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
992 {
993 	connf_t	*connfp;
994 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
995 
996 	ASSERT(connp != NULL);
997 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
998 	    protocol == IPPROTO_ESP);
999 
1000 	connp->conn_ulp = protocol;
1001 
1002 	/* Insert it in the protocol hash */
1003 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1004 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1005 }
1006 
1007 void
1008 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
1009 {
1010 	connf_t	*connfp;
1011 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1012 
1013 	ASSERT(connp != NULL);
1014 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
1015 	    protocol == IPPROTO_ESP);
1016 
1017 	connp->conn_ulp = protocol;
1018 
1019 	/* Insert it in the Bind Hash */
1020 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1021 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1022 }
1023 
1024 /*
1025  * This function is used only for inserting SCTP raw socket now.
1026  * This may change later.
1027  *
1028  * Note that only one raw socket can be bound to a port.  The param
1029  * lport is in network byte order.
1030  */
1031 static int
1032 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1033 {
1034 	connf_t	*connfp;
1035 	conn_t	*oconnp;
1036 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1037 
1038 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1039 
1040 	/* Check for existing raw socket already bound to the port. */
1041 	mutex_enter(&connfp->connf_lock);
1042 	for (oconnp = connfp->connf_head; oconnp != NULL;
1043 	    oconnp = oconnp->conn_next) {
1044 		if (oconnp->conn_lport == lport &&
1045 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1046 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1047 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1048 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1049 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1050 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1051 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1052 		    &connp->conn_srcv6))) {
1053 			break;
1054 		}
1055 	}
1056 	mutex_exit(&connfp->connf_lock);
1057 	if (oconnp != NULL)
1058 		return (EADDRNOTAVAIL);
1059 
1060 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1061 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1062 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1063 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1064 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1065 		} else {
1066 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1067 		}
1068 	} else {
1069 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1070 	}
1071 	return (0);
1072 }
1073 
1074 /*
1075  * Check for a MAC exemption conflict on a labeled system.  Note that for
1076  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1077  * transport layer.  This check is for binding all other protocols.
1078  *
1079  * Returns true if there's a conflict.
1080  */
1081 static boolean_t
1082 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1083 {
1084 	connf_t	*connfp;
1085 	conn_t *tconn;
1086 
1087 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1088 	mutex_enter(&connfp->connf_lock);
1089 	for (tconn = connfp->connf_head; tconn != NULL;
1090 	    tconn = tconn->conn_next) {
1091 		/* We don't allow v4 fallback for v6 raw socket */
1092 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1093 			continue;
1094 		/* If neither is exempt, then there's no conflict */
1095 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1096 			continue;
1097 		/* We are only concerned about sockets for a different zone */
1098 		if (connp->conn_zoneid == tconn->conn_zoneid)
1099 			continue;
1100 		/* If both are bound to different specific addrs, ok */
1101 		if (connp->conn_src != INADDR_ANY &&
1102 		    tconn->conn_src != INADDR_ANY &&
1103 		    connp->conn_src != tconn->conn_src)
1104 			continue;
1105 		/* These two conflict; fail */
1106 		break;
1107 	}
1108 	mutex_exit(&connfp->connf_lock);
1109 	return (tconn != NULL);
1110 }
1111 
1112 static boolean_t
1113 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1114 {
1115 	connf_t	*connfp;
1116 	conn_t *tconn;
1117 
1118 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1119 	mutex_enter(&connfp->connf_lock);
1120 	for (tconn = connfp->connf_head; tconn != NULL;
1121 	    tconn = tconn->conn_next) {
1122 		/* We don't allow v4 fallback for v6 raw socket */
1123 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1124 			continue;
1125 		/* If neither is exempt, then there's no conflict */
1126 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1127 			continue;
1128 		/* We are only concerned about sockets for a different zone */
1129 		if (connp->conn_zoneid == tconn->conn_zoneid)
1130 			continue;
1131 		/* If both are bound to different addrs, ok */
1132 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1133 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1134 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1135 			continue;
1136 		/* These two conflict; fail */
1137 		break;
1138 	}
1139 	mutex_exit(&connfp->connf_lock);
1140 	return (tconn != NULL);
1141 }
1142 
1143 /*
1144  * (v4, v6) bind hash insertion routines
1145  */
1146 int
1147 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1148 {
1149 	connf_t	*connfp;
1150 #ifdef	IPCL_DEBUG
1151 	char	buf[INET_NTOA_BUFSIZE];
1152 #endif
1153 	int	ret = 0;
1154 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1155 
1156 	ASSERT(connp);
1157 
1158 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1159 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1160 
1161 	connp->conn_ulp = protocol;
1162 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1163 	connp->conn_lport = lport;
1164 
1165 	switch (protocol) {
1166 	default:
1167 		if (is_system_labeled() &&
1168 		    check_exempt_conflict_v4(connp, ipst))
1169 			return (EADDRINUSE);
1170 		/* FALLTHROUGH */
1171 	case IPPROTO_UDP:
1172 		if (protocol == IPPROTO_UDP) {
1173 			IPCL_DEBUG_LVL(64,
1174 			    ("ipcl_bind_insert: connp %p - udp\n",
1175 			    (void *)connp));
1176 			connfp = &ipst->ips_ipcl_udp_fanout[
1177 			    IPCL_UDP_HASH(lport, ipst)];
1178 		} else {
1179 			IPCL_DEBUG_LVL(64,
1180 			    ("ipcl_bind_insert: connp %p - protocol\n",
1181 			    (void *)connp));
1182 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1183 		}
1184 
1185 		if (connp->conn_rem != INADDR_ANY) {
1186 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1187 		} else if (connp->conn_src != INADDR_ANY) {
1188 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1189 		} else {
1190 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1191 		}
1192 		break;
1193 
1194 	case IPPROTO_TCP:
1195 
1196 		/* Insert it in the Bind Hash */
1197 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1198 		connfp = &ipst->ips_ipcl_bind_fanout[
1199 		    IPCL_BIND_HASH(lport, ipst)];
1200 		if (connp->conn_src != INADDR_ANY) {
1201 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1202 		} else {
1203 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1204 		}
1205 		if (cl_inet_listen != NULL) {
1206 			ASSERT(!connp->conn_pkt_isv6);
1207 			connp->conn_flags |= IPCL_CL_LISTENER;
1208 			(*cl_inet_listen)(
1209 			    connp->conn_netstack->netstack_stackid,
1210 			    IPPROTO_TCP, AF_INET,
1211 			    (uint8_t *)&connp->conn_bound_source, lport, NULL);
1212 		}
1213 		break;
1214 
1215 	case IPPROTO_SCTP:
1216 		ret = ipcl_sctp_hash_insert(connp, lport);
1217 		break;
1218 	}
1219 
1220 	return (ret);
1221 }
1222 
1223 int
1224 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1225     uint16_t lport)
1226 {
1227 	connf_t	*connfp;
1228 	int	ret = 0;
1229 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1230 
1231 	ASSERT(connp);
1232 
1233 	connp->conn_ulp = protocol;
1234 	connp->conn_srcv6 = *src;
1235 	connp->conn_lport = lport;
1236 
1237 	switch (protocol) {
1238 	default:
1239 		if (is_system_labeled() &&
1240 		    check_exempt_conflict_v6(connp, ipst))
1241 			return (EADDRINUSE);
1242 		/* FALLTHROUGH */
1243 	case IPPROTO_UDP:
1244 		if (protocol == IPPROTO_UDP) {
1245 			IPCL_DEBUG_LVL(128,
1246 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1247 			    (void *)connp));
1248 			connfp = &ipst->ips_ipcl_udp_fanout[
1249 			    IPCL_UDP_HASH(lport, ipst)];
1250 		} else {
1251 			IPCL_DEBUG_LVL(128,
1252 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1253 			    (void *)connp));
1254 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1255 		}
1256 
1257 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1258 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1259 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1260 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1261 		} else {
1262 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1263 		}
1264 		break;
1265 
1266 	case IPPROTO_TCP:
1267 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1268 
1269 		/* Insert it in the Bind Hash */
1270 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1271 		connfp = &ipst->ips_ipcl_bind_fanout[
1272 		    IPCL_BIND_HASH(lport, ipst)];
1273 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1274 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 		} else {
1276 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 		}
1278 		if (cl_inet_listen != NULL) {
1279 			sa_family_t	addr_family;
1280 			uint8_t		*laddrp;
1281 
1282 			if (connp->conn_pkt_isv6) {
1283 				addr_family = AF_INET6;
1284 				laddrp =
1285 				    (uint8_t *)&connp->conn_bound_source_v6;
1286 			} else {
1287 				addr_family = AF_INET;
1288 				laddrp = (uint8_t *)&connp->conn_bound_source;
1289 			}
1290 			connp->conn_flags |= IPCL_CL_LISTENER;
1291 			(*cl_inet_listen)(
1292 			    connp->conn_netstack->netstack_stackid,
1293 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1294 		}
1295 		break;
1296 
1297 	case IPPROTO_SCTP:
1298 		ret = ipcl_sctp_hash_insert(connp, lport);
1299 		break;
1300 	}
1301 
1302 	return (ret);
1303 }
1304 
1305 /*
1306  * ipcl_conn_hash insertion routines.
1307  */
1308 int
1309 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1310     ipaddr_t rem, uint32_t ports)
1311 {
1312 	connf_t		*connfp;
1313 	uint16_t	*up;
1314 	conn_t		*tconnp;
1315 #ifdef	IPCL_DEBUG
1316 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1317 #endif
1318 	in_port_t	lport;
1319 	int		ret = 0;
1320 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1321 
1322 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1323 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1324 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1325 	    ports, protocol));
1326 
1327 	switch (protocol) {
1328 	case IPPROTO_TCP:
1329 		if (!(connp->conn_flags & IPCL_EAGER)) {
1330 			/*
1331 			 * for a eager connection, i.e connections which
1332 			 * have just been created, the initialization is
1333 			 * already done in ip at conn_creation time, so
1334 			 * we can skip the checks here.
1335 			 */
1336 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1337 		}
1338 
1339 		/*
1340 		 * For tcp, we check whether the connection tuple already
1341 		 * exists before allowing the connection to proceed.  We
1342 		 * also allow indexing on the zoneid. This is to allow
1343 		 * multiple shared stack zones to have the same tcp
1344 		 * connection tuple. In practice this only happens for
1345 		 * INADDR_LOOPBACK as it's the only local address which
1346 		 * doesn't have to be unique.
1347 		 */
1348 		connfp = &ipst->ips_ipcl_conn_fanout[
1349 		    IPCL_CONN_HASH(connp->conn_rem,
1350 		    connp->conn_ports, ipst)];
1351 		mutex_enter(&connfp->connf_lock);
1352 		for (tconnp = connfp->connf_head; tconnp != NULL;
1353 		    tconnp = tconnp->conn_next) {
1354 			if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1355 			    connp->conn_rem, connp->conn_src,
1356 			    connp->conn_ports)) &&
1357 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1358 
1359 				/* Already have a conn. bail out */
1360 				mutex_exit(&connfp->connf_lock);
1361 				return (EADDRINUSE);
1362 			}
1363 		}
1364 		if (connp->conn_fanout != NULL) {
1365 			/*
1366 			 * Probably a XTI/TLI application trying to do a
1367 			 * rebind. Let it happen.
1368 			 */
1369 			mutex_exit(&connfp->connf_lock);
1370 			IPCL_HASH_REMOVE(connp);
1371 			mutex_enter(&connfp->connf_lock);
1372 		}
1373 
1374 		ASSERT(connp->conn_recv != NULL);
1375 
1376 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1377 		mutex_exit(&connfp->connf_lock);
1378 		break;
1379 
1380 	case IPPROTO_SCTP:
1381 		/*
1382 		 * The raw socket may have already been bound, remove it
1383 		 * from the hash first.
1384 		 */
1385 		IPCL_HASH_REMOVE(connp);
1386 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1387 		ret = ipcl_sctp_hash_insert(connp, lport);
1388 		break;
1389 
1390 	default:
1391 		/*
1392 		 * Check for conflicts among MAC exempt bindings.  For
1393 		 * transports with port numbers, this is done by the upper
1394 		 * level per-transport binding logic.  For all others, it's
1395 		 * done here.
1396 		 */
1397 		if (is_system_labeled() &&
1398 		    check_exempt_conflict_v4(connp, ipst))
1399 			return (EADDRINUSE);
1400 		/* FALLTHROUGH */
1401 
1402 	case IPPROTO_UDP:
1403 		up = (uint16_t *)&ports;
1404 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1405 		if (protocol == IPPROTO_UDP) {
1406 			connfp = &ipst->ips_ipcl_udp_fanout[
1407 			    IPCL_UDP_HASH(up[1], ipst)];
1408 		} else {
1409 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1410 		}
1411 
1412 		if (connp->conn_rem != INADDR_ANY) {
1413 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1414 		} else if (connp->conn_src != INADDR_ANY) {
1415 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1416 		} else {
1417 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1418 		}
1419 		break;
1420 	}
1421 
1422 	return (ret);
1423 }
1424 
1425 int
1426 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1427     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1428 {
1429 	connf_t		*connfp;
1430 	uint16_t	*up;
1431 	conn_t		*tconnp;
1432 	in_port_t	lport;
1433 	int		ret = 0;
1434 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1435 
1436 	switch (protocol) {
1437 	case IPPROTO_TCP:
1438 		/* Just need to insert a conn struct */
1439 		if (!(connp->conn_flags & IPCL_EAGER)) {
1440 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1441 		}
1442 
1443 		/*
1444 		 * For tcp, we check whether the connection tuple already
1445 		 * exists before allowing the connection to proceed.  We
1446 		 * also allow indexing on the zoneid. This is to allow
1447 		 * multiple shared stack zones to have the same tcp
1448 		 * connection tuple. In practice this only happens for
1449 		 * ipv6_loopback as it's the only local address which
1450 		 * doesn't have to be unique.
1451 		 */
1452 		connfp = &ipst->ips_ipcl_conn_fanout[
1453 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1454 		    ipst)];
1455 		mutex_enter(&connfp->connf_lock);
1456 		for (tconnp = connfp->connf_head; tconnp != NULL;
1457 		    tconnp = tconnp->conn_next) {
1458 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1459 			    connp->conn_remv6, connp->conn_srcv6,
1460 			    connp->conn_ports) &&
1461 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1462 			    tconnp->conn_tcp->tcp_bound_if == ifindex) &&
1463 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1464 				/* Already have a conn. bail out */
1465 				mutex_exit(&connfp->connf_lock);
1466 				return (EADDRINUSE);
1467 			}
1468 		}
1469 		if (connp->conn_fanout != NULL) {
1470 			/*
1471 			 * Probably a XTI/TLI application trying to do a
1472 			 * rebind. Let it happen.
1473 			 */
1474 			mutex_exit(&connfp->connf_lock);
1475 			IPCL_HASH_REMOVE(connp);
1476 			mutex_enter(&connfp->connf_lock);
1477 		}
1478 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1479 		mutex_exit(&connfp->connf_lock);
1480 		break;
1481 
1482 	case IPPROTO_SCTP:
1483 		IPCL_HASH_REMOVE(connp);
1484 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1485 		ret = ipcl_sctp_hash_insert(connp, lport);
1486 		break;
1487 
1488 	default:
1489 		if (is_system_labeled() &&
1490 		    check_exempt_conflict_v6(connp, ipst))
1491 			return (EADDRINUSE);
1492 		/* FALLTHROUGH */
1493 	case IPPROTO_UDP:
1494 		up = (uint16_t *)&ports;
1495 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1496 		if (protocol == IPPROTO_UDP) {
1497 			connfp = &ipst->ips_ipcl_udp_fanout[
1498 			    IPCL_UDP_HASH(up[1], ipst)];
1499 		} else {
1500 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1501 		}
1502 
1503 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1504 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1505 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1506 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1507 		} else {
1508 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1509 		}
1510 		break;
1511 	}
1512 
1513 	return (ret);
1514 }
1515 
1516 /*
1517  * v4 packet classifying function. looks up the fanout table to
1518  * find the conn, the packet belongs to. returns the conn with
1519  * the reference held, null otherwise.
1520  *
1521  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1522  * Lookup" comment block are applied.  Labels are also checked as described
1523  * above.  If the packet is from the inside (looped back), and is from the same
1524  * zone, then label checks are omitted.
1525  */
1526 conn_t *
1527 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1528     ip_stack_t *ipst)
1529 {
1530 	ipha_t	*ipha;
1531 	connf_t	*connfp, *bind_connfp;
1532 	uint16_t lport;
1533 	uint16_t fport;
1534 	uint32_t ports;
1535 	conn_t	*connp;
1536 	uint16_t  *up;
1537 	boolean_t shared_addr;
1538 	boolean_t unlabeled;
1539 
1540 	ipha = (ipha_t *)mp->b_rptr;
1541 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 
1543 	switch (protocol) {
1544 	case IPPROTO_TCP:
1545 		ports = *(uint32_t *)up;
1546 		connfp =
1547 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 		    ports, ipst)];
1549 		mutex_enter(&connfp->connf_lock);
1550 		for (connp = connfp->connf_head; connp != NULL;
1551 		    connp = connp->conn_next) {
1552 			if ((IPCL_CONN_MATCH(connp, protocol,
1553 			    ipha->ipha_src, ipha->ipha_dst, ports)) &&
1554 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1555 				break;
1556 			}
1557 		}
1558 
1559 		if (connp != NULL) {
1560 			/*
1561 			 * We have a fully-bound TCP connection.
1562 			 *
1563 			 * For labeled systems, there's no need to check the
1564 			 * label here.  It's known to be good as we checked
1565 			 * before allowing the connection to become bound.
1566 			 */
1567 			CONN_INC_REF(connp);
1568 			mutex_exit(&connfp->connf_lock);
1569 			return (connp);
1570 		}
1571 
1572 		mutex_exit(&connfp->connf_lock);
1573 
1574 		lport = up[1];
1575 		unlabeled = B_FALSE;
1576 		/* Cred cannot be null on IPv4 */
1577 		if (is_system_labeled()) {
1578 			cred_t *cr = msg_getcred(mp, NULL);
1579 			ASSERT(cr != NULL);
1580 			unlabeled = (crgetlabel(cr)->tsl_flags &
1581 			    TSLF_UNLABELED) != 0;
1582 		}
1583 		shared_addr = (zoneid == ALL_ZONES);
1584 		if (shared_addr) {
1585 			/*
1586 			 * No need to handle exclusive-stack zones since
1587 			 * ALL_ZONES only applies to the shared stack.
1588 			 */
1589 			zoneid = tsol_mlp_findzone(protocol, lport);
1590 			/*
1591 			 * If no shared MLP is found, tsol_mlp_findzone returns
1592 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1593 			 * search for the zone based on the packet label.
1594 			 *
1595 			 * If there is such a zone, we prefer to find a
1596 			 * connection in it.  Otherwise, we look for a
1597 			 * MAC-exempt connection in any zone whose label
1598 			 * dominates the default label on the packet.
1599 			 */
1600 			if (zoneid == ALL_ZONES)
1601 				zoneid = tsol_packet_to_zoneid(mp);
1602 			else
1603 				unlabeled = B_FALSE;
1604 		}
1605 
1606 		bind_connfp =
1607 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1608 		mutex_enter(&bind_connfp->connf_lock);
1609 		for (connp = bind_connfp->connf_head; connp != NULL;
1610 		    connp = connp->conn_next) {
1611 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1612 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1613 			    (unlabeled && connp->conn_mac_exempt &&
1614 			    shared_addr)))
1615 				break;
1616 		}
1617 
1618 		/*
1619 		 * If the matching connection is SLP on a private address, then
1620 		 * the label on the packet must match the local zone's label.
1621 		 * Otherwise, it must be in the label range defined by tnrh.
1622 		 * This is ensured by tsol_receive_label.
1623 		 */
1624 		if (connp != NULL && is_system_labeled() &&
1625 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1626 		    shared_addr, connp)) {
1627 				DTRACE_PROBE3(
1628 				    tx__ip__log__info__classify__tcp,
1629 				    char *,
1630 				    "connp(1) could not receive mp(2)",
1631 				    conn_t *, connp, mblk_t *, mp);
1632 			connp = NULL;
1633 		}
1634 
1635 		if (connp != NULL) {
1636 			/* Have a listener at least */
1637 			CONN_INC_REF(connp);
1638 			mutex_exit(&bind_connfp->connf_lock);
1639 			return (connp);
1640 		}
1641 
1642 		mutex_exit(&bind_connfp->connf_lock);
1643 
1644 		IPCL_DEBUG_LVL(512,
1645 		    ("ipcl_classify: couldn't classify mp = %p\n",
1646 		    (void *)mp));
1647 		break;
1648 
1649 	case IPPROTO_UDP:
1650 		lport = up[1];
1651 		unlabeled = B_FALSE;
1652 		/* Cred cannot be null on IPv4 */
1653 		if (is_system_labeled()) {
1654 			cred_t *cr = msg_getcred(mp, NULL);
1655 			ASSERT(cr != NULL);
1656 			unlabeled = (crgetlabel(cr)->tsl_flags &
1657 			    TSLF_UNLABELED) != 0;
1658 		}
1659 		shared_addr = (zoneid == ALL_ZONES);
1660 		if (shared_addr) {
1661 			/*
1662 			 * No need to handle exclusive-stack zones since
1663 			 * ALL_ZONES only applies to the shared stack.
1664 			 */
1665 			zoneid = tsol_mlp_findzone(protocol, lport);
1666 			/*
1667 			 * If no shared MLP is found, tsol_mlp_findzone returns
1668 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1669 			 * search for the zone based on the packet label.
1670 			 *
1671 			 * If there is such a zone, we prefer to find a
1672 			 * connection in it.  Otherwise, we look for a
1673 			 * MAC-exempt connection in any zone whose label
1674 			 * dominates the default label on the packet.
1675 			 */
1676 			if (zoneid == ALL_ZONES)
1677 				zoneid = tsol_packet_to_zoneid(mp);
1678 			else
1679 				unlabeled = B_FALSE;
1680 		}
1681 		fport = up[0];
1682 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1683 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1684 		mutex_enter(&connfp->connf_lock);
1685 		for (connp = connfp->connf_head; connp != NULL;
1686 		    connp = connp->conn_next) {
1687 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1688 			    fport, ipha->ipha_src) &&
1689 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1690 			    (unlabeled && connp->conn_mac_exempt &&
1691 			    shared_addr)))
1692 				break;
1693 		}
1694 
1695 		if (connp != NULL && is_system_labeled() &&
1696 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1697 		    shared_addr, connp)) {
1698 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1699 			    char *, "connp(1) could not receive mp(2)",
1700 			    conn_t *, connp, mblk_t *, mp);
1701 			connp = NULL;
1702 		}
1703 
1704 		if (connp != NULL) {
1705 			CONN_INC_REF(connp);
1706 			mutex_exit(&connfp->connf_lock);
1707 			return (connp);
1708 		}
1709 
1710 		/*
1711 		 * We shouldn't come here for multicast/broadcast packets
1712 		 */
1713 		mutex_exit(&connfp->connf_lock);
1714 		IPCL_DEBUG_LVL(512,
1715 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1716 		    lport, fport));
1717 		break;
1718 	}
1719 
1720 	return (NULL);
1721 }
1722 
1723 conn_t *
1724 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1725     ip_stack_t *ipst)
1726 {
1727 	ip6_t		*ip6h;
1728 	connf_t		*connfp, *bind_connfp;
1729 	uint16_t	lport;
1730 	uint16_t	fport;
1731 	tcph_t		*tcph;
1732 	uint32_t	ports;
1733 	conn_t		*connp;
1734 	uint16_t	*up;
1735 	boolean_t	shared_addr;
1736 	boolean_t	unlabeled;
1737 
1738 	ip6h = (ip6_t *)mp->b_rptr;
1739 
1740 	switch (protocol) {
1741 	case IPPROTO_TCP:
1742 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1743 		up = (uint16_t *)tcph->th_lport;
1744 		ports = *(uint32_t *)up;
1745 
1746 		connfp =
1747 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1748 		    ports, ipst)];
1749 		mutex_enter(&connfp->connf_lock);
1750 		for (connp = connfp->connf_head; connp != NULL;
1751 		    connp = connp->conn_next) {
1752 			if ((IPCL_CONN_MATCH_V6(connp, protocol,
1753 			    ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
1754 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1755 				break;
1756 			}
1757 		}
1758 
1759 		if (connp != NULL) {
1760 			/*
1761 			 * We have a fully-bound TCP connection.
1762 			 *
1763 			 * For labeled systems, there's no need to check the
1764 			 * label here.  It's known to be good as we checked
1765 			 * before allowing the connection to become bound.
1766 			 */
1767 			CONN_INC_REF(connp);
1768 			mutex_exit(&connfp->connf_lock);
1769 			return (connp);
1770 		}
1771 
1772 		mutex_exit(&connfp->connf_lock);
1773 
1774 		lport = up[1];
1775 		unlabeled = B_FALSE;
1776 		/* Cred can be null on IPv6 */
1777 		if (is_system_labeled()) {
1778 			cred_t *cr = msg_getcred(mp, NULL);
1779 
1780 			unlabeled = (cr != NULL &&
1781 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1782 		}
1783 		shared_addr = (zoneid == ALL_ZONES);
1784 		if (shared_addr) {
1785 			/*
1786 			 * No need to handle exclusive-stack zones since
1787 			 * ALL_ZONES only applies to the shared stack.
1788 			 */
1789 			zoneid = tsol_mlp_findzone(protocol, lport);
1790 			/*
1791 			 * If no shared MLP is found, tsol_mlp_findzone returns
1792 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1793 			 * search for the zone based on the packet label.
1794 			 *
1795 			 * If there is such a zone, we prefer to find a
1796 			 * connection in it.  Otherwise, we look for a
1797 			 * MAC-exempt connection in any zone whose label
1798 			 * dominates the default label on the packet.
1799 			 */
1800 			if (zoneid == ALL_ZONES)
1801 				zoneid = tsol_packet_to_zoneid(mp);
1802 			else
1803 				unlabeled = B_FALSE;
1804 		}
1805 
1806 		bind_connfp =
1807 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1808 		mutex_enter(&bind_connfp->connf_lock);
1809 		for (connp = bind_connfp->connf_head; connp != NULL;
1810 		    connp = connp->conn_next) {
1811 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1812 			    ip6h->ip6_dst, lport) &&
1813 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1814 			    (unlabeled && connp->conn_mac_exempt &&
1815 			    shared_addr)))
1816 				break;
1817 		}
1818 
1819 		if (connp != NULL && is_system_labeled() &&
1820 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1821 		    shared_addr, connp)) {
1822 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1823 			    char *, "connp(1) could not receive mp(2)",
1824 			    conn_t *, connp, mblk_t *, mp);
1825 			connp = NULL;
1826 		}
1827 
1828 		if (connp != NULL) {
1829 			/* Have a listner at least */
1830 			CONN_INC_REF(connp);
1831 			mutex_exit(&bind_connfp->connf_lock);
1832 			IPCL_DEBUG_LVL(512,
1833 			    ("ipcl_classify_v6: found listner "
1834 			    "connp = %p\n", (void *)connp));
1835 
1836 			return (connp);
1837 		}
1838 
1839 		mutex_exit(&bind_connfp->connf_lock);
1840 
1841 		IPCL_DEBUG_LVL(512,
1842 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1843 		    (void *)mp));
1844 		break;
1845 
1846 	case IPPROTO_UDP:
1847 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1848 		lport = up[1];
1849 		unlabeled = B_FALSE;
1850 		/* Cred can be null on IPv6 */
1851 		if (is_system_labeled()) {
1852 			cred_t *cr = msg_getcred(mp, NULL);
1853 
1854 			unlabeled = (cr != NULL &&
1855 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1856 		}
1857 		shared_addr = (zoneid == ALL_ZONES);
1858 		if (shared_addr) {
1859 			/*
1860 			 * No need to handle exclusive-stack zones since
1861 			 * ALL_ZONES only applies to the shared stack.
1862 			 */
1863 			zoneid = tsol_mlp_findzone(protocol, lport);
1864 			/*
1865 			 * If no shared MLP is found, tsol_mlp_findzone returns
1866 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1867 			 * search for the zone based on the packet label.
1868 			 *
1869 			 * If there is such a zone, we prefer to find a
1870 			 * connection in it.  Otherwise, we look for a
1871 			 * MAC-exempt connection in any zone whose label
1872 			 * dominates the default label on the packet.
1873 			 */
1874 			if (zoneid == ALL_ZONES)
1875 				zoneid = tsol_packet_to_zoneid(mp);
1876 			else
1877 				unlabeled = B_FALSE;
1878 		}
1879 
1880 		fport = up[0];
1881 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1882 		    fport));
1883 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1884 		mutex_enter(&connfp->connf_lock);
1885 		for (connp = connfp->connf_head; connp != NULL;
1886 		    connp = connp->conn_next) {
1887 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1888 			    fport, ip6h->ip6_src) &&
1889 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1890 			    (unlabeled && connp->conn_mac_exempt &&
1891 			    shared_addr)))
1892 				break;
1893 		}
1894 
1895 		if (connp != NULL && is_system_labeled() &&
1896 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1897 		    shared_addr, connp)) {
1898 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1899 			    char *, "connp(1) could not receive mp(2)",
1900 			    conn_t *, connp, mblk_t *, mp);
1901 			connp = NULL;
1902 		}
1903 
1904 		if (connp != NULL) {
1905 			CONN_INC_REF(connp);
1906 			mutex_exit(&connfp->connf_lock);
1907 			return (connp);
1908 		}
1909 
1910 		/*
1911 		 * We shouldn't come here for multicast/broadcast packets
1912 		 */
1913 		mutex_exit(&connfp->connf_lock);
1914 		IPCL_DEBUG_LVL(512,
1915 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1916 		    lport, fport));
1917 		break;
1918 	}
1919 
1920 	return (NULL);
1921 }
1922 
1923 /*
1924  * wrapper around ipcl_classify_(v4,v6) routines.
1925  */
1926 conn_t *
1927 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
1928 {
1929 	uint16_t	hdr_len;
1930 	ipha_t		*ipha;
1931 	uint8_t		*nexthdrp;
1932 
1933 	if (MBLKL(mp) < sizeof (ipha_t))
1934 		return (NULL);
1935 
1936 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1937 	case IPV4_VERSION:
1938 		ipha = (ipha_t *)mp->b_rptr;
1939 		hdr_len = IPH_HDR_LENGTH(ipha);
1940 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1941 		    zoneid, ipst));
1942 	case IPV6_VERSION:
1943 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1944 		    &hdr_len, &nexthdrp))
1945 			return (NULL);
1946 
1947 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
1948 	}
1949 
1950 	return (NULL);
1951 }
1952 
1953 conn_t *
1954 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1955     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
1956 {
1957 	connf_t		*connfp;
1958 	conn_t		*connp;
1959 	in_port_t	lport;
1960 	int		af;
1961 	boolean_t	shared_addr;
1962 	boolean_t	unlabeled;
1963 	const void	*dst;
1964 
1965 	lport = ((uint16_t *)&ports)[1];
1966 
1967 	unlabeled = B_FALSE;
1968 	/* Cred can be null on IPv6 */
1969 	if (is_system_labeled()) {
1970 		cred_t *cr = msg_getcred(mp, NULL);
1971 
1972 		unlabeled = (cr != NULL &&
1973 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1974 	}
1975 	shared_addr = (zoneid == ALL_ZONES);
1976 	if (shared_addr) {
1977 		/*
1978 		 * No need to handle exclusive-stack zones since ALL_ZONES
1979 		 * only applies to the shared stack.
1980 		 */
1981 		zoneid = tsol_mlp_findzone(protocol, lport);
1982 		/*
1983 		 * If no shared MLP is found, tsol_mlp_findzone returns
1984 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1985 		 * the zone based on the packet label.
1986 		 *
1987 		 * If there is such a zone, we prefer to find a connection in
1988 		 * it.  Otherwise, we look for a MAC-exempt connection in any
1989 		 * zone whose label dominates the default label on the packet.
1990 		 */
1991 		if (zoneid == ALL_ZONES)
1992 			zoneid = tsol_packet_to_zoneid(mp);
1993 		else
1994 			unlabeled = B_FALSE;
1995 	}
1996 
1997 	af = IPH_HDR_VERSION(hdr);
1998 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1999 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
2000 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
2001 
2002 	mutex_enter(&connfp->connf_lock);
2003 	for (connp = connfp->connf_head; connp != NULL;
2004 	    connp = connp->conn_next) {
2005 		/* We don't allow v4 fallback for v6 raw socket. */
2006 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
2007 		    IPV6_VERSION))
2008 			continue;
2009 		if (connp->conn_fully_bound) {
2010 			if (af == IPV4_VERSION) {
2011 				if (!IPCL_CONN_MATCH(connp, protocol,
2012 				    hdr->ipha_src, hdr->ipha_dst, ports))
2013 					continue;
2014 			} else {
2015 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
2016 				    ((ip6_t *)hdr)->ip6_src,
2017 				    ((ip6_t *)hdr)->ip6_dst, ports))
2018 					continue;
2019 			}
2020 		} else {
2021 			if (af == IPV4_VERSION) {
2022 				if (!IPCL_BIND_MATCH(connp, protocol,
2023 				    hdr->ipha_dst, lport))
2024 					continue;
2025 			} else {
2026 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
2027 				    ((ip6_t *)hdr)->ip6_dst, lport))
2028 					continue;
2029 			}
2030 		}
2031 
2032 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
2033 		    (unlabeled && connp->conn_mac_exempt && shared_addr))
2034 			break;
2035 	}
2036 	/*
2037 	 * If the connection is fully-bound and connection-oriented (TCP or
2038 	 * SCTP), then we've already validated the remote system's label.
2039 	 * There's no need to do it again for every packet.
2040 	 */
2041 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
2042 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
2043 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
2044 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
2045 		    char *, "connp(1) could not receive mp(2)",
2046 		    conn_t *, connp, mblk_t *, mp);
2047 		connp = NULL;
2048 	}
2049 
2050 	if (connp != NULL)
2051 		goto found;
2052 	mutex_exit(&connfp->connf_lock);
2053 
2054 	/* Try to look for a wildcard match. */
2055 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2056 	mutex_enter(&connfp->connf_lock);
2057 	for (connp = connfp->connf_head; connp != NULL;
2058 	    connp = connp->conn_next) {
2059 		/* We don't allow v4 fallback for v6 raw socket. */
2060 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
2061 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
2062 			continue;
2063 		}
2064 		if (af == IPV4_VERSION) {
2065 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
2066 				break;
2067 		} else {
2068 			if (IPCL_RAW_MATCH_V6(connp, protocol,
2069 			    ((ip6_t *)hdr)->ip6_dst)) {
2070 				break;
2071 			}
2072 		}
2073 	}
2074 
2075 	if (connp != NULL)
2076 		goto found;
2077 
2078 	mutex_exit(&connfp->connf_lock);
2079 	return (NULL);
2080 
2081 found:
2082 	ASSERT(connp != NULL);
2083 	CONN_INC_REF(connp);
2084 	mutex_exit(&connfp->connf_lock);
2085 	return (connp);
2086 }
2087 
2088 /* ARGSUSED */
2089 static int
2090 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2091 {
2092 	itc_t	*itc = (itc_t *)buf;
2093 	conn_t 	*connp = &itc->itc_conn;
2094 	tcp_t	*tcp = (tcp_t *)&itc[1];
2095 
2096 	bzero(connp, sizeof (conn_t));
2097 	bzero(tcp, sizeof (tcp_t));
2098 
2099 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2100 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2101 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2102 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2103 	connp->conn_tcp = tcp;
2104 	connp->conn_flags = IPCL_TCPCONN;
2105 	connp->conn_ulp = IPPROTO_TCP;
2106 	tcp->tcp_connp = connp;
2107 	return (0);
2108 }
2109 
2110 /* ARGSUSED */
2111 static void
2112 tcp_conn_destructor(void *buf, void *cdrarg)
2113 {
2114 	itc_t	*itc = (itc_t *)buf;
2115 	conn_t 	*connp = &itc->itc_conn;
2116 	tcp_t	*tcp = (tcp_t *)&itc[1];
2117 
2118 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2119 	ASSERT(tcp->tcp_connp == connp);
2120 	ASSERT(connp->conn_tcp == tcp);
2121 	tcp_timermp_free(tcp);
2122 	mutex_destroy(&connp->conn_lock);
2123 	cv_destroy(&connp->conn_cv);
2124 	cv_destroy(&connp->conn_sq_cv);
2125 }
2126 
2127 /* ARGSUSED */
2128 static int
2129 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2130 {
2131 	itc_t	*itc = (itc_t *)buf;
2132 	conn_t 	*connp = &itc->itc_conn;
2133 
2134 	bzero(connp, sizeof (conn_t));
2135 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2136 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2137 	connp->conn_flags = IPCL_IPCCONN;
2138 
2139 	return (0);
2140 }
2141 
2142 /* ARGSUSED */
2143 static void
2144 ip_conn_destructor(void *buf, void *cdrarg)
2145 {
2146 	itc_t	*itc = (itc_t *)buf;
2147 	conn_t 	*connp = &itc->itc_conn;
2148 
2149 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2150 	ASSERT(connp->conn_priv == NULL);
2151 	mutex_destroy(&connp->conn_lock);
2152 	cv_destroy(&connp->conn_cv);
2153 }
2154 
2155 /* ARGSUSED */
2156 static int
2157 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2158 {
2159 	itc_t	*itc = (itc_t *)buf;
2160 	conn_t 	*connp = &itc->itc_conn;
2161 	udp_t	*udp = (udp_t *)&itc[1];
2162 
2163 	bzero(connp, sizeof (conn_t));
2164 	bzero(udp, sizeof (udp_t));
2165 
2166 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2167 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2168 	connp->conn_udp = udp;
2169 	connp->conn_flags = IPCL_UDPCONN;
2170 	connp->conn_ulp = IPPROTO_UDP;
2171 	udp->udp_connp = connp;
2172 	return (0);
2173 }
2174 
2175 /* ARGSUSED */
2176 static void
2177 udp_conn_destructor(void *buf, void *cdrarg)
2178 {
2179 	itc_t	*itc = (itc_t *)buf;
2180 	conn_t 	*connp = &itc->itc_conn;
2181 	udp_t	*udp = (udp_t *)&itc[1];
2182 
2183 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2184 	ASSERT(udp->udp_connp == connp);
2185 	ASSERT(connp->conn_udp == udp);
2186 	mutex_destroy(&connp->conn_lock);
2187 	cv_destroy(&connp->conn_cv);
2188 }
2189 
2190 /* ARGSUSED */
2191 static int
2192 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2193 {
2194 	itc_t	*itc = (itc_t *)buf;
2195 	conn_t 	*connp = &itc->itc_conn;
2196 	icmp_t	*icmp = (icmp_t *)&itc[1];
2197 
2198 	bzero(connp, sizeof (conn_t));
2199 	bzero(icmp, sizeof (icmp_t));
2200 
2201 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2202 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2203 	connp->conn_icmp = icmp;
2204 	connp->conn_flags = IPCL_RAWIPCONN;
2205 	connp->conn_ulp = IPPROTO_ICMP;
2206 	icmp->icmp_connp = connp;
2207 	return (0);
2208 }
2209 
2210 /* ARGSUSED */
2211 static void
2212 rawip_conn_destructor(void *buf, void *cdrarg)
2213 {
2214 	itc_t	*itc = (itc_t *)buf;
2215 	conn_t 	*connp = &itc->itc_conn;
2216 	icmp_t	*icmp = (icmp_t *)&itc[1];
2217 
2218 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2219 	ASSERT(icmp->icmp_connp == connp);
2220 	ASSERT(connp->conn_icmp == icmp);
2221 	mutex_destroy(&connp->conn_lock);
2222 	cv_destroy(&connp->conn_cv);
2223 }
2224 
2225 /* ARGSUSED */
2226 static int
2227 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2228 {
2229 	itc_t	*itc = (itc_t *)buf;
2230 	conn_t 	*connp = &itc->itc_conn;
2231 	rts_t	*rts = (rts_t *)&itc[1];
2232 
2233 	bzero(connp, sizeof (conn_t));
2234 	bzero(rts, sizeof (rts_t));
2235 
2236 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2237 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2238 	connp->conn_rts = rts;
2239 	connp->conn_flags = IPCL_RTSCONN;
2240 	rts->rts_connp = connp;
2241 	return (0);
2242 }
2243 
2244 /* ARGSUSED */
2245 static void
2246 rts_conn_destructor(void *buf, void *cdrarg)
2247 {
2248 	itc_t	*itc = (itc_t *)buf;
2249 	conn_t 	*connp = &itc->itc_conn;
2250 	rts_t	*rts = (rts_t *)&itc[1];
2251 
2252 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2253 	ASSERT(rts->rts_connp == connp);
2254 	ASSERT(connp->conn_rts == rts);
2255 	mutex_destroy(&connp->conn_lock);
2256 	cv_destroy(&connp->conn_cv);
2257 }
2258 
2259 /* ARGSUSED */
2260 int
2261 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
2262 {
2263 	int error;
2264 	netstack_t	*ns;
2265 	int		ret;
2266 	tcp_stack_t	*tcps;
2267 	ip_helper_stream_info_t	*ip_helper_str;
2268 	ip_stack_t	*ipst;
2269 
2270 	ns = netstack_find_by_cred(kcred);
2271 	ASSERT(ns != NULL);
2272 	tcps = ns->netstack_tcp;
2273 	ipst = ns->netstack_ip;
2274 	ASSERT(tcps != NULL);
2275 	ip_helper_str = (ip_helper_stream_info_t *)buf;
2276 
2277 	do {
2278 		error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
2279 		    &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
2280 	} while (error == EINTR);
2281 
2282 	if (error == 0) {
2283 		do {
2284 			error = ldi_ioctl(
2285 			    ip_helper_str->iphs_handle, SIOCSQPTR,
2286 			    (intptr_t)buf, FKIOCTL, kcred, &ret);
2287 		} while (error == EINTR);
2288 
2289 		if (error != 0) {
2290 			(void) ldi_close(
2291 			    ip_helper_str->iphs_handle, 0, kcred);
2292 		}
2293 	}
2294 
2295 	netstack_rele(ipst->ips_netstack);
2296 
2297 	return (error);
2298 }
2299 
2300 /* ARGSUSED */
2301 static void
2302 ip_helper_stream_destructor(void *buf, void *cdrarg)
2303 {
2304 	ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
2305 
2306 	ip_helper_str->iphs_rq->q_ptr =
2307 	    ip_helper_str->iphs_wq->q_ptr =
2308 	    ip_helper_str->iphs_minfo;
2309 	(void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
2310 }
2311 
2312 
2313 /*
2314  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2315  * in the conn_t.
2316  */
2317 void
2318 ipcl_conn_cleanup(conn_t *connp)
2319 {
2320 	ASSERT(connp->conn_ire_cache == NULL);
2321 	ASSERT(connp->conn_latch == NULL);
2322 #ifdef notdef
2323 	ASSERT(connp->conn_rq == NULL);
2324 	ASSERT(connp->conn_wq == NULL);
2325 #endif
2326 	ASSERT(connp->conn_cred == NULL);
2327 	ASSERT(connp->conn_g_fanout == NULL);
2328 	ASSERT(connp->conn_g_next == NULL);
2329 	ASSERT(connp->conn_g_prev == NULL);
2330 	ASSERT(connp->conn_policy == NULL);
2331 	ASSERT(connp->conn_fanout == NULL);
2332 	ASSERT(connp->conn_next == NULL);
2333 	ASSERT(connp->conn_prev == NULL);
2334 #ifdef notdef
2335 	/*
2336 	 * The ill and ipif pointers are not cleared before the conn_t
2337 	 * goes away since they do not hold a reference on the ill/ipif.
2338 	 * We should replace these pointers with ifindex/ipaddr_t to
2339 	 * make the code less complex.
2340 	 */
2341 	ASSERT(connp->conn_outgoing_ill == NULL);
2342 	ASSERT(connp->conn_incoming_ill == NULL);
2343 	ASSERT(connp->conn_multicast_ipif == NULL);
2344 	ASSERT(connp->conn_multicast_ill == NULL);
2345 #endif
2346 	ASSERT(connp->conn_oper_pending_ill == NULL);
2347 	ASSERT(connp->conn_ilg == NULL);
2348 	ASSERT(connp->conn_drain_next == NULL);
2349 	ASSERT(connp->conn_drain_prev == NULL);
2350 #ifdef notdef
2351 	/* conn_idl is not cleared when removed from idl list */
2352 	ASSERT(connp->conn_idl == NULL);
2353 #endif
2354 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2355 	ASSERT(connp->conn_effective_cred == NULL);
2356 	ASSERT(connp->conn_netstack == NULL);
2357 
2358 	ASSERT(connp->conn_helper_info == NULL);
2359 	/* Clear out the conn_t fields that are not preserved */
2360 	bzero(&connp->conn_start_clr,
2361 	    sizeof (conn_t) -
2362 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2363 }
2364 
2365 /*
2366  * All conns are inserted in a global multi-list for the benefit of
2367  * walkers. The walk is guaranteed to walk all open conns at the time
2368  * of the start of the walk exactly once. This property is needed to
2369  * achieve some cleanups during unplumb of interfaces. This is achieved
2370  * as follows.
2371  *
2372  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2373  * call the insert and delete functions below at creation and deletion
2374  * time respectively. The conn never moves or changes its position in this
2375  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2376  * won't increase due to walkers, once the conn deletion has started. Note
2377  * that we can't remove the conn from the global list and then wait for
2378  * the refcnt to drop to zero, since walkers would then see a truncated
2379  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2380  * conns until ip_open is ready to make them globally visible.
2381  * The global round robin multi-list locks are held only to get the
2382  * next member/insertion/deletion and contention should be negligible
2383  * if the multi-list is much greater than the number of cpus.
2384  */
2385 void
2386 ipcl_globalhash_insert(conn_t *connp)
2387 {
2388 	int	index;
2389 	struct connf_s	*connfp;
2390 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2391 
2392 	/*
2393 	 * No need for atomic here. Approximate even distribution
2394 	 * in the global lists is sufficient.
2395 	 */
2396 	ipst->ips_conn_g_index++;
2397 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2398 
2399 	connp->conn_g_prev = NULL;
2400 	/*
2401 	 * Mark as INCIPIENT, so that walkers will ignore this
2402 	 * for now, till ip_open is ready to make it visible globally.
2403 	 */
2404 	connp->conn_state_flags |= CONN_INCIPIENT;
2405 
2406 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2407 	/* Insert at the head of the list */
2408 	mutex_enter(&connfp->connf_lock);
2409 	connp->conn_g_next = connfp->connf_head;
2410 	if (connp->conn_g_next != NULL)
2411 		connp->conn_g_next->conn_g_prev = connp;
2412 	connfp->connf_head = connp;
2413 
2414 	/* The fanout bucket this conn points to */
2415 	connp->conn_g_fanout = connfp;
2416 
2417 	mutex_exit(&connfp->connf_lock);
2418 }
2419 
2420 void
2421 ipcl_globalhash_remove(conn_t *connp)
2422 {
2423 	struct connf_s	*connfp;
2424 
2425 	/*
2426 	 * We were never inserted in the global multi list.
2427 	 * IPCL_NONE variety is never inserted in the global multilist
2428 	 * since it is presumed to not need any cleanup and is transient.
2429 	 */
2430 	if (connp->conn_g_fanout == NULL)
2431 		return;
2432 
2433 	connfp = connp->conn_g_fanout;
2434 	mutex_enter(&connfp->connf_lock);
2435 	if (connp->conn_g_prev != NULL)
2436 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2437 	else
2438 		connfp->connf_head = connp->conn_g_next;
2439 	if (connp->conn_g_next != NULL)
2440 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2441 	mutex_exit(&connfp->connf_lock);
2442 
2443 	/* Better to stumble on a null pointer than to corrupt memory */
2444 	connp->conn_g_next = NULL;
2445 	connp->conn_g_prev = NULL;
2446 	connp->conn_g_fanout = NULL;
2447 }
2448 
2449 /*
2450  * Walk the list of all conn_t's in the system, calling the function provided
2451  * with the specified argument for each.
2452  * Applies to both IPv4 and IPv6.
2453  *
2454  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2455  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2456  * unplumbed or removed. New conn_t's that are created while we are walking
2457  * may be missed by this walk, because they are not necessarily inserted
2458  * at the tail of the list. They are new conn_t's and thus don't have any
2459  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2460  * is created to the struct that is going away.
2461  */
2462 void
2463 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2464 {
2465 	int	i;
2466 	conn_t	*connp;
2467 	conn_t	*prev_connp;
2468 
2469 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2470 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2471 		prev_connp = NULL;
2472 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2473 		while (connp != NULL) {
2474 			mutex_enter(&connp->conn_lock);
2475 			if (connp->conn_state_flags &
2476 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2477 				mutex_exit(&connp->conn_lock);
2478 				connp = connp->conn_g_next;
2479 				continue;
2480 			}
2481 			CONN_INC_REF_LOCKED(connp);
2482 			mutex_exit(&connp->conn_lock);
2483 			mutex_exit(
2484 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2485 			(*func)(connp, arg);
2486 			if (prev_connp != NULL)
2487 				CONN_DEC_REF(prev_connp);
2488 			mutex_enter(
2489 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2490 			prev_connp = connp;
2491 			connp = connp->conn_g_next;
2492 		}
2493 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2494 		if (prev_connp != NULL)
2495 			CONN_DEC_REF(prev_connp);
2496 	}
2497 }
2498 
2499 /*
2500  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2501  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2502  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2503  * (peer tcp in ESTABLISHED state).
2504  */
2505 conn_t *
2506 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2507     ip_stack_t *ipst)
2508 {
2509 	uint32_t ports;
2510 	uint16_t *pports = (uint16_t *)&ports;
2511 	connf_t	*connfp;
2512 	conn_t	*tconnp;
2513 	boolean_t zone_chk;
2514 
2515 	/*
2516 	 * If either the source of destination address is loopback, then
2517 	 * both endpoints must be in the same Zone.  Otherwise, both of
2518 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2519 	 * state) and the endpoints may reside in different Zones.
2520 	 */
2521 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2522 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2523 
2524 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2525 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2526 
2527 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2528 	    ports, ipst)];
2529 
2530 	mutex_enter(&connfp->connf_lock);
2531 	for (tconnp = connfp->connf_head; tconnp != NULL;
2532 	    tconnp = tconnp->conn_next) {
2533 
2534 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2535 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2536 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2537 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2538 
2539 			ASSERT(tconnp != connp);
2540 			CONN_INC_REF(tconnp);
2541 			mutex_exit(&connfp->connf_lock);
2542 			return (tconnp);
2543 		}
2544 	}
2545 	mutex_exit(&connfp->connf_lock);
2546 	return (NULL);
2547 }
2548 
2549 /*
2550  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2551  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2552  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2553  * (peer tcp in ESTABLISHED state).
2554  */
2555 conn_t *
2556 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2557     ip_stack_t *ipst)
2558 {
2559 	uint32_t ports;
2560 	uint16_t *pports = (uint16_t *)&ports;
2561 	connf_t	*connfp;
2562 	conn_t	*tconnp;
2563 	boolean_t zone_chk;
2564 
2565 	/*
2566 	 * If either the source of destination address is loopback, then
2567 	 * both endpoints must be in the same Zone.  Otherwise, both of
2568 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2569 	 * state) and the endpoints may reside in different Zones.  We
2570 	 * don't do Zone check for link local address(es) because the
2571 	 * current Zone implementation treats each link local address as
2572 	 * being unique per system node, i.e. they belong to global Zone.
2573 	 */
2574 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2575 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2576 
2577 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2578 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2579 
2580 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2581 	    ports, ipst)];
2582 
2583 	mutex_enter(&connfp->connf_lock);
2584 	for (tconnp = connfp->connf_head; tconnp != NULL;
2585 	    tconnp = tconnp->conn_next) {
2586 
2587 		/* We skip tcp_bound_if check here as this is loopback tcp */
2588 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2589 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2590 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2591 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2592 
2593 			ASSERT(tconnp != connp);
2594 			CONN_INC_REF(tconnp);
2595 			mutex_exit(&connfp->connf_lock);
2596 			return (tconnp);
2597 		}
2598 	}
2599 	mutex_exit(&connfp->connf_lock);
2600 	return (NULL);
2601 }
2602 
2603 /*
2604  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2605  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2606  * Only checks for connected entries i.e. no INADDR_ANY checks.
2607  */
2608 conn_t *
2609 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2610     ip_stack_t *ipst)
2611 {
2612 	uint32_t ports;
2613 	uint16_t *pports;
2614 	connf_t	*connfp;
2615 	conn_t	*tconnp;
2616 
2617 	pports = (uint16_t *)&ports;
2618 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2619 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2620 
2621 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2622 	    ports, ipst)];
2623 
2624 	mutex_enter(&connfp->connf_lock);
2625 	for (tconnp = connfp->connf_head; tconnp != NULL;
2626 	    tconnp = tconnp->conn_next) {
2627 
2628 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2629 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2630 		    tconnp->conn_tcp->tcp_state >= min_state) {
2631 
2632 			CONN_INC_REF(tconnp);
2633 			mutex_exit(&connfp->connf_lock);
2634 			return (tconnp);
2635 		}
2636 	}
2637 	mutex_exit(&connfp->connf_lock);
2638 	return (NULL);
2639 }
2640 
2641 /*
2642  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2643  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2644  * Only checks for connected entries i.e. no INADDR_ANY checks.
2645  * Match on ifindex in addition to addresses.
2646  */
2647 conn_t *
2648 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2649     uint_t ifindex, ip_stack_t *ipst)
2650 {
2651 	tcp_t	*tcp;
2652 	uint32_t ports;
2653 	uint16_t *pports;
2654 	connf_t	*connfp;
2655 	conn_t	*tconnp;
2656 
2657 	pports = (uint16_t *)&ports;
2658 	pports[0] = tcpha->tha_fport;
2659 	pports[1] = tcpha->tha_lport;
2660 
2661 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2662 	    ports, ipst)];
2663 
2664 	mutex_enter(&connfp->connf_lock);
2665 	for (tconnp = connfp->connf_head; tconnp != NULL;
2666 	    tconnp = tconnp->conn_next) {
2667 
2668 		tcp = tconnp->conn_tcp;
2669 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2670 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2671 		    tcp->tcp_state >= min_state &&
2672 		    (tcp->tcp_bound_if == 0 ||
2673 		    tcp->tcp_bound_if == ifindex)) {
2674 
2675 			CONN_INC_REF(tconnp);
2676 			mutex_exit(&connfp->connf_lock);
2677 			return (tconnp);
2678 		}
2679 	}
2680 	mutex_exit(&connfp->connf_lock);
2681 	return (NULL);
2682 }
2683 
2684 /*
2685  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2686  * a listener when changing state.
2687  */
2688 conn_t *
2689 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2690     ip_stack_t *ipst)
2691 {
2692 	connf_t		*bind_connfp;
2693 	conn_t		*connp;
2694 	tcp_t		*tcp;
2695 
2696 	/*
2697 	 * Avoid false matches for packets sent to an IP destination of
2698 	 * all zeros.
2699 	 */
2700 	if (laddr == 0)
2701 		return (NULL);
2702 
2703 	ASSERT(zoneid != ALL_ZONES);
2704 
2705 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2706 	mutex_enter(&bind_connfp->connf_lock);
2707 	for (connp = bind_connfp->connf_head; connp != NULL;
2708 	    connp = connp->conn_next) {
2709 		tcp = connp->conn_tcp;
2710 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2711 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2712 		    (tcp->tcp_listener == NULL)) {
2713 			CONN_INC_REF(connp);
2714 			mutex_exit(&bind_connfp->connf_lock);
2715 			return (connp);
2716 		}
2717 	}
2718 	mutex_exit(&bind_connfp->connf_lock);
2719 	return (NULL);
2720 }
2721 
2722 /*
2723  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2724  * a listener when changing state.
2725  */
2726 conn_t *
2727 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2728     zoneid_t zoneid, ip_stack_t *ipst)
2729 {
2730 	connf_t		*bind_connfp;
2731 	conn_t		*connp = NULL;
2732 	tcp_t		*tcp;
2733 
2734 	/*
2735 	 * Avoid false matches for packets sent to an IP destination of
2736 	 * all zeros.
2737 	 */
2738 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2739 		return (NULL);
2740 
2741 	ASSERT(zoneid != ALL_ZONES);
2742 
2743 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2744 	mutex_enter(&bind_connfp->connf_lock);
2745 	for (connp = bind_connfp->connf_head; connp != NULL;
2746 	    connp = connp->conn_next) {
2747 		tcp = connp->conn_tcp;
2748 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2749 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2750 		    (tcp->tcp_bound_if == 0 ||
2751 		    tcp->tcp_bound_if == ifindex) &&
2752 		    tcp->tcp_listener == NULL) {
2753 			CONN_INC_REF(connp);
2754 			mutex_exit(&bind_connfp->connf_lock);
2755 			return (connp);
2756 		}
2757 	}
2758 	mutex_exit(&bind_connfp->connf_lock);
2759 	return (NULL);
2760 }
2761 
2762 /*
2763  * ipcl_get_next_conn
2764  *	get the next entry in the conn global list
2765  *	and put a reference on the next_conn.
2766  *	decrement the reference on the current conn.
2767  *
2768  * This is an iterator based walker function that also provides for
2769  * some selection by the caller. It walks through the conn_hash bucket
2770  * searching for the next valid connp in the list, and selects connections
2771  * that are neither closed nor condemned. It also REFHOLDS the conn
2772  * thus ensuring that the conn exists when the caller uses the conn.
2773  */
2774 conn_t *
2775 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2776 {
2777 	conn_t	*next_connp;
2778 
2779 	if (connfp == NULL)
2780 		return (NULL);
2781 
2782 	mutex_enter(&connfp->connf_lock);
2783 
2784 	next_connp = (connp == NULL) ?
2785 	    connfp->connf_head : connp->conn_g_next;
2786 
2787 	while (next_connp != NULL) {
2788 		mutex_enter(&next_connp->conn_lock);
2789 		if (!(next_connp->conn_flags & conn_flags) ||
2790 		    (next_connp->conn_state_flags &
2791 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2792 			/*
2793 			 * This conn has been condemned or
2794 			 * is closing, or the flags don't match
2795 			 */
2796 			mutex_exit(&next_connp->conn_lock);
2797 			next_connp = next_connp->conn_g_next;
2798 			continue;
2799 		}
2800 		CONN_INC_REF_LOCKED(next_connp);
2801 		mutex_exit(&next_connp->conn_lock);
2802 		break;
2803 	}
2804 
2805 	mutex_exit(&connfp->connf_lock);
2806 
2807 	if (connp != NULL)
2808 		CONN_DEC_REF(connp);
2809 
2810 	return (next_connp);
2811 }
2812 
2813 #ifdef CONN_DEBUG
2814 /*
2815  * Trace of the last NBUF refhold/refrele
2816  */
2817 int
2818 conn_trace_ref(conn_t *connp)
2819 {
2820 	int	last;
2821 	conn_trace_t	*ctb;
2822 
2823 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2824 	last = connp->conn_trace_last;
2825 	last++;
2826 	if (last == CONN_TRACE_MAX)
2827 		last = 0;
2828 
2829 	ctb = &connp->conn_trace_buf[last];
2830 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2831 	connp->conn_trace_last = last;
2832 	return (1);
2833 }
2834 
2835 int
2836 conn_untrace_ref(conn_t *connp)
2837 {
2838 	int	last;
2839 	conn_trace_t	*ctb;
2840 
2841 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2842 	last = connp->conn_trace_last;
2843 	last++;
2844 	if (last == CONN_TRACE_MAX)
2845 		last = 0;
2846 
2847 	ctb = &connp->conn_trace_buf[last];
2848 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2849 	connp->conn_trace_last = last;
2850 	return (1);
2851 }
2852 #endif
2853