xref: /titanic_50/usr/src/uts/common/inet/ip/ipclassifier.c (revision 07070659d3b755175df60ae6860c2b082250982f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
29 
30 /*
31  * IP PACKET CLASSIFIER
32  *
33  * The IP packet classifier provides mapping between IP packets and persistent
34  * connection state for connection-oriented protocols. It also provides
35  * interface for managing connection states.
36  *
37  * The connection state is kept in conn_t data structure and contains, among
38  * other things:
39  *
40  *	o local/remote address and ports
41  *	o Transport protocol
42  *	o squeue for the connection (for TCP only)
43  *	o reference counter
44  *	o Connection state
45  *	o hash table linkage
46  *	o interface/ire information
47  *	o credentials
48  *	o ipsec policy
49  *	o send and receive functions.
50  *	o mutex lock.
51  *
52  * Connections use a reference counting scheme. They are freed when the
53  * reference counter drops to zero. A reference is incremented when connection
54  * is placed in a list or table, when incoming packet for the connection arrives
55  * and when connection is processed via squeue (squeue processing may be
56  * asynchronous and the reference protects the connection from being destroyed
57  * before its processing is finished).
58  *
59  * send and receive functions are currently used for TCP only. The send function
60  * determines the IP entry point for the packet once it leaves TCP to be sent to
61  * the destination address. The receive function is used by IP when the packet
62  * should be passed for TCP processing. When a new connection is created these
63  * are set to ip_output() and tcp_input() respectively. During the lifetime of
64  * the connection the send and receive functions may change depending on the
65  * changes in the connection state. For example, Once the connection is bound to
66  * an addresse, the receive function for this connection is set to
67  * tcp_conn_request().  This allows incoming SYNs to go directly into the
68  * listener SYN processing function without going to tcp_input() first.
69  *
70  * Classifier uses several hash tables:
71  *
72  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73  *	ipcl_bind_fanout:	contains all connections in BOUND state
74  *	ipcl_proto_fanout:	IPv4 protocol fanout
75  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76  *	ipcl_udp_fanout:	contains all UDP connections
77  *	ipcl_globalhash_fanout:	contains all connections
78  *
79  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80  * which need to view all existing connections.
81  *
82  * All tables are protected by per-bucket locks. When both per-bucket lock and
83  * connection lock need to be held, the per-bucket lock should be acquired
84  * first, followed by the connection lock.
85  *
86  * All functions doing search in one of these tables increment a reference
87  * counter on the connection found (if any). This reference should be dropped
88  * when the caller has finished processing the connection.
89  *
90  *
91  * INTERFACES:
92  * ===========
93  *
94  * Connection Lookup:
95  * ------------------
96  *
97  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid)
98  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid)
99  *
100  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101  * it can't find any associated connection. If the connection is found, its
102  * reference counter is incremented.
103  *
104  *	mp:	mblock, containing packet header. The full header should fit
105  *		into a single mblock. It should also contain at least full IP
106  *		and TCP or UDP header.
107  *
108  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109  *
110  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111  *		 the packet.
112  *
113  * 	zoneid: The zone in which the returned connection must be; the zoneid
114  *		corresponding to the ire_zoneid on the IRE located for the
115  *		packet's destination address.
116  *
117  *	For TCP connections, the lookup order is as follows:
118  *		5-tuple {src, dst, protocol, local port, remote port}
119  *			lookup in ipcl_conn_fanout table.
120  *		3-tuple {dst, remote port, protocol} lookup in
121  *			ipcl_bind_fanout table.
122  *
123  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
124  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
125  *	these interfaces do not handle cases where a packets belongs
126  *	to multiple UDP clients, which is handled in IP itself.
127  *
128  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
129  * determine which actual zone gets the segment.  This is used only in a
130  * labeled environment.  The matching rules are:
131  *
132  *	- If it's not a multilevel port, then the label on the packet selects
133  *	  the zone.  Unlabeled packets are delivered to the global zone.
134  *
135  *	- If it's a multilevel port, then only the zone registered to receive
136  *	  packets on that port matches.
137  *
138  * Also, in a labeled environment, packet labels need to be checked.  For fully
139  * bound TCP connections, we can assume that the packet label was checked
140  * during connection establishment, and doesn't need to be checked on each
141  * packet.  For others, though, we need to check for strict equality or, for
142  * multilevel ports, membership in the range or set.  This part currently does
143  * a tnrh lookup on each packet, but could be optimized to use cached results
144  * if that were necessary.  (SCTP doesn't come through here, but if it did,
145  * we would apply the same rules as TCP.)
146  *
147  * An implication of the above is that fully-bound TCP sockets must always use
148  * distinct 4-tuples; they can't be discriminated by label alone.
149  *
150  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
151  * as there's no connection set-up handshake and no shared state.
152  *
153  * Labels on looped-back packets within a single zone do not need to be
154  * checked, as all processes in the same zone have the same label.
155  *
156  * Finally, for unlabeled packets received by a labeled system, special rules
157  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
158  * socket in the zone whose label matches the default label of the sender, if
159  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
160  * receiver's label must dominate the sender's default label.
161  *
162  * conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int);
163  * conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t);
164  *
165  *	Lookup routine to find a exact match for {src, dst, local port,
166  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
167  *	ports are read from the IP and TCP header respectively.
168  *
169  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol);
170  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex);
171  *
172  * 	Lookup routine to find a listener with the tuple {lport, laddr,
173  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174  * 	parameter interface index is also compared.
175  *
176  * void ipcl_walk(func, arg)
177  *
178  * 	Apply 'func' to every connection available. The 'func' is called as
179  *	(*func)(connp, arg). The walk is non-atomic so connections may be
180  *	created and destroyed during the walk. The CONN_CONDEMNED and
181  *	CONN_INCIPIENT flags ensure that connections which are newly created
182  *	or being destroyed are not selected by the walker.
183  *
184  * Table Updates
185  * -------------
186  *
187  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
188  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
189  *
190  *	Insert 'connp' in the ipcl_conn_fanout.
191  *	Arguements :
192  *		connp		conn_t to be inserted
193  *		protocol	connection protocol
194  *		src		source address
195  *		dst		destination address
196  *		ports		local and remote port
197  *		ifindex		interface index for IPv6 connections
198  *
199  *	Return value :
200  *		0		if connp was inserted
201  *		EADDRINUSE	if the connection with the same tuple
202  *				already exists.
203  *
204  * int ipcl_bind_insert(connp, protocol, src, lport);
205  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
206  *
207  * 	Insert 'connp' in ipcl_bind_fanout.
208  * 	Arguements :
209  * 		connp		conn_t to be inserted
210  * 		protocol	connection protocol
211  * 		src		source address connection wants
212  * 				to bind to
213  * 		lport		local port connection wants to
214  * 				bind to
215  *
216  *
217  * void ipcl_hash_remove(connp);
218  *
219  * 	Removes the 'connp' from the connection fanout table.
220  *
221  * Connection Creation/Destruction
222  * -------------------------------
223  *
224  * conn_t *ipcl_conn_create(type, sleep)
225  *
226  * 	Creates a new conn based on the type flag, inserts it into
227  * 	globalhash table.
228  *
229  *	type:	This flag determines the type of conn_t which needs to be
230  *		created.
231  *		IPCL_TCPCONN	indicates a TCP connection
232  *		IPCL_IPCONN	indicates all non-TCP connections.
233  *
234  * void ipcl_conn_destroy(connp)
235  *
236  * 	Destroys the connection state, removes it from the global
237  * 	connection hash table and frees its memory.
238  */
239 
240 #include <sys/types.h>
241 #include <sys/stream.h>
242 #include <sys/stropts.h>
243 #include <sys/sysmacros.h>
244 #include <sys/strsubr.h>
245 #include <sys/strsun.h>
246 #define	_SUN_TPI_VERSION 2
247 #include <sys/ddi.h>
248 #include <sys/cmn_err.h>
249 #include <sys/debug.h>
250 
251 #include <sys/systm.h>
252 #include <sys/param.h>
253 #include <sys/kmem.h>
254 #include <sys/isa_defs.h>
255 #include <inet/common.h>
256 #include <netinet/ip6.h>
257 #include <netinet/icmp6.h>
258 
259 #include <inet/ip.h>
260 #include <inet/ip6.h>
261 #include <inet/tcp.h>
262 #include <inet/ip_ndp.h>
263 #include <inet/udp_impl.h>
264 #include <inet/sctp_ip.h>
265 
266 #include <sys/cpuvar.h>
267 
268 #include <inet/ipclassifier.h>
269 #include <inet/ipsec_impl.h>
270 
271 #include <sys/tsol/tnet.h>
272 
273 #ifdef DEBUG
274 #define	IPCL_DEBUG
275 #else
276 #undef	IPCL_DEBUG
277 #endif
278 
279 #ifdef	IPCL_DEBUG
280 int	ipcl_debug_level = 0;
281 #define	IPCL_DEBUG_LVL(level, args)	\
282 	if (ipcl_debug_level  & level) { printf args; }
283 #else
284 #define	IPCL_DEBUG_LVL(level, args) {; }
285 #endif
286 connf_t	*ipcl_conn_fanout;
287 connf_t	*ipcl_bind_fanout;
288 connf_t	ipcl_proto_fanout[IPPROTO_MAX + 1];
289 connf_t	ipcl_proto_fanout_v6[IPPROTO_MAX + 1];
290 connf_t	*ipcl_udp_fanout;
291 
292 /* A separate hash list for raw socket. */
293 connf_t *ipcl_raw_fanout;
294 
295 connf_t rts_clients;
296 
297 /* Old value for compatibility */
298 uint_t tcp_conn_hash_size = 0;
299 
300 /* New value. Zero means choose automatically. */
301 uint_t ipcl_conn_hash_size = 0;
302 uint_t ipcl_conn_hash_memfactor = 8192;
303 uint_t ipcl_conn_hash_maxsize = 82500;
304 
305 uint_t ipcl_conn_fanout_size = 0;
306 
307 
308 /* bind/udp fanout table size */
309 uint_t ipcl_bind_fanout_size = 512;
310 uint_t ipcl_udp_fanout_size = 16384;
311 
312 /* Raw socket fanout size.  Must be a power of 2. */
313 uint_t ipcl_raw_fanout_size = 256;
314 
315 /*
316  * Power of 2^N Primes useful for hashing for N of 0-28,
317  * these primes are the nearest prime <= 2^N - 2^(N-2).
318  */
319 
320 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
321 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
322 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
323 		50331599, 100663291, 201326557, 0}
324 
325 /*
326  * wrapper structure to ensure that conn+tcpb are aligned
327  * on cache lines.
328  */
329 typedef struct itc_s {
330 	union {
331 		conn_t	itcu_conn;
332 		char	itcu_filler[CACHE_ALIGN(conn_s)];
333 	}	itc_u;
334 	tcp_t	itc_tcp;
335 } itc_t;
336 
337 #define	itc_conn	itc_u.itcu_conn
338 
339 struct kmem_cache  *ipcl_tcpconn_cache;
340 struct kmem_cache  *ipcl_tcp_cache;
341 struct kmem_cache  *ipcl_conn_cache;
342 extern struct kmem_cache  *sctp_conn_cache;
343 extern struct kmem_cache  *tcp_sack_info_cache;
344 extern struct kmem_cache  *tcp_iphc_cache;
345 
346 extern void	tcp_timermp_free(tcp_t *);
347 extern mblk_t	*tcp_timermp_alloc(int);
348 
349 static int	ipcl_tcpconn_constructor(void *, void *, int);
350 static void	ipcl_tcpconn_destructor(void *, void *);
351 
352 static int conn_g_index;
353 connf_t	*ipcl_globalhash_fanout;
354 
355 #ifdef	IPCL_DEBUG
356 #define	INET_NTOA_BUFSIZE	18
357 
358 static char *
359 inet_ntoa_r(uint32_t in, char *b)
360 {
361 	unsigned char	*p;
362 
363 	p = (unsigned char *)&in;
364 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
365 	return (b);
366 }
367 #endif
368 
369 /*
370  * ipclassifier intialization routine, sets up hash tables and
371  * conn caches.
372  */
373 void
374 ipcl_init(void)
375 {
376 	int i;
377 	int sizes[] = P2Ps();
378 
379 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
380 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
381 	    NULL, NULL, NULL, NULL, NULL, 0);
382 
383 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
384 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
385 	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
386 	    NULL, NULL, NULL, 0);
387 
388 	/*
389 	 * Calculate size of conn fanout table.
390 	 */
391 	if (ipcl_conn_hash_size != 0) {
392 		ipcl_conn_fanout_size = ipcl_conn_hash_size;
393 	} else if (tcp_conn_hash_size != 0) {
394 		ipcl_conn_fanout_size = tcp_conn_hash_size;
395 	} else {
396 		extern pgcnt_t freemem;
397 
398 		ipcl_conn_fanout_size =
399 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
400 
401 		if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize)
402 			ipcl_conn_fanout_size = ipcl_conn_hash_maxsize;
403 	}
404 
405 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
406 		if (sizes[i] >= ipcl_conn_fanout_size) {
407 			break;
408 		}
409 	}
410 	if ((ipcl_conn_fanout_size = sizes[i]) == 0) {
411 		/* Out of range, use the 2^16 value */
412 		ipcl_conn_fanout_size = sizes[16];
413 	}
414 	ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size *
415 	    sizeof (*ipcl_conn_fanout), KM_SLEEP);
416 
417 	for (i = 0; i < ipcl_conn_fanout_size; i++) {
418 		mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL,
419 		    MUTEX_DEFAULT, NULL);
420 	}
421 
422 	ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size *
423 	    sizeof (*ipcl_bind_fanout), KM_SLEEP);
424 
425 	for (i = 0; i < ipcl_bind_fanout_size; i++) {
426 		mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL,
427 		    MUTEX_DEFAULT, NULL);
428 	}
429 
430 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) {
431 		mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL,
432 		    MUTEX_DEFAULT, NULL);
433 	}
434 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) {
435 		mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL,
436 		    MUTEX_DEFAULT, NULL);
437 	}
438 
439 	mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL);
440 
441 	ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size *
442 	    sizeof (*ipcl_udp_fanout), KM_SLEEP);
443 
444 	for (i = 0; i < ipcl_udp_fanout_size; i++) {
445 		mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL,
446 		    MUTEX_DEFAULT, NULL);
447 	}
448 
449 	ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size *
450 	    sizeof (*ipcl_raw_fanout), KM_SLEEP);
451 
452 	for (i = 0; i < ipcl_raw_fanout_size; i++) {
453 		mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL,
454 		    MUTEX_DEFAULT, NULL);
455 	}
456 
457 	ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) *
458 	    CONN_G_HASH_SIZE, KM_SLEEP);
459 
460 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
461 		mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL,
462 		    MUTEX_DEFAULT, NULL);
463 	}
464 }
465 
466 void
467 ipcl_destroy(void)
468 {
469 	int i;
470 	kmem_cache_destroy(ipcl_conn_cache);
471 	kmem_cache_destroy(ipcl_tcpconn_cache);
472 	for (i = 0; i < ipcl_conn_fanout_size; i++)
473 		mutex_destroy(&ipcl_conn_fanout[i].connf_lock);
474 	kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size *
475 	    sizeof (*ipcl_conn_fanout));
476 	for (i = 0; i < ipcl_bind_fanout_size; i++)
477 		mutex_destroy(&ipcl_bind_fanout[i].connf_lock);
478 	kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size *
479 	    sizeof (*ipcl_bind_fanout));
480 
481 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++)
482 		mutex_destroy(&ipcl_proto_fanout[i].connf_lock);
483 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++)
484 		mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock);
485 
486 	for (i = 0; i < ipcl_udp_fanout_size; i++)
487 		mutex_destroy(&ipcl_udp_fanout[i].connf_lock);
488 	kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size *
489 	    sizeof (*ipcl_udp_fanout));
490 
491 	for (i = 0; i < ipcl_raw_fanout_size; i++)
492 		mutex_destroy(&ipcl_raw_fanout[i].connf_lock);
493 	kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size *
494 	    sizeof (*ipcl_raw_fanout));
495 
496 	kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE);
497 	mutex_destroy(&rts_clients.connf_lock);
498 }
499 
500 /*
501  * conn creation routine. initialize the conn, sets the reference
502  * and inserts it in the global hash table.
503  */
504 conn_t *
505 ipcl_conn_create(uint32_t type, int sleep)
506 {
507 	itc_t	*itc;
508 	conn_t	*connp;
509 
510 	switch (type) {
511 	case IPCL_TCPCONN:
512 		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
513 		    sleep)) == NULL)
514 			return (NULL);
515 		connp = &itc->itc_conn;
516 		connp->conn_ref = 1;
517 		IPCL_DEBUG_LVL(1,
518 		    ("ipcl_conn_create: connp = %p tcp (%p)",
519 		    (void *)connp, (void *)connp->conn_tcp));
520 		ipcl_globalhash_insert(connp);
521 		break;
522 	case IPCL_SCTPCONN:
523 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
524 			return (NULL);
525 		connp->conn_flags = IPCL_SCTPCONN;
526 		break;
527 	case IPCL_IPCCONN:
528 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
529 		if (connp == NULL)
530 			return (NULL);
531 		bzero(connp, sizeof (conn_t));
532 		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
533 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
534 		connp->conn_flags = IPCL_IPCCONN;
535 		connp->conn_ref = 1;
536 		IPCL_DEBUG_LVL(1,
537 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
538 		ipcl_globalhash_insert(connp);
539 		break;
540 	default:
541 		connp = NULL;
542 		ASSERT(0);
543 	}
544 
545 	return (connp);
546 }
547 
548 void
549 ipcl_conn_destroy(conn_t *connp)
550 {
551 	mblk_t	*mp;
552 
553 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
554 	ASSERT(connp->conn_ref == 0);
555 	ASSERT(connp->conn_ire_cache == NULL);
556 
557 	if (connp->conn_peercred != NULL &&
558 	    connp->conn_peercred != connp->conn_cred)
559 		crfree(connp->conn_peercred);
560 	connp->conn_peercred = NULL;
561 
562 	if (connp->conn_cred != NULL) {
563 		crfree(connp->conn_cred);
564 		connp->conn_cred = NULL;
565 	}
566 
567 	ipcl_globalhash_remove(connp);
568 
569 	cv_destroy(&connp->conn_cv);
570 	if (connp->conn_flags & IPCL_TCPCONN) {
571 		tcp_t	*tcp = connp->conn_tcp;
572 
573 		mutex_destroy(&connp->conn_lock);
574 		ASSERT(connp->conn_tcp != NULL);
575 		tcp_free(tcp);
576 		mp = tcp->tcp_timercache;
577 		tcp->tcp_cred = NULL;
578 
579 		if (tcp->tcp_sack_info != NULL) {
580 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
581 			kmem_cache_free(tcp_sack_info_cache,
582 			    tcp->tcp_sack_info);
583 		}
584 		if (tcp->tcp_iphc != NULL) {
585 			if (tcp->tcp_hdr_grown) {
586 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
587 			} else {
588 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
589 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
590 			}
591 			tcp->tcp_iphc_len = 0;
592 		}
593 		ASSERT(tcp->tcp_iphc_len == 0);
594 
595 		if (connp->conn_latch != NULL)
596 			IPLATCH_REFRELE(connp->conn_latch);
597 		if (connp->conn_policy != NULL)
598 			IPPH_REFRELE(connp->conn_policy);
599 		bzero(connp, sizeof (itc_t));
600 
601 		tcp->tcp_timercache = mp;
602 		connp->conn_tcp = tcp;
603 		connp->conn_flags = IPCL_TCPCONN;
604 		connp->conn_ulp = IPPROTO_TCP;
605 		tcp->tcp_connp = connp;
606 		kmem_cache_free(ipcl_tcpconn_cache, connp);
607 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
608 		sctp_free(connp);
609 	} else {
610 		ASSERT(connp->conn_udp == NULL);
611 		mutex_destroy(&connp->conn_lock);
612 		kmem_cache_free(ipcl_conn_cache, connp);
613 	}
614 }
615 
616 /*
617  * Running in cluster mode - deregister listener information
618  */
619 
620 static void
621 ipcl_conn_unlisten(conn_t *connp)
622 {
623 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
624 	ASSERT(connp->conn_lport != 0);
625 
626 	if (cl_inet_unlisten != NULL) {
627 		sa_family_t	addr_family;
628 		uint8_t		*laddrp;
629 
630 		if (connp->conn_pkt_isv6) {
631 			addr_family = AF_INET6;
632 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
633 		} else {
634 			addr_family = AF_INET;
635 			laddrp = (uint8_t *)&connp->conn_bound_source;
636 		}
637 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
638 		    connp->conn_lport);
639 	}
640 	connp->conn_flags &= ~IPCL_CL_LISTENER;
641 }
642 
643 /*
644  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
645  * which table the conn belonged to). So for debugging we can see which hash
646  * table this connection was in.
647  */
648 #define	IPCL_HASH_REMOVE(connp)	{					\
649 	connf_t	*connfp = (connp)->conn_fanout;				\
650 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
651 	if (connfp != NULL) {						\
652 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
653 		    (void *)(connp)));					\
654 		mutex_enter(&connfp->connf_lock);			\
655 		if ((connp)->conn_next != NULL)				\
656 			(connp)->conn_next->conn_prev =			\
657 			    (connp)->conn_prev;				\
658 		if ((connp)->conn_prev != NULL)				\
659 			(connp)->conn_prev->conn_next =			\
660 			    (connp)->conn_next;				\
661 		else							\
662 			connfp->connf_head = (connp)->conn_next;	\
663 		(connp)->conn_fanout = NULL;				\
664 		(connp)->conn_next = NULL;				\
665 		(connp)->conn_prev = NULL;				\
666 		(connp)->conn_flags |= IPCL_REMOVED;			\
667 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
668 			ipcl_conn_unlisten((connp));			\
669 		CONN_DEC_REF((connp));					\
670 		mutex_exit(&connfp->connf_lock);			\
671 	}								\
672 }
673 
674 void
675 ipcl_hash_remove(conn_t *connp)
676 {
677 	IPCL_HASH_REMOVE(connp);
678 }
679 
680 /*
681  * The whole purpose of this function is allow removal of
682  * a conn_t from the connected hash for timewait reclaim.
683  * This is essentially a TW reclaim fastpath where timewait
684  * collector checks under fanout lock (so no one else can
685  * get access to the conn_t) that refcnt is 2 i.e. one for
686  * TCP and one for the classifier hash list. If ref count
687  * is indeed 2, we can just remove the conn under lock and
688  * avoid cleaning up the conn under squeue. This gives us
689  * improved performance.
690  */
691 void
692 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
693 {
694 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
695 	ASSERT(MUTEX_HELD(&connp->conn_lock));
696 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
697 
698 	if ((connp)->conn_next != NULL) {
699 		(connp)->conn_next->conn_prev =
700 			(connp)->conn_prev;
701 	}
702 	if ((connp)->conn_prev != NULL) {
703 		(connp)->conn_prev->conn_next =
704 			(connp)->conn_next;
705 	} else {
706 		connfp->connf_head = (connp)->conn_next;
707 	}
708 	(connp)->conn_fanout = NULL;
709 	(connp)->conn_next = NULL;
710 	(connp)->conn_prev = NULL;
711 	(connp)->conn_flags |= IPCL_REMOVED;
712 	ASSERT((connp)->conn_ref == 2);
713 	(connp)->conn_ref--;
714 }
715 
716 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
717 	ASSERT((connp)->conn_fanout == NULL);				\
718 	ASSERT((connp)->conn_next == NULL);				\
719 	ASSERT((connp)->conn_prev == NULL);				\
720 	if ((connfp)->connf_head != NULL) {				\
721 		(connfp)->connf_head->conn_prev = (connp);		\
722 		(connp)->conn_next = (connfp)->connf_head;		\
723 	}								\
724 	(connp)->conn_fanout = (connfp);				\
725 	(connfp)->connf_head = (connp);					\
726 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
727 	    IPCL_CONNECTED;						\
728 	CONN_INC_REF(connp);						\
729 }
730 
731 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
732 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
733 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
734 	IPCL_HASH_REMOVE((connp));					\
735 	mutex_enter(&(connfp)->connf_lock);				\
736 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
737 	mutex_exit(&(connfp)->connf_lock);				\
738 }
739 
740 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
741 	conn_t *pconnp = NULL, *nconnp;					\
742 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
743 	    "connp %p", (void *)connfp, (void *)(connp)));		\
744 	IPCL_HASH_REMOVE((connp));					\
745 	mutex_enter(&(connfp)->connf_lock);				\
746 	nconnp = (connfp)->connf_head;					\
747 	while (nconnp != NULL &&					\
748 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
749 		pconnp = nconnp;					\
750 		nconnp = nconnp->conn_next;				\
751 	}								\
752 	if (pconnp != NULL) {						\
753 		pconnp->conn_next = (connp);				\
754 		(connp)->conn_prev = pconnp;				\
755 	} else {							\
756 		(connfp)->connf_head = (connp);				\
757 	}								\
758 	if (nconnp != NULL) {						\
759 		(connp)->conn_next = nconnp;				\
760 		nconnp->conn_prev = (connp);				\
761 	}								\
762 	(connp)->conn_fanout = (connfp);				\
763 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
764 	    IPCL_BOUND;							\
765 	CONN_INC_REF(connp);						\
766 	mutex_exit(&(connfp)->connf_lock);				\
767 }
768 
769 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
770 	conn_t **list, *prev, *next;					\
771 	boolean_t isv4mapped =						\
772 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
773 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
774 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
775 	IPCL_HASH_REMOVE((connp));					\
776 	mutex_enter(&(connfp)->connf_lock);				\
777 	list = &(connfp)->connf_head;					\
778 	prev = NULL;							\
779 	while ((next = *list) != NULL) {				\
780 		if (isv4mapped &&					\
781 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
782 		    connp->conn_zoneid == next->conn_zoneid) {		\
783 			(connp)->conn_next = next;			\
784 			if (prev != NULL)				\
785 				prev = next->conn_prev;			\
786 			next->conn_prev = (connp);			\
787 			break;						\
788 		}							\
789 		list = &next->conn_next;				\
790 		prev = next;						\
791 	}								\
792 	(connp)->conn_prev = prev;					\
793 	*list = (connp);						\
794 	(connp)->conn_fanout = (connfp);				\
795 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
796 	    IPCL_BOUND;							\
797 	CONN_INC_REF((connp));						\
798 	mutex_exit(&(connfp)->connf_lock);				\
799 }
800 
801 void
802 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
803 {
804 	ASSERT(!connp->conn_mac_exempt);
805 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
806 }
807 
808 void
809 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
810 {
811 	connf_t	*connfp;
812 
813 	ASSERT(connp != NULL);
814 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
815 	    protocol == IPPROTO_ESP);
816 
817 	connp->conn_ulp = protocol;
818 
819 	/* Insert it in the protocol hash */
820 	connfp = &ipcl_proto_fanout[protocol];
821 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
822 }
823 
824 void
825 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
826 {
827 	connf_t	*connfp;
828 
829 	ASSERT(connp != NULL);
830 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
831 	    protocol == IPPROTO_ESP);
832 
833 	connp->conn_ulp = protocol;
834 
835 	/* Insert it in the Bind Hash */
836 	connfp = &ipcl_proto_fanout_v6[protocol];
837 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
838 }
839 
840 /*
841  * This function is used only for inserting SCTP raw socket now.
842  * This may change later.
843  *
844  * Note that only one raw socket can be bound to a port.  The param
845  * lport is in network byte order.
846  */
847 static int
848 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
849 {
850 	connf_t	*connfp;
851 	conn_t	*oconnp;
852 
853 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
854 
855 	/* Check for existing raw socket already bound to the port. */
856 	mutex_enter(&connfp->connf_lock);
857 	for (oconnp = connfp->connf_head; oconnp != NULL;
858 	    oconnp = oconnp->conn_next) {
859 		if (oconnp->conn_lport == lport &&
860 		    oconnp->conn_zoneid == connp->conn_zoneid &&
861 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
862 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
863 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
864 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
865 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
866 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
867 		    &connp->conn_srcv6))) {
868 			break;
869 		}
870 	}
871 	mutex_exit(&connfp->connf_lock);
872 	if (oconnp != NULL)
873 		return (EADDRNOTAVAIL);
874 
875 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
876 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
877 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
878 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
879 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
880 		} else {
881 			IPCL_HASH_INSERT_BOUND(connfp, connp);
882 		}
883 	} else {
884 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
885 	}
886 	return (0);
887 }
888 
889 /*
890  * Check for a MAC exemption conflict on a labeled system.  Note that for
891  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
892  * transport layer.  This check is for binding all other protocols.
893  *
894  * Returns true if there's a conflict.
895  */
896 static boolean_t
897 check_exempt_conflict_v4(conn_t *connp)
898 {
899 	connf_t	*connfp;
900 	conn_t *tconn;
901 
902 	connfp = &ipcl_proto_fanout[connp->conn_ulp];
903 	mutex_enter(&connfp->connf_lock);
904 	for (tconn = connfp->connf_head; tconn != NULL;
905 	    tconn = tconn->conn_next) {
906 		/* We don't allow v4 fallback for v6 raw socket */
907 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
908 			continue;
909 		/* If neither is exempt, then there's no conflict */
910 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
911 			continue;
912 		/* If both are bound to different specific addrs, ok */
913 		if (connp->conn_src != INADDR_ANY &&
914 		    tconn->conn_src != INADDR_ANY &&
915 		    connp->conn_src != tconn->conn_src)
916 			continue;
917 		/* These two conflict; fail */
918 		break;
919 	}
920 	mutex_exit(&connfp->connf_lock);
921 	return (tconn != NULL);
922 }
923 
924 static boolean_t
925 check_exempt_conflict_v6(conn_t *connp)
926 {
927 	connf_t	*connfp;
928 	conn_t *tconn;
929 
930 	connfp = &ipcl_proto_fanout[connp->conn_ulp];
931 	mutex_enter(&connfp->connf_lock);
932 	for (tconn = connfp->connf_head; tconn != NULL;
933 	    tconn = tconn->conn_next) {
934 		/* We don't allow v4 fallback for v6 raw socket */
935 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
936 			continue;
937 		/* If neither is exempt, then there's no conflict */
938 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
939 			continue;
940 		/* If both are bound to different addrs, ok */
941 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
942 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
943 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
944 			continue;
945 		/* These two conflict; fail */
946 		break;
947 	}
948 	mutex_exit(&connfp->connf_lock);
949 	return (tconn != NULL);
950 }
951 
952 /*
953  * (v4, v6) bind hash insertion routines
954  */
955 int
956 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
957 {
958 	connf_t	*connfp;
959 #ifdef	IPCL_DEBUG
960 	char	buf[INET_NTOA_BUFSIZE];
961 #endif
962 	int	ret = 0;
963 
964 	ASSERT(connp);
965 
966 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
967 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
968 
969 	connp->conn_ulp = protocol;
970 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
971 	connp->conn_lport = lport;
972 
973 	switch (protocol) {
974 	default:
975 		if (is_system_labeled() && check_exempt_conflict_v4(connp))
976 			return (EADDRINUSE);
977 		/* FALLTHROUGH */
978 	case IPPROTO_UDP:
979 		if (protocol == IPPROTO_UDP) {
980 			IPCL_DEBUG_LVL(64,
981 			    ("ipcl_bind_insert: connp %p - udp\n",
982 			    (void *)connp));
983 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
984 		} else {
985 			IPCL_DEBUG_LVL(64,
986 			    ("ipcl_bind_insert: connp %p - protocol\n",
987 			    (void *)connp));
988 			connfp = &ipcl_proto_fanout[protocol];
989 		}
990 
991 		if (connp->conn_rem != INADDR_ANY) {
992 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
993 		} else if (connp->conn_src != INADDR_ANY) {
994 			IPCL_HASH_INSERT_BOUND(connfp, connp);
995 		} else {
996 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
997 		}
998 		break;
999 
1000 	case IPPROTO_TCP:
1001 
1002 		/* Insert it in the Bind Hash */
1003 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1004 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1005 		if (connp->conn_src != INADDR_ANY) {
1006 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1007 		} else {
1008 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1009 		}
1010 		if (cl_inet_listen != NULL) {
1011 			ASSERT(!connp->conn_pkt_isv6);
1012 			connp->conn_flags |= IPCL_CL_LISTENER;
1013 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1014 			    (uint8_t *)&connp->conn_bound_source, lport);
1015 		}
1016 		break;
1017 
1018 	case IPPROTO_SCTP:
1019 		ret = ipcl_sctp_hash_insert(connp, lport);
1020 		break;
1021 	}
1022 
1023 	return (ret);
1024 }
1025 
1026 int
1027 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1028     uint16_t lport)
1029 {
1030 	connf_t	*connfp;
1031 	int	ret = 0;
1032 
1033 	ASSERT(connp);
1034 
1035 	connp->conn_ulp = protocol;
1036 	connp->conn_srcv6 = *src;
1037 	connp->conn_lport = lport;
1038 
1039 	switch (protocol) {
1040 	default:
1041 		if (is_system_labeled() && check_exempt_conflict_v6(connp))
1042 			return (EADDRINUSE);
1043 		/* FALLTHROUGH */
1044 	case IPPROTO_UDP:
1045 		if (protocol == IPPROTO_UDP) {
1046 			IPCL_DEBUG_LVL(128,
1047 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1048 			    (void *)connp));
1049 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1050 		} else {
1051 			IPCL_DEBUG_LVL(128,
1052 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1053 			    (void *)connp));
1054 			connfp = &ipcl_proto_fanout_v6[protocol];
1055 		}
1056 
1057 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1058 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1059 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1060 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1061 		} else {
1062 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1063 		}
1064 		break;
1065 
1066 	case IPPROTO_TCP:
1067 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1068 
1069 		/* Insert it in the Bind Hash */
1070 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1071 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1072 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1073 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1074 		} else {
1075 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1076 		}
1077 		if (cl_inet_listen != NULL) {
1078 			sa_family_t	addr_family;
1079 			uint8_t		*laddrp;
1080 
1081 			if (connp->conn_pkt_isv6) {
1082 				addr_family = AF_INET6;
1083 				laddrp =
1084 				    (uint8_t *)&connp->conn_bound_source_v6;
1085 			} else {
1086 				addr_family = AF_INET;
1087 				laddrp = (uint8_t *)&connp->conn_bound_source;
1088 			}
1089 			connp->conn_flags |= IPCL_CL_LISTENER;
1090 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1091 			    lport);
1092 		}
1093 		break;
1094 
1095 	case IPPROTO_SCTP:
1096 		ret = ipcl_sctp_hash_insert(connp, lport);
1097 		break;
1098 	}
1099 
1100 	return (ret);
1101 }
1102 
1103 /*
1104  * ipcl_conn_hash insertion routines.
1105  */
1106 int
1107 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1108     ipaddr_t rem, uint32_t ports)
1109 {
1110 	connf_t		*connfp;
1111 	uint16_t	*up;
1112 	conn_t		*tconnp;
1113 #ifdef	IPCL_DEBUG
1114 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1115 #endif
1116 	in_port_t	lport;
1117 	int		ret = 0;
1118 
1119 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1120 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1121 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1122 	    ports, protocol));
1123 
1124 	switch (protocol) {
1125 	case IPPROTO_TCP:
1126 		if (!(connp->conn_flags & IPCL_EAGER)) {
1127 			/*
1128 			 * for a eager connection, i.e connections which
1129 			 * have just been created, the initialization is
1130 			 * already done in ip at conn_creation time, so
1131 			 * we can skip the checks here.
1132 			 */
1133 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1134 		}
1135 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem,
1136 		    connp->conn_ports)];
1137 		mutex_enter(&connfp->connf_lock);
1138 		for (tconnp = connfp->connf_head; tconnp != NULL;
1139 		    tconnp = tconnp->conn_next) {
1140 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1141 			    connp->conn_rem, connp->conn_src,
1142 			    connp->conn_ports)) {
1143 
1144 				/* Already have a conn. bail out */
1145 				mutex_exit(&connfp->connf_lock);
1146 				return (EADDRINUSE);
1147 			}
1148 		}
1149 		if (connp->conn_fanout != NULL) {
1150 			/*
1151 			 * Probably a XTI/TLI application trying to do a
1152 			 * rebind. Let it happen.
1153 			 */
1154 			mutex_exit(&connfp->connf_lock);
1155 			IPCL_HASH_REMOVE(connp);
1156 			mutex_enter(&connfp->connf_lock);
1157 		}
1158 
1159 		ASSERT(connp->conn_recv != NULL);
1160 
1161 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1162 		mutex_exit(&connfp->connf_lock);
1163 		break;
1164 
1165 	case IPPROTO_SCTP:
1166 		/*
1167 		 * The raw socket may have already been bound, remove it
1168 		 * from the hash first.
1169 		 */
1170 		IPCL_HASH_REMOVE(connp);
1171 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1172 		ret = ipcl_sctp_hash_insert(connp, lport);
1173 		break;
1174 
1175 	default:
1176 		/*
1177 		 * Check for conflicts among MAC exempt bindings.  For
1178 		 * transports with port numbers, this is done by the upper
1179 		 * level per-transport binding logic.  For all others, it's
1180 		 * done here.
1181 		 */
1182 		if (is_system_labeled() && check_exempt_conflict_v4(connp))
1183 			return (EADDRINUSE);
1184 		/* FALLTHROUGH */
1185 
1186 	case IPPROTO_UDP:
1187 		up = (uint16_t *)&ports;
1188 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1189 		if (protocol == IPPROTO_UDP) {
1190 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1191 		} else {
1192 			connfp = &ipcl_proto_fanout[protocol];
1193 		}
1194 
1195 		if (connp->conn_rem != INADDR_ANY) {
1196 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1197 		} else if (connp->conn_src != INADDR_ANY) {
1198 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1199 		} else {
1200 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1201 		}
1202 		break;
1203 	}
1204 
1205 	return (ret);
1206 }
1207 
1208 int
1209 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1210     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1211 {
1212 	connf_t		*connfp;
1213 	uint16_t	*up;
1214 	conn_t		*tconnp;
1215 	in_port_t	lport;
1216 	int		ret = 0;
1217 
1218 	switch (protocol) {
1219 	case IPPROTO_TCP:
1220 		/* Just need to insert a conn struct */
1221 		if (!(connp->conn_flags & IPCL_EAGER)) {
1222 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1223 		}
1224 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6,
1225 		    connp->conn_ports)];
1226 		mutex_enter(&connfp->connf_lock);
1227 		for (tconnp = connfp->connf_head; tconnp != NULL;
1228 		    tconnp = tconnp->conn_next) {
1229 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1230 			    connp->conn_remv6, connp->conn_srcv6,
1231 			    connp->conn_ports) &&
1232 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1233 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1234 				/* Already have a conn. bail out */
1235 				mutex_exit(&connfp->connf_lock);
1236 				return (EADDRINUSE);
1237 			}
1238 		}
1239 		if (connp->conn_fanout != NULL) {
1240 			/*
1241 			 * Probably a XTI/TLI application trying to do a
1242 			 * rebind. Let it happen.
1243 			 */
1244 			mutex_exit(&connfp->connf_lock);
1245 			IPCL_HASH_REMOVE(connp);
1246 			mutex_enter(&connfp->connf_lock);
1247 		}
1248 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1249 		mutex_exit(&connfp->connf_lock);
1250 		break;
1251 
1252 	case IPPROTO_SCTP:
1253 		IPCL_HASH_REMOVE(connp);
1254 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1255 		ret = ipcl_sctp_hash_insert(connp, lport);
1256 		break;
1257 
1258 	default:
1259 		if (is_system_labeled() && check_exempt_conflict_v6(connp))
1260 			return (EADDRINUSE);
1261 		/* FALLTHROUGH */
1262 	case IPPROTO_UDP:
1263 		up = (uint16_t *)&ports;
1264 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1265 		if (protocol == IPPROTO_UDP) {
1266 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1267 		} else {
1268 			connfp = &ipcl_proto_fanout_v6[protocol];
1269 		}
1270 
1271 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1272 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1274 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 		} else {
1276 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 		}
1278 		break;
1279 	}
1280 
1281 	return (ret);
1282 }
1283 
1284 /*
1285  * v4 packet classifying function. looks up the fanout table to
1286  * find the conn, the packet belongs to. returns the conn with
1287  * the reference held, null otherwise.
1288  *
1289  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1290  * Lookup" comment block are applied.  Labels are also checked as described
1291  * above.  If the packet is from the inside (looped back), and is from the same
1292  * zone, then label checks are omitted.
1293  */
1294 conn_t *
1295 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1296 {
1297 	ipha_t	*ipha;
1298 	connf_t	*connfp, *bind_connfp;
1299 	uint16_t lport;
1300 	uint16_t fport;
1301 	uint32_t ports;
1302 	conn_t	*connp;
1303 	uint16_t  *up;
1304 	boolean_t shared_addr;
1305 	boolean_t unlabeled;
1306 
1307 	ipha = (ipha_t *)mp->b_rptr;
1308 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1309 
1310 	switch (protocol) {
1311 	case IPPROTO_TCP:
1312 		ports = *(uint32_t *)up;
1313 		connfp =
1314 		    &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)];
1315 		mutex_enter(&connfp->connf_lock);
1316 		for (connp = connfp->connf_head; connp != NULL;
1317 		    connp = connp->conn_next) {
1318 			if (IPCL_CONN_MATCH(connp, protocol,
1319 			    ipha->ipha_src, ipha->ipha_dst, ports))
1320 				break;
1321 		}
1322 
1323 		if (connp != NULL) {
1324 			/*
1325 			 * We have a fully-bound TCP connection.
1326 			 *
1327 			 * For labeled systems, there's no need to check the
1328 			 * label here.  It's known to be good as we checked
1329 			 * before allowing the connection to become bound.
1330 			 */
1331 			CONN_INC_REF(connp);
1332 			mutex_exit(&connfp->connf_lock);
1333 			return (connp);
1334 		}
1335 
1336 		mutex_exit(&connfp->connf_lock);
1337 
1338 		lport = up[1];
1339 		unlabeled = B_FALSE;
1340 		/* Cred cannot be null on IPv4 */
1341 		if (is_system_labeled())
1342 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1343 			    TSLF_UNLABELED) != 0;
1344 		shared_addr = (zoneid == ALL_ZONES);
1345 		if (shared_addr) {
1346 			zoneid = tsol_mlp_findzone(protocol, lport);
1347 			/*
1348 			 * If no shared MLP is found, tsol_mlp_findzone returns
1349 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1350 			 * search for the zone based on the packet label.
1351 			 *
1352 			 * If there is such a zone, we prefer to find a
1353 			 * connection in it.  Otherwise, we look for a
1354 			 * MAC-exempt connection in any zone whose label
1355 			 * dominates the default label on the packet.
1356 			 */
1357 			if (zoneid == ALL_ZONES)
1358 				zoneid = tsol_packet_to_zoneid(mp);
1359 			else
1360 				unlabeled = B_FALSE;
1361 		}
1362 
1363 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1364 		mutex_enter(&bind_connfp->connf_lock);
1365 		for (connp = bind_connfp->connf_head; connp != NULL;
1366 		    connp = connp->conn_next) {
1367 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1368 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1369 			    (unlabeled && connp->conn_mac_exempt)))
1370 				break;
1371 		}
1372 
1373 		/*
1374 		 * If the matching connection is SLP on a private address, then
1375 		 * the label on the packet must match the local zone's label.
1376 		 * Otherwise, it must be in the label range defined by tnrh.
1377 		 * This is ensured by tsol_receive_label.
1378 		 */
1379 		if (connp != NULL && is_system_labeled() &&
1380 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1381 		    shared_addr, connp)) {
1382 				DTRACE_PROBE3(
1383 				    tx__ip__log__info__classify__tcp,
1384 				    char *,
1385 				    "connp(1) could not receive mp(2)",
1386 				    conn_t *, connp, mblk_t *, mp);
1387 			connp = NULL;
1388 		}
1389 
1390 		if (connp != NULL) {
1391 			/* Have a listener at least */
1392 			CONN_INC_REF(connp);
1393 			mutex_exit(&bind_connfp->connf_lock);
1394 			return (connp);
1395 		}
1396 
1397 		mutex_exit(&bind_connfp->connf_lock);
1398 
1399 		IPCL_DEBUG_LVL(512,
1400 		    ("ipcl_classify: couldn't classify mp = %p\n",
1401 		    (void *)mp));
1402 		break;
1403 
1404 	case IPPROTO_UDP:
1405 		lport = up[1];
1406 		unlabeled = B_FALSE;
1407 		/* Cred cannot be null on IPv4 */
1408 		if (is_system_labeled())
1409 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1410 			    TSLF_UNLABELED) != 0;
1411 		shared_addr = (zoneid == ALL_ZONES);
1412 		if (shared_addr) {
1413 			zoneid = tsol_mlp_findzone(protocol, lport);
1414 			/*
1415 			 * If no shared MLP is found, tsol_mlp_findzone returns
1416 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1417 			 * search for the zone based on the packet label.
1418 			 *
1419 			 * If there is such a zone, we prefer to find a
1420 			 * connection in it.  Otherwise, we look for a
1421 			 * MAC-exempt connection in any zone whose label
1422 			 * dominates the default label on the packet.
1423 			 */
1424 			if (zoneid == ALL_ZONES)
1425 				zoneid = tsol_packet_to_zoneid(mp);
1426 			else
1427 				unlabeled = B_FALSE;
1428 		}
1429 		fport = up[0];
1430 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1431 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1432 		mutex_enter(&connfp->connf_lock);
1433 		for (connp = connfp->connf_head; connp != NULL;
1434 		    connp = connp->conn_next) {
1435 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1436 			    fport, ipha->ipha_src) &&
1437 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1438 			    (unlabeled && connp->conn_mac_exempt)))
1439 				break;
1440 		}
1441 
1442 		if (connp != NULL && is_system_labeled() &&
1443 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1444 		    shared_addr, connp)) {
1445 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1446 			    char *, "connp(1) could not receive mp(2)",
1447 			    conn_t *, connp, mblk_t *, mp);
1448 			connp = NULL;
1449 		}
1450 
1451 		if (connp != NULL) {
1452 			CONN_INC_REF(connp);
1453 			mutex_exit(&connfp->connf_lock);
1454 			return (connp);
1455 		}
1456 
1457 		/*
1458 		 * We shouldn't come here for multicast/broadcast packets
1459 		 */
1460 		mutex_exit(&connfp->connf_lock);
1461 		IPCL_DEBUG_LVL(512,
1462 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1463 		    lport, fport));
1464 		break;
1465 	}
1466 
1467 	return (NULL);
1468 }
1469 
1470 conn_t *
1471 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1472 {
1473 	ip6_t		*ip6h;
1474 	connf_t		*connfp, *bind_connfp;
1475 	uint16_t	lport;
1476 	uint16_t	fport;
1477 	tcph_t		*tcph;
1478 	uint32_t	ports;
1479 	conn_t		*connp;
1480 	uint16_t	*up;
1481 	boolean_t	shared_addr;
1482 	boolean_t	unlabeled;
1483 
1484 	ip6h = (ip6_t *)mp->b_rptr;
1485 
1486 	switch (protocol) {
1487 	case IPPROTO_TCP:
1488 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1489 		up = (uint16_t *)tcph->th_lport;
1490 		ports = *(uint32_t *)up;
1491 
1492 		connfp =
1493 		    &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)];
1494 		mutex_enter(&connfp->connf_lock);
1495 		for (connp = connfp->connf_head; connp != NULL;
1496 		    connp = connp->conn_next) {
1497 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1498 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1499 				break;
1500 		}
1501 
1502 		if (connp != NULL) {
1503 			/*
1504 			 * We have a fully-bound TCP connection.
1505 			 *
1506 			 * For labeled systems, there's no need to check the
1507 			 * label here.  It's known to be good as we checked
1508 			 * before allowing the connection to become bound.
1509 			 */
1510 			CONN_INC_REF(connp);
1511 			mutex_exit(&connfp->connf_lock);
1512 			return (connp);
1513 		}
1514 
1515 		mutex_exit(&connfp->connf_lock);
1516 
1517 		lport = up[1];
1518 		unlabeled = B_FALSE;
1519 		/* Cred can be null on IPv6 */
1520 		if (is_system_labeled()) {
1521 			cred_t *cr = DB_CRED(mp);
1522 
1523 			unlabeled = (cr != NULL &&
1524 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1525 		}
1526 		shared_addr = (zoneid == ALL_ZONES);
1527 		if (shared_addr) {
1528 			zoneid = tsol_mlp_findzone(protocol, lport);
1529 			/*
1530 			 * If no shared MLP is found, tsol_mlp_findzone returns
1531 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1532 			 * search for the zone based on the packet label.
1533 			 *
1534 			 * If there is such a zone, we prefer to find a
1535 			 * connection in it.  Otherwise, we look for a
1536 			 * MAC-exempt connection in any zone whose label
1537 			 * dominates the default label on the packet.
1538 			 */
1539 			if (zoneid == ALL_ZONES)
1540 				zoneid = tsol_packet_to_zoneid(mp);
1541 			else
1542 				unlabeled = B_FALSE;
1543 		}
1544 
1545 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1546 		mutex_enter(&bind_connfp->connf_lock);
1547 		for (connp = bind_connfp->connf_head; connp != NULL;
1548 		    connp = connp->conn_next) {
1549 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1550 			    ip6h->ip6_dst, lport) &&
1551 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1552 			    (unlabeled && connp->conn_mac_exempt)))
1553 				break;
1554 		}
1555 
1556 		if (connp != NULL && is_system_labeled() &&
1557 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1558 		    shared_addr, connp)) {
1559 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1560 			    char *, "connp(1) could not receive mp(2)",
1561 			    conn_t *, connp, mblk_t *, mp);
1562 			connp = NULL;
1563 		}
1564 
1565 		if (connp != NULL) {
1566 			/* Have a listner at least */
1567 			CONN_INC_REF(connp);
1568 			mutex_exit(&bind_connfp->connf_lock);
1569 			IPCL_DEBUG_LVL(512,
1570 			    ("ipcl_classify_v6: found listner "
1571 			    "connp = %p\n", (void *)connp));
1572 
1573 			return (connp);
1574 		}
1575 
1576 		mutex_exit(&bind_connfp->connf_lock);
1577 
1578 		IPCL_DEBUG_LVL(512,
1579 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1580 		    (void *)mp));
1581 		break;
1582 
1583 	case IPPROTO_UDP:
1584 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1585 		lport = up[1];
1586 		unlabeled = B_FALSE;
1587 		/* Cred can be null on IPv6 */
1588 		if (is_system_labeled()) {
1589 			cred_t *cr = DB_CRED(mp);
1590 
1591 			unlabeled = (cr != NULL &&
1592 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1593 		}
1594 		shared_addr = (zoneid == ALL_ZONES);
1595 		if (shared_addr) {
1596 			zoneid = tsol_mlp_findzone(protocol, lport);
1597 			/*
1598 			 * If no shared MLP is found, tsol_mlp_findzone returns
1599 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1600 			 * search for the zone based on the packet label.
1601 			 *
1602 			 * If there is such a zone, we prefer to find a
1603 			 * connection in it.  Otherwise, we look for a
1604 			 * MAC-exempt connection in any zone whose label
1605 			 * dominates the default label on the packet.
1606 			 */
1607 			if (zoneid == ALL_ZONES)
1608 				zoneid = tsol_packet_to_zoneid(mp);
1609 			else
1610 				unlabeled = B_FALSE;
1611 		}
1612 
1613 		fport = up[0];
1614 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1615 		    fport));
1616 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1617 		mutex_enter(&connfp->connf_lock);
1618 		for (connp = connfp->connf_head; connp != NULL;
1619 		    connp = connp->conn_next) {
1620 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1621 			    fport, ip6h->ip6_src) &&
1622 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1623 			    (unlabeled && connp->conn_mac_exempt)))
1624 				break;
1625 		}
1626 
1627 		if (connp != NULL && is_system_labeled() &&
1628 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1629 		    shared_addr, connp)) {
1630 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1631 			    char *, "connp(1) could not receive mp(2)",
1632 			    conn_t *, connp, mblk_t *, mp);
1633 			connp = NULL;
1634 		}
1635 
1636 		if (connp != NULL) {
1637 			CONN_INC_REF(connp);
1638 			mutex_exit(&connfp->connf_lock);
1639 			return (connp);
1640 		}
1641 
1642 		/*
1643 		 * We shouldn't come here for multicast/broadcast packets
1644 		 */
1645 		mutex_exit(&connfp->connf_lock);
1646 		IPCL_DEBUG_LVL(512,
1647 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1648 		    lport, fport));
1649 		break;
1650 	}
1651 
1652 	return (NULL);
1653 }
1654 
1655 /*
1656  * wrapper around ipcl_classify_(v4,v6) routines.
1657  */
1658 conn_t *
1659 ipcl_classify(mblk_t *mp, zoneid_t zoneid)
1660 {
1661 	uint16_t	hdr_len;
1662 	ipha_t		*ipha;
1663 	uint8_t		*nexthdrp;
1664 
1665 	if (MBLKL(mp) < sizeof (ipha_t))
1666 		return (NULL);
1667 
1668 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1669 	case IPV4_VERSION:
1670 		ipha = (ipha_t *)mp->b_rptr;
1671 		hdr_len = IPH_HDR_LENGTH(ipha);
1672 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1673 		    zoneid));
1674 	case IPV6_VERSION:
1675 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1676 		    &hdr_len, &nexthdrp))
1677 			return (NULL);
1678 
1679 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid));
1680 	}
1681 
1682 	return (NULL);
1683 }
1684 
1685 conn_t *
1686 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1687     uint32_t ports, ipha_t *hdr)
1688 {
1689 	connf_t		*connfp;
1690 	conn_t		*connp;
1691 	in_port_t	lport;
1692 	int		af;
1693 	boolean_t	shared_addr;
1694 	boolean_t	unlabeled;
1695 	const void	*dst;
1696 
1697 	lport = ((uint16_t *)&ports)[1];
1698 
1699 	unlabeled = B_FALSE;
1700 	/* Cred can be null on IPv6 */
1701 	if (is_system_labeled()) {
1702 		cred_t *cr = DB_CRED(mp);
1703 
1704 		unlabeled = (cr != NULL &&
1705 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1706 	}
1707 	shared_addr = (zoneid == ALL_ZONES);
1708 	if (shared_addr) {
1709 		zoneid = tsol_mlp_findzone(protocol, lport);
1710 		/*
1711 		 * If no shared MLP is found, tsol_mlp_findzone returns
1712 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1713 		 * the zone based on the packet label.
1714 		 *
1715 		 * If there is such a zone, we prefer to find a connection in
1716 		 * it.  Otherwise, we look for a MAC-exempt connection in any
1717 		 * zone whose label dominates the default label on the packet.
1718 		 */
1719 		if (zoneid == ALL_ZONES)
1720 			zoneid = tsol_packet_to_zoneid(mp);
1721 		else
1722 			unlabeled = B_FALSE;
1723 	}
1724 
1725 	af = IPH_HDR_VERSION(hdr);
1726 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1727 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1728 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
1729 
1730 	mutex_enter(&connfp->connf_lock);
1731 	for (connp = connfp->connf_head; connp != NULL;
1732 	    connp = connp->conn_next) {
1733 		/* We don't allow v4 fallback for v6 raw socket. */
1734 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1735 		    IPV6_VERSION))
1736 			continue;
1737 		if (connp->conn_fully_bound) {
1738 			if (af == IPV4_VERSION) {
1739 				if (!IPCL_CONN_MATCH(connp, protocol,
1740 				    hdr->ipha_src, hdr->ipha_dst, ports))
1741 					continue;
1742 			} else {
1743 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1744 				    ((ip6_t *)hdr)->ip6_src,
1745 				    ((ip6_t *)hdr)->ip6_dst, ports))
1746 					continue;
1747 			}
1748 		} else {
1749 			if (af == IPV4_VERSION) {
1750 				if (!IPCL_BIND_MATCH(connp, protocol,
1751 				    hdr->ipha_dst, lport))
1752 					continue;
1753 			} else {
1754 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1755 				    ((ip6_t *)hdr)->ip6_dst, lport))
1756 					continue;
1757 			}
1758 		}
1759 
1760 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1761 		    (unlabeled && connp->conn_mac_exempt))
1762 			break;
1763 	}
1764 	/*
1765 	 * If the connection is fully-bound and connection-oriented (TCP or
1766 	 * SCTP), then we've already validated the remote system's label.
1767 	 * There's no need to do it again for every packet.
1768 	 */
1769 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1770 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1771 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1772 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1773 		    char *, "connp(1) could not receive mp(2)",
1774 		    conn_t *, connp, mblk_t *, mp);
1775 		connp = NULL;
1776 	}
1777 
1778 	if (connp != NULL)
1779 		goto found;
1780 	mutex_exit(&connfp->connf_lock);
1781 
1782 	/* Try to look for a wildcard match. */
1783 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)];
1784 	mutex_enter(&connfp->connf_lock);
1785 	for (connp = connfp->connf_head; connp != NULL;
1786 	    connp = connp->conn_next) {
1787 		/* We don't allow v4 fallback for v6 raw socket. */
1788 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1789 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1790 			continue;
1791 		}
1792 		if (af == IPV4_VERSION) {
1793 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1794 				break;
1795 		} else {
1796 			if (IPCL_RAW_MATCH_V6(connp, protocol,
1797 			    ((ip6_t *)hdr)->ip6_dst)) {
1798 				break;
1799 			}
1800 		}
1801 	}
1802 
1803 	if (connp != NULL)
1804 		goto found;
1805 
1806 	mutex_exit(&connfp->connf_lock);
1807 	return (NULL);
1808 
1809 found:
1810 	ASSERT(connp != NULL);
1811 	CONN_INC_REF(connp);
1812 	mutex_exit(&connfp->connf_lock);
1813 	return (connp);
1814 }
1815 
1816 /* ARGSUSED */
1817 static int
1818 ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1819 {
1820 	itc_t	*itc = (itc_t *)buf;
1821 	conn_t 	*connp = &itc->itc_conn;
1822 	tcp_t	*tcp = &itc->itc_tcp;
1823 	bzero(itc, sizeof (itc_t));
1824 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1825 	connp->conn_tcp = tcp;
1826 	connp->conn_flags = IPCL_TCPCONN;
1827 	connp->conn_ulp = IPPROTO_TCP;
1828 	tcp->tcp_connp = connp;
1829 	return (0);
1830 }
1831 
1832 /* ARGSUSED */
1833 static void
1834 ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1835 {
1836 	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1837 }
1838 
1839 /*
1840  * All conns are inserted in a global multi-list for the benefit of
1841  * walkers. The walk is guaranteed to walk all open conns at the time
1842  * of the start of the walk exactly once. This property is needed to
1843  * achieve some cleanups during unplumb of interfaces. This is achieved
1844  * as follows.
1845  *
1846  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1847  * call the insert and delete functions below at creation and deletion
1848  * time respectively. The conn never moves or changes its position in this
1849  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1850  * won't increase due to walkers, once the conn deletion has started. Note
1851  * that we can't remove the conn from the global list and then wait for
1852  * the refcnt to drop to zero, since walkers would then see a truncated
1853  * list. CONN_INCIPIENT ensures that walkers don't start looking at
1854  * conns until ip_open is ready to make them globally visible.
1855  * The global round robin multi-list locks are held only to get the
1856  * next member/insertion/deletion and contention should be negligible
1857  * if the multi-list is much greater than the number of cpus.
1858  */
1859 void
1860 ipcl_globalhash_insert(conn_t *connp)
1861 {
1862 	int	index;
1863 
1864 	/*
1865 	 * No need for atomic here. Approximate even distribution
1866 	 * in the global lists is sufficient.
1867 	 */
1868 	conn_g_index++;
1869 	index = conn_g_index & (CONN_G_HASH_SIZE - 1);
1870 
1871 	connp->conn_g_prev = NULL;
1872 	/*
1873 	 * Mark as INCIPIENT, so that walkers will ignore this
1874 	 * for now, till ip_open is ready to make it visible globally.
1875 	 */
1876 	connp->conn_state_flags |= CONN_INCIPIENT;
1877 
1878 	/* Insert at the head of the list */
1879 	mutex_enter(&ipcl_globalhash_fanout[index].connf_lock);
1880 	connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head;
1881 	if (connp->conn_g_next != NULL)
1882 		connp->conn_g_next->conn_g_prev = connp;
1883 	ipcl_globalhash_fanout[index].connf_head = connp;
1884 
1885 	/* The fanout bucket this conn points to */
1886 	connp->conn_g_fanout = &ipcl_globalhash_fanout[index];
1887 
1888 	mutex_exit(&ipcl_globalhash_fanout[index].connf_lock);
1889 }
1890 
1891 void
1892 ipcl_globalhash_remove(conn_t *connp)
1893 {
1894 	/*
1895 	 * We were never inserted in the global multi list.
1896 	 * IPCL_NONE variety is never inserted in the global multilist
1897 	 * since it is presumed to not need any cleanup and is transient.
1898 	 */
1899 	if (connp->conn_g_fanout == NULL)
1900 		return;
1901 
1902 	mutex_enter(&connp->conn_g_fanout->connf_lock);
1903 	if (connp->conn_g_prev != NULL)
1904 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
1905 	else
1906 		connp->conn_g_fanout->connf_head = connp->conn_g_next;
1907 	if (connp->conn_g_next != NULL)
1908 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
1909 	mutex_exit(&connp->conn_g_fanout->connf_lock);
1910 
1911 	/* Better to stumble on a null pointer than to corrupt memory */
1912 	connp->conn_g_next = NULL;
1913 	connp->conn_g_prev = NULL;
1914 }
1915 
1916 /*
1917  * Walk the list of all conn_t's in the system, calling the function provided
1918  * with the specified argument for each.
1919  * Applies to both IPv4 and IPv6.
1920  *
1921  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
1922  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
1923  * unplumbed or removed. New conn_t's that are created while we are walking
1924  * may be missed by this walk, because they are not necessarily inserted
1925  * at the tail of the list. They are new conn_t's and thus don't have any
1926  * stale pointers. The CONN_CLOSING flag ensures that no new reference
1927  * is created to the struct that is going away.
1928  */
1929 void
1930 ipcl_walk(pfv_t func, void *arg)
1931 {
1932 	int	i;
1933 	conn_t	*connp;
1934 	conn_t	*prev_connp;
1935 
1936 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
1937 		mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1938 		prev_connp = NULL;
1939 		connp = ipcl_globalhash_fanout[i].connf_head;
1940 		while (connp != NULL) {
1941 			mutex_enter(&connp->conn_lock);
1942 			if (connp->conn_state_flags &
1943 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
1944 				mutex_exit(&connp->conn_lock);
1945 				connp = connp->conn_g_next;
1946 				continue;
1947 			}
1948 			CONN_INC_REF_LOCKED(connp);
1949 			mutex_exit(&connp->conn_lock);
1950 			mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1951 			(*func)(connp, arg);
1952 			if (prev_connp != NULL)
1953 				CONN_DEC_REF(prev_connp);
1954 			mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1955 			prev_connp = connp;
1956 			connp = connp->conn_g_next;
1957 		}
1958 		mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1959 		if (prev_connp != NULL)
1960 			CONN_DEC_REF(prev_connp);
1961 	}
1962 }
1963 
1964 /*
1965  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
1966  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1967  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1968  * (peer tcp in ESTABLISHED state).
1969  */
1970 conn_t *
1971 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph)
1972 {
1973 	uint32_t ports;
1974 	uint16_t *pports = (uint16_t *)&ports;
1975 	connf_t	*connfp;
1976 	conn_t	*tconnp;
1977 	boolean_t zone_chk;
1978 
1979 	/*
1980 	 * If either the source of destination address is loopback, then
1981 	 * both endpoints must be in the same Zone.  Otherwise, both of
1982 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1983 	 * state) and the endpoints may reside in different Zones.
1984 	 */
1985 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
1986 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
1987 
1988 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1989 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1990 
1991 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1992 
1993 	mutex_enter(&connfp->connf_lock);
1994 	for (tconnp = connfp->connf_head; tconnp != NULL;
1995 	    tconnp = tconnp->conn_next) {
1996 
1997 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1998 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1999 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2000 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2001 
2002 			ASSERT(tconnp != connp);
2003 			CONN_INC_REF(tconnp);
2004 			mutex_exit(&connfp->connf_lock);
2005 			return (tconnp);
2006 		}
2007 	}
2008 	mutex_exit(&connfp->connf_lock);
2009 	return (NULL);
2010 }
2011 
2012 /*
2013  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2014  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2015  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2016  * (peer tcp in ESTABLISHED state).
2017  */
2018 conn_t *
2019 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph)
2020 {
2021 	uint32_t ports;
2022 	uint16_t *pports = (uint16_t *)&ports;
2023 	connf_t	*connfp;
2024 	conn_t	*tconnp;
2025 	boolean_t zone_chk;
2026 
2027 	/*
2028 	 * If either the source of destination address is loopback, then
2029 	 * both endpoints must be in the same Zone.  Otherwise, both of
2030 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2031 	 * state) and the endpoints may reside in different Zones.  We
2032 	 * don't do Zone check for link local address(es) because the
2033 	 * current Zone implementation treats each link local address as
2034 	 * being unique per system node, i.e. they belong to global Zone.
2035 	 */
2036 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2037 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2038 
2039 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2040 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2041 
2042 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
2043 
2044 	mutex_enter(&connfp->connf_lock);
2045 	for (tconnp = connfp->connf_head; tconnp != NULL;
2046 	    tconnp = tconnp->conn_next) {
2047 
2048 		/* We skip tcp_bound_if check here as this is loopback tcp */
2049 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2050 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2051 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2052 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2053 
2054 			ASSERT(tconnp != connp);
2055 			CONN_INC_REF(tconnp);
2056 			mutex_exit(&connfp->connf_lock);
2057 			return (tconnp);
2058 		}
2059 	}
2060 	mutex_exit(&connfp->connf_lock);
2061 	return (NULL);
2062 }
2063 
2064 /*
2065  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2066  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2067  * Only checks for connected entries i.e. no INADDR_ANY checks.
2068  */
2069 conn_t *
2070 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state)
2071 {
2072 	uint32_t ports;
2073 	uint16_t *pports;
2074 	connf_t	*connfp;
2075 	conn_t	*tconnp;
2076 
2077 	pports = (uint16_t *)&ports;
2078 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2079 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2080 
2081 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
2082 
2083 	mutex_enter(&connfp->connf_lock);
2084 	for (tconnp = connfp->connf_head; tconnp != NULL;
2085 	    tconnp = tconnp->conn_next) {
2086 
2087 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2088 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2089 		    tconnp->conn_tcp->tcp_state >= min_state) {
2090 
2091 			CONN_INC_REF(tconnp);
2092 			mutex_exit(&connfp->connf_lock);
2093 			return (tconnp);
2094 		}
2095 	}
2096 	mutex_exit(&connfp->connf_lock);
2097 	return (NULL);
2098 }
2099 
2100 /*
2101  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2102  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2103  * Only checks for connected entries i.e. no INADDR_ANY checks.
2104  * Match on ifindex in addition to addresses.
2105  */
2106 conn_t *
2107 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2108     uint_t ifindex)
2109 {
2110 	tcp_t	*tcp;
2111 	uint32_t ports;
2112 	uint16_t *pports;
2113 	connf_t	*connfp;
2114 	conn_t	*tconnp;
2115 
2116 	pports = (uint16_t *)&ports;
2117 	pports[0] = tcpha->tha_fport;
2118 	pports[1] = tcpha->tha_lport;
2119 
2120 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
2121 
2122 	mutex_enter(&connfp->connf_lock);
2123 	for (tconnp = connfp->connf_head; tconnp != NULL;
2124 	    tconnp = tconnp->conn_next) {
2125 
2126 		tcp = tconnp->conn_tcp;
2127 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2128 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2129 		    tcp->tcp_state >= min_state &&
2130 		    (tcp->tcp_bound_if == 0 ||
2131 		    tcp->tcp_bound_if == ifindex)) {
2132 
2133 			CONN_INC_REF(tconnp);
2134 			mutex_exit(&connfp->connf_lock);
2135 			return (tconnp);
2136 		}
2137 	}
2138 	mutex_exit(&connfp->connf_lock);
2139 	return (NULL);
2140 }
2141 
2142 /*
2143  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2144  * a listener when changing state.
2145  */
2146 conn_t *
2147 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid)
2148 {
2149 	connf_t		*bind_connfp;
2150 	conn_t		*connp;
2151 	tcp_t		*tcp;
2152 
2153 	/*
2154 	 * Avoid false matches for packets sent to an IP destination of
2155 	 * all zeros.
2156 	 */
2157 	if (laddr == 0)
2158 		return (NULL);
2159 
2160 	ASSERT(zoneid != ALL_ZONES);
2161 
2162 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
2163 	mutex_enter(&bind_connfp->connf_lock);
2164 	for (connp = bind_connfp->connf_head; connp != NULL;
2165 	    connp = connp->conn_next) {
2166 		tcp = connp->conn_tcp;
2167 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2168 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2169 		    (tcp->tcp_listener == NULL)) {
2170 			CONN_INC_REF(connp);
2171 			mutex_exit(&bind_connfp->connf_lock);
2172 			return (connp);
2173 		}
2174 	}
2175 	mutex_exit(&bind_connfp->connf_lock);
2176 	return (NULL);
2177 }
2178 
2179 /*
2180  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2181  * a listener when changing state.
2182  */
2183 conn_t *
2184 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2185     zoneid_t zoneid)
2186 {
2187 	connf_t		*bind_connfp;
2188 	conn_t		*connp = NULL;
2189 	tcp_t		*tcp;
2190 
2191 	/*
2192 	 * Avoid false matches for packets sent to an IP destination of
2193 	 * all zeros.
2194 	 */
2195 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2196 		return (NULL);
2197 
2198 	ASSERT(zoneid != ALL_ZONES);
2199 
2200 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
2201 	mutex_enter(&bind_connfp->connf_lock);
2202 	for (connp = bind_connfp->connf_head; connp != NULL;
2203 	    connp = connp->conn_next) {
2204 		tcp = connp->conn_tcp;
2205 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2206 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2207 		    (tcp->tcp_bound_if == 0 ||
2208 		    tcp->tcp_bound_if == ifindex) &&
2209 		    tcp->tcp_listener == NULL) {
2210 			CONN_INC_REF(connp);
2211 			mutex_exit(&bind_connfp->connf_lock);
2212 			return (connp);
2213 		}
2214 	}
2215 	mutex_exit(&bind_connfp->connf_lock);
2216 	return (NULL);
2217 }
2218 
2219 /*
2220  * ipcl_get_next_conn
2221  *	get the next entry in the conn global list
2222  *	and put a reference on the next_conn.
2223  *	decrement the reference on the current conn.
2224  *
2225  * This is an iterator based walker function that also provides for
2226  * some selection by the caller. It walks through the conn_hash bucket
2227  * searching for the next valid connp in the list, and selects connections
2228  * that are neither closed nor condemned. It also REFHOLDS the conn
2229  * thus ensuring that the conn exists when the caller uses the conn.
2230  */
2231 conn_t *
2232 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2233 {
2234 	conn_t	*next_connp;
2235 
2236 	if (connfp == NULL)
2237 		return (NULL);
2238 
2239 	mutex_enter(&connfp->connf_lock);
2240 
2241 	next_connp = (connp == NULL) ?
2242 	    connfp->connf_head : connp->conn_g_next;
2243 
2244 	while (next_connp != NULL) {
2245 		mutex_enter(&next_connp->conn_lock);
2246 		if (!(next_connp->conn_flags & conn_flags) ||
2247 		    (next_connp->conn_state_flags &
2248 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2249 			/*
2250 			 * This conn has been condemned or
2251 			 * is closing, or the flags don't match
2252 			 */
2253 			mutex_exit(&next_connp->conn_lock);
2254 			next_connp = next_connp->conn_g_next;
2255 			continue;
2256 		}
2257 		CONN_INC_REF_LOCKED(next_connp);
2258 		mutex_exit(&next_connp->conn_lock);
2259 		break;
2260 	}
2261 
2262 	mutex_exit(&connfp->connf_lock);
2263 
2264 	if (connp != NULL)
2265 		CONN_DEC_REF(connp);
2266 
2267 	return (next_connp);
2268 }
2269 
2270 #ifdef CONN_DEBUG
2271 /*
2272  * Trace of the last NBUF refhold/refrele
2273  */
2274 int
2275 conn_trace_ref(conn_t *connp)
2276 {
2277 	int	last;
2278 	conn_trace_t	*ctb;
2279 
2280 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2281 	last = connp->conn_trace_last;
2282 	last++;
2283 	if (last == CONN_TRACE_MAX)
2284 		last = 0;
2285 
2286 	ctb = &connp->conn_trace_buf[last];
2287 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2288 	connp->conn_trace_last = last;
2289 	return (1);
2290 }
2291 
2292 int
2293 conn_untrace_ref(conn_t *connp)
2294 {
2295 	int	last;
2296 	conn_trace_t	*ctb;
2297 
2298 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2299 	last = connp->conn_trace_last;
2300 	last++;
2301 	if (last == CONN_TRACE_MAX)
2302 		last = 0;
2303 
2304 	ctb = &connp->conn_trace_buf[last];
2305 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2306 	connp->conn_trace_last = last;
2307 	return (1);
2308 }
2309 #endif
2310