xref: /titanic_50/usr/src/uts/common/inet/ip/ipclassifier.c (revision d676c6678e0d7d3d3a700014d359b227edba0042)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP PACKET CLASSIFIER
28  *
29  * The IP packet classifier provides mapping between IP packets and persistent
30  * connection state for connection-oriented protocols. It also provides
31  * interface for managing connection states.
32  *
33  * The connection state is kept in conn_t data structure and contains, among
34  * other things:
35  *
36  *	o local/remote address and ports
37  *	o Transport protocol
38  *	o squeue for the connection (for TCP only)
39  *	o reference counter
40  *	o Connection state
41  *	o hash table linkage
42  *	o interface/ire information
43  *	o credentials
44  *	o ipsec policy
45  *	o send and receive functions.
46  *	o mutex lock.
47  *
48  * Connections use a reference counting scheme. They are freed when the
49  * reference counter drops to zero. A reference is incremented when connection
50  * is placed in a list or table, when incoming packet for the connection arrives
51  * and when connection is processed via squeue (squeue processing may be
52  * asynchronous and the reference protects the connection from being destroyed
53  * before its processing is finished).
54  *
55  * send and receive functions are currently used for TCP only. The send function
56  * determines the IP entry point for the packet once it leaves TCP to be sent to
57  * the destination address. The receive function is used by IP when the packet
58  * should be passed for TCP processing. When a new connection is created these
59  * are set to ip_output() and tcp_input() respectively. During the lifetime of
60  * the connection the send and receive functions may change depending on the
61  * changes in the connection state. For example, Once the connection is bound to
62  * an addresse, the receive function for this connection is set to
63  * tcp_conn_request().  This allows incoming SYNs to go directly into the
64  * listener SYN processing function without going to tcp_input() first.
65  *
66  * Classifier uses several hash tables:
67  *
68  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
69  *	ipcl_bind_fanout:	contains all connections in BOUND state
70  *	ipcl_proto_fanout:	IPv4 protocol fanout
71  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
72  *	ipcl_udp_fanout:	contains all UDP connections
73  *	ipcl_iptun_fanout:	contains all IP tunnel connections
74  *	ipcl_globalhash_fanout:	contains all connections
75  *
76  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
77  * which need to view all existing connections.
78  *
79  * All tables are protected by per-bucket locks. When both per-bucket lock and
80  * connection lock need to be held, the per-bucket lock should be acquired
81  * first, followed by the connection lock.
82  *
83  * All functions doing search in one of these tables increment a reference
84  * counter on the connection found (if any). This reference should be dropped
85  * when the caller has finished processing the connection.
86  *
87  *
88  * INTERFACES:
89  * ===========
90  *
91  * Connection Lookup:
92  * ------------------
93  *
94  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
95  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
96  *
97  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
98  * it can't find any associated connection. If the connection is found, its
99  * reference counter is incremented.
100  *
101  *	mp:	mblock, containing packet header. The full header should fit
102  *		into a single mblock. It should also contain at least full IP
103  *		and TCP or UDP header.
104  *
105  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
106  *
107  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
108  *		 the packet.
109  *
110  * 	zoneid: The zone in which the returned connection must be; the zoneid
111  *		corresponding to the ire_zoneid on the IRE located for the
112  *		packet's destination address.
113  *
114  *	For TCP connections, the lookup order is as follows:
115  *		5-tuple {src, dst, protocol, local port, remote port}
116  *			lookup in ipcl_conn_fanout table.
117  *		3-tuple {dst, remote port, protocol} lookup in
118  *			ipcl_bind_fanout table.
119  *
120  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
121  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
122  *	these interfaces do not handle cases where a packets belongs
123  *	to multiple UDP clients, which is handled in IP itself.
124  *
125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
126  * determine which actual zone gets the segment.  This is used only in a
127  * labeled environment.  The matching rules are:
128  *
129  *	- If it's not a multilevel port, then the label on the packet selects
130  *	  the zone.  Unlabeled packets are delivered to the global zone.
131  *
132  *	- If it's a multilevel port, then only the zone registered to receive
133  *	  packets on that port matches.
134  *
135  * Also, in a labeled environment, packet labels need to be checked.  For fully
136  * bound TCP connections, we can assume that the packet label was checked
137  * during connection establishment, and doesn't need to be checked on each
138  * packet.  For others, though, we need to check for strict equality or, for
139  * multilevel ports, membership in the range or set.  This part currently does
140  * a tnrh lookup on each packet, but could be optimized to use cached results
141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
142  * we would apply the same rules as TCP.)
143  *
144  * An implication of the above is that fully-bound TCP sockets must always use
145  * distinct 4-tuples; they can't be discriminated by label alone.
146  *
147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
148  * as there's no connection set-up handshake and no shared state.
149  *
150  * Labels on looped-back packets within a single zone do not need to be
151  * checked, as all processes in the same zone have the same label.
152  *
153  * Finally, for unlabeled packets received by a labeled system, special rules
154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
155  * socket in the zone whose label matches the default label of the sender, if
156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
157  * receiver's label must dominate the sender's default label.
158  *
159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161  *					 ip_stack);
162  *
163  *	Lookup routine to find a exact match for {src, dst, local port,
164  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
165  *	ports are read from the IP and TCP header respectively.
166  *
167  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
168  *					 zoneid, ip_stack);
169  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170  *					 zoneid, ip_stack);
171  *
172  * 	Lookup routine to find a listener with the tuple {lport, laddr,
173  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174  * 	parameter interface index is also compared.
175  *
176  * void ipcl_walk(func, arg, ip_stack)
177  *
178  * 	Apply 'func' to every connection available. The 'func' is called as
179  *	(*func)(connp, arg). The walk is non-atomic so connections may be
180  *	created and destroyed during the walk. The CONN_CONDEMNED and
181  *	CONN_INCIPIENT flags ensure that connections which are newly created
182  *	or being destroyed are not selected by the walker.
183  *
184  * Table Updates
185  * -------------
186  *
187  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
188  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
189  *
190  *	Insert 'connp' in the ipcl_conn_fanout.
191  *	Arguements :
192  *		connp		conn_t to be inserted
193  *		protocol	connection protocol
194  *		src		source address
195  *		dst		destination address
196  *		ports		local and remote port
197  *		ifindex		interface index for IPv6 connections
198  *
199  *	Return value :
200  *		0		if connp was inserted
201  *		EADDRINUSE	if the connection with the same tuple
202  *				already exists.
203  *
204  * int ipcl_bind_insert(connp, protocol, src, lport);
205  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
206  *
207  * 	Insert 'connp' in ipcl_bind_fanout.
208  * 	Arguements :
209  * 		connp		conn_t to be inserted
210  * 		protocol	connection protocol
211  * 		src		source address connection wants
212  * 				to bind to
213  * 		lport		local port connection wants to
214  * 				bind to
215  *
216  *
217  * void ipcl_hash_remove(connp);
218  *
219  * 	Removes the 'connp' from the connection fanout table.
220  *
221  * Connection Creation/Destruction
222  * -------------------------------
223  *
224  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
225  *
226  * 	Creates a new conn based on the type flag, inserts it into
227  * 	globalhash table.
228  *
229  *	type:	This flag determines the type of conn_t which needs to be
230  *		created i.e., which kmem_cache it comes from.
231  *		IPCL_TCPCONN	indicates a TCP connection
232  *		IPCL_SCTPCONN	indicates a SCTP connection
233  *		IPCL_UDPCONN	indicates a UDP conn_t.
234  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
235  *		IPCL_RTSCONN	indicates a RTS conn_t.
236  *		IPCL_IPCCONN	indicates all other connections.
237  *
238  * void ipcl_conn_destroy(connp)
239  *
240  * 	Destroys the connection state, removes it from the global
241  * 	connection hash table and frees its memory.
242  */
243 
244 #include <sys/types.h>
245 #include <sys/stream.h>
246 #include <sys/stropts.h>
247 #include <sys/sysmacros.h>
248 #include <sys/strsubr.h>
249 #include <sys/strsun.h>
250 #define	_SUN_TPI_VERSION 2
251 #include <sys/ddi.h>
252 #include <sys/cmn_err.h>
253 #include <sys/debug.h>
254 
255 #include <sys/systm.h>
256 #include <sys/param.h>
257 #include <sys/kmem.h>
258 #include <sys/isa_defs.h>
259 #include <inet/common.h>
260 #include <netinet/ip6.h>
261 #include <netinet/icmp6.h>
262 
263 #include <inet/ip.h>
264 #include <inet/ip6.h>
265 #include <inet/ip_ndp.h>
266 #include <inet/ip_impl.h>
267 #include <inet/udp_impl.h>
268 #include <inet/sctp_ip.h>
269 #include <inet/sctp/sctp_impl.h>
270 #include <inet/rawip_impl.h>
271 #include <inet/rts_impl.h>
272 #include <inet/iptun/iptun_impl.h>
273 
274 #include <sys/cpuvar.h>
275 
276 #include <inet/ipclassifier.h>
277 #include <inet/tcp.h>
278 #include <inet/ipsec_impl.h>
279 
280 #include <sys/tsol/tnet.h>
281 #include <sys/sockio.h>
282 
283 #ifdef DEBUG
284 #define	IPCL_DEBUG
285 #else
286 #undef	IPCL_DEBUG
287 #endif
288 
289 #ifdef	IPCL_DEBUG
290 int	ipcl_debug_level = 0;
291 #define	IPCL_DEBUG_LVL(level, args)	\
292 	if (ipcl_debug_level  & level) { printf args; }
293 #else
294 #define	IPCL_DEBUG_LVL(level, args) {; }
295 #endif
296 /* Old value for compatibility. Setable in /etc/system */
297 uint_t tcp_conn_hash_size = 0;
298 
299 /* New value. Zero means choose automatically.  Setable in /etc/system */
300 uint_t ipcl_conn_hash_size = 0;
301 uint_t ipcl_conn_hash_memfactor = 8192;
302 uint_t ipcl_conn_hash_maxsize = 82500;
303 
304 /* bind/udp fanout table size */
305 uint_t ipcl_bind_fanout_size = 512;
306 uint_t ipcl_udp_fanout_size = 16384;
307 
308 /* Raw socket fanout size.  Must be a power of 2. */
309 uint_t ipcl_raw_fanout_size = 256;
310 
311 /*
312  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
313  * expect that most large deployments would have hundreds of tunnels, and
314  * thousands in the extreme case.
315  */
316 uint_t ipcl_iptun_fanout_size = 6143;
317 
318 /*
319  * Power of 2^N Primes useful for hashing for N of 0-28,
320  * these primes are the nearest prime <= 2^N - 2^(N-2).
321  */
322 
323 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
324 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
325 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
326 		50331599, 100663291, 201326557, 0}
327 
328 /*
329  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
330  * are aligned on cache lines.
331  */
332 typedef union itc_s {
333 	conn_t	itc_conn;
334 	char	itcu_filler[CACHE_ALIGN(conn_s)];
335 } itc_t;
336 
337 struct kmem_cache  *tcp_conn_cache;
338 struct kmem_cache  *ip_conn_cache;
339 struct kmem_cache  *ip_helper_stream_cache;
340 extern struct kmem_cache  *sctp_conn_cache;
341 extern struct kmem_cache  *tcp_sack_info_cache;
342 extern struct kmem_cache  *tcp_iphc_cache;
343 struct kmem_cache  *udp_conn_cache;
344 struct kmem_cache  *rawip_conn_cache;
345 struct kmem_cache  *rts_conn_cache;
346 
347 extern void	tcp_timermp_free(tcp_t *);
348 extern mblk_t	*tcp_timermp_alloc(int);
349 
350 static int	ip_conn_constructor(void *, void *, int);
351 static void	ip_conn_destructor(void *, void *);
352 
353 static int	tcp_conn_constructor(void *, void *, int);
354 static void	tcp_conn_destructor(void *, void *);
355 
356 static int	udp_conn_constructor(void *, void *, int);
357 static void	udp_conn_destructor(void *, void *);
358 
359 static int	rawip_conn_constructor(void *, void *, int);
360 static void	rawip_conn_destructor(void *, void *);
361 
362 static int	rts_conn_constructor(void *, void *, int);
363 static void	rts_conn_destructor(void *, void *);
364 
365 static int	ip_helper_stream_constructor(void *, void *, int);
366 static void	ip_helper_stream_destructor(void *, void *);
367 
368 boolean_t	ip_use_helper_cache = B_TRUE;
369 
370 /*
371  * Hook functions to enable cluster networking
372  * On non-clustered systems these vectors must always be NULL.
373  */
374 extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
375 		    uint8_t *, in_port_t, void *);
376 extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
377 		    uint8_t *, in_port_t, void *);
378 
379 #ifdef	IPCL_DEBUG
380 #define	INET_NTOA_BUFSIZE	18
381 
382 static char *
383 inet_ntoa_r(uint32_t in, char *b)
384 {
385 	unsigned char	*p;
386 
387 	p = (unsigned char *)&in;
388 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
389 	return (b);
390 }
391 #endif
392 
393 /*
394  * Global (for all stack instances) init routine
395  */
396 void
397 ipcl_g_init(void)
398 {
399 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
400 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
401 	    ip_conn_constructor, ip_conn_destructor,
402 	    NULL, NULL, NULL, 0);
403 
404 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
405 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
406 	    tcp_conn_constructor, tcp_conn_destructor,
407 	    NULL, NULL, NULL, 0);
408 
409 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
410 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
411 	    udp_conn_constructor, udp_conn_destructor,
412 	    NULL, NULL, NULL, 0);
413 
414 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
415 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
416 	    rawip_conn_constructor, rawip_conn_destructor,
417 	    NULL, NULL, NULL, 0);
418 
419 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
420 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
421 	    rts_conn_constructor, rts_conn_destructor,
422 	    NULL, NULL, NULL, 0);
423 
424 	if (ip_use_helper_cache) {
425 		ip_helper_stream_cache = kmem_cache_create
426 		    ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
427 		    CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
428 		    ip_helper_stream_destructor, NULL, NULL, NULL, 0);
429 	} else {
430 		ip_helper_stream_cache = NULL;
431 	}
432 }
433 
434 /*
435  * ipclassifier intialization routine, sets up hash tables.
436  */
437 void
438 ipcl_init(ip_stack_t *ipst)
439 {
440 	int i;
441 	int sizes[] = P2Ps();
442 
443 	/*
444 	 * Calculate size of conn fanout table from /etc/system settings
445 	 */
446 	if (ipcl_conn_hash_size != 0) {
447 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
448 	} else if (tcp_conn_hash_size != 0) {
449 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
450 	} else {
451 		extern pgcnt_t freemem;
452 
453 		ipst->ips_ipcl_conn_fanout_size =
454 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
455 
456 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
457 			ipst->ips_ipcl_conn_fanout_size =
458 			    ipcl_conn_hash_maxsize;
459 		}
460 	}
461 
462 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
463 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
464 			break;
465 		}
466 	}
467 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
468 		/* Out of range, use the 2^16 value */
469 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
470 	}
471 
472 	/* Take values from /etc/system */
473 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
474 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
475 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
476 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
477 
478 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
479 
480 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
481 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
482 
483 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
484 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
485 		    MUTEX_DEFAULT, NULL);
486 	}
487 
488 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
489 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
490 
491 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
492 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
493 		    MUTEX_DEFAULT, NULL);
494 	}
495 
496 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
497 	    sizeof (connf_t), KM_SLEEP);
498 	for (i = 0; i < IPPROTO_MAX; i++) {
499 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
500 		    MUTEX_DEFAULT, NULL);
501 	}
502 
503 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
504 	    sizeof (connf_t), KM_SLEEP);
505 	for (i = 0; i < IPPROTO_MAX; i++) {
506 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
507 		    MUTEX_DEFAULT, NULL);
508 	}
509 
510 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
511 	mutex_init(&ipst->ips_rts_clients->connf_lock,
512 	    NULL, MUTEX_DEFAULT, NULL);
513 
514 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
515 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
516 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
517 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
518 		    MUTEX_DEFAULT, NULL);
519 	}
520 
521 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
522 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
523 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
524 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
525 		    MUTEX_DEFAULT, NULL);
526 	}
527 
528 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
529 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
530 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
531 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
532 		    MUTEX_DEFAULT, NULL);
533 	}
534 
535 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
536 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
537 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
538 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
539 		    NULL, MUTEX_DEFAULT, NULL);
540 	}
541 }
542 
543 void
544 ipcl_g_destroy(void)
545 {
546 	kmem_cache_destroy(ip_conn_cache);
547 	kmem_cache_destroy(tcp_conn_cache);
548 	kmem_cache_destroy(udp_conn_cache);
549 	kmem_cache_destroy(rawip_conn_cache);
550 	kmem_cache_destroy(rts_conn_cache);
551 }
552 
553 /*
554  * All user-level and kernel use of the stack must be gone
555  * by now.
556  */
557 void
558 ipcl_destroy(ip_stack_t *ipst)
559 {
560 	int i;
561 
562 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
563 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
564 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
565 	}
566 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
567 	    sizeof (connf_t));
568 	ipst->ips_ipcl_conn_fanout = NULL;
569 
570 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
571 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
572 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
573 	}
574 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
575 	    sizeof (connf_t));
576 	ipst->ips_ipcl_bind_fanout = NULL;
577 
578 	for (i = 0; i < IPPROTO_MAX; i++) {
579 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
580 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
581 	}
582 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
583 	ipst->ips_ipcl_proto_fanout = NULL;
584 
585 	for (i = 0; i < IPPROTO_MAX; i++) {
586 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
587 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
588 	}
589 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
590 	    IPPROTO_MAX * sizeof (connf_t));
591 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
592 
593 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
594 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
595 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
596 	}
597 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
598 	    sizeof (connf_t));
599 	ipst->ips_ipcl_udp_fanout = NULL;
600 
601 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
602 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
603 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
604 	}
605 	kmem_free(ipst->ips_ipcl_iptun_fanout,
606 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
607 	ipst->ips_ipcl_iptun_fanout = NULL;
608 
609 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
610 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
611 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
612 	}
613 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
614 	    sizeof (connf_t));
615 	ipst->ips_ipcl_raw_fanout = NULL;
616 
617 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
618 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
619 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
620 	}
621 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
622 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
623 	ipst->ips_ipcl_globalhash_fanout = NULL;
624 
625 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
626 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
627 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
628 	ipst->ips_rts_clients = NULL;
629 }
630 
631 /*
632  * conn creation routine. initialize the conn, sets the reference
633  * and inserts it in the global hash table.
634  */
635 conn_t *
636 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
637 {
638 	conn_t	*connp;
639 	sctp_stack_t *sctps;
640 	struct kmem_cache *conn_cache;
641 
642 	switch (type) {
643 	case IPCL_SCTPCONN:
644 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
645 			return (NULL);
646 		sctp_conn_init(connp);
647 		sctps = ns->netstack_sctp;
648 		SCTP_G_Q_REFHOLD(sctps);
649 		netstack_hold(ns);
650 		connp->conn_netstack = ns;
651 		return (connp);
652 
653 	case IPCL_TCPCONN:
654 		conn_cache = tcp_conn_cache;
655 		break;
656 
657 	case IPCL_UDPCONN:
658 		conn_cache = udp_conn_cache;
659 		break;
660 
661 	case IPCL_RAWIPCONN:
662 		conn_cache = rawip_conn_cache;
663 		break;
664 
665 	case IPCL_RTSCONN:
666 		conn_cache = rts_conn_cache;
667 		break;
668 
669 	case IPCL_IPCCONN:
670 		conn_cache = ip_conn_cache;
671 		break;
672 
673 	default:
674 		connp = NULL;
675 		ASSERT(0);
676 	}
677 
678 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
679 		return (NULL);
680 
681 	connp->conn_ref = 1;
682 	netstack_hold(ns);
683 	connp->conn_netstack = ns;
684 	ipcl_globalhash_insert(connp);
685 	return (connp);
686 }
687 
688 void
689 ipcl_conn_destroy(conn_t *connp)
690 {
691 	mblk_t	*mp;
692 	netstack_t	*ns = connp->conn_netstack;
693 
694 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
695 	ASSERT(connp->conn_ref == 0);
696 	ASSERT(connp->conn_ire_cache == NULL);
697 
698 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
699 
700 	if (connp->conn_effective_cred != NULL) {
701 		crfree(connp->conn_effective_cred);
702 		connp->conn_effective_cred = NULL;
703 	}
704 
705 	if (connp->conn_cred != NULL) {
706 		crfree(connp->conn_cred);
707 		connp->conn_cred = NULL;
708 	}
709 
710 	ipcl_globalhash_remove(connp);
711 
712 	/* FIXME: add separate tcp_conn_free()? */
713 	if (connp->conn_flags & IPCL_TCPCONN) {
714 		tcp_t	*tcp = connp->conn_tcp;
715 		tcp_stack_t *tcps;
716 
717 		ASSERT(tcp != NULL);
718 		tcps = tcp->tcp_tcps;
719 		if (tcps != NULL) {
720 			if (connp->conn_latch != NULL) {
721 				IPLATCH_REFRELE(connp->conn_latch, ns);
722 				connp->conn_latch = NULL;
723 			}
724 			if (connp->conn_policy != NULL) {
725 				IPPH_REFRELE(connp->conn_policy, ns);
726 				connp->conn_policy = NULL;
727 			}
728 			tcp->tcp_tcps = NULL;
729 			TCPS_REFRELE(tcps);
730 		}
731 
732 		tcp_free(tcp);
733 		mp = tcp->tcp_timercache;
734 		tcp->tcp_cred = NULL;
735 
736 		if (tcp->tcp_sack_info != NULL) {
737 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
738 			kmem_cache_free(tcp_sack_info_cache,
739 			    tcp->tcp_sack_info);
740 		}
741 		if (tcp->tcp_iphc != NULL) {
742 			if (tcp->tcp_hdr_grown) {
743 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
744 			} else {
745 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
746 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
747 			}
748 			tcp->tcp_iphc_len = 0;
749 		}
750 		ASSERT(tcp->tcp_iphc_len == 0);
751 
752 		/*
753 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
754 		 * the mblk.
755 		 */
756 		if (tcp->tcp_rsrv_mp != NULL) {
757 			freeb(tcp->tcp_rsrv_mp);
758 			tcp->tcp_rsrv_mp = NULL;
759 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
760 		}
761 
762 		ASSERT(connp->conn_latch == NULL);
763 		ASSERT(connp->conn_policy == NULL);
764 
765 		if (ns != NULL) {
766 			ASSERT(tcp->tcp_tcps == NULL);
767 			connp->conn_netstack = NULL;
768 			netstack_rele(ns);
769 		}
770 
771 		ipcl_conn_cleanup(connp);
772 		connp->conn_flags = IPCL_TCPCONN;
773 		bzero(tcp, sizeof (tcp_t));
774 
775 		tcp->tcp_timercache = mp;
776 		tcp->tcp_connp = connp;
777 		kmem_cache_free(tcp_conn_cache, connp);
778 		return;
779 	}
780 	if (connp->conn_latch != NULL) {
781 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
782 		connp->conn_latch = NULL;
783 	}
784 	if (connp->conn_policy != NULL) {
785 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
786 		connp->conn_policy = NULL;
787 	}
788 	if (connp->conn_ipsec_opt_mp != NULL) {
789 		freemsg(connp->conn_ipsec_opt_mp);
790 		connp->conn_ipsec_opt_mp = NULL;
791 	}
792 
793 	if (connp->conn_flags & IPCL_SCTPCONN) {
794 		ASSERT(ns != NULL);
795 		sctp_free(connp);
796 		return;
797 	}
798 
799 	if (ns != NULL) {
800 		connp->conn_netstack = NULL;
801 		netstack_rele(ns);
802 	}
803 
804 	ipcl_conn_cleanup(connp);
805 
806 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
807 	if (connp->conn_flags & IPCL_UDPCONN) {
808 		connp->conn_flags = IPCL_UDPCONN;
809 		kmem_cache_free(udp_conn_cache, connp);
810 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
811 
812 		connp->conn_flags = IPCL_RAWIPCONN;
813 		connp->conn_ulp = IPPROTO_ICMP;
814 		kmem_cache_free(rawip_conn_cache, connp);
815 	} else if (connp->conn_flags & IPCL_RTSCONN) {
816 		connp->conn_flags = IPCL_RTSCONN;
817 		kmem_cache_free(rts_conn_cache, connp);
818 	} else {
819 		connp->conn_flags = IPCL_IPCCONN;
820 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
821 		ASSERT(connp->conn_priv == NULL);
822 		kmem_cache_free(ip_conn_cache, connp);
823 	}
824 }
825 
826 /*
827  * Running in cluster mode - deregister listener information
828  */
829 
830 static void
831 ipcl_conn_unlisten(conn_t *connp)
832 {
833 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
834 	ASSERT(connp->conn_lport != 0);
835 
836 	if (cl_inet_unlisten != NULL) {
837 		sa_family_t	addr_family;
838 		uint8_t		*laddrp;
839 
840 		if (connp->conn_pkt_isv6) {
841 			addr_family = AF_INET6;
842 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
843 		} else {
844 			addr_family = AF_INET;
845 			laddrp = (uint8_t *)&connp->conn_bound_source;
846 		}
847 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
848 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
849 	}
850 	connp->conn_flags &= ~IPCL_CL_LISTENER;
851 }
852 
853 /*
854  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
855  * which table the conn belonged to). So for debugging we can see which hash
856  * table this connection was in.
857  */
858 #define	IPCL_HASH_REMOVE(connp)	{					\
859 	connf_t	*connfp = (connp)->conn_fanout;				\
860 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
861 	if (connfp != NULL) {						\
862 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
863 		    (void *)(connp)));					\
864 		mutex_enter(&connfp->connf_lock);			\
865 		if ((connp)->conn_next != NULL)				\
866 			(connp)->conn_next->conn_prev =			\
867 			    (connp)->conn_prev;				\
868 		if ((connp)->conn_prev != NULL)				\
869 			(connp)->conn_prev->conn_next =			\
870 			    (connp)->conn_next;				\
871 		else							\
872 			connfp->connf_head = (connp)->conn_next;	\
873 		(connp)->conn_fanout = NULL;				\
874 		(connp)->conn_next = NULL;				\
875 		(connp)->conn_prev = NULL;				\
876 		(connp)->conn_flags |= IPCL_REMOVED;			\
877 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
878 			ipcl_conn_unlisten((connp));			\
879 		CONN_DEC_REF((connp));					\
880 		mutex_exit(&connfp->connf_lock);			\
881 	}								\
882 }
883 
884 void
885 ipcl_hash_remove(conn_t *connp)
886 {
887 	IPCL_HASH_REMOVE(connp);
888 }
889 
890 /*
891  * The whole purpose of this function is allow removal of
892  * a conn_t from the connected hash for timewait reclaim.
893  * This is essentially a TW reclaim fastpath where timewait
894  * collector checks under fanout lock (so no one else can
895  * get access to the conn_t) that refcnt is 2 i.e. one for
896  * TCP and one for the classifier hash list. If ref count
897  * is indeed 2, we can just remove the conn under lock and
898  * avoid cleaning up the conn under squeue. This gives us
899  * improved performance.
900  */
901 void
902 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
903 {
904 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
905 	ASSERT(MUTEX_HELD(&connp->conn_lock));
906 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
907 
908 	if ((connp)->conn_next != NULL) {
909 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
910 	}
911 	if ((connp)->conn_prev != NULL) {
912 		(connp)->conn_prev->conn_next = (connp)->conn_next;
913 	} else {
914 		connfp->connf_head = (connp)->conn_next;
915 	}
916 	(connp)->conn_fanout = NULL;
917 	(connp)->conn_next = NULL;
918 	(connp)->conn_prev = NULL;
919 	(connp)->conn_flags |= IPCL_REMOVED;
920 	ASSERT((connp)->conn_ref == 2);
921 	(connp)->conn_ref--;
922 }
923 
924 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
925 	ASSERT((connp)->conn_fanout == NULL);				\
926 	ASSERT((connp)->conn_next == NULL);				\
927 	ASSERT((connp)->conn_prev == NULL);				\
928 	if ((connfp)->connf_head != NULL) {				\
929 		(connfp)->connf_head->conn_prev = (connp);		\
930 		(connp)->conn_next = (connfp)->connf_head;		\
931 	}								\
932 	(connp)->conn_fanout = (connfp);				\
933 	(connfp)->connf_head = (connp);					\
934 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
935 	    IPCL_CONNECTED;						\
936 	CONN_INC_REF(connp);						\
937 }
938 
939 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
940 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
941 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
942 	IPCL_HASH_REMOVE((connp));					\
943 	mutex_enter(&(connfp)->connf_lock);				\
944 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
945 	mutex_exit(&(connfp)->connf_lock);				\
946 }
947 
948 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
949 	conn_t *pconnp = NULL, *nconnp;					\
950 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
951 	    "connp %p", (void *)connfp, (void *)(connp)));		\
952 	IPCL_HASH_REMOVE((connp));					\
953 	mutex_enter(&(connfp)->connf_lock);				\
954 	nconnp = (connfp)->connf_head;					\
955 	while (nconnp != NULL &&					\
956 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
957 		pconnp = nconnp;					\
958 		nconnp = nconnp->conn_next;				\
959 	}								\
960 	if (pconnp != NULL) {						\
961 		pconnp->conn_next = (connp);				\
962 		(connp)->conn_prev = pconnp;				\
963 	} else {							\
964 		(connfp)->connf_head = (connp);				\
965 	}								\
966 	if (nconnp != NULL) {						\
967 		(connp)->conn_next = nconnp;				\
968 		nconnp->conn_prev = (connp);				\
969 	}								\
970 	(connp)->conn_fanout = (connfp);				\
971 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
972 	    IPCL_BOUND;							\
973 	CONN_INC_REF(connp);						\
974 	mutex_exit(&(connfp)->connf_lock);				\
975 }
976 
977 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
978 	conn_t **list, *prev, *next;					\
979 	boolean_t isv4mapped =						\
980 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
981 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
982 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
983 	IPCL_HASH_REMOVE((connp));					\
984 	mutex_enter(&(connfp)->connf_lock);				\
985 	list = &(connfp)->connf_head;					\
986 	prev = NULL;							\
987 	while ((next = *list) != NULL) {				\
988 		if (isv4mapped &&					\
989 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
990 		    connp->conn_zoneid == next->conn_zoneid) {		\
991 			(connp)->conn_next = next;			\
992 			if (prev != NULL)				\
993 				prev = next->conn_prev;			\
994 			next->conn_prev = (connp);			\
995 			break;						\
996 		}							\
997 		list = &next->conn_next;				\
998 		prev = next;						\
999 	}								\
1000 	(connp)->conn_prev = prev;					\
1001 	*list = (connp);						\
1002 	(connp)->conn_fanout = (connfp);				\
1003 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
1004 	    IPCL_BOUND;							\
1005 	CONN_INC_REF((connp));						\
1006 	mutex_exit(&(connfp)->connf_lock);				\
1007 }
1008 
1009 void
1010 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
1011 {
1012 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1013 }
1014 
1015 void
1016 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
1017 {
1018 	connf_t	*connfp;
1019 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1020 
1021 	ASSERT(connp != NULL);
1022 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
1023 	    protocol == IPPROTO_ESP);
1024 
1025 	connp->conn_ulp = protocol;
1026 
1027 	/* Insert it in the protocol hash */
1028 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1029 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1030 }
1031 
1032 void
1033 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
1034 {
1035 	connf_t	*connfp;
1036 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1037 
1038 	ASSERT(connp != NULL);
1039 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
1040 	    protocol == IPPROTO_ESP);
1041 
1042 	connp->conn_ulp = protocol;
1043 
1044 	/* Insert it in the Bind Hash */
1045 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1046 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1047 }
1048 
1049 /*
1050  * Because the classifier is used to classify inbound packets, the destination
1051  * address is meant to be our local tunnel address (tunnel source), and the
1052  * source the remote tunnel address (tunnel destination).
1053  */
1054 conn_t *
1055 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
1056 {
1057 	connf_t	*connfp;
1058 	conn_t	*connp;
1059 
1060 	/* first look for IPv4 tunnel links */
1061 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
1062 	mutex_enter(&connfp->connf_lock);
1063 	for (connp = connfp->connf_head; connp != NULL;
1064 	    connp = connp->conn_next) {
1065 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
1066 			break;
1067 	}
1068 	if (connp != NULL)
1069 		goto done;
1070 
1071 	mutex_exit(&connfp->connf_lock);
1072 
1073 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
1074 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
1075 	    INADDR_ANY)];
1076 	mutex_enter(&connfp->connf_lock);
1077 	for (connp = connfp->connf_head; connp != NULL;
1078 	    connp = connp->conn_next) {
1079 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
1080 			break;
1081 	}
1082 done:
1083 	if (connp != NULL)
1084 		CONN_INC_REF(connp);
1085 	mutex_exit(&connfp->connf_lock);
1086 	return (connp);
1087 }
1088 
1089 conn_t *
1090 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1091 {
1092 	connf_t	*connfp;
1093 	conn_t	*connp;
1094 
1095 	/* Look for an IPv6 tunnel link */
1096 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1097 	mutex_enter(&connfp->connf_lock);
1098 	for (connp = connfp->connf_head; connp != NULL;
1099 	    connp = connp->conn_next) {
1100 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1101 			CONN_INC_REF(connp);
1102 			break;
1103 		}
1104 	}
1105 	mutex_exit(&connfp->connf_lock);
1106 	return (connp);
1107 }
1108 
1109 /*
1110  * This function is used only for inserting SCTP raw socket now.
1111  * This may change later.
1112  *
1113  * Note that only one raw socket can be bound to a port.  The param
1114  * lport is in network byte order.
1115  */
1116 static int
1117 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1118 {
1119 	connf_t	*connfp;
1120 	conn_t	*oconnp;
1121 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1122 
1123 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1124 
1125 	/* Check for existing raw socket already bound to the port. */
1126 	mutex_enter(&connfp->connf_lock);
1127 	for (oconnp = connfp->connf_head; oconnp != NULL;
1128 	    oconnp = oconnp->conn_next) {
1129 		if (oconnp->conn_lport == lport &&
1130 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1131 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1132 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1133 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1134 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1135 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1136 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1137 		    &connp->conn_srcv6))) {
1138 			break;
1139 		}
1140 	}
1141 	mutex_exit(&connfp->connf_lock);
1142 	if (oconnp != NULL)
1143 		return (EADDRNOTAVAIL);
1144 
1145 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1146 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1147 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1148 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1149 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1150 		} else {
1151 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1152 		}
1153 	} else {
1154 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1155 	}
1156 	return (0);
1157 }
1158 
1159 static int
1160 ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
1161     ip_stack_t *ipst)
1162 {
1163 	connf_t	*connfp;
1164 	conn_t	*tconnp;
1165 
1166 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)];
1167 	mutex_enter(&connfp->connf_lock);
1168 	for (tconnp = connfp->connf_head; tconnp != NULL;
1169 	    tconnp = tconnp->conn_next) {
1170 		if (IPCL_IPTUN_MATCH(tconnp, src, dst)) {
1171 			/* A tunnel is already bound to these addresses. */
1172 			mutex_exit(&connfp->connf_lock);
1173 			return (EADDRINUSE);
1174 		}
1175 	}
1176 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1177 	mutex_exit(&connfp->connf_lock);
1178 	return (0);
1179 }
1180 
1181 static int
1182 ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src,
1183     const in6_addr_t *dst, ip_stack_t *ipst)
1184 {
1185 	connf_t	*connfp;
1186 	conn_t	*tconnp;
1187 
1188 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)];
1189 	mutex_enter(&connfp->connf_lock);
1190 	for (tconnp = connfp->connf_head; tconnp != NULL;
1191 	    tconnp = tconnp->conn_next) {
1192 		if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) {
1193 			/* A tunnel is already bound to these addresses. */
1194 			mutex_exit(&connfp->connf_lock);
1195 			return (EADDRINUSE);
1196 		}
1197 	}
1198 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1199 	mutex_exit(&connfp->connf_lock);
1200 	return (0);
1201 }
1202 
1203 /*
1204  * Check for a MAC exemption conflict on a labeled system.  Note that for
1205  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1206  * transport layer.  This check is for binding all other protocols.
1207  *
1208  * Returns true if there's a conflict.
1209  */
1210 static boolean_t
1211 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1212 {
1213 	connf_t	*connfp;
1214 	conn_t *tconn;
1215 
1216 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1217 	mutex_enter(&connfp->connf_lock);
1218 	for (tconn = connfp->connf_head; tconn != NULL;
1219 	    tconn = tconn->conn_next) {
1220 		/* We don't allow v4 fallback for v6 raw socket */
1221 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1222 			continue;
1223 		/* If neither is exempt, then there's no conflict */
1224 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1225 			continue;
1226 		/* We are only concerned about sockets for a different zone */
1227 		if (connp->conn_zoneid == tconn->conn_zoneid)
1228 			continue;
1229 		/* If both are bound to different specific addrs, ok */
1230 		if (connp->conn_src != INADDR_ANY &&
1231 		    tconn->conn_src != INADDR_ANY &&
1232 		    connp->conn_src != tconn->conn_src)
1233 			continue;
1234 		/* These two conflict; fail */
1235 		break;
1236 	}
1237 	mutex_exit(&connfp->connf_lock);
1238 	return (tconn != NULL);
1239 }
1240 
1241 static boolean_t
1242 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1243 {
1244 	connf_t	*connfp;
1245 	conn_t *tconn;
1246 
1247 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1248 	mutex_enter(&connfp->connf_lock);
1249 	for (tconn = connfp->connf_head; tconn != NULL;
1250 	    tconn = tconn->conn_next) {
1251 		/* We don't allow v4 fallback for v6 raw socket */
1252 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1253 			continue;
1254 		/* If neither is exempt, then there's no conflict */
1255 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1256 			continue;
1257 		/* We are only concerned about sockets for a different zone */
1258 		if (connp->conn_zoneid == tconn->conn_zoneid)
1259 			continue;
1260 		/* If both are bound to different addrs, ok */
1261 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1262 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1263 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1264 			continue;
1265 		/* These two conflict; fail */
1266 		break;
1267 	}
1268 	mutex_exit(&connfp->connf_lock);
1269 	return (tconn != NULL);
1270 }
1271 
1272 /*
1273  * (v4, v6) bind hash insertion routines
1274  */
1275 int
1276 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1277 {
1278 	connf_t	*connfp;
1279 #ifdef	IPCL_DEBUG
1280 	char	buf[INET_NTOA_BUFSIZE];
1281 #endif
1282 	int	ret = 0;
1283 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1284 
1285 	ASSERT(connp);
1286 
1287 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1288 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1289 
1290 	connp->conn_ulp = protocol;
1291 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1292 	connp->conn_lport = lport;
1293 
1294 	if (IPCL_IS_IPTUN(connp))
1295 		return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst));
1296 
1297 	switch (protocol) {
1298 	default:
1299 		if (is_system_labeled() &&
1300 		    check_exempt_conflict_v4(connp, ipst))
1301 			return (EADDRINUSE);
1302 		/* FALLTHROUGH */
1303 	case IPPROTO_UDP:
1304 		if (protocol == IPPROTO_UDP) {
1305 			IPCL_DEBUG_LVL(64,
1306 			    ("ipcl_bind_insert: connp %p - udp\n",
1307 			    (void *)connp));
1308 			connfp = &ipst->ips_ipcl_udp_fanout[
1309 			    IPCL_UDP_HASH(lport, ipst)];
1310 		} else {
1311 			IPCL_DEBUG_LVL(64,
1312 			    ("ipcl_bind_insert: connp %p - protocol\n",
1313 			    (void *)connp));
1314 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1315 		}
1316 
1317 		if (connp->conn_rem != INADDR_ANY) {
1318 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1319 		} else if (connp->conn_src != INADDR_ANY) {
1320 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1321 		} else {
1322 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1323 		}
1324 		break;
1325 
1326 	case IPPROTO_TCP:
1327 
1328 		/* Insert it in the Bind Hash */
1329 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1330 		connfp = &ipst->ips_ipcl_bind_fanout[
1331 		    IPCL_BIND_HASH(lport, ipst)];
1332 		if (connp->conn_src != INADDR_ANY) {
1333 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1334 		} else {
1335 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1336 		}
1337 		if (cl_inet_listen != NULL) {
1338 			ASSERT(!connp->conn_pkt_isv6);
1339 			connp->conn_flags |= IPCL_CL_LISTENER;
1340 			(*cl_inet_listen)(
1341 			    connp->conn_netstack->netstack_stackid,
1342 			    IPPROTO_TCP, AF_INET,
1343 			    (uint8_t *)&connp->conn_bound_source, lport, NULL);
1344 		}
1345 		break;
1346 
1347 	case IPPROTO_SCTP:
1348 		ret = ipcl_sctp_hash_insert(connp, lport);
1349 		break;
1350 	}
1351 
1352 	return (ret);
1353 }
1354 
1355 int
1356 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1357     uint16_t lport)
1358 {
1359 	connf_t		*connfp;
1360 	int		ret = 0;
1361 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1362 
1363 	ASSERT(connp != NULL);	connp->conn_ulp = protocol;
1364 	connp->conn_srcv6 = *src;
1365 	connp->conn_lport = lport;
1366 
1367 	if (IPCL_IS_IPTUN(connp)) {
1368 		return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros,
1369 		    ipst));
1370 	}
1371 
1372 	switch (protocol) {
1373 	default:
1374 		if (is_system_labeled() &&
1375 		    check_exempt_conflict_v6(connp, ipst))
1376 			return (EADDRINUSE);
1377 		/* FALLTHROUGH */
1378 	case IPPROTO_UDP:
1379 		if (protocol == IPPROTO_UDP) {
1380 			IPCL_DEBUG_LVL(128,
1381 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1382 			    (void *)connp));
1383 			connfp = &ipst->ips_ipcl_udp_fanout[
1384 			    IPCL_UDP_HASH(lport, ipst)];
1385 		} else {
1386 			IPCL_DEBUG_LVL(128,
1387 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1388 			    (void *)connp));
1389 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1390 		}
1391 
1392 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1393 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1394 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1395 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1396 		} else {
1397 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1398 		}
1399 		break;
1400 
1401 	case IPPROTO_TCP:
1402 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1403 
1404 		/* Insert it in the Bind Hash */
1405 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1406 		connfp = &ipst->ips_ipcl_bind_fanout[
1407 		    IPCL_BIND_HASH(lport, ipst)];
1408 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1409 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1410 		} else {
1411 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1412 		}
1413 		if (cl_inet_listen != NULL) {
1414 			sa_family_t	addr_family;
1415 			uint8_t		*laddrp;
1416 
1417 			if (connp->conn_pkt_isv6) {
1418 				addr_family = AF_INET6;
1419 				laddrp =
1420 				    (uint8_t *)&connp->conn_bound_source_v6;
1421 			} else {
1422 				addr_family = AF_INET;
1423 				laddrp = (uint8_t *)&connp->conn_bound_source;
1424 			}
1425 			connp->conn_flags |= IPCL_CL_LISTENER;
1426 			(*cl_inet_listen)(
1427 			    connp->conn_netstack->netstack_stackid,
1428 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1429 		}
1430 		break;
1431 
1432 	case IPPROTO_SCTP:
1433 		ret = ipcl_sctp_hash_insert(connp, lport);
1434 		break;
1435 	}
1436 
1437 	return (ret);
1438 }
1439 
1440 /*
1441  * ipcl_conn_hash insertion routines.
1442  */
1443 int
1444 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1445     ipaddr_t rem, uint32_t ports)
1446 {
1447 	connf_t		*connfp;
1448 	uint16_t	*up;
1449 	conn_t		*tconnp;
1450 #ifdef	IPCL_DEBUG
1451 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1452 #endif
1453 	in_port_t	lport;
1454 	int		ret = 0;
1455 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1456 
1457 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1458 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1459 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1460 	    ports, protocol));
1461 
1462 	if (IPCL_IS_IPTUN(connp))
1463 		return (ipcl_iptun_hash_insert(connp, src, rem, ipst));
1464 
1465 	switch (protocol) {
1466 	case IPPROTO_TCP:
1467 		if (!(connp->conn_flags & IPCL_EAGER)) {
1468 			/*
1469 			 * for a eager connection, i.e connections which
1470 			 * have just been created, the initialization is
1471 			 * already done in ip at conn_creation time, so
1472 			 * we can skip the checks here.
1473 			 */
1474 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1475 		}
1476 
1477 		/*
1478 		 * For tcp, we check whether the connection tuple already
1479 		 * exists before allowing the connection to proceed.  We
1480 		 * also allow indexing on the zoneid. This is to allow
1481 		 * multiple shared stack zones to have the same tcp
1482 		 * connection tuple. In practice this only happens for
1483 		 * INADDR_LOOPBACK as it's the only local address which
1484 		 * doesn't have to be unique.
1485 		 */
1486 		connfp = &ipst->ips_ipcl_conn_fanout[
1487 		    IPCL_CONN_HASH(connp->conn_rem,
1488 		    connp->conn_ports, ipst)];
1489 		mutex_enter(&connfp->connf_lock);
1490 		for (tconnp = connfp->connf_head; tconnp != NULL;
1491 		    tconnp = tconnp->conn_next) {
1492 			if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1493 			    connp->conn_rem, connp->conn_src,
1494 			    connp->conn_ports)) &&
1495 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1496 
1497 				/* Already have a conn. bail out */
1498 				mutex_exit(&connfp->connf_lock);
1499 				return (EADDRINUSE);
1500 			}
1501 		}
1502 		if (connp->conn_fanout != NULL) {
1503 			/*
1504 			 * Probably a XTI/TLI application trying to do a
1505 			 * rebind. Let it happen.
1506 			 */
1507 			mutex_exit(&connfp->connf_lock);
1508 			IPCL_HASH_REMOVE(connp);
1509 			mutex_enter(&connfp->connf_lock);
1510 		}
1511 
1512 		ASSERT(connp->conn_recv != NULL);
1513 
1514 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1515 		mutex_exit(&connfp->connf_lock);
1516 		break;
1517 
1518 	case IPPROTO_SCTP:
1519 		/*
1520 		 * The raw socket may have already been bound, remove it
1521 		 * from the hash first.
1522 		 */
1523 		IPCL_HASH_REMOVE(connp);
1524 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1525 		ret = ipcl_sctp_hash_insert(connp, lport);
1526 		break;
1527 
1528 	default:
1529 		/*
1530 		 * Check for conflicts among MAC exempt bindings.  For
1531 		 * transports with port numbers, this is done by the upper
1532 		 * level per-transport binding logic.  For all others, it's
1533 		 * done here.
1534 		 */
1535 		if (is_system_labeled() &&
1536 		    check_exempt_conflict_v4(connp, ipst))
1537 			return (EADDRINUSE);
1538 		/* FALLTHROUGH */
1539 
1540 	case IPPROTO_UDP:
1541 		up = (uint16_t *)&ports;
1542 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1543 		if (protocol == IPPROTO_UDP) {
1544 			connfp = &ipst->ips_ipcl_udp_fanout[
1545 			    IPCL_UDP_HASH(up[1], ipst)];
1546 		} else {
1547 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1548 		}
1549 
1550 		if (connp->conn_rem != INADDR_ANY) {
1551 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1552 		} else if (connp->conn_src != INADDR_ANY) {
1553 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1554 		} else {
1555 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1556 		}
1557 		break;
1558 	}
1559 
1560 	return (ret);
1561 }
1562 
1563 int
1564 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1565     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1566 {
1567 	connf_t		*connfp;
1568 	uint16_t	*up;
1569 	conn_t		*tconnp;
1570 	in_port_t	lport;
1571 	int		ret = 0;
1572 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1573 
1574 	if (IPCL_IS_IPTUN(connp))
1575 		return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst));
1576 
1577 	switch (protocol) {
1578 	case IPPROTO_TCP:
1579 		/* Just need to insert a conn struct */
1580 		if (!(connp->conn_flags & IPCL_EAGER)) {
1581 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1582 		}
1583 
1584 		/*
1585 		 * For tcp, we check whether the connection tuple already
1586 		 * exists before allowing the connection to proceed.  We
1587 		 * also allow indexing on the zoneid. This is to allow
1588 		 * multiple shared stack zones to have the same tcp
1589 		 * connection tuple. In practice this only happens for
1590 		 * ipv6_loopback as it's the only local address which
1591 		 * doesn't have to be unique.
1592 		 */
1593 		connfp = &ipst->ips_ipcl_conn_fanout[
1594 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1595 		    ipst)];
1596 		mutex_enter(&connfp->connf_lock);
1597 		for (tconnp = connfp->connf_head; tconnp != NULL;
1598 		    tconnp = tconnp->conn_next) {
1599 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1600 			    connp->conn_remv6, connp->conn_srcv6,
1601 			    connp->conn_ports) &&
1602 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1603 			    tconnp->conn_tcp->tcp_bound_if == ifindex) &&
1604 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1605 				/* Already have a conn. bail out */
1606 				mutex_exit(&connfp->connf_lock);
1607 				return (EADDRINUSE);
1608 			}
1609 		}
1610 		if (connp->conn_fanout != NULL) {
1611 			/*
1612 			 * Probably a XTI/TLI application trying to do a
1613 			 * rebind. Let it happen.
1614 			 */
1615 			mutex_exit(&connfp->connf_lock);
1616 			IPCL_HASH_REMOVE(connp);
1617 			mutex_enter(&connfp->connf_lock);
1618 		}
1619 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1620 		mutex_exit(&connfp->connf_lock);
1621 		break;
1622 
1623 	case IPPROTO_SCTP:
1624 		IPCL_HASH_REMOVE(connp);
1625 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1626 		ret = ipcl_sctp_hash_insert(connp, lport);
1627 		break;
1628 
1629 	default:
1630 		if (is_system_labeled() &&
1631 		    check_exempt_conflict_v6(connp, ipst))
1632 			return (EADDRINUSE);
1633 		/* FALLTHROUGH */
1634 	case IPPROTO_UDP:
1635 		up = (uint16_t *)&ports;
1636 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1637 		if (protocol == IPPROTO_UDP) {
1638 			connfp = &ipst->ips_ipcl_udp_fanout[
1639 			    IPCL_UDP_HASH(up[1], ipst)];
1640 		} else {
1641 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1642 		}
1643 
1644 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1645 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1646 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1647 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1648 		} else {
1649 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1650 		}
1651 		break;
1652 	}
1653 
1654 	return (ret);
1655 }
1656 
1657 /*
1658  * v4 packet classifying function. looks up the fanout table to
1659  * find the conn, the packet belongs to. returns the conn with
1660  * the reference held, null otherwise.
1661  *
1662  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1663  * Lookup" comment block are applied.  Labels are also checked as described
1664  * above.  If the packet is from the inside (looped back), and is from the same
1665  * zone, then label checks are omitted.
1666  */
1667 conn_t *
1668 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1669     ip_stack_t *ipst)
1670 {
1671 	ipha_t	*ipha;
1672 	connf_t	*connfp, *bind_connfp;
1673 	uint16_t lport;
1674 	uint16_t fport;
1675 	uint32_t ports;
1676 	conn_t	*connp;
1677 	uint16_t  *up;
1678 	boolean_t shared_addr;
1679 	boolean_t unlabeled;
1680 
1681 	ipha = (ipha_t *)mp->b_rptr;
1682 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1683 
1684 	switch (protocol) {
1685 	case IPPROTO_TCP:
1686 		ports = *(uint32_t *)up;
1687 		connfp =
1688 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1689 		    ports, ipst)];
1690 		mutex_enter(&connfp->connf_lock);
1691 		for (connp = connfp->connf_head; connp != NULL;
1692 		    connp = connp->conn_next) {
1693 			if ((IPCL_CONN_MATCH(connp, protocol,
1694 			    ipha->ipha_src, ipha->ipha_dst, ports)) &&
1695 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1696 				break;
1697 			}
1698 		}
1699 
1700 		if (connp != NULL) {
1701 			/*
1702 			 * We have a fully-bound TCP connection.
1703 			 *
1704 			 * For labeled systems, there's no need to check the
1705 			 * label here.  It's known to be good as we checked
1706 			 * before allowing the connection to become bound.
1707 			 */
1708 			CONN_INC_REF(connp);
1709 			mutex_exit(&connfp->connf_lock);
1710 			return (connp);
1711 		}
1712 
1713 		mutex_exit(&connfp->connf_lock);
1714 
1715 		lport = up[1];
1716 		unlabeled = B_FALSE;
1717 		/* Cred cannot be null on IPv4 */
1718 		if (is_system_labeled()) {
1719 			cred_t *cr = msg_getcred(mp, NULL);
1720 			ASSERT(cr != NULL);
1721 			unlabeled = (crgetlabel(cr)->tsl_flags &
1722 			    TSLF_UNLABELED) != 0;
1723 		}
1724 		shared_addr = (zoneid == ALL_ZONES);
1725 		if (shared_addr) {
1726 			/*
1727 			 * No need to handle exclusive-stack zones since
1728 			 * ALL_ZONES only applies to the shared stack.
1729 			 */
1730 			zoneid = tsol_mlp_findzone(protocol, lport);
1731 			/*
1732 			 * If no shared MLP is found, tsol_mlp_findzone returns
1733 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1734 			 * search for the zone based on the packet label.
1735 			 *
1736 			 * If there is such a zone, we prefer to find a
1737 			 * connection in it.  Otherwise, we look for a
1738 			 * MAC-exempt connection in any zone whose label
1739 			 * dominates the default label on the packet.
1740 			 */
1741 			if (zoneid == ALL_ZONES)
1742 				zoneid = tsol_packet_to_zoneid(mp);
1743 			else
1744 				unlabeled = B_FALSE;
1745 		}
1746 
1747 		bind_connfp =
1748 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1749 		mutex_enter(&bind_connfp->connf_lock);
1750 		for (connp = bind_connfp->connf_head; connp != NULL;
1751 		    connp = connp->conn_next) {
1752 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1753 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1754 			    (unlabeled && connp->conn_mac_exempt &&
1755 			    shared_addr)))
1756 				break;
1757 		}
1758 
1759 		/*
1760 		 * If the matching connection is SLP on a private address, then
1761 		 * the label on the packet must match the local zone's label.
1762 		 * Otherwise, it must be in the label range defined by tnrh.
1763 		 * This is ensured by tsol_receive_label.
1764 		 */
1765 		if (connp != NULL && is_system_labeled() &&
1766 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1767 		    shared_addr, connp)) {
1768 				DTRACE_PROBE3(
1769 				    tx__ip__log__info__classify__tcp,
1770 				    char *,
1771 				    "connp(1) could not receive mp(2)",
1772 				    conn_t *, connp, mblk_t *, mp);
1773 			connp = NULL;
1774 		}
1775 
1776 		if (connp != NULL) {
1777 			/* Have a listener at least */
1778 			CONN_INC_REF(connp);
1779 			mutex_exit(&bind_connfp->connf_lock);
1780 			return (connp);
1781 		}
1782 
1783 		mutex_exit(&bind_connfp->connf_lock);
1784 
1785 		IPCL_DEBUG_LVL(512,
1786 		    ("ipcl_classify: couldn't classify mp = %p\n",
1787 		    (void *)mp));
1788 		break;
1789 
1790 	case IPPROTO_UDP:
1791 		lport = up[1];
1792 		unlabeled = B_FALSE;
1793 		/* Cred cannot be null on IPv4 */
1794 		if (is_system_labeled()) {
1795 			cred_t *cr = msg_getcred(mp, NULL);
1796 			ASSERT(cr != NULL);
1797 			unlabeled = (crgetlabel(cr)->tsl_flags &
1798 			    TSLF_UNLABELED) != 0;
1799 		}
1800 		shared_addr = (zoneid == ALL_ZONES);
1801 		if (shared_addr) {
1802 			/*
1803 			 * No need to handle exclusive-stack zones since
1804 			 * ALL_ZONES only applies to the shared stack.
1805 			 */
1806 			zoneid = tsol_mlp_findzone(protocol, lport);
1807 			/*
1808 			 * If no shared MLP is found, tsol_mlp_findzone returns
1809 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1810 			 * search for the zone based on the packet label.
1811 			 *
1812 			 * If there is such a zone, we prefer to find a
1813 			 * connection in it.  Otherwise, we look for a
1814 			 * MAC-exempt connection in any zone whose label
1815 			 * dominates the default label on the packet.
1816 			 */
1817 			if (zoneid == ALL_ZONES)
1818 				zoneid = tsol_packet_to_zoneid(mp);
1819 			else
1820 				unlabeled = B_FALSE;
1821 		}
1822 		fport = up[0];
1823 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1824 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1825 		mutex_enter(&connfp->connf_lock);
1826 		for (connp = connfp->connf_head; connp != NULL;
1827 		    connp = connp->conn_next) {
1828 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1829 			    fport, ipha->ipha_src) &&
1830 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1831 			    (unlabeled && connp->conn_mac_exempt &&
1832 			    shared_addr)))
1833 				break;
1834 		}
1835 
1836 		if (connp != NULL && is_system_labeled() &&
1837 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1838 		    shared_addr, connp)) {
1839 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1840 			    char *, "connp(1) could not receive mp(2)",
1841 			    conn_t *, connp, mblk_t *, mp);
1842 			connp = NULL;
1843 		}
1844 
1845 		if (connp != NULL) {
1846 			CONN_INC_REF(connp);
1847 			mutex_exit(&connfp->connf_lock);
1848 			return (connp);
1849 		}
1850 
1851 		/*
1852 		 * We shouldn't come here for multicast/broadcast packets
1853 		 */
1854 		mutex_exit(&connfp->connf_lock);
1855 		IPCL_DEBUG_LVL(512,
1856 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1857 		    lport, fport));
1858 		break;
1859 
1860 	case IPPROTO_ENCAP:
1861 	case IPPROTO_IPV6:
1862 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1863 		    &ipha->ipha_dst, ipst));
1864 	}
1865 
1866 	return (NULL);
1867 }
1868 
1869 conn_t *
1870 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1871     ip_stack_t *ipst)
1872 {
1873 	ip6_t		*ip6h;
1874 	connf_t		*connfp, *bind_connfp;
1875 	uint16_t	lport;
1876 	uint16_t	fport;
1877 	tcph_t		*tcph;
1878 	uint32_t	ports;
1879 	conn_t		*connp;
1880 	uint16_t	*up;
1881 	boolean_t	shared_addr;
1882 	boolean_t	unlabeled;
1883 
1884 	ip6h = (ip6_t *)mp->b_rptr;
1885 
1886 	switch (protocol) {
1887 	case IPPROTO_TCP:
1888 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1889 		up = (uint16_t *)tcph->th_lport;
1890 		ports = *(uint32_t *)up;
1891 
1892 		connfp =
1893 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1894 		    ports, ipst)];
1895 		mutex_enter(&connfp->connf_lock);
1896 		for (connp = connfp->connf_head; connp != NULL;
1897 		    connp = connp->conn_next) {
1898 			if ((IPCL_CONN_MATCH_V6(connp, protocol,
1899 			    ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
1900 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1901 				break;
1902 			}
1903 		}
1904 
1905 		if (connp != NULL) {
1906 			/*
1907 			 * We have a fully-bound TCP connection.
1908 			 *
1909 			 * For labeled systems, there's no need to check the
1910 			 * label here.  It's known to be good as we checked
1911 			 * before allowing the connection to become bound.
1912 			 */
1913 			CONN_INC_REF(connp);
1914 			mutex_exit(&connfp->connf_lock);
1915 			return (connp);
1916 		}
1917 
1918 		mutex_exit(&connfp->connf_lock);
1919 
1920 		lport = up[1];
1921 		unlabeled = B_FALSE;
1922 		/* Cred can be null on IPv6 */
1923 		if (is_system_labeled()) {
1924 			cred_t *cr = msg_getcred(mp, NULL);
1925 
1926 			unlabeled = (cr != NULL &&
1927 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1928 		}
1929 		shared_addr = (zoneid == ALL_ZONES);
1930 		if (shared_addr) {
1931 			/*
1932 			 * No need to handle exclusive-stack zones since
1933 			 * ALL_ZONES only applies to the shared stack.
1934 			 */
1935 			zoneid = tsol_mlp_findzone(protocol, lport);
1936 			/*
1937 			 * If no shared MLP is found, tsol_mlp_findzone returns
1938 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1939 			 * search for the zone based on the packet label.
1940 			 *
1941 			 * If there is such a zone, we prefer to find a
1942 			 * connection in it.  Otherwise, we look for a
1943 			 * MAC-exempt connection in any zone whose label
1944 			 * dominates the default label on the packet.
1945 			 */
1946 			if (zoneid == ALL_ZONES)
1947 				zoneid = tsol_packet_to_zoneid(mp);
1948 			else
1949 				unlabeled = B_FALSE;
1950 		}
1951 
1952 		bind_connfp =
1953 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1954 		mutex_enter(&bind_connfp->connf_lock);
1955 		for (connp = bind_connfp->connf_head; connp != NULL;
1956 		    connp = connp->conn_next) {
1957 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1958 			    ip6h->ip6_dst, lport) &&
1959 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1960 			    (unlabeled && connp->conn_mac_exempt &&
1961 			    shared_addr)))
1962 				break;
1963 		}
1964 
1965 		if (connp != NULL && is_system_labeled() &&
1966 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1967 		    shared_addr, connp)) {
1968 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1969 			    char *, "connp(1) could not receive mp(2)",
1970 			    conn_t *, connp, mblk_t *, mp);
1971 			connp = NULL;
1972 		}
1973 
1974 		if (connp != NULL) {
1975 			/* Have a listner at least */
1976 			CONN_INC_REF(connp);
1977 			mutex_exit(&bind_connfp->connf_lock);
1978 			IPCL_DEBUG_LVL(512,
1979 			    ("ipcl_classify_v6: found listner "
1980 			    "connp = %p\n", (void *)connp));
1981 
1982 			return (connp);
1983 		}
1984 
1985 		mutex_exit(&bind_connfp->connf_lock);
1986 
1987 		IPCL_DEBUG_LVL(512,
1988 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1989 		    (void *)mp));
1990 		break;
1991 
1992 	case IPPROTO_UDP:
1993 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1994 		lport = up[1];
1995 		unlabeled = B_FALSE;
1996 		/* Cred can be null on IPv6 */
1997 		if (is_system_labeled()) {
1998 			cred_t *cr = msg_getcred(mp, NULL);
1999 
2000 			unlabeled = (cr != NULL &&
2001 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
2002 		}
2003 		shared_addr = (zoneid == ALL_ZONES);
2004 		if (shared_addr) {
2005 			/*
2006 			 * No need to handle exclusive-stack zones since
2007 			 * ALL_ZONES only applies to the shared stack.
2008 			 */
2009 			zoneid = tsol_mlp_findzone(protocol, lport);
2010 			/*
2011 			 * If no shared MLP is found, tsol_mlp_findzone returns
2012 			 * ALL_ZONES.  In that case, we assume it's SLP, and
2013 			 * search for the zone based on the packet label.
2014 			 *
2015 			 * If there is such a zone, we prefer to find a
2016 			 * connection in it.  Otherwise, we look for a
2017 			 * MAC-exempt connection in any zone whose label
2018 			 * dominates the default label on the packet.
2019 			 */
2020 			if (zoneid == ALL_ZONES)
2021 				zoneid = tsol_packet_to_zoneid(mp);
2022 			else
2023 				unlabeled = B_FALSE;
2024 		}
2025 
2026 		fport = up[0];
2027 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
2028 		    fport));
2029 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2030 		mutex_enter(&connfp->connf_lock);
2031 		for (connp = connfp->connf_head; connp != NULL;
2032 		    connp = connp->conn_next) {
2033 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
2034 			    fport, ip6h->ip6_src) &&
2035 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
2036 			    (unlabeled && connp->conn_mac_exempt &&
2037 			    shared_addr)))
2038 				break;
2039 		}
2040 
2041 		if (connp != NULL && is_system_labeled() &&
2042 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
2043 		    shared_addr, connp)) {
2044 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
2045 			    char *, "connp(1) could not receive mp(2)",
2046 			    conn_t *, connp, mblk_t *, mp);
2047 			connp = NULL;
2048 		}
2049 
2050 		if (connp != NULL) {
2051 			CONN_INC_REF(connp);
2052 			mutex_exit(&connfp->connf_lock);
2053 			return (connp);
2054 		}
2055 
2056 		/*
2057 		 * We shouldn't come here for multicast/broadcast packets
2058 		 */
2059 		mutex_exit(&connfp->connf_lock);
2060 		IPCL_DEBUG_LVL(512,
2061 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
2062 		    lport, fport));
2063 		break;
2064 	case IPPROTO_ENCAP:
2065 	case IPPROTO_IPV6:
2066 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
2067 		    &ip6h->ip6_dst, ipst));
2068 	}
2069 
2070 	return (NULL);
2071 }
2072 
2073 /*
2074  * wrapper around ipcl_classify_(v4,v6) routines.
2075  */
2076 conn_t *
2077 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
2078 {
2079 	uint16_t	hdr_len;
2080 	ipha_t		*ipha;
2081 	uint8_t		*nexthdrp;
2082 
2083 	if (MBLKL(mp) < sizeof (ipha_t))
2084 		return (NULL);
2085 
2086 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
2087 	case IPV4_VERSION:
2088 		ipha = (ipha_t *)mp->b_rptr;
2089 		hdr_len = IPH_HDR_LENGTH(ipha);
2090 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
2091 		    zoneid, ipst));
2092 	case IPV6_VERSION:
2093 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
2094 		    &hdr_len, &nexthdrp))
2095 			return (NULL);
2096 
2097 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
2098 	}
2099 
2100 	return (NULL);
2101 }
2102 
2103 conn_t *
2104 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
2105     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
2106 {
2107 	connf_t		*connfp;
2108 	conn_t		*connp;
2109 	in_port_t	lport;
2110 	int		af;
2111 	boolean_t	shared_addr;
2112 	boolean_t	unlabeled;
2113 	const void	*dst;
2114 
2115 	lport = ((uint16_t *)&ports)[1];
2116 
2117 	unlabeled = B_FALSE;
2118 	/* Cred can be null on IPv6 */
2119 	if (is_system_labeled()) {
2120 		cred_t *cr = msg_getcred(mp, NULL);
2121 
2122 		unlabeled = (cr != NULL &&
2123 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
2124 	}
2125 	shared_addr = (zoneid == ALL_ZONES);
2126 	if (shared_addr) {
2127 		/*
2128 		 * No need to handle exclusive-stack zones since ALL_ZONES
2129 		 * only applies to the shared stack.
2130 		 */
2131 		zoneid = tsol_mlp_findzone(protocol, lport);
2132 		/*
2133 		 * If no shared MLP is found, tsol_mlp_findzone returns
2134 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
2135 		 * the zone based on the packet label.
2136 		 *
2137 		 * If there is such a zone, we prefer to find a connection in
2138 		 * it.  Otherwise, we look for a MAC-exempt connection in any
2139 		 * zone whose label dominates the default label on the packet.
2140 		 */
2141 		if (zoneid == ALL_ZONES)
2142 			zoneid = tsol_packet_to_zoneid(mp);
2143 		else
2144 			unlabeled = B_FALSE;
2145 	}
2146 
2147 	af = IPH_HDR_VERSION(hdr);
2148 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
2149 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
2150 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
2151 
2152 	mutex_enter(&connfp->connf_lock);
2153 	for (connp = connfp->connf_head; connp != NULL;
2154 	    connp = connp->conn_next) {
2155 		/* We don't allow v4 fallback for v6 raw socket. */
2156 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
2157 		    IPV6_VERSION))
2158 			continue;
2159 		if (connp->conn_fully_bound) {
2160 			if (af == IPV4_VERSION) {
2161 				if (!IPCL_CONN_MATCH(connp, protocol,
2162 				    hdr->ipha_src, hdr->ipha_dst, ports))
2163 					continue;
2164 			} else {
2165 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
2166 				    ((ip6_t *)hdr)->ip6_src,
2167 				    ((ip6_t *)hdr)->ip6_dst, ports))
2168 					continue;
2169 			}
2170 		} else {
2171 			if (af == IPV4_VERSION) {
2172 				if (!IPCL_BIND_MATCH(connp, protocol,
2173 				    hdr->ipha_dst, lport))
2174 					continue;
2175 			} else {
2176 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
2177 				    ((ip6_t *)hdr)->ip6_dst, lport))
2178 					continue;
2179 			}
2180 		}
2181 
2182 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
2183 		    (unlabeled && connp->conn_mac_exempt && shared_addr))
2184 			break;
2185 	}
2186 	/*
2187 	 * If the connection is fully-bound and connection-oriented (TCP or
2188 	 * SCTP), then we've already validated the remote system's label.
2189 	 * There's no need to do it again for every packet.
2190 	 */
2191 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
2192 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
2193 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
2194 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
2195 		    char *, "connp(1) could not receive mp(2)",
2196 		    conn_t *, connp, mblk_t *, mp);
2197 		connp = NULL;
2198 	}
2199 
2200 	if (connp != NULL)
2201 		goto found;
2202 	mutex_exit(&connfp->connf_lock);
2203 
2204 	/* Try to look for a wildcard match. */
2205 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2206 	mutex_enter(&connfp->connf_lock);
2207 	for (connp = connfp->connf_head; connp != NULL;
2208 	    connp = connp->conn_next) {
2209 		/* We don't allow v4 fallback for v6 raw socket. */
2210 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
2211 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
2212 			continue;
2213 		}
2214 		if (af == IPV4_VERSION) {
2215 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
2216 				break;
2217 		} else {
2218 			if (IPCL_RAW_MATCH_V6(connp, protocol,
2219 			    ((ip6_t *)hdr)->ip6_dst)) {
2220 				break;
2221 			}
2222 		}
2223 	}
2224 
2225 	if (connp != NULL)
2226 		goto found;
2227 
2228 	mutex_exit(&connfp->connf_lock);
2229 	return (NULL);
2230 
2231 found:
2232 	ASSERT(connp != NULL);
2233 	CONN_INC_REF(connp);
2234 	mutex_exit(&connfp->connf_lock);
2235 	return (connp);
2236 }
2237 
2238 /* ARGSUSED */
2239 static int
2240 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2241 {
2242 	itc_t	*itc = (itc_t *)buf;
2243 	conn_t 	*connp = &itc->itc_conn;
2244 	tcp_t	*tcp = (tcp_t *)&itc[1];
2245 
2246 	bzero(connp, sizeof (conn_t));
2247 	bzero(tcp, sizeof (tcp_t));
2248 
2249 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2250 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2251 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2252 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2253 	connp->conn_tcp = tcp;
2254 	connp->conn_flags = IPCL_TCPCONN;
2255 	connp->conn_ulp = IPPROTO_TCP;
2256 	tcp->tcp_connp = connp;
2257 	return (0);
2258 }
2259 
2260 /* ARGSUSED */
2261 static void
2262 tcp_conn_destructor(void *buf, void *cdrarg)
2263 {
2264 	itc_t	*itc = (itc_t *)buf;
2265 	conn_t 	*connp = &itc->itc_conn;
2266 	tcp_t	*tcp = (tcp_t *)&itc[1];
2267 
2268 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2269 	ASSERT(tcp->tcp_connp == connp);
2270 	ASSERT(connp->conn_tcp == tcp);
2271 	tcp_timermp_free(tcp);
2272 	mutex_destroy(&connp->conn_lock);
2273 	cv_destroy(&connp->conn_cv);
2274 	cv_destroy(&connp->conn_sq_cv);
2275 }
2276 
2277 /* ARGSUSED */
2278 static int
2279 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2280 {
2281 	itc_t	*itc = (itc_t *)buf;
2282 	conn_t 	*connp = &itc->itc_conn;
2283 
2284 	bzero(connp, sizeof (conn_t));
2285 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2286 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2287 	connp->conn_flags = IPCL_IPCCONN;
2288 
2289 	return (0);
2290 }
2291 
2292 /* ARGSUSED */
2293 static void
2294 ip_conn_destructor(void *buf, void *cdrarg)
2295 {
2296 	itc_t	*itc = (itc_t *)buf;
2297 	conn_t 	*connp = &itc->itc_conn;
2298 
2299 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2300 	ASSERT(connp->conn_priv == NULL);
2301 	mutex_destroy(&connp->conn_lock);
2302 	cv_destroy(&connp->conn_cv);
2303 }
2304 
2305 /* ARGSUSED */
2306 static int
2307 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2308 {
2309 	itc_t	*itc = (itc_t *)buf;
2310 	conn_t 	*connp = &itc->itc_conn;
2311 	udp_t	*udp = (udp_t *)&itc[1];
2312 
2313 	bzero(connp, sizeof (conn_t));
2314 	bzero(udp, sizeof (udp_t));
2315 
2316 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2317 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2318 	connp->conn_udp = udp;
2319 	connp->conn_flags = IPCL_UDPCONN;
2320 	connp->conn_ulp = IPPROTO_UDP;
2321 	udp->udp_connp = connp;
2322 	return (0);
2323 }
2324 
2325 /* ARGSUSED */
2326 static void
2327 udp_conn_destructor(void *buf, void *cdrarg)
2328 {
2329 	itc_t	*itc = (itc_t *)buf;
2330 	conn_t 	*connp = &itc->itc_conn;
2331 	udp_t	*udp = (udp_t *)&itc[1];
2332 
2333 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2334 	ASSERT(udp->udp_connp == connp);
2335 	ASSERT(connp->conn_udp == udp);
2336 	mutex_destroy(&connp->conn_lock);
2337 	cv_destroy(&connp->conn_cv);
2338 }
2339 
2340 /* ARGSUSED */
2341 static int
2342 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2343 {
2344 	itc_t	*itc = (itc_t *)buf;
2345 	conn_t 	*connp = &itc->itc_conn;
2346 	icmp_t	*icmp = (icmp_t *)&itc[1];
2347 
2348 	bzero(connp, sizeof (conn_t));
2349 	bzero(icmp, sizeof (icmp_t));
2350 
2351 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2352 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2353 	connp->conn_icmp = icmp;
2354 	connp->conn_flags = IPCL_RAWIPCONN;
2355 	connp->conn_ulp = IPPROTO_ICMP;
2356 	icmp->icmp_connp = connp;
2357 	return (0);
2358 }
2359 
2360 /* ARGSUSED */
2361 static void
2362 rawip_conn_destructor(void *buf, void *cdrarg)
2363 {
2364 	itc_t	*itc = (itc_t *)buf;
2365 	conn_t 	*connp = &itc->itc_conn;
2366 	icmp_t	*icmp = (icmp_t *)&itc[1];
2367 
2368 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2369 	ASSERT(icmp->icmp_connp == connp);
2370 	ASSERT(connp->conn_icmp == icmp);
2371 	mutex_destroy(&connp->conn_lock);
2372 	cv_destroy(&connp->conn_cv);
2373 }
2374 
2375 /* ARGSUSED */
2376 static int
2377 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2378 {
2379 	itc_t	*itc = (itc_t *)buf;
2380 	conn_t 	*connp = &itc->itc_conn;
2381 	rts_t	*rts = (rts_t *)&itc[1];
2382 
2383 	bzero(connp, sizeof (conn_t));
2384 	bzero(rts, sizeof (rts_t));
2385 
2386 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2387 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2388 	connp->conn_rts = rts;
2389 	connp->conn_flags = IPCL_RTSCONN;
2390 	rts->rts_connp = connp;
2391 	return (0);
2392 }
2393 
2394 /* ARGSUSED */
2395 static void
2396 rts_conn_destructor(void *buf, void *cdrarg)
2397 {
2398 	itc_t	*itc = (itc_t *)buf;
2399 	conn_t 	*connp = &itc->itc_conn;
2400 	rts_t	*rts = (rts_t *)&itc[1];
2401 
2402 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2403 	ASSERT(rts->rts_connp == connp);
2404 	ASSERT(connp->conn_rts == rts);
2405 	mutex_destroy(&connp->conn_lock);
2406 	cv_destroy(&connp->conn_cv);
2407 }
2408 
2409 /* ARGSUSED */
2410 int
2411 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
2412 {
2413 	int error;
2414 	netstack_t	*ns;
2415 	int		ret;
2416 	tcp_stack_t	*tcps;
2417 	ip_helper_stream_info_t	*ip_helper_str;
2418 	ip_stack_t	*ipst;
2419 
2420 	ns = netstack_find_by_cred(kcred);
2421 	ASSERT(ns != NULL);
2422 	tcps = ns->netstack_tcp;
2423 	ipst = ns->netstack_ip;
2424 	ASSERT(tcps != NULL);
2425 	ip_helper_str = (ip_helper_stream_info_t *)buf;
2426 
2427 	do {
2428 		error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
2429 		    &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
2430 	} while (error == EINTR);
2431 
2432 	if (error == 0) {
2433 		do {
2434 			error = ldi_ioctl(
2435 			    ip_helper_str->iphs_handle, SIOCSQPTR,
2436 			    (intptr_t)buf, FKIOCTL, kcred, &ret);
2437 		} while (error == EINTR);
2438 
2439 		if (error != 0) {
2440 			(void) ldi_close(
2441 			    ip_helper_str->iphs_handle, 0, kcred);
2442 		}
2443 	}
2444 
2445 	netstack_rele(ipst->ips_netstack);
2446 
2447 	return (error);
2448 }
2449 
2450 /* ARGSUSED */
2451 static void
2452 ip_helper_stream_destructor(void *buf, void *cdrarg)
2453 {
2454 	ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
2455 
2456 	ip_helper_str->iphs_rq->q_ptr =
2457 	    ip_helper_str->iphs_wq->q_ptr =
2458 	    ip_helper_str->iphs_minfo;
2459 	(void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
2460 }
2461 
2462 
2463 /*
2464  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2465  * in the conn_t.
2466  */
2467 void
2468 ipcl_conn_cleanup(conn_t *connp)
2469 {
2470 	ASSERT(connp->conn_ire_cache == NULL);
2471 	ASSERT(connp->conn_latch == NULL);
2472 #ifdef notdef
2473 	ASSERT(connp->conn_rq == NULL);
2474 	ASSERT(connp->conn_wq == NULL);
2475 #endif
2476 	ASSERT(connp->conn_cred == NULL);
2477 	ASSERT(connp->conn_g_fanout == NULL);
2478 	ASSERT(connp->conn_g_next == NULL);
2479 	ASSERT(connp->conn_g_prev == NULL);
2480 	ASSERT(connp->conn_policy == NULL);
2481 	ASSERT(connp->conn_fanout == NULL);
2482 	ASSERT(connp->conn_next == NULL);
2483 	ASSERT(connp->conn_prev == NULL);
2484 #ifdef notdef
2485 	/*
2486 	 * The ill and ipif pointers are not cleared before the conn_t
2487 	 * goes away since they do not hold a reference on the ill/ipif.
2488 	 * We should replace these pointers with ifindex/ipaddr_t to
2489 	 * make the code less complex.
2490 	 */
2491 	ASSERT(connp->conn_outgoing_ill == NULL);
2492 	ASSERT(connp->conn_incoming_ill == NULL);
2493 	ASSERT(connp->conn_multicast_ipif == NULL);
2494 	ASSERT(connp->conn_multicast_ill == NULL);
2495 #endif
2496 	ASSERT(connp->conn_oper_pending_ill == NULL);
2497 	ASSERT(connp->conn_ilg == NULL);
2498 	ASSERT(connp->conn_drain_next == NULL);
2499 	ASSERT(connp->conn_drain_prev == NULL);
2500 #ifdef notdef
2501 	/* conn_idl is not cleared when removed from idl list */
2502 	ASSERT(connp->conn_idl == NULL);
2503 #endif
2504 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2505 	ASSERT(connp->conn_effective_cred == NULL);
2506 	ASSERT(connp->conn_netstack == NULL);
2507 
2508 	ASSERT(connp->conn_helper_info == NULL);
2509 	/* Clear out the conn_t fields that are not preserved */
2510 	bzero(&connp->conn_start_clr,
2511 	    sizeof (conn_t) -
2512 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2513 }
2514 
2515 /*
2516  * All conns are inserted in a global multi-list for the benefit of
2517  * walkers. The walk is guaranteed to walk all open conns at the time
2518  * of the start of the walk exactly once. This property is needed to
2519  * achieve some cleanups during unplumb of interfaces. This is achieved
2520  * as follows.
2521  *
2522  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2523  * call the insert and delete functions below at creation and deletion
2524  * time respectively. The conn never moves or changes its position in this
2525  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2526  * won't increase due to walkers, once the conn deletion has started. Note
2527  * that we can't remove the conn from the global list and then wait for
2528  * the refcnt to drop to zero, since walkers would then see a truncated
2529  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2530  * conns until ip_open is ready to make them globally visible.
2531  * The global round robin multi-list locks are held only to get the
2532  * next member/insertion/deletion and contention should be negligible
2533  * if the multi-list is much greater than the number of cpus.
2534  */
2535 void
2536 ipcl_globalhash_insert(conn_t *connp)
2537 {
2538 	int	index;
2539 	struct connf_s	*connfp;
2540 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2541 
2542 	/*
2543 	 * No need for atomic here. Approximate even distribution
2544 	 * in the global lists is sufficient.
2545 	 */
2546 	ipst->ips_conn_g_index++;
2547 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2548 
2549 	connp->conn_g_prev = NULL;
2550 	/*
2551 	 * Mark as INCIPIENT, so that walkers will ignore this
2552 	 * for now, till ip_open is ready to make it visible globally.
2553 	 */
2554 	connp->conn_state_flags |= CONN_INCIPIENT;
2555 
2556 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2557 	/* Insert at the head of the list */
2558 	mutex_enter(&connfp->connf_lock);
2559 	connp->conn_g_next = connfp->connf_head;
2560 	if (connp->conn_g_next != NULL)
2561 		connp->conn_g_next->conn_g_prev = connp;
2562 	connfp->connf_head = connp;
2563 
2564 	/* The fanout bucket this conn points to */
2565 	connp->conn_g_fanout = connfp;
2566 
2567 	mutex_exit(&connfp->connf_lock);
2568 }
2569 
2570 void
2571 ipcl_globalhash_remove(conn_t *connp)
2572 {
2573 	struct connf_s	*connfp;
2574 
2575 	/*
2576 	 * We were never inserted in the global multi list.
2577 	 * IPCL_NONE variety is never inserted in the global multilist
2578 	 * since it is presumed to not need any cleanup and is transient.
2579 	 */
2580 	if (connp->conn_g_fanout == NULL)
2581 		return;
2582 
2583 	connfp = connp->conn_g_fanout;
2584 	mutex_enter(&connfp->connf_lock);
2585 	if (connp->conn_g_prev != NULL)
2586 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2587 	else
2588 		connfp->connf_head = connp->conn_g_next;
2589 	if (connp->conn_g_next != NULL)
2590 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2591 	mutex_exit(&connfp->connf_lock);
2592 
2593 	/* Better to stumble on a null pointer than to corrupt memory */
2594 	connp->conn_g_next = NULL;
2595 	connp->conn_g_prev = NULL;
2596 	connp->conn_g_fanout = NULL;
2597 }
2598 
2599 /*
2600  * Walk the list of all conn_t's in the system, calling the function provided
2601  * with the specified argument for each.
2602  * Applies to both IPv4 and IPv6.
2603  *
2604  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2605  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2606  * unplumbed or removed. New conn_t's that are created while we are walking
2607  * may be missed by this walk, because they are not necessarily inserted
2608  * at the tail of the list. They are new conn_t's and thus don't have any
2609  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2610  * is created to the struct that is going away.
2611  */
2612 void
2613 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2614 {
2615 	int	i;
2616 	conn_t	*connp;
2617 	conn_t	*prev_connp;
2618 
2619 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2620 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2621 		prev_connp = NULL;
2622 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2623 		while (connp != NULL) {
2624 			mutex_enter(&connp->conn_lock);
2625 			if (connp->conn_state_flags &
2626 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2627 				mutex_exit(&connp->conn_lock);
2628 				connp = connp->conn_g_next;
2629 				continue;
2630 			}
2631 			CONN_INC_REF_LOCKED(connp);
2632 			mutex_exit(&connp->conn_lock);
2633 			mutex_exit(
2634 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2635 			(*func)(connp, arg);
2636 			if (prev_connp != NULL)
2637 				CONN_DEC_REF(prev_connp);
2638 			mutex_enter(
2639 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2640 			prev_connp = connp;
2641 			connp = connp->conn_g_next;
2642 		}
2643 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2644 		if (prev_connp != NULL)
2645 			CONN_DEC_REF(prev_connp);
2646 	}
2647 }
2648 
2649 /*
2650  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2651  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2652  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2653  * (peer tcp in ESTABLISHED state).
2654  */
2655 conn_t *
2656 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2657     ip_stack_t *ipst)
2658 {
2659 	uint32_t ports;
2660 	uint16_t *pports = (uint16_t *)&ports;
2661 	connf_t	*connfp;
2662 	conn_t	*tconnp;
2663 	boolean_t zone_chk;
2664 
2665 	/*
2666 	 * If either the source of destination address is loopback, then
2667 	 * both endpoints must be in the same Zone.  Otherwise, both of
2668 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2669 	 * state) and the endpoints may reside in different Zones.
2670 	 */
2671 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2672 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2673 
2674 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2675 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2676 
2677 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2678 	    ports, ipst)];
2679 
2680 	mutex_enter(&connfp->connf_lock);
2681 	for (tconnp = connfp->connf_head; tconnp != NULL;
2682 	    tconnp = tconnp->conn_next) {
2683 
2684 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2685 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2686 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2687 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2688 
2689 			ASSERT(tconnp != connp);
2690 			CONN_INC_REF(tconnp);
2691 			mutex_exit(&connfp->connf_lock);
2692 			return (tconnp);
2693 		}
2694 	}
2695 	mutex_exit(&connfp->connf_lock);
2696 	return (NULL);
2697 }
2698 
2699 /*
2700  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2701  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2702  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2703  * (peer tcp in ESTABLISHED state).
2704  */
2705 conn_t *
2706 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2707     ip_stack_t *ipst)
2708 {
2709 	uint32_t ports;
2710 	uint16_t *pports = (uint16_t *)&ports;
2711 	connf_t	*connfp;
2712 	conn_t	*tconnp;
2713 	boolean_t zone_chk;
2714 
2715 	/*
2716 	 * If either the source of destination address is loopback, then
2717 	 * both endpoints must be in the same Zone.  Otherwise, both of
2718 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2719 	 * state) and the endpoints may reside in different Zones.  We
2720 	 * don't do Zone check for link local address(es) because the
2721 	 * current Zone implementation treats each link local address as
2722 	 * being unique per system node, i.e. they belong to global Zone.
2723 	 */
2724 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2725 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2726 
2727 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2728 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2729 
2730 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2731 	    ports, ipst)];
2732 
2733 	mutex_enter(&connfp->connf_lock);
2734 	for (tconnp = connfp->connf_head; tconnp != NULL;
2735 	    tconnp = tconnp->conn_next) {
2736 
2737 		/* We skip tcp_bound_if check here as this is loopback tcp */
2738 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2739 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2740 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2741 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2742 
2743 			ASSERT(tconnp != connp);
2744 			CONN_INC_REF(tconnp);
2745 			mutex_exit(&connfp->connf_lock);
2746 			return (tconnp);
2747 		}
2748 	}
2749 	mutex_exit(&connfp->connf_lock);
2750 	return (NULL);
2751 }
2752 
2753 /*
2754  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2755  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2756  * Only checks for connected entries i.e. no INADDR_ANY checks.
2757  */
2758 conn_t *
2759 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2760     ip_stack_t *ipst)
2761 {
2762 	uint32_t ports;
2763 	uint16_t *pports;
2764 	connf_t	*connfp;
2765 	conn_t	*tconnp;
2766 
2767 	pports = (uint16_t *)&ports;
2768 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2769 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2770 
2771 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2772 	    ports, ipst)];
2773 
2774 	mutex_enter(&connfp->connf_lock);
2775 	for (tconnp = connfp->connf_head; tconnp != NULL;
2776 	    tconnp = tconnp->conn_next) {
2777 
2778 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2779 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2780 		    tconnp->conn_tcp->tcp_state >= min_state) {
2781 
2782 			CONN_INC_REF(tconnp);
2783 			mutex_exit(&connfp->connf_lock);
2784 			return (tconnp);
2785 		}
2786 	}
2787 	mutex_exit(&connfp->connf_lock);
2788 	return (NULL);
2789 }
2790 
2791 /*
2792  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2793  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2794  * Only checks for connected entries i.e. no INADDR_ANY checks.
2795  * Match on ifindex in addition to addresses.
2796  */
2797 conn_t *
2798 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2799     uint_t ifindex, ip_stack_t *ipst)
2800 {
2801 	tcp_t	*tcp;
2802 	uint32_t ports;
2803 	uint16_t *pports;
2804 	connf_t	*connfp;
2805 	conn_t	*tconnp;
2806 
2807 	pports = (uint16_t *)&ports;
2808 	pports[0] = tcpha->tha_fport;
2809 	pports[1] = tcpha->tha_lport;
2810 
2811 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2812 	    ports, ipst)];
2813 
2814 	mutex_enter(&connfp->connf_lock);
2815 	for (tconnp = connfp->connf_head; tconnp != NULL;
2816 	    tconnp = tconnp->conn_next) {
2817 
2818 		tcp = tconnp->conn_tcp;
2819 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2820 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2821 		    tcp->tcp_state >= min_state &&
2822 		    (tcp->tcp_bound_if == 0 ||
2823 		    tcp->tcp_bound_if == ifindex)) {
2824 
2825 			CONN_INC_REF(tconnp);
2826 			mutex_exit(&connfp->connf_lock);
2827 			return (tconnp);
2828 		}
2829 	}
2830 	mutex_exit(&connfp->connf_lock);
2831 	return (NULL);
2832 }
2833 
2834 /*
2835  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2836  * a listener when changing state.
2837  */
2838 conn_t *
2839 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2840     ip_stack_t *ipst)
2841 {
2842 	connf_t		*bind_connfp;
2843 	conn_t		*connp;
2844 	tcp_t		*tcp;
2845 
2846 	/*
2847 	 * Avoid false matches for packets sent to an IP destination of
2848 	 * all zeros.
2849 	 */
2850 	if (laddr == 0)
2851 		return (NULL);
2852 
2853 	ASSERT(zoneid != ALL_ZONES);
2854 
2855 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2856 	mutex_enter(&bind_connfp->connf_lock);
2857 	for (connp = bind_connfp->connf_head; connp != NULL;
2858 	    connp = connp->conn_next) {
2859 		tcp = connp->conn_tcp;
2860 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2861 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2862 		    (tcp->tcp_listener == NULL)) {
2863 			CONN_INC_REF(connp);
2864 			mutex_exit(&bind_connfp->connf_lock);
2865 			return (connp);
2866 		}
2867 	}
2868 	mutex_exit(&bind_connfp->connf_lock);
2869 	return (NULL);
2870 }
2871 
2872 /*
2873  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2874  * a listener when changing state.
2875  */
2876 conn_t *
2877 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2878     zoneid_t zoneid, ip_stack_t *ipst)
2879 {
2880 	connf_t		*bind_connfp;
2881 	conn_t		*connp = NULL;
2882 	tcp_t		*tcp;
2883 
2884 	/*
2885 	 * Avoid false matches for packets sent to an IP destination of
2886 	 * all zeros.
2887 	 */
2888 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2889 		return (NULL);
2890 
2891 	ASSERT(zoneid != ALL_ZONES);
2892 
2893 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2894 	mutex_enter(&bind_connfp->connf_lock);
2895 	for (connp = bind_connfp->connf_head; connp != NULL;
2896 	    connp = connp->conn_next) {
2897 		tcp = connp->conn_tcp;
2898 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2899 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2900 		    (tcp->tcp_bound_if == 0 ||
2901 		    tcp->tcp_bound_if == ifindex) &&
2902 		    tcp->tcp_listener == NULL) {
2903 			CONN_INC_REF(connp);
2904 			mutex_exit(&bind_connfp->connf_lock);
2905 			return (connp);
2906 		}
2907 	}
2908 	mutex_exit(&bind_connfp->connf_lock);
2909 	return (NULL);
2910 }
2911 
2912 /*
2913  * ipcl_get_next_conn
2914  *	get the next entry in the conn global list
2915  *	and put a reference on the next_conn.
2916  *	decrement the reference on the current conn.
2917  *
2918  * This is an iterator based walker function that also provides for
2919  * some selection by the caller. It walks through the conn_hash bucket
2920  * searching for the next valid connp in the list, and selects connections
2921  * that are neither closed nor condemned. It also REFHOLDS the conn
2922  * thus ensuring that the conn exists when the caller uses the conn.
2923  */
2924 conn_t *
2925 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2926 {
2927 	conn_t	*next_connp;
2928 
2929 	if (connfp == NULL)
2930 		return (NULL);
2931 
2932 	mutex_enter(&connfp->connf_lock);
2933 
2934 	next_connp = (connp == NULL) ?
2935 	    connfp->connf_head : connp->conn_g_next;
2936 
2937 	while (next_connp != NULL) {
2938 		mutex_enter(&next_connp->conn_lock);
2939 		if (!(next_connp->conn_flags & conn_flags) ||
2940 		    (next_connp->conn_state_flags &
2941 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2942 			/*
2943 			 * This conn has been condemned or
2944 			 * is closing, or the flags don't match
2945 			 */
2946 			mutex_exit(&next_connp->conn_lock);
2947 			next_connp = next_connp->conn_g_next;
2948 			continue;
2949 		}
2950 		CONN_INC_REF_LOCKED(next_connp);
2951 		mutex_exit(&next_connp->conn_lock);
2952 		break;
2953 	}
2954 
2955 	mutex_exit(&connfp->connf_lock);
2956 
2957 	if (connp != NULL)
2958 		CONN_DEC_REF(connp);
2959 
2960 	return (next_connp);
2961 }
2962 
2963 #ifdef CONN_DEBUG
2964 /*
2965  * Trace of the last NBUF refhold/refrele
2966  */
2967 int
2968 conn_trace_ref(conn_t *connp)
2969 {
2970 	int	last;
2971 	conn_trace_t	*ctb;
2972 
2973 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2974 	last = connp->conn_trace_last;
2975 	last++;
2976 	if (last == CONN_TRACE_MAX)
2977 		last = 0;
2978 
2979 	ctb = &connp->conn_trace_buf[last];
2980 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2981 	connp->conn_trace_last = last;
2982 	return (1);
2983 }
2984 
2985 int
2986 conn_untrace_ref(conn_t *connp)
2987 {
2988 	int	last;
2989 	conn_trace_t	*ctb;
2990 
2991 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2992 	last = connp->conn_trace_last;
2993 	last++;
2994 	if (last == CONN_TRACE_MAX)
2995 		last = 0;
2996 
2997 	ctb = &connp->conn_trace_buf[last];
2998 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2999 	connp->conn_trace_last = last;
3000 	return (1);
3001 }
3002 #endif
3003