xref: /titanic_44/usr/src/uts/common/inet/ip/ipclassifier.c (revision 8339b41da2395f0525c46ceedfeb01961893ec44)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * IP PACKET CLASSIFIER
27  *
28  * The IP packet classifier provides mapping between IP packets and persistent
29  * connection state for connection-oriented protocols. It also provides
30  * interface for managing connection states.
31  *
32  * The connection state is kept in conn_t data structure and contains, among
33  * other things:
34  *
35  *	o local/remote address and ports
36  *	o Transport protocol
37  *	o squeue for the connection (for TCP only)
38  *	o reference counter
39  *	o Connection state
40  *	o hash table linkage
41  *	o interface/ire information
42  *	o credentials
43  *	o ipsec policy
44  *	o send and receive functions.
45  *	o mutex lock.
46  *
47  * Connections use a reference counting scheme. They are freed when the
48  * reference counter drops to zero. A reference is incremented when connection
49  * is placed in a list or table, when incoming packet for the connection arrives
50  * and when connection is processed via squeue (squeue processing may be
51  * asynchronous and the reference protects the connection from being destroyed
52  * before its processing is finished).
53  *
54  * conn_recv is used to pass up packets to the ULP.
55  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56  * a listener, and changes to tcp_input_listener as the listener has picked a
57  * good squeue. For other cases it is set to tcp_input_data.
58  *
59  * conn_recvicmp is used to pass up ICMP errors to the ULP.
60  *
61  * Classifier uses several hash tables:
62  *
63  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
64  *	ipcl_bind_fanout:	contains all connections in BOUND state
65  *	ipcl_proto_fanout:	IPv4 protocol fanout
66  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
67  *	ipcl_udp_fanout:	contains all UDP connections
68  *	ipcl_iptun_fanout:	contains all IP tunnel connections
69  *	ipcl_globalhash_fanout:	contains all connections
70  *
71  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72  * which need to view all existing connections.
73  *
74  * All tables are protected by per-bucket locks. When both per-bucket lock and
75  * connection lock need to be held, the per-bucket lock should be acquired
76  * first, followed by the connection lock.
77  *
78  * All functions doing search in one of these tables increment a reference
79  * counter on the connection found (if any). This reference should be dropped
80  * when the caller has finished processing the connection.
81  *
82  *
83  * INTERFACES:
84  * ===========
85  *
86  * Connection Lookup:
87  * ------------------
88  *
89  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
91  *
92  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93  * it can't find any associated connection. If the connection is found, its
94  * reference counter is incremented.
95  *
96  *	mp:	mblock, containing packet header. The full header should fit
97  *		into a single mblock. It should also contain at least full IP
98  *		and TCP or UDP header.
99  *
100  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
101  *
102  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
103  *		 the packet.
104  *
105  * 	ira->ira_zoneid: The zone in which the returned connection must be; the
106  *		zoneid corresponding to the ire_zoneid on the IRE located for
107  *		the packet's destination address.
108  *
109  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110  *		IRAF_TX_SHARED_ADDR flags
111  *
112  *	For TCP connections, the lookup order is as follows:
113  *		5-tuple {src, dst, protocol, local port, remote port}
114  *			lookup in ipcl_conn_fanout table.
115  *		3-tuple {dst, remote port, protocol} lookup in
116  *			ipcl_bind_fanout table.
117  *
118  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
119  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
120  *	these interfaces do not handle cases where a packets belongs
121  *	to multiple UDP clients, which is handled in IP itself.
122  *
123  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
124  * determine which actual zone gets the segment.  This is used only in a
125  * labeled environment.  The matching rules are:
126  *
127  *	- If it's not a multilevel port, then the label on the packet selects
128  *	  the zone.  Unlabeled packets are delivered to the global zone.
129  *
130  *	- If it's a multilevel port, then only the zone registered to receive
131  *	  packets on that port matches.
132  *
133  * Also, in a labeled environment, packet labels need to be checked.  For fully
134  * bound TCP connections, we can assume that the packet label was checked
135  * during connection establishment, and doesn't need to be checked on each
136  * packet.  For others, though, we need to check for strict equality or, for
137  * multilevel ports, membership in the range or set.  This part currently does
138  * a tnrh lookup on each packet, but could be optimized to use cached results
139  * if that were necessary.  (SCTP doesn't come through here, but if it did,
140  * we would apply the same rules as TCP.)
141  *
142  * An implication of the above is that fully-bound TCP sockets must always use
143  * distinct 4-tuples; they can't be discriminated by label alone.
144  *
145  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
146  * as there's no connection set-up handshake and no shared state.
147  *
148  * Labels on looped-back packets within a single zone do not need to be
149  * checked, as all processes in the same zone have the same label.
150  *
151  * Finally, for unlabeled packets received by a labeled system, special rules
152  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
153  * socket in the zone whose label matches the default label of the sender, if
154  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
155  * receiver's label must dominate the sender's default label.
156  *
157  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
158  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
159  *					 ip_stack);
160  *
161  *	Lookup routine to find a exact match for {src, dst, local port,
162  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
163  *	ports are read from the IP and TCP header respectively.
164  *
165  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
166  *					 zoneid, ip_stack);
167  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
168  *					 zoneid, ip_stack);
169  *
170  * 	Lookup routine to find a listener with the tuple {lport, laddr,
171  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
172  * 	parameter interface index is also compared.
173  *
174  * void ipcl_walk(func, arg, ip_stack)
175  *
176  * 	Apply 'func' to every connection available. The 'func' is called as
177  *	(*func)(connp, arg). The walk is non-atomic so connections may be
178  *	created and destroyed during the walk. The CONN_CONDEMNED and
179  *	CONN_INCIPIENT flags ensure that connections which are newly created
180  *	or being destroyed are not selected by the walker.
181  *
182  * Table Updates
183  * -------------
184  *
185  * int ipcl_conn_insert(connp);
186  * int ipcl_conn_insert_v4(connp);
187  * int ipcl_conn_insert_v6(connp);
188  *
189  *	Insert 'connp' in the ipcl_conn_fanout.
190  *	Arguements :
191  *		connp		conn_t to be inserted
192  *
193  *	Return value :
194  *		0		if connp was inserted
195  *		EADDRINUSE	if the connection with the same tuple
196  *				already exists.
197  *
198  * int ipcl_bind_insert(connp);
199  * int ipcl_bind_insert_v4(connp);
200  * int ipcl_bind_insert_v6(connp);
201  *
202  * 	Insert 'connp' in ipcl_bind_fanout.
203  * 	Arguements :
204  * 		connp		conn_t to be inserted
205  *
206  *
207  * void ipcl_hash_remove(connp);
208  *
209  * 	Removes the 'connp' from the connection fanout table.
210  *
211  * Connection Creation/Destruction
212  * -------------------------------
213  *
214  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
215  *
216  * 	Creates a new conn based on the type flag, inserts it into
217  * 	globalhash table.
218  *
219  *	type:	This flag determines the type of conn_t which needs to be
220  *		created i.e., which kmem_cache it comes from.
221  *		IPCL_TCPCONN	indicates a TCP connection
222  *		IPCL_SCTPCONN	indicates a SCTP connection
223  *		IPCL_UDPCONN	indicates a UDP conn_t.
224  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
225  *		IPCL_RTSCONN	indicates a RTS conn_t.
226  *		IPCL_IPCCONN	indicates all other connections.
227  *
228  * void ipcl_conn_destroy(connp)
229  *
230  * 	Destroys the connection state, removes it from the global
231  * 	connection hash table and frees its memory.
232  */
233 
234 #include <sys/types.h>
235 #include <sys/stream.h>
236 #include <sys/stropts.h>
237 #include <sys/sysmacros.h>
238 #include <sys/strsubr.h>
239 #include <sys/strsun.h>
240 #define	_SUN_TPI_VERSION 2
241 #include <sys/ddi.h>
242 #include <sys/cmn_err.h>
243 #include <sys/debug.h>
244 
245 #include <sys/systm.h>
246 #include <sys/param.h>
247 #include <sys/kmem.h>
248 #include <sys/isa_defs.h>
249 #include <inet/common.h>
250 #include <netinet/ip6.h>
251 #include <netinet/icmp6.h>
252 
253 #include <inet/ip.h>
254 #include <inet/ip_if.h>
255 #include <inet/ip_ire.h>
256 #include <inet/ip6.h>
257 #include <inet/ip_ndp.h>
258 #include <inet/ip_impl.h>
259 #include <inet/udp_impl.h>
260 #include <inet/sctp_ip.h>
261 #include <inet/sctp/sctp_impl.h>
262 #include <inet/rawip_impl.h>
263 #include <inet/rts_impl.h>
264 #include <inet/iptun/iptun_impl.h>
265 
266 #include <sys/cpuvar.h>
267 
268 #include <inet/ipclassifier.h>
269 #include <inet/tcp.h>
270 #include <inet/ipsec_impl.h>
271 
272 #include <sys/tsol/tnet.h>
273 #include <sys/sockio.h>
274 
275 /* Old value for compatibility. Setable in /etc/system */
276 uint_t tcp_conn_hash_size = 0;
277 
278 /* New value. Zero means choose automatically.  Setable in /etc/system */
279 uint_t ipcl_conn_hash_size = 0;
280 uint_t ipcl_conn_hash_memfactor = 8192;
281 uint_t ipcl_conn_hash_maxsize = 82500;
282 
283 /* bind/udp fanout table size */
284 uint_t ipcl_bind_fanout_size = 512;
285 uint_t ipcl_udp_fanout_size = 16384;
286 
287 /* Raw socket fanout size.  Must be a power of 2. */
288 uint_t ipcl_raw_fanout_size = 256;
289 
290 /*
291  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
292  * expect that most large deployments would have hundreds of tunnels, and
293  * thousands in the extreme case.
294  */
295 uint_t ipcl_iptun_fanout_size = 6143;
296 
297 /*
298  * Power of 2^N Primes useful for hashing for N of 0-28,
299  * these primes are the nearest prime <= 2^N - 2^(N-2).
300  */
301 
302 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
303 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
304 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
305 		50331599, 100663291, 201326557, 0}
306 
307 /*
308  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
309  * are aligned on cache lines.
310  */
311 typedef union itc_s {
312 	conn_t	itc_conn;
313 	char	itcu_filler[CACHE_ALIGN(conn_s)];
314 } itc_t;
315 
316 struct kmem_cache  *tcp_conn_cache;
317 struct kmem_cache  *ip_conn_cache;
318 extern struct kmem_cache  *sctp_conn_cache;
319 struct kmem_cache  *udp_conn_cache;
320 struct kmem_cache  *rawip_conn_cache;
321 struct kmem_cache  *rts_conn_cache;
322 
323 extern void	tcp_timermp_free(tcp_t *);
324 extern mblk_t	*tcp_timermp_alloc(int);
325 
326 static int	ip_conn_constructor(void *, void *, int);
327 static void	ip_conn_destructor(void *, void *);
328 
329 static int	tcp_conn_constructor(void *, void *, int);
330 static void	tcp_conn_destructor(void *, void *);
331 
332 static int	udp_conn_constructor(void *, void *, int);
333 static void	udp_conn_destructor(void *, void *);
334 
335 static int	rawip_conn_constructor(void *, void *, int);
336 static void	rawip_conn_destructor(void *, void *);
337 
338 static int	rts_conn_constructor(void *, void *, int);
339 static void	rts_conn_destructor(void *, void *);
340 
341 /*
342  * Global (for all stack instances) init routine
343  */
344 void
345 ipcl_g_init(void)
346 {
347 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
348 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
349 	    ip_conn_constructor, ip_conn_destructor,
350 	    NULL, NULL, NULL, 0);
351 
352 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
353 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
354 	    tcp_conn_constructor, tcp_conn_destructor,
355 	    tcp_conn_reclaim, NULL, NULL, 0);
356 
357 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
358 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
359 	    udp_conn_constructor, udp_conn_destructor,
360 	    NULL, NULL, NULL, 0);
361 
362 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
363 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
364 	    rawip_conn_constructor, rawip_conn_destructor,
365 	    NULL, NULL, NULL, 0);
366 
367 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
368 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
369 	    rts_conn_constructor, rts_conn_destructor,
370 	    NULL, NULL, NULL, 0);
371 }
372 
373 /*
374  * ipclassifier intialization routine, sets up hash tables.
375  */
376 void
377 ipcl_init(ip_stack_t *ipst)
378 {
379 	int i;
380 	int sizes[] = P2Ps();
381 
382 	/*
383 	 * Calculate size of conn fanout table from /etc/system settings
384 	 */
385 	if (ipcl_conn_hash_size != 0) {
386 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
387 	} else if (tcp_conn_hash_size != 0) {
388 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
389 	} else {
390 		extern pgcnt_t freemem;
391 
392 		ipst->ips_ipcl_conn_fanout_size =
393 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
394 
395 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
396 			ipst->ips_ipcl_conn_fanout_size =
397 			    ipcl_conn_hash_maxsize;
398 		}
399 	}
400 
401 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
402 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
403 			break;
404 		}
405 	}
406 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
407 		/* Out of range, use the 2^16 value */
408 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
409 	}
410 
411 	/* Take values from /etc/system */
412 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
413 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
414 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
415 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
416 
417 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
418 
419 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
420 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
421 
422 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
423 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
424 		    MUTEX_DEFAULT, NULL);
425 	}
426 
427 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
428 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
429 
430 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
431 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
432 		    MUTEX_DEFAULT, NULL);
433 	}
434 
435 	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
436 	    sizeof (connf_t), KM_SLEEP);
437 	for (i = 0; i < IPPROTO_MAX; i++) {
438 		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
439 		    MUTEX_DEFAULT, NULL);
440 	}
441 
442 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
443 	    sizeof (connf_t), KM_SLEEP);
444 	for (i = 0; i < IPPROTO_MAX; i++) {
445 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
446 		    MUTEX_DEFAULT, NULL);
447 	}
448 
449 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
450 	mutex_init(&ipst->ips_rts_clients->connf_lock,
451 	    NULL, MUTEX_DEFAULT, NULL);
452 
453 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
454 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
455 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
456 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
457 		    MUTEX_DEFAULT, NULL);
458 	}
459 
460 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
461 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
462 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
463 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
464 		    MUTEX_DEFAULT, NULL);
465 	}
466 
467 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
468 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
469 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
470 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
471 		    MUTEX_DEFAULT, NULL);
472 	}
473 
474 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
475 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
476 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
477 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
478 		    NULL, MUTEX_DEFAULT, NULL);
479 	}
480 }
481 
482 void
483 ipcl_g_destroy(void)
484 {
485 	kmem_cache_destroy(ip_conn_cache);
486 	kmem_cache_destroy(tcp_conn_cache);
487 	kmem_cache_destroy(udp_conn_cache);
488 	kmem_cache_destroy(rawip_conn_cache);
489 	kmem_cache_destroy(rts_conn_cache);
490 }
491 
492 /*
493  * All user-level and kernel use of the stack must be gone
494  * by now.
495  */
496 void
497 ipcl_destroy(ip_stack_t *ipst)
498 {
499 	int i;
500 
501 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
502 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
503 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
504 	}
505 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
506 	    sizeof (connf_t));
507 	ipst->ips_ipcl_conn_fanout = NULL;
508 
509 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
510 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
511 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
512 	}
513 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
514 	    sizeof (connf_t));
515 	ipst->ips_ipcl_bind_fanout = NULL;
516 
517 	for (i = 0; i < IPPROTO_MAX; i++) {
518 		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
519 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
520 	}
521 	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
522 	    IPPROTO_MAX * sizeof (connf_t));
523 	ipst->ips_ipcl_proto_fanout_v4 = NULL;
524 
525 	for (i = 0; i < IPPROTO_MAX; i++) {
526 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
527 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
528 	}
529 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
530 	    IPPROTO_MAX * sizeof (connf_t));
531 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
532 
533 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
534 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
535 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
536 	}
537 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
538 	    sizeof (connf_t));
539 	ipst->ips_ipcl_udp_fanout = NULL;
540 
541 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
542 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
543 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
544 	}
545 	kmem_free(ipst->ips_ipcl_iptun_fanout,
546 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
547 	ipst->ips_ipcl_iptun_fanout = NULL;
548 
549 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
550 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
551 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
552 	}
553 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
554 	    sizeof (connf_t));
555 	ipst->ips_ipcl_raw_fanout = NULL;
556 
557 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
558 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
559 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
560 	}
561 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
562 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
563 	ipst->ips_ipcl_globalhash_fanout = NULL;
564 
565 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
566 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
567 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
568 	ipst->ips_rts_clients = NULL;
569 }
570 
571 /*
572  * conn creation routine. initialize the conn, sets the reference
573  * and inserts it in the global hash table.
574  */
575 conn_t *
576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
577 {
578 	conn_t	*connp;
579 	struct kmem_cache *conn_cache;
580 
581 	switch (type) {
582 	case IPCL_SCTPCONN:
583 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
584 			return (NULL);
585 		sctp_conn_init(connp);
586 		netstack_hold(ns);
587 		connp->conn_netstack = ns;
588 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
589 		ipcl_globalhash_insert(connp);
590 		return (connp);
591 
592 	case IPCL_TCPCONN:
593 		conn_cache = tcp_conn_cache;
594 		break;
595 
596 	case IPCL_UDPCONN:
597 		conn_cache = udp_conn_cache;
598 		break;
599 
600 	case IPCL_RAWIPCONN:
601 		conn_cache = rawip_conn_cache;
602 		break;
603 
604 	case IPCL_RTSCONN:
605 		conn_cache = rts_conn_cache;
606 		break;
607 
608 	case IPCL_IPCCONN:
609 		conn_cache = ip_conn_cache;
610 		break;
611 
612 	default:
613 		connp = NULL;
614 		ASSERT(0);
615 	}
616 
617 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
618 		return (NULL);
619 
620 	connp->conn_ref = 1;
621 	netstack_hold(ns);
622 	connp->conn_netstack = ns;
623 	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
624 	ipcl_globalhash_insert(connp);
625 	return (connp);
626 }
627 
628 void
629 ipcl_conn_destroy(conn_t *connp)
630 {
631 	mblk_t	*mp;
632 	netstack_t	*ns = connp->conn_netstack;
633 
634 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
635 	ASSERT(connp->conn_ref == 0);
636 
637 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
638 
639 	if (connp->conn_cred != NULL) {
640 		crfree(connp->conn_cred);
641 		connp->conn_cred = NULL;
642 		/* ixa_cred done in ipcl_conn_cleanup below */
643 	}
644 
645 	if (connp->conn_ht_iphc != NULL) {
646 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
647 		connp->conn_ht_iphc = NULL;
648 		connp->conn_ht_iphc_allocated = 0;
649 		connp->conn_ht_iphc_len = 0;
650 		connp->conn_ht_ulp = NULL;
651 		connp->conn_ht_ulp_len = 0;
652 	}
653 	ip_pkt_free(&connp->conn_xmit_ipp);
654 
655 	ipcl_globalhash_remove(connp);
656 
657 	if (connp->conn_latch != NULL) {
658 		IPLATCH_REFRELE(connp->conn_latch);
659 		connp->conn_latch = NULL;
660 	}
661 	if (connp->conn_latch_in_policy != NULL) {
662 		IPPOL_REFRELE(connp->conn_latch_in_policy);
663 		connp->conn_latch_in_policy = NULL;
664 	}
665 	if (connp->conn_latch_in_action != NULL) {
666 		IPACT_REFRELE(connp->conn_latch_in_action);
667 		connp->conn_latch_in_action = NULL;
668 	}
669 	if (connp->conn_policy != NULL) {
670 		IPPH_REFRELE(connp->conn_policy, ns);
671 		connp->conn_policy = NULL;
672 	}
673 
674 	if (connp->conn_ipsec_opt_mp != NULL) {
675 		freemsg(connp->conn_ipsec_opt_mp);
676 		connp->conn_ipsec_opt_mp = NULL;
677 	}
678 
679 	if (connp->conn_flags & IPCL_TCPCONN) {
680 		tcp_t *tcp = connp->conn_tcp;
681 
682 		tcp_free(tcp);
683 		mp = tcp->tcp_timercache;
684 
685 		tcp->tcp_tcps = NULL;
686 
687 		/*
688 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
689 		 * the mblk.
690 		 */
691 		if (tcp->tcp_rsrv_mp != NULL) {
692 			freeb(tcp->tcp_rsrv_mp);
693 			tcp->tcp_rsrv_mp = NULL;
694 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
695 		}
696 
697 		ipcl_conn_cleanup(connp);
698 		connp->conn_flags = IPCL_TCPCONN;
699 		if (ns != NULL) {
700 			ASSERT(tcp->tcp_tcps == NULL);
701 			connp->conn_netstack = NULL;
702 			connp->conn_ixa->ixa_ipst = NULL;
703 			netstack_rele(ns);
704 		}
705 
706 		bzero(tcp, sizeof (tcp_t));
707 
708 		tcp->tcp_timercache = mp;
709 		tcp->tcp_connp = connp;
710 		kmem_cache_free(tcp_conn_cache, connp);
711 		return;
712 	}
713 
714 	if (connp->conn_flags & IPCL_SCTPCONN) {
715 		ASSERT(ns != NULL);
716 		sctp_free(connp);
717 		return;
718 	}
719 
720 	ipcl_conn_cleanup(connp);
721 	if (ns != NULL) {
722 		connp->conn_netstack = NULL;
723 		connp->conn_ixa->ixa_ipst = NULL;
724 		netstack_rele(ns);
725 	}
726 
727 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
728 	if (connp->conn_flags & IPCL_UDPCONN) {
729 		connp->conn_flags = IPCL_UDPCONN;
730 		kmem_cache_free(udp_conn_cache, connp);
731 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
732 		connp->conn_flags = IPCL_RAWIPCONN;
733 		connp->conn_proto = IPPROTO_ICMP;
734 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
735 		kmem_cache_free(rawip_conn_cache, connp);
736 	} else if (connp->conn_flags & IPCL_RTSCONN) {
737 		connp->conn_flags = IPCL_RTSCONN;
738 		kmem_cache_free(rts_conn_cache, connp);
739 	} else {
740 		connp->conn_flags = IPCL_IPCCONN;
741 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
742 		ASSERT(connp->conn_priv == NULL);
743 		kmem_cache_free(ip_conn_cache, connp);
744 	}
745 }
746 
747 /*
748  * Running in cluster mode - deregister listener information
749  */
750 static void
751 ipcl_conn_unlisten(conn_t *connp)
752 {
753 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
754 	ASSERT(connp->conn_lport != 0);
755 
756 	if (cl_inet_unlisten != NULL) {
757 		sa_family_t	addr_family;
758 		uint8_t		*laddrp;
759 
760 		if (connp->conn_ipversion == IPV6_VERSION) {
761 			addr_family = AF_INET6;
762 			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
763 		} else {
764 			addr_family = AF_INET;
765 			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
766 		}
767 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
768 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
769 	}
770 	connp->conn_flags &= ~IPCL_CL_LISTENER;
771 }
772 
773 /*
774  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
775  * which table the conn belonged to). So for debugging we can see which hash
776  * table this connection was in.
777  */
778 #define	IPCL_HASH_REMOVE(connp)	{					\
779 	connf_t	*connfp = (connp)->conn_fanout;				\
780 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
781 	if (connfp != NULL) {						\
782 		mutex_enter(&connfp->connf_lock);			\
783 		if ((connp)->conn_next != NULL)				\
784 			(connp)->conn_next->conn_prev =			\
785 			    (connp)->conn_prev;				\
786 		if ((connp)->conn_prev != NULL)				\
787 			(connp)->conn_prev->conn_next =			\
788 			    (connp)->conn_next;				\
789 		else							\
790 			connfp->connf_head = (connp)->conn_next;	\
791 		(connp)->conn_fanout = NULL;				\
792 		(connp)->conn_next = NULL;				\
793 		(connp)->conn_prev = NULL;				\
794 		(connp)->conn_flags |= IPCL_REMOVED;			\
795 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
796 			ipcl_conn_unlisten((connp));			\
797 		CONN_DEC_REF((connp));					\
798 		mutex_exit(&connfp->connf_lock);			\
799 	}								\
800 }
801 
802 void
803 ipcl_hash_remove(conn_t *connp)
804 {
805 	uint8_t		protocol = connp->conn_proto;
806 
807 	IPCL_HASH_REMOVE(connp);
808 	if (protocol == IPPROTO_RSVP)
809 		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
810 }
811 
812 /*
813  * The whole purpose of this function is allow removal of
814  * a conn_t from the connected hash for timewait reclaim.
815  * This is essentially a TW reclaim fastpath where timewait
816  * collector checks under fanout lock (so no one else can
817  * get access to the conn_t) that refcnt is 2 i.e. one for
818  * TCP and one for the classifier hash list. If ref count
819  * is indeed 2, we can just remove the conn under lock and
820  * avoid cleaning up the conn under squeue. This gives us
821  * improved performance.
822  */
823 void
824 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
825 {
826 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
827 	ASSERT(MUTEX_HELD(&connp->conn_lock));
828 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
829 
830 	if ((connp)->conn_next != NULL) {
831 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
832 	}
833 	if ((connp)->conn_prev != NULL) {
834 		(connp)->conn_prev->conn_next = (connp)->conn_next;
835 	} else {
836 		connfp->connf_head = (connp)->conn_next;
837 	}
838 	(connp)->conn_fanout = NULL;
839 	(connp)->conn_next = NULL;
840 	(connp)->conn_prev = NULL;
841 	(connp)->conn_flags |= IPCL_REMOVED;
842 	ASSERT((connp)->conn_ref == 2);
843 	(connp)->conn_ref--;
844 }
845 
846 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
847 	ASSERT((connp)->conn_fanout == NULL);				\
848 	ASSERT((connp)->conn_next == NULL);				\
849 	ASSERT((connp)->conn_prev == NULL);				\
850 	if ((connfp)->connf_head != NULL) {				\
851 		(connfp)->connf_head->conn_prev = (connp);		\
852 		(connp)->conn_next = (connfp)->connf_head;		\
853 	}								\
854 	(connp)->conn_fanout = (connfp);				\
855 	(connfp)->connf_head = (connp);					\
856 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
857 	    IPCL_CONNECTED;						\
858 	CONN_INC_REF(connp);						\
859 }
860 
861 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
862 	IPCL_HASH_REMOVE((connp));					\
863 	mutex_enter(&(connfp)->connf_lock);				\
864 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
865 	mutex_exit(&(connfp)->connf_lock);				\
866 }
867 
868 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
869 	conn_t *pconnp = NULL, *nconnp;					\
870 	IPCL_HASH_REMOVE((connp));					\
871 	mutex_enter(&(connfp)->connf_lock);				\
872 	nconnp = (connfp)->connf_head;					\
873 	while (nconnp != NULL &&					\
874 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
875 		pconnp = nconnp;					\
876 		nconnp = nconnp->conn_next;				\
877 	}								\
878 	if (pconnp != NULL) {						\
879 		pconnp->conn_next = (connp);				\
880 		(connp)->conn_prev = pconnp;				\
881 	} else {							\
882 		(connfp)->connf_head = (connp);				\
883 	}								\
884 	if (nconnp != NULL) {						\
885 		(connp)->conn_next = nconnp;				\
886 		nconnp->conn_prev = (connp);				\
887 	}								\
888 	(connp)->conn_fanout = (connfp);				\
889 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
890 	    IPCL_BOUND;							\
891 	CONN_INC_REF(connp);						\
892 	mutex_exit(&(connfp)->connf_lock);				\
893 }
894 
895 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
896 	conn_t **list, *prev, *next;					\
897 	boolean_t isv4mapped =						\
898 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
899 	IPCL_HASH_REMOVE((connp));					\
900 	mutex_enter(&(connfp)->connf_lock);				\
901 	list = &(connfp)->connf_head;					\
902 	prev = NULL;							\
903 	while ((next = *list) != NULL) {				\
904 		if (isv4mapped &&					\
905 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
906 		    connp->conn_zoneid == next->conn_zoneid) {		\
907 			(connp)->conn_next = next;			\
908 			if (prev != NULL)				\
909 				prev = next->conn_prev;			\
910 			next->conn_prev = (connp);			\
911 			break;						\
912 		}							\
913 		list = &next->conn_next;				\
914 		prev = next;						\
915 	}								\
916 	(connp)->conn_prev = prev;					\
917 	*list = (connp);						\
918 	(connp)->conn_fanout = (connfp);				\
919 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
920 	    IPCL_BOUND;							\
921 	CONN_INC_REF((connp));						\
922 	mutex_exit(&(connfp)->connf_lock);				\
923 }
924 
925 void
926 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
927 {
928 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
929 }
930 
931 /*
932  * Because the classifier is used to classify inbound packets, the destination
933  * address is meant to be our local tunnel address (tunnel source), and the
934  * source the remote tunnel address (tunnel destination).
935  *
936  * Note that conn_proto can't be used for fanout since the upper protocol
937  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
938  */
939 conn_t *
940 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
941 {
942 	connf_t	*connfp;
943 	conn_t	*connp;
944 
945 	/* first look for IPv4 tunnel links */
946 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
947 	mutex_enter(&connfp->connf_lock);
948 	for (connp = connfp->connf_head; connp != NULL;
949 	    connp = connp->conn_next) {
950 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
951 			break;
952 	}
953 	if (connp != NULL)
954 		goto done;
955 
956 	mutex_exit(&connfp->connf_lock);
957 
958 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
959 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
960 	    INADDR_ANY)];
961 	mutex_enter(&connfp->connf_lock);
962 	for (connp = connfp->connf_head; connp != NULL;
963 	    connp = connp->conn_next) {
964 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
965 			break;
966 	}
967 done:
968 	if (connp != NULL)
969 		CONN_INC_REF(connp);
970 	mutex_exit(&connfp->connf_lock);
971 	return (connp);
972 }
973 
974 conn_t *
975 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
976 {
977 	connf_t	*connfp;
978 	conn_t	*connp;
979 
980 	/* Look for an IPv6 tunnel link */
981 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
982 	mutex_enter(&connfp->connf_lock);
983 	for (connp = connfp->connf_head; connp != NULL;
984 	    connp = connp->conn_next) {
985 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
986 			CONN_INC_REF(connp);
987 			break;
988 		}
989 	}
990 	mutex_exit(&connfp->connf_lock);
991 	return (connp);
992 }
993 
994 /*
995  * This function is used only for inserting SCTP raw socket now.
996  * This may change later.
997  *
998  * Note that only one raw socket can be bound to a port.  The param
999  * lport is in network byte order.
1000  */
1001 static int
1002 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1003 {
1004 	connf_t	*connfp;
1005 	conn_t	*oconnp;
1006 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1007 
1008 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1009 
1010 	/* Check for existing raw socket already bound to the port. */
1011 	mutex_enter(&connfp->connf_lock);
1012 	for (oconnp = connfp->connf_head; oconnp != NULL;
1013 	    oconnp = oconnp->conn_next) {
1014 		if (oconnp->conn_lport == lport &&
1015 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1016 		    oconnp->conn_family == connp->conn_family &&
1017 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1018 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1019 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1020 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1021 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1022 		    &connp->conn_laddr_v6))) {
1023 			break;
1024 		}
1025 	}
1026 	mutex_exit(&connfp->connf_lock);
1027 	if (oconnp != NULL)
1028 		return (EADDRNOTAVAIL);
1029 
1030 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1031 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1032 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1033 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1034 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1035 		} else {
1036 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1037 		}
1038 	} else {
1039 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1040 	}
1041 	return (0);
1042 }
1043 
1044 static int
1045 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1046 {
1047 	connf_t	*connfp;
1048 	conn_t	*tconnp;
1049 	ipaddr_t laddr = connp->conn_laddr_v4;
1050 	ipaddr_t faddr = connp->conn_faddr_v4;
1051 
1052 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1053 	mutex_enter(&connfp->connf_lock);
1054 	for (tconnp = connfp->connf_head; tconnp != NULL;
1055 	    tconnp = tconnp->conn_next) {
1056 		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1057 			/* A tunnel is already bound to these addresses. */
1058 			mutex_exit(&connfp->connf_lock);
1059 			return (EADDRINUSE);
1060 		}
1061 	}
1062 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1063 	mutex_exit(&connfp->connf_lock);
1064 	return (0);
1065 }
1066 
1067 static int
1068 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1069 {
1070 	connf_t	*connfp;
1071 	conn_t	*tconnp;
1072 	in6_addr_t *laddr = &connp->conn_laddr_v6;
1073 	in6_addr_t *faddr = &connp->conn_faddr_v6;
1074 
1075 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1076 	mutex_enter(&connfp->connf_lock);
1077 	for (tconnp = connfp->connf_head; tconnp != NULL;
1078 	    tconnp = tconnp->conn_next) {
1079 		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1080 			/* A tunnel is already bound to these addresses. */
1081 			mutex_exit(&connfp->connf_lock);
1082 			return (EADDRINUSE);
1083 		}
1084 	}
1085 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1086 	mutex_exit(&connfp->connf_lock);
1087 	return (0);
1088 }
1089 
1090 /*
1091  * Check for a MAC exemption conflict on a labeled system.  Note that for
1092  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1093  * transport layer.  This check is for binding all other protocols.
1094  *
1095  * Returns true if there's a conflict.
1096  */
1097 static boolean_t
1098 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1099 {
1100 	connf_t	*connfp;
1101 	conn_t *tconn;
1102 
1103 	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1104 	mutex_enter(&connfp->connf_lock);
1105 	for (tconn = connfp->connf_head; tconn != NULL;
1106 	    tconn = tconn->conn_next) {
1107 		/* We don't allow v4 fallback for v6 raw socket */
1108 		if (connp->conn_family != tconn->conn_family)
1109 			continue;
1110 		/* If neither is exempt, then there's no conflict */
1111 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1112 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1113 			continue;
1114 		/* We are only concerned about sockets for a different zone */
1115 		if (connp->conn_zoneid == tconn->conn_zoneid)
1116 			continue;
1117 		/* If both are bound to different specific addrs, ok */
1118 		if (connp->conn_laddr_v4 != INADDR_ANY &&
1119 		    tconn->conn_laddr_v4 != INADDR_ANY &&
1120 		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1121 			continue;
1122 		/* These two conflict; fail */
1123 		break;
1124 	}
1125 	mutex_exit(&connfp->connf_lock);
1126 	return (tconn != NULL);
1127 }
1128 
1129 static boolean_t
1130 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1131 {
1132 	connf_t	*connfp;
1133 	conn_t *tconn;
1134 
1135 	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1136 	mutex_enter(&connfp->connf_lock);
1137 	for (tconn = connfp->connf_head; tconn != NULL;
1138 	    tconn = tconn->conn_next) {
1139 		/* We don't allow v4 fallback for v6 raw socket */
1140 		if (connp->conn_family != tconn->conn_family)
1141 			continue;
1142 		/* If neither is exempt, then there's no conflict */
1143 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1144 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1145 			continue;
1146 		/* We are only concerned about sockets for a different zone */
1147 		if (connp->conn_zoneid == tconn->conn_zoneid)
1148 			continue;
1149 		/* If both are bound to different addrs, ok */
1150 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1151 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1152 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1153 		    &tconn->conn_laddr_v6))
1154 			continue;
1155 		/* These two conflict; fail */
1156 		break;
1157 	}
1158 	mutex_exit(&connfp->connf_lock);
1159 	return (tconn != NULL);
1160 }
1161 
1162 /*
1163  * (v4, v6) bind hash insertion routines
1164  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1165  */
1166 
1167 int
1168 ipcl_bind_insert(conn_t *connp)
1169 {
1170 	if (connp->conn_ipversion == IPV6_VERSION)
1171 		return (ipcl_bind_insert_v6(connp));
1172 	else
1173 		return (ipcl_bind_insert_v4(connp));
1174 }
1175 
1176 int
1177 ipcl_bind_insert_v4(conn_t *connp)
1178 {
1179 	connf_t	*connfp;
1180 	int	ret = 0;
1181 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1182 	uint16_t	lport = connp->conn_lport;
1183 	uint8_t		protocol = connp->conn_proto;
1184 
1185 	if (IPCL_IS_IPTUN(connp))
1186 		return (ipcl_iptun_hash_insert(connp, ipst));
1187 
1188 	switch (protocol) {
1189 	default:
1190 		if (is_system_labeled() &&
1191 		    check_exempt_conflict_v4(connp, ipst))
1192 			return (EADDRINUSE);
1193 		/* FALLTHROUGH */
1194 	case IPPROTO_UDP:
1195 		if (protocol == IPPROTO_UDP) {
1196 			connfp = &ipst->ips_ipcl_udp_fanout[
1197 			    IPCL_UDP_HASH(lport, ipst)];
1198 		} else {
1199 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1200 		}
1201 
1202 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1203 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1204 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1205 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1206 		} else {
1207 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1208 		}
1209 		if (protocol == IPPROTO_RSVP)
1210 			ill_set_inputfn_all(ipst);
1211 		break;
1212 
1213 	case IPPROTO_TCP:
1214 		/* Insert it in the Bind Hash */
1215 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1216 		connfp = &ipst->ips_ipcl_bind_fanout[
1217 		    IPCL_BIND_HASH(lport, ipst)];
1218 		if (connp->conn_laddr_v4 != INADDR_ANY) {
1219 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1220 		} else {
1221 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1222 		}
1223 		if (cl_inet_listen != NULL) {
1224 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1225 			connp->conn_flags |= IPCL_CL_LISTENER;
1226 			(*cl_inet_listen)(
1227 			    connp->conn_netstack->netstack_stackid,
1228 			    IPPROTO_TCP, AF_INET,
1229 			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1230 		}
1231 		break;
1232 
1233 	case IPPROTO_SCTP:
1234 		ret = ipcl_sctp_hash_insert(connp, lport);
1235 		break;
1236 	}
1237 
1238 	return (ret);
1239 }
1240 
1241 int
1242 ipcl_bind_insert_v6(conn_t *connp)
1243 {
1244 	connf_t		*connfp;
1245 	int		ret = 0;
1246 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1247 	uint16_t	lport = connp->conn_lport;
1248 	uint8_t		protocol = connp->conn_proto;
1249 
1250 	if (IPCL_IS_IPTUN(connp)) {
1251 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1252 	}
1253 
1254 	switch (protocol) {
1255 	default:
1256 		if (is_system_labeled() &&
1257 		    check_exempt_conflict_v6(connp, ipst))
1258 			return (EADDRINUSE);
1259 		/* FALLTHROUGH */
1260 	case IPPROTO_UDP:
1261 		if (protocol == IPPROTO_UDP) {
1262 			connfp = &ipst->ips_ipcl_udp_fanout[
1263 			    IPCL_UDP_HASH(lport, ipst)];
1264 		} else {
1265 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1266 		}
1267 
1268 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1269 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1270 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1271 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1272 		} else {
1273 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1274 		}
1275 		break;
1276 
1277 	case IPPROTO_TCP:
1278 		/* Insert it in the Bind Hash */
1279 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1280 		connfp = &ipst->ips_ipcl_bind_fanout[
1281 		    IPCL_BIND_HASH(lport, ipst)];
1282 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1283 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1284 		} else {
1285 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1286 		}
1287 		if (cl_inet_listen != NULL) {
1288 			sa_family_t	addr_family;
1289 			uint8_t		*laddrp;
1290 
1291 			if (connp->conn_ipversion == IPV6_VERSION) {
1292 				addr_family = AF_INET6;
1293 				laddrp =
1294 				    (uint8_t *)&connp->conn_bound_addr_v6;
1295 			} else {
1296 				addr_family = AF_INET;
1297 				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1298 			}
1299 			connp->conn_flags |= IPCL_CL_LISTENER;
1300 			(*cl_inet_listen)(
1301 			    connp->conn_netstack->netstack_stackid,
1302 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1303 		}
1304 		break;
1305 
1306 	case IPPROTO_SCTP:
1307 		ret = ipcl_sctp_hash_insert(connp, lport);
1308 		break;
1309 	}
1310 
1311 	return (ret);
1312 }
1313 
1314 /*
1315  * ipcl_conn_hash insertion routines.
1316  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1317  */
1318 
1319 int
1320 ipcl_conn_insert(conn_t *connp)
1321 {
1322 	if (connp->conn_ipversion == IPV6_VERSION)
1323 		return (ipcl_conn_insert_v6(connp));
1324 	else
1325 		return (ipcl_conn_insert_v4(connp));
1326 }
1327 
1328 int
1329 ipcl_conn_insert_v4(conn_t *connp)
1330 {
1331 	connf_t		*connfp;
1332 	conn_t		*tconnp;
1333 	int		ret = 0;
1334 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1335 	uint16_t	lport = connp->conn_lport;
1336 	uint8_t		protocol = connp->conn_proto;
1337 
1338 	if (IPCL_IS_IPTUN(connp))
1339 		return (ipcl_iptun_hash_insert(connp, ipst));
1340 
1341 	switch (protocol) {
1342 	case IPPROTO_TCP:
1343 		/*
1344 		 * For TCP, we check whether the connection tuple already
1345 		 * exists before allowing the connection to proceed.  We
1346 		 * also allow indexing on the zoneid. This is to allow
1347 		 * multiple shared stack zones to have the same tcp
1348 		 * connection tuple. In practice this only happens for
1349 		 * INADDR_LOOPBACK as it's the only local address which
1350 		 * doesn't have to be unique.
1351 		 */
1352 		connfp = &ipst->ips_ipcl_conn_fanout[
1353 		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1354 		    connp->conn_ports, ipst)];
1355 		mutex_enter(&connfp->connf_lock);
1356 		for (tconnp = connfp->connf_head; tconnp != NULL;
1357 		    tconnp = tconnp->conn_next) {
1358 			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1359 			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1360 			    connp->conn_ports) &&
1361 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1362 				/* Already have a conn. bail out */
1363 				mutex_exit(&connfp->connf_lock);
1364 				return (EADDRINUSE);
1365 			}
1366 		}
1367 		if (connp->conn_fanout != NULL) {
1368 			/*
1369 			 * Probably a XTI/TLI application trying to do a
1370 			 * rebind. Let it happen.
1371 			 */
1372 			mutex_exit(&connfp->connf_lock);
1373 			IPCL_HASH_REMOVE(connp);
1374 			mutex_enter(&connfp->connf_lock);
1375 		}
1376 
1377 		ASSERT(connp->conn_recv != NULL);
1378 		ASSERT(connp->conn_recvicmp != NULL);
1379 
1380 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1381 		mutex_exit(&connfp->connf_lock);
1382 		break;
1383 
1384 	case IPPROTO_SCTP:
1385 		/*
1386 		 * The raw socket may have already been bound, remove it
1387 		 * from the hash first.
1388 		 */
1389 		IPCL_HASH_REMOVE(connp);
1390 		ret = ipcl_sctp_hash_insert(connp, lport);
1391 		break;
1392 
1393 	default:
1394 		/*
1395 		 * Check for conflicts among MAC exempt bindings.  For
1396 		 * transports with port numbers, this is done by the upper
1397 		 * level per-transport binding logic.  For all others, it's
1398 		 * done here.
1399 		 */
1400 		if (is_system_labeled() &&
1401 		    check_exempt_conflict_v4(connp, ipst))
1402 			return (EADDRINUSE);
1403 		/* FALLTHROUGH */
1404 
1405 	case IPPROTO_UDP:
1406 		if (protocol == IPPROTO_UDP) {
1407 			connfp = &ipst->ips_ipcl_udp_fanout[
1408 			    IPCL_UDP_HASH(lport, ipst)];
1409 		} else {
1410 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1411 		}
1412 
1413 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1414 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1415 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1416 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1417 		} else {
1418 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1419 		}
1420 		break;
1421 	}
1422 
1423 	return (ret);
1424 }
1425 
1426 int
1427 ipcl_conn_insert_v6(conn_t *connp)
1428 {
1429 	connf_t		*connfp;
1430 	conn_t		*tconnp;
1431 	int		ret = 0;
1432 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1433 	uint16_t	lport = connp->conn_lport;
1434 	uint8_t		protocol = connp->conn_proto;
1435 	uint_t		ifindex = connp->conn_bound_if;
1436 
1437 	if (IPCL_IS_IPTUN(connp))
1438 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1439 
1440 	switch (protocol) {
1441 	case IPPROTO_TCP:
1442 
1443 		/*
1444 		 * For tcp, we check whether the connection tuple already
1445 		 * exists before allowing the connection to proceed.  We
1446 		 * also allow indexing on the zoneid. This is to allow
1447 		 * multiple shared stack zones to have the same tcp
1448 		 * connection tuple. In practice this only happens for
1449 		 * ipv6_loopback as it's the only local address which
1450 		 * doesn't have to be unique.
1451 		 */
1452 		connfp = &ipst->ips_ipcl_conn_fanout[
1453 		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1454 		    ipst)];
1455 		mutex_enter(&connfp->connf_lock);
1456 		for (tconnp = connfp->connf_head; tconnp != NULL;
1457 		    tconnp = tconnp->conn_next) {
1458 			/* NOTE: need to match zoneid. Bug in onnv-gate */
1459 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1460 			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1461 			    connp->conn_ports) &&
1462 			    (tconnp->conn_bound_if == 0 ||
1463 			    tconnp->conn_bound_if == ifindex) &&
1464 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1465 				/* Already have a conn. bail out */
1466 				mutex_exit(&connfp->connf_lock);
1467 				return (EADDRINUSE);
1468 			}
1469 		}
1470 		if (connp->conn_fanout != NULL) {
1471 			/*
1472 			 * Probably a XTI/TLI application trying to do a
1473 			 * rebind. Let it happen.
1474 			 */
1475 			mutex_exit(&connfp->connf_lock);
1476 			IPCL_HASH_REMOVE(connp);
1477 			mutex_enter(&connfp->connf_lock);
1478 		}
1479 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1480 		mutex_exit(&connfp->connf_lock);
1481 		break;
1482 
1483 	case IPPROTO_SCTP:
1484 		IPCL_HASH_REMOVE(connp);
1485 		ret = ipcl_sctp_hash_insert(connp, lport);
1486 		break;
1487 
1488 	default:
1489 		if (is_system_labeled() &&
1490 		    check_exempt_conflict_v6(connp, ipst))
1491 			return (EADDRINUSE);
1492 		/* FALLTHROUGH */
1493 	case IPPROTO_UDP:
1494 		if (protocol == IPPROTO_UDP) {
1495 			connfp = &ipst->ips_ipcl_udp_fanout[
1496 			    IPCL_UDP_HASH(lport, ipst)];
1497 		} else {
1498 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1499 		}
1500 
1501 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1502 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1503 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1504 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1505 		} else {
1506 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1507 		}
1508 		break;
1509 	}
1510 
1511 	return (ret);
1512 }
1513 
1514 /*
1515  * v4 packet classifying function. looks up the fanout table to
1516  * find the conn, the packet belongs to. returns the conn with
1517  * the reference held, null otherwise.
1518  *
1519  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1520  * Lookup" comment block are applied.  Labels are also checked as described
1521  * above.  If the packet is from the inside (looped back), and is from the same
1522  * zone, then label checks are omitted.
1523  */
1524 conn_t *
1525 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1526     ip_recv_attr_t *ira, ip_stack_t *ipst)
1527 {
1528 	ipha_t	*ipha;
1529 	connf_t	*connfp, *bind_connfp;
1530 	uint16_t lport;
1531 	uint16_t fport;
1532 	uint32_t ports;
1533 	conn_t	*connp;
1534 	uint16_t  *up;
1535 	zoneid_t	zoneid = ira->ira_zoneid;
1536 
1537 	ipha = (ipha_t *)mp->b_rptr;
1538 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1539 
1540 	switch (protocol) {
1541 	case IPPROTO_TCP:
1542 		ports = *(uint32_t *)up;
1543 		connfp =
1544 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1545 		    ports, ipst)];
1546 		mutex_enter(&connfp->connf_lock);
1547 		for (connp = connfp->connf_head; connp != NULL;
1548 		    connp = connp->conn_next) {
1549 			if (IPCL_CONN_MATCH(connp, protocol,
1550 			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1551 			    (connp->conn_zoneid == zoneid ||
1552 			    connp->conn_allzones ||
1553 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1554 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1555 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1556 				break;
1557 		}
1558 
1559 		if (connp != NULL) {
1560 			/*
1561 			 * We have a fully-bound TCP connection.
1562 			 *
1563 			 * For labeled systems, there's no need to check the
1564 			 * label here.  It's known to be good as we checked
1565 			 * before allowing the connection to become bound.
1566 			 */
1567 			CONN_INC_REF(connp);
1568 			mutex_exit(&connfp->connf_lock);
1569 			return (connp);
1570 		}
1571 
1572 		mutex_exit(&connfp->connf_lock);
1573 		lport = up[1];
1574 		bind_connfp =
1575 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1576 		mutex_enter(&bind_connfp->connf_lock);
1577 		for (connp = bind_connfp->connf_head; connp != NULL;
1578 		    connp = connp->conn_next) {
1579 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1580 			    lport) &&
1581 			    (connp->conn_zoneid == zoneid ||
1582 			    connp->conn_allzones ||
1583 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1584 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1585 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1586 				break;
1587 		}
1588 
1589 		/*
1590 		 * If the matching connection is SLP on a private address, then
1591 		 * the label on the packet must match the local zone's label.
1592 		 * Otherwise, it must be in the label range defined by tnrh.
1593 		 * This is ensured by tsol_receive_local.
1594 		 *
1595 		 * Note that we don't check tsol_receive_local for
1596 		 * the connected case.
1597 		 */
1598 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1599 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1600 		    ira, connp)) {
1601 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1602 			    char *, "connp(1) could not receive mp(2)",
1603 			    conn_t *, connp, mblk_t *, mp);
1604 			connp = NULL;
1605 		}
1606 
1607 		if (connp != NULL) {
1608 			/* Have a listener at least */
1609 			CONN_INC_REF(connp);
1610 			mutex_exit(&bind_connfp->connf_lock);
1611 			return (connp);
1612 		}
1613 
1614 		mutex_exit(&bind_connfp->connf_lock);
1615 		break;
1616 
1617 	case IPPROTO_UDP:
1618 		lport = up[1];
1619 		fport = up[0];
1620 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1621 		mutex_enter(&connfp->connf_lock);
1622 		for (connp = connfp->connf_head; connp != NULL;
1623 		    connp = connp->conn_next) {
1624 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1625 			    fport, ipha->ipha_src) &&
1626 			    (connp->conn_zoneid == zoneid ||
1627 			    connp->conn_allzones ||
1628 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1629 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1630 				break;
1631 		}
1632 
1633 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1634 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1635 		    ira, connp)) {
1636 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1637 			    char *, "connp(1) could not receive mp(2)",
1638 			    conn_t *, connp, mblk_t *, mp);
1639 			connp = NULL;
1640 		}
1641 
1642 		if (connp != NULL) {
1643 			CONN_INC_REF(connp);
1644 			mutex_exit(&connfp->connf_lock);
1645 			return (connp);
1646 		}
1647 
1648 		/*
1649 		 * We shouldn't come here for multicast/broadcast packets
1650 		 */
1651 		mutex_exit(&connfp->connf_lock);
1652 
1653 		break;
1654 
1655 	case IPPROTO_ENCAP:
1656 	case IPPROTO_IPV6:
1657 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1658 		    &ipha->ipha_dst, ipst));
1659 	}
1660 
1661 	return (NULL);
1662 }
1663 
1664 conn_t *
1665 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1666     ip_recv_attr_t *ira, ip_stack_t *ipst)
1667 {
1668 	ip6_t		*ip6h;
1669 	connf_t		*connfp, *bind_connfp;
1670 	uint16_t	lport;
1671 	uint16_t	fport;
1672 	tcpha_t		*tcpha;
1673 	uint32_t	ports;
1674 	conn_t		*connp;
1675 	uint16_t	*up;
1676 	zoneid_t	zoneid = ira->ira_zoneid;
1677 
1678 	ip6h = (ip6_t *)mp->b_rptr;
1679 
1680 	switch (protocol) {
1681 	case IPPROTO_TCP:
1682 		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1683 		up = &tcpha->tha_lport;
1684 		ports = *(uint32_t *)up;
1685 
1686 		connfp =
1687 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1688 		    ports, ipst)];
1689 		mutex_enter(&connfp->connf_lock);
1690 		for (connp = connfp->connf_head; connp != NULL;
1691 		    connp = connp->conn_next) {
1692 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1693 			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1694 			    (connp->conn_zoneid == zoneid ||
1695 			    connp->conn_allzones ||
1696 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1697 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1698 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1699 				break;
1700 		}
1701 
1702 		if (connp != NULL) {
1703 			/*
1704 			 * We have a fully-bound TCP connection.
1705 			 *
1706 			 * For labeled systems, there's no need to check the
1707 			 * label here.  It's known to be good as we checked
1708 			 * before allowing the connection to become bound.
1709 			 */
1710 			CONN_INC_REF(connp);
1711 			mutex_exit(&connfp->connf_lock);
1712 			return (connp);
1713 		}
1714 
1715 		mutex_exit(&connfp->connf_lock);
1716 
1717 		lport = up[1];
1718 		bind_connfp =
1719 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1720 		mutex_enter(&bind_connfp->connf_lock);
1721 		for (connp = bind_connfp->connf_head; connp != NULL;
1722 		    connp = connp->conn_next) {
1723 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1724 			    ip6h->ip6_dst, lport) &&
1725 			    (connp->conn_zoneid == zoneid ||
1726 			    connp->conn_allzones ||
1727 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1728 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1729 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1730 				break;
1731 		}
1732 
1733 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1734 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1735 		    ira, connp)) {
1736 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1737 			    char *, "connp(1) could not receive mp(2)",
1738 			    conn_t *, connp, mblk_t *, mp);
1739 			connp = NULL;
1740 		}
1741 
1742 		if (connp != NULL) {
1743 			/* Have a listner at least */
1744 			CONN_INC_REF(connp);
1745 			mutex_exit(&bind_connfp->connf_lock);
1746 			return (connp);
1747 		}
1748 
1749 		mutex_exit(&bind_connfp->connf_lock);
1750 		break;
1751 
1752 	case IPPROTO_UDP:
1753 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1754 		lport = up[1];
1755 		fport = up[0];
1756 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1757 		mutex_enter(&connfp->connf_lock);
1758 		for (connp = connfp->connf_head; connp != NULL;
1759 		    connp = connp->conn_next) {
1760 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1761 			    fport, ip6h->ip6_src) &&
1762 			    (connp->conn_zoneid == zoneid ||
1763 			    connp->conn_allzones ||
1764 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1765 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1766 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1767 				break;
1768 		}
1769 
1770 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1771 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1772 		    ira, connp)) {
1773 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1774 			    char *, "connp(1) could not receive mp(2)",
1775 			    conn_t *, connp, mblk_t *, mp);
1776 			connp = NULL;
1777 		}
1778 
1779 		if (connp != NULL) {
1780 			CONN_INC_REF(connp);
1781 			mutex_exit(&connfp->connf_lock);
1782 			return (connp);
1783 		}
1784 
1785 		/*
1786 		 * We shouldn't come here for multicast/broadcast packets
1787 		 */
1788 		mutex_exit(&connfp->connf_lock);
1789 		break;
1790 	case IPPROTO_ENCAP:
1791 	case IPPROTO_IPV6:
1792 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1793 		    &ip6h->ip6_dst, ipst));
1794 	}
1795 
1796 	return (NULL);
1797 }
1798 
1799 /*
1800  * wrapper around ipcl_classify_(v4,v6) routines.
1801  */
1802 conn_t *
1803 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1804 {
1805 	if (ira->ira_flags & IRAF_IS_IPV4) {
1806 		return (ipcl_classify_v4(mp, ira->ira_protocol,
1807 		    ira->ira_ip_hdr_length, ira, ipst));
1808 	} else {
1809 		return (ipcl_classify_v6(mp, ira->ira_protocol,
1810 		    ira->ira_ip_hdr_length, ira, ipst));
1811 	}
1812 }
1813 
1814 /*
1815  * Only used to classify SCTP RAW sockets
1816  */
1817 conn_t *
1818 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1819     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1820 {
1821 	connf_t		*connfp;
1822 	conn_t		*connp;
1823 	in_port_t	lport;
1824 	int		ipversion;
1825 	const void	*dst;
1826 	zoneid_t	zoneid = ira->ira_zoneid;
1827 
1828 	lport = ((uint16_t *)&ports)[1];
1829 	if (ira->ira_flags & IRAF_IS_IPV4) {
1830 		dst = (const void *)&ipha->ipha_dst;
1831 		ipversion = IPV4_VERSION;
1832 	} else {
1833 		dst = (const void *)&ip6h->ip6_dst;
1834 		ipversion = IPV6_VERSION;
1835 	}
1836 
1837 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1838 	mutex_enter(&connfp->connf_lock);
1839 	for (connp = connfp->connf_head; connp != NULL;
1840 	    connp = connp->conn_next) {
1841 		/* We don't allow v4 fallback for v6 raw socket. */
1842 		if (ipversion != connp->conn_ipversion)
1843 			continue;
1844 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1845 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1846 			if (ipversion == IPV4_VERSION) {
1847 				if (!IPCL_CONN_MATCH(connp, protocol,
1848 				    ipha->ipha_src, ipha->ipha_dst, ports))
1849 					continue;
1850 			} else {
1851 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1852 				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1853 					continue;
1854 			}
1855 		} else {
1856 			if (ipversion == IPV4_VERSION) {
1857 				if (!IPCL_BIND_MATCH(connp, protocol,
1858 				    ipha->ipha_dst, lport))
1859 					continue;
1860 			} else {
1861 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1862 				    ip6h->ip6_dst, lport))
1863 					continue;
1864 			}
1865 		}
1866 
1867 		if (connp->conn_zoneid == zoneid ||
1868 		    connp->conn_allzones ||
1869 		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1870 		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1871 		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1872 			break;
1873 	}
1874 
1875 	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1876 	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1877 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1878 		    char *, "connp(1) could not receive mp(2)",
1879 		    conn_t *, connp, mblk_t *, mp);
1880 		connp = NULL;
1881 	}
1882 
1883 	if (connp != NULL)
1884 		goto found;
1885 	mutex_exit(&connfp->connf_lock);
1886 
1887 	/* Try to look for a wildcard SCTP RAW socket match. */
1888 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1889 	mutex_enter(&connfp->connf_lock);
1890 	for (connp = connfp->connf_head; connp != NULL;
1891 	    connp = connp->conn_next) {
1892 		/* We don't allow v4 fallback for v6 raw socket. */
1893 		if (ipversion != connp->conn_ipversion)
1894 			continue;
1895 		if (!IPCL_ZONE_MATCH(connp, zoneid))
1896 			continue;
1897 
1898 		if (ipversion == IPV4_VERSION) {
1899 			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1900 				break;
1901 		} else {
1902 			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1903 				break;
1904 			}
1905 		}
1906 	}
1907 
1908 	if (connp != NULL)
1909 		goto found;
1910 
1911 	mutex_exit(&connfp->connf_lock);
1912 	return (NULL);
1913 
1914 found:
1915 	ASSERT(connp != NULL);
1916 	CONN_INC_REF(connp);
1917 	mutex_exit(&connfp->connf_lock);
1918 	return (connp);
1919 }
1920 
1921 /* ARGSUSED */
1922 static int
1923 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1924 {
1925 	itc_t	*itc = (itc_t *)buf;
1926 	conn_t 	*connp = &itc->itc_conn;
1927 	tcp_t	*tcp = (tcp_t *)&itc[1];
1928 
1929 	bzero(connp, sizeof (conn_t));
1930 	bzero(tcp, sizeof (tcp_t));
1931 
1932 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1933 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1934 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1935 	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1936 	if (tcp->tcp_timercache == NULL)
1937 		return (ENOMEM);
1938 	connp->conn_tcp = tcp;
1939 	connp->conn_flags = IPCL_TCPCONN;
1940 	connp->conn_proto = IPPROTO_TCP;
1941 	tcp->tcp_connp = connp;
1942 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1943 
1944 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1945 	if (connp->conn_ixa == NULL) {
1946 		tcp_timermp_free(tcp);
1947 		return (ENOMEM);
1948 	}
1949 	connp->conn_ixa->ixa_refcnt = 1;
1950 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1951 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1952 	return (0);
1953 }
1954 
1955 /* ARGSUSED */
1956 static void
1957 tcp_conn_destructor(void *buf, void *cdrarg)
1958 {
1959 	itc_t	*itc = (itc_t *)buf;
1960 	conn_t 	*connp = &itc->itc_conn;
1961 	tcp_t	*tcp = (tcp_t *)&itc[1];
1962 
1963 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1964 	ASSERT(tcp->tcp_connp == connp);
1965 	ASSERT(connp->conn_tcp == tcp);
1966 	tcp_timermp_free(tcp);
1967 	mutex_destroy(&connp->conn_lock);
1968 	cv_destroy(&connp->conn_cv);
1969 	cv_destroy(&connp->conn_sq_cv);
1970 	rw_destroy(&connp->conn_ilg_lock);
1971 
1972 	/* Can be NULL if constructor failed */
1973 	if (connp->conn_ixa != NULL) {
1974 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1975 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1976 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1977 		ixa_refrele(connp->conn_ixa);
1978 	}
1979 }
1980 
1981 /* ARGSUSED */
1982 static int
1983 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1984 {
1985 	itc_t	*itc = (itc_t *)buf;
1986 	conn_t 	*connp = &itc->itc_conn;
1987 
1988 	bzero(connp, sizeof (conn_t));
1989 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1990 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1991 	connp->conn_flags = IPCL_IPCCONN;
1992 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1993 
1994 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1995 	if (connp->conn_ixa == NULL)
1996 		return (ENOMEM);
1997 	connp->conn_ixa->ixa_refcnt = 1;
1998 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1999 	return (0);
2000 }
2001 
2002 /* ARGSUSED */
2003 static void
2004 ip_conn_destructor(void *buf, void *cdrarg)
2005 {
2006 	itc_t	*itc = (itc_t *)buf;
2007 	conn_t 	*connp = &itc->itc_conn;
2008 
2009 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2010 	ASSERT(connp->conn_priv == NULL);
2011 	mutex_destroy(&connp->conn_lock);
2012 	cv_destroy(&connp->conn_cv);
2013 	rw_destroy(&connp->conn_ilg_lock);
2014 
2015 	/* Can be NULL if constructor failed */
2016 	if (connp->conn_ixa != NULL) {
2017 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2018 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2019 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2020 		ixa_refrele(connp->conn_ixa);
2021 	}
2022 }
2023 
2024 /* ARGSUSED */
2025 static int
2026 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2027 {
2028 	itc_t	*itc = (itc_t *)buf;
2029 	conn_t 	*connp = &itc->itc_conn;
2030 	udp_t	*udp = (udp_t *)&itc[1];
2031 
2032 	bzero(connp, sizeof (conn_t));
2033 	bzero(udp, sizeof (udp_t));
2034 
2035 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2036 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2037 	connp->conn_udp = udp;
2038 	connp->conn_flags = IPCL_UDPCONN;
2039 	connp->conn_proto = IPPROTO_UDP;
2040 	udp->udp_connp = connp;
2041 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2042 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2043 	if (connp->conn_ixa == NULL)
2044 		return (ENOMEM);
2045 	connp->conn_ixa->ixa_refcnt = 1;
2046 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2047 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2048 	return (0);
2049 }
2050 
2051 /* ARGSUSED */
2052 static void
2053 udp_conn_destructor(void *buf, void *cdrarg)
2054 {
2055 	itc_t	*itc = (itc_t *)buf;
2056 	conn_t 	*connp = &itc->itc_conn;
2057 	udp_t	*udp = (udp_t *)&itc[1];
2058 
2059 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2060 	ASSERT(udp->udp_connp == connp);
2061 	ASSERT(connp->conn_udp == udp);
2062 	mutex_destroy(&connp->conn_lock);
2063 	cv_destroy(&connp->conn_cv);
2064 	rw_destroy(&connp->conn_ilg_lock);
2065 
2066 	/* Can be NULL if constructor failed */
2067 	if (connp->conn_ixa != NULL) {
2068 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2069 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2070 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2071 		ixa_refrele(connp->conn_ixa);
2072 	}
2073 }
2074 
2075 /* ARGSUSED */
2076 static int
2077 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2078 {
2079 	itc_t	*itc = (itc_t *)buf;
2080 	conn_t 	*connp = &itc->itc_conn;
2081 	icmp_t	*icmp = (icmp_t *)&itc[1];
2082 
2083 	bzero(connp, sizeof (conn_t));
2084 	bzero(icmp, sizeof (icmp_t));
2085 
2086 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2087 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2088 	connp->conn_icmp = icmp;
2089 	connp->conn_flags = IPCL_RAWIPCONN;
2090 	connp->conn_proto = IPPROTO_ICMP;
2091 	icmp->icmp_connp = connp;
2092 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2093 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2094 	if (connp->conn_ixa == NULL)
2095 		return (ENOMEM);
2096 	connp->conn_ixa->ixa_refcnt = 1;
2097 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2098 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2099 	return (0);
2100 }
2101 
2102 /* ARGSUSED */
2103 static void
2104 rawip_conn_destructor(void *buf, void *cdrarg)
2105 {
2106 	itc_t	*itc = (itc_t *)buf;
2107 	conn_t 	*connp = &itc->itc_conn;
2108 	icmp_t	*icmp = (icmp_t *)&itc[1];
2109 
2110 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2111 	ASSERT(icmp->icmp_connp == connp);
2112 	ASSERT(connp->conn_icmp == icmp);
2113 	mutex_destroy(&connp->conn_lock);
2114 	cv_destroy(&connp->conn_cv);
2115 	rw_destroy(&connp->conn_ilg_lock);
2116 
2117 	/* Can be NULL if constructor failed */
2118 	if (connp->conn_ixa != NULL) {
2119 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2120 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2121 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2122 		ixa_refrele(connp->conn_ixa);
2123 	}
2124 }
2125 
2126 /* ARGSUSED */
2127 static int
2128 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2129 {
2130 	itc_t	*itc = (itc_t *)buf;
2131 	conn_t 	*connp = &itc->itc_conn;
2132 	rts_t	*rts = (rts_t *)&itc[1];
2133 
2134 	bzero(connp, sizeof (conn_t));
2135 	bzero(rts, sizeof (rts_t));
2136 
2137 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2138 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2139 	connp->conn_rts = rts;
2140 	connp->conn_flags = IPCL_RTSCONN;
2141 	rts->rts_connp = connp;
2142 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2143 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2144 	if (connp->conn_ixa == NULL)
2145 		return (ENOMEM);
2146 	connp->conn_ixa->ixa_refcnt = 1;
2147 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2148 	return (0);
2149 }
2150 
2151 /* ARGSUSED */
2152 static void
2153 rts_conn_destructor(void *buf, void *cdrarg)
2154 {
2155 	itc_t	*itc = (itc_t *)buf;
2156 	conn_t 	*connp = &itc->itc_conn;
2157 	rts_t	*rts = (rts_t *)&itc[1];
2158 
2159 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2160 	ASSERT(rts->rts_connp == connp);
2161 	ASSERT(connp->conn_rts == rts);
2162 	mutex_destroy(&connp->conn_lock);
2163 	cv_destroy(&connp->conn_cv);
2164 	rw_destroy(&connp->conn_ilg_lock);
2165 
2166 	/* Can be NULL if constructor failed */
2167 	if (connp->conn_ixa != NULL) {
2168 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2169 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2170 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2171 		ixa_refrele(connp->conn_ixa);
2172 	}
2173 }
2174 
2175 /*
2176  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2177  * in the conn_t.
2178  *
2179  * Below we list all the pointers in the conn_t as a documentation aid.
2180  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2181  * If you add any pointers to the conn_t please add an ASSERT here
2182  * and #ifdef it out if it can't be actually asserted to be NULL.
2183  * In any case, we bzero most of the conn_t at the end of the function.
2184  */
2185 void
2186 ipcl_conn_cleanup(conn_t *connp)
2187 {
2188 	ip_xmit_attr_t	*ixa;
2189 
2190 	ASSERT(connp->conn_latch == NULL);
2191 	ASSERT(connp->conn_latch_in_policy == NULL);
2192 	ASSERT(connp->conn_latch_in_action == NULL);
2193 #ifdef notdef
2194 	ASSERT(connp->conn_rq == NULL);
2195 	ASSERT(connp->conn_wq == NULL);
2196 #endif
2197 	ASSERT(connp->conn_cred == NULL);
2198 	ASSERT(connp->conn_g_fanout == NULL);
2199 	ASSERT(connp->conn_g_next == NULL);
2200 	ASSERT(connp->conn_g_prev == NULL);
2201 	ASSERT(connp->conn_policy == NULL);
2202 	ASSERT(connp->conn_fanout == NULL);
2203 	ASSERT(connp->conn_next == NULL);
2204 	ASSERT(connp->conn_prev == NULL);
2205 	ASSERT(connp->conn_oper_pending_ill == NULL);
2206 	ASSERT(connp->conn_ilg == NULL);
2207 	ASSERT(connp->conn_drain_next == NULL);
2208 	ASSERT(connp->conn_drain_prev == NULL);
2209 #ifdef notdef
2210 	/* conn_idl is not cleared when removed from idl list */
2211 	ASSERT(connp->conn_idl == NULL);
2212 #endif
2213 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2214 #ifdef notdef
2215 	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2216 	ASSERT(connp->conn_netstack == NULL);
2217 #endif
2218 
2219 	ASSERT(connp->conn_helper_info == NULL);
2220 	ASSERT(connp->conn_ixa != NULL);
2221 	ixa = connp->conn_ixa;
2222 	ASSERT(ixa->ixa_refcnt == 1);
2223 	/* Need to preserve ixa_protocol */
2224 	ixa_cleanup(ixa);
2225 	ixa->ixa_flags = 0;
2226 
2227 	/* Clear out the conn_t fields that are not preserved */
2228 	bzero(&connp->conn_start_clr,
2229 	    sizeof (conn_t) -
2230 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2231 }
2232 
2233 /*
2234  * All conns are inserted in a global multi-list for the benefit of
2235  * walkers. The walk is guaranteed to walk all open conns at the time
2236  * of the start of the walk exactly once. This property is needed to
2237  * achieve some cleanups during unplumb of interfaces. This is achieved
2238  * as follows.
2239  *
2240  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2241  * call the insert and delete functions below at creation and deletion
2242  * time respectively. The conn never moves or changes its position in this
2243  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2244  * won't increase due to walkers, once the conn deletion has started. Note
2245  * that we can't remove the conn from the global list and then wait for
2246  * the refcnt to drop to zero, since walkers would then see a truncated
2247  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2248  * conns until ip_open is ready to make them globally visible.
2249  * The global round robin multi-list locks are held only to get the
2250  * next member/insertion/deletion and contention should be negligible
2251  * if the multi-list is much greater than the number of cpus.
2252  */
2253 void
2254 ipcl_globalhash_insert(conn_t *connp)
2255 {
2256 	int	index;
2257 	struct connf_s	*connfp;
2258 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2259 
2260 	/*
2261 	 * No need for atomic here. Approximate even distribution
2262 	 * in the global lists is sufficient.
2263 	 */
2264 	ipst->ips_conn_g_index++;
2265 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2266 
2267 	connp->conn_g_prev = NULL;
2268 	/*
2269 	 * Mark as INCIPIENT, so that walkers will ignore this
2270 	 * for now, till ip_open is ready to make it visible globally.
2271 	 */
2272 	connp->conn_state_flags |= CONN_INCIPIENT;
2273 
2274 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2275 	/* Insert at the head of the list */
2276 	mutex_enter(&connfp->connf_lock);
2277 	connp->conn_g_next = connfp->connf_head;
2278 	if (connp->conn_g_next != NULL)
2279 		connp->conn_g_next->conn_g_prev = connp;
2280 	connfp->connf_head = connp;
2281 
2282 	/* The fanout bucket this conn points to */
2283 	connp->conn_g_fanout = connfp;
2284 
2285 	mutex_exit(&connfp->connf_lock);
2286 }
2287 
2288 void
2289 ipcl_globalhash_remove(conn_t *connp)
2290 {
2291 	struct connf_s	*connfp;
2292 
2293 	/*
2294 	 * We were never inserted in the global multi list.
2295 	 * IPCL_NONE variety is never inserted in the global multilist
2296 	 * since it is presumed to not need any cleanup and is transient.
2297 	 */
2298 	if (connp->conn_g_fanout == NULL)
2299 		return;
2300 
2301 	connfp = connp->conn_g_fanout;
2302 	mutex_enter(&connfp->connf_lock);
2303 	if (connp->conn_g_prev != NULL)
2304 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2305 	else
2306 		connfp->connf_head = connp->conn_g_next;
2307 	if (connp->conn_g_next != NULL)
2308 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2309 	mutex_exit(&connfp->connf_lock);
2310 
2311 	/* Better to stumble on a null pointer than to corrupt memory */
2312 	connp->conn_g_next = NULL;
2313 	connp->conn_g_prev = NULL;
2314 	connp->conn_g_fanout = NULL;
2315 }
2316 
2317 /*
2318  * Walk the list of all conn_t's in the system, calling the function provided
2319  * With the specified argument for each.
2320  * Applies to both IPv4 and IPv6.
2321  *
2322  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2323  * conn_oper_pending_ill). To guard against stale pointers
2324  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2325  * unplumbed or removed. New conn_t's that are created while we are walking
2326  * may be missed by this walk, because they are not necessarily inserted
2327  * at the tail of the list. They are new conn_t's and thus don't have any
2328  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2329  * is created to the struct that is going away.
2330  */
2331 void
2332 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2333 {
2334 	int	i;
2335 	conn_t	*connp;
2336 	conn_t	*prev_connp;
2337 
2338 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2339 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2340 		prev_connp = NULL;
2341 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2342 		while (connp != NULL) {
2343 			mutex_enter(&connp->conn_lock);
2344 			if (connp->conn_state_flags &
2345 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2346 				mutex_exit(&connp->conn_lock);
2347 				connp = connp->conn_g_next;
2348 				continue;
2349 			}
2350 			CONN_INC_REF_LOCKED(connp);
2351 			mutex_exit(&connp->conn_lock);
2352 			mutex_exit(
2353 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2354 			(*func)(connp, arg);
2355 			if (prev_connp != NULL)
2356 				CONN_DEC_REF(prev_connp);
2357 			mutex_enter(
2358 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2359 			prev_connp = connp;
2360 			connp = connp->conn_g_next;
2361 		}
2362 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2363 		if (prev_connp != NULL)
2364 			CONN_DEC_REF(prev_connp);
2365 	}
2366 }
2367 
2368 /*
2369  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2370  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2371  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2372  * (peer tcp in ESTABLISHED state).
2373  */
2374 conn_t *
2375 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2376     ip_stack_t *ipst)
2377 {
2378 	uint32_t ports;
2379 	uint16_t *pports = (uint16_t *)&ports;
2380 	connf_t	*connfp;
2381 	conn_t	*tconnp;
2382 	boolean_t zone_chk;
2383 
2384 	/*
2385 	 * If either the source of destination address is loopback, then
2386 	 * both endpoints must be in the same Zone.  Otherwise, both of
2387 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2388 	 * state) and the endpoints may reside in different Zones.
2389 	 */
2390 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2391 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2392 
2393 	pports[0] = tcpha->tha_fport;
2394 	pports[1] = tcpha->tha_lport;
2395 
2396 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2397 	    ports, ipst)];
2398 
2399 	mutex_enter(&connfp->connf_lock);
2400 	for (tconnp = connfp->connf_head; tconnp != NULL;
2401 	    tconnp = tconnp->conn_next) {
2402 
2403 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2404 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2405 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2406 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2407 
2408 			ASSERT(tconnp != connp);
2409 			CONN_INC_REF(tconnp);
2410 			mutex_exit(&connfp->connf_lock);
2411 			return (tconnp);
2412 		}
2413 	}
2414 	mutex_exit(&connfp->connf_lock);
2415 	return (NULL);
2416 }
2417 
2418 /*
2419  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2420  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2421  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2422  * (peer tcp in ESTABLISHED state).
2423  */
2424 conn_t *
2425 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2426     ip_stack_t *ipst)
2427 {
2428 	uint32_t ports;
2429 	uint16_t *pports = (uint16_t *)&ports;
2430 	connf_t	*connfp;
2431 	conn_t	*tconnp;
2432 	boolean_t zone_chk;
2433 
2434 	/*
2435 	 * If either the source of destination address is loopback, then
2436 	 * both endpoints must be in the same Zone.  Otherwise, both of
2437 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2438 	 * state) and the endpoints may reside in different Zones.  We
2439 	 * don't do Zone check for link local address(es) because the
2440 	 * current Zone implementation treats each link local address as
2441 	 * being unique per system node, i.e. they belong to global Zone.
2442 	 */
2443 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2444 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2445 
2446 	pports[0] = tcpha->tha_fport;
2447 	pports[1] = tcpha->tha_lport;
2448 
2449 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2450 	    ports, ipst)];
2451 
2452 	mutex_enter(&connfp->connf_lock);
2453 	for (tconnp = connfp->connf_head; tconnp != NULL;
2454 	    tconnp = tconnp->conn_next) {
2455 
2456 		/* We skip conn_bound_if check here as this is loopback tcp */
2457 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2458 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2459 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2460 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2461 
2462 			ASSERT(tconnp != connp);
2463 			CONN_INC_REF(tconnp);
2464 			mutex_exit(&connfp->connf_lock);
2465 			return (tconnp);
2466 		}
2467 	}
2468 	mutex_exit(&connfp->connf_lock);
2469 	return (NULL);
2470 }
2471 
2472 /*
2473  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2474  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2475  * Only checks for connected entries i.e. no INADDR_ANY checks.
2476  */
2477 conn_t *
2478 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2479     ip_stack_t *ipst)
2480 {
2481 	uint32_t ports;
2482 	uint16_t *pports;
2483 	connf_t	*connfp;
2484 	conn_t	*tconnp;
2485 
2486 	pports = (uint16_t *)&ports;
2487 	pports[0] = tcpha->tha_fport;
2488 	pports[1] = tcpha->tha_lport;
2489 
2490 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2491 	    ports, ipst)];
2492 
2493 	mutex_enter(&connfp->connf_lock);
2494 	for (tconnp = connfp->connf_head; tconnp != NULL;
2495 	    tconnp = tconnp->conn_next) {
2496 
2497 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2498 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2499 		    tconnp->conn_tcp->tcp_state >= min_state) {
2500 
2501 			CONN_INC_REF(tconnp);
2502 			mutex_exit(&connfp->connf_lock);
2503 			return (tconnp);
2504 		}
2505 	}
2506 	mutex_exit(&connfp->connf_lock);
2507 	return (NULL);
2508 }
2509 
2510 /*
2511  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2512  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2513  * Only checks for connected entries i.e. no INADDR_ANY checks.
2514  * Match on ifindex in addition to addresses.
2515  */
2516 conn_t *
2517 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2518     uint_t ifindex, ip_stack_t *ipst)
2519 {
2520 	tcp_t	*tcp;
2521 	uint32_t ports;
2522 	uint16_t *pports;
2523 	connf_t	*connfp;
2524 	conn_t	*tconnp;
2525 
2526 	pports = (uint16_t *)&ports;
2527 	pports[0] = tcpha->tha_fport;
2528 	pports[1] = tcpha->tha_lport;
2529 
2530 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2531 	    ports, ipst)];
2532 
2533 	mutex_enter(&connfp->connf_lock);
2534 	for (tconnp = connfp->connf_head; tconnp != NULL;
2535 	    tconnp = tconnp->conn_next) {
2536 
2537 		tcp = tconnp->conn_tcp;
2538 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2539 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2540 		    tcp->tcp_state >= min_state &&
2541 		    (tconnp->conn_bound_if == 0 ||
2542 		    tconnp->conn_bound_if == ifindex)) {
2543 
2544 			CONN_INC_REF(tconnp);
2545 			mutex_exit(&connfp->connf_lock);
2546 			return (tconnp);
2547 		}
2548 	}
2549 	mutex_exit(&connfp->connf_lock);
2550 	return (NULL);
2551 }
2552 
2553 /*
2554  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2555  * a listener when changing state.
2556  */
2557 conn_t *
2558 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2559     ip_stack_t *ipst)
2560 {
2561 	connf_t		*bind_connfp;
2562 	conn_t		*connp;
2563 	tcp_t		*tcp;
2564 
2565 	/*
2566 	 * Avoid false matches for packets sent to an IP destination of
2567 	 * all zeros.
2568 	 */
2569 	if (laddr == 0)
2570 		return (NULL);
2571 
2572 	ASSERT(zoneid != ALL_ZONES);
2573 
2574 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2575 	mutex_enter(&bind_connfp->connf_lock);
2576 	for (connp = bind_connfp->connf_head; connp != NULL;
2577 	    connp = connp->conn_next) {
2578 		tcp = connp->conn_tcp;
2579 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2580 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2581 		    (tcp->tcp_listener == NULL)) {
2582 			CONN_INC_REF(connp);
2583 			mutex_exit(&bind_connfp->connf_lock);
2584 			return (connp);
2585 		}
2586 	}
2587 	mutex_exit(&bind_connfp->connf_lock);
2588 	return (NULL);
2589 }
2590 
2591 /*
2592  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2593  * a listener when changing state.
2594  */
2595 conn_t *
2596 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2597     zoneid_t zoneid, ip_stack_t *ipst)
2598 {
2599 	connf_t		*bind_connfp;
2600 	conn_t		*connp = NULL;
2601 	tcp_t		*tcp;
2602 
2603 	/*
2604 	 * Avoid false matches for packets sent to an IP destination of
2605 	 * all zeros.
2606 	 */
2607 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2608 		return (NULL);
2609 
2610 	ASSERT(zoneid != ALL_ZONES);
2611 
2612 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2613 	mutex_enter(&bind_connfp->connf_lock);
2614 	for (connp = bind_connfp->connf_head; connp != NULL;
2615 	    connp = connp->conn_next) {
2616 		tcp = connp->conn_tcp;
2617 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2618 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2619 		    (connp->conn_bound_if == 0 ||
2620 		    connp->conn_bound_if == ifindex) &&
2621 		    tcp->tcp_listener == NULL) {
2622 			CONN_INC_REF(connp);
2623 			mutex_exit(&bind_connfp->connf_lock);
2624 			return (connp);
2625 		}
2626 	}
2627 	mutex_exit(&bind_connfp->connf_lock);
2628 	return (NULL);
2629 }
2630 
2631 /*
2632  * ipcl_get_next_conn
2633  *	get the next entry in the conn global list
2634  *	and put a reference on the next_conn.
2635  *	decrement the reference on the current conn.
2636  *
2637  * This is an iterator based walker function that also provides for
2638  * some selection by the caller. It walks through the conn_hash bucket
2639  * searching for the next valid connp in the list, and selects connections
2640  * that are neither closed nor condemned. It also REFHOLDS the conn
2641  * thus ensuring that the conn exists when the caller uses the conn.
2642  */
2643 conn_t *
2644 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2645 {
2646 	conn_t	*next_connp;
2647 
2648 	if (connfp == NULL)
2649 		return (NULL);
2650 
2651 	mutex_enter(&connfp->connf_lock);
2652 
2653 	next_connp = (connp == NULL) ?
2654 	    connfp->connf_head : connp->conn_g_next;
2655 
2656 	while (next_connp != NULL) {
2657 		mutex_enter(&next_connp->conn_lock);
2658 		if (!(next_connp->conn_flags & conn_flags) ||
2659 		    (next_connp->conn_state_flags &
2660 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2661 			/*
2662 			 * This conn has been condemned or
2663 			 * is closing, or the flags don't match
2664 			 */
2665 			mutex_exit(&next_connp->conn_lock);
2666 			next_connp = next_connp->conn_g_next;
2667 			continue;
2668 		}
2669 		CONN_INC_REF_LOCKED(next_connp);
2670 		mutex_exit(&next_connp->conn_lock);
2671 		break;
2672 	}
2673 
2674 	mutex_exit(&connfp->connf_lock);
2675 
2676 	if (connp != NULL)
2677 		CONN_DEC_REF(connp);
2678 
2679 	return (next_connp);
2680 }
2681 
2682 #ifdef CONN_DEBUG
2683 /*
2684  * Trace of the last NBUF refhold/refrele
2685  */
2686 int
2687 conn_trace_ref(conn_t *connp)
2688 {
2689 	int	last;
2690 	conn_trace_t	*ctb;
2691 
2692 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2693 	last = connp->conn_trace_last;
2694 	last++;
2695 	if (last == CONN_TRACE_MAX)
2696 		last = 0;
2697 
2698 	ctb = &connp->conn_trace_buf[last];
2699 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2700 	connp->conn_trace_last = last;
2701 	return (1);
2702 }
2703 
2704 int
2705 conn_untrace_ref(conn_t *connp)
2706 {
2707 	int	last;
2708 	conn_trace_t	*ctb;
2709 
2710 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2711 	last = connp->conn_trace_last;
2712 	last++;
2713 	if (last == CONN_TRACE_MAX)
2714 		last = 0;
2715 
2716 	ctb = &connp->conn_trace_buf[last];
2717 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2718 	connp->conn_trace_last = last;
2719 	return (1);
2720 }
2721 #endif
2722