xref: /titanic_50/usr/src/uts/common/inet/ip/ipclassifier.c (revision bf604c6405d5cbc4e94e3d0ecc9e6e074ed4ea67)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP PACKET CLASSIFIER
28  *
29  * The IP packet classifier provides mapping between IP packets and persistent
30  * connection state for connection-oriented protocols. It also provides
31  * interface for managing connection states.
32  *
33  * The connection state is kept in conn_t data structure and contains, among
34  * other things:
35  *
36  *	o local/remote address and ports
37  *	o Transport protocol
38  *	o squeue for the connection (for TCP only)
39  *	o reference counter
40  *	o Connection state
41  *	o hash table linkage
42  *	o interface/ire information
43  *	o credentials
44  *	o ipsec policy
45  *	o send and receive functions.
46  *	o mutex lock.
47  *
48  * Connections use a reference counting scheme. They are freed when the
49  * reference counter drops to zero. A reference is incremented when connection
50  * is placed in a list or table, when incoming packet for the connection arrives
51  * and when connection is processed via squeue (squeue processing may be
52  * asynchronous and the reference protects the connection from being destroyed
53  * before its processing is finished).
54  *
55  * send and receive functions are currently used for TCP only. The send function
56  * determines the IP entry point for the packet once it leaves TCP to be sent to
57  * the destination address. The receive function is used by IP when the packet
58  * should be passed for TCP processing. When a new connection is created these
59  * are set to ip_output() and tcp_input() respectively. During the lifetime of
60  * the connection the send and receive functions may change depending on the
61  * changes in the connection state. For example, Once the connection is bound to
62  * an addresse, the receive function for this connection is set to
63  * tcp_conn_request().  This allows incoming SYNs to go directly into the
64  * listener SYN processing function without going to tcp_input() first.
65  *
66  * Classifier uses several hash tables:
67  *
68  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
69  *	ipcl_bind_fanout:	contains all connections in BOUND state
70  *	ipcl_proto_fanout:	IPv4 protocol fanout
71  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
72  *	ipcl_udp_fanout:	contains all UDP connections
73  *	ipcl_globalhash_fanout:	contains all connections
74  *
75  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
76  * which need to view all existing connections.
77  *
78  * All tables are protected by per-bucket locks. When both per-bucket lock and
79  * connection lock need to be held, the per-bucket lock should be acquired
80  * first, followed by the connection lock.
81  *
82  * All functions doing search in one of these tables increment a reference
83  * counter on the connection found (if any). This reference should be dropped
84  * when the caller has finished processing the connection.
85  *
86  *
87  * INTERFACES:
88  * ===========
89  *
90  * Connection Lookup:
91  * ------------------
92  *
93  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
94  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
95  *
96  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
97  * it can't find any associated connection. If the connection is found, its
98  * reference counter is incremented.
99  *
100  *	mp:	mblock, containing packet header. The full header should fit
101  *		into a single mblock. It should also contain at least full IP
102  *		and TCP or UDP header.
103  *
104  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
105  *
106  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
107  *		 the packet.
108  *
109  * 	zoneid: The zone in which the returned connection must be; the zoneid
110  *		corresponding to the ire_zoneid on the IRE located for the
111  *		packet's destination address.
112  *
113  *	For TCP connections, the lookup order is as follows:
114  *		5-tuple {src, dst, protocol, local port, remote port}
115  *			lookup in ipcl_conn_fanout table.
116  *		3-tuple {dst, remote port, protocol} lookup in
117  *			ipcl_bind_fanout table.
118  *
119  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
120  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
121  *	these interfaces do not handle cases where a packets belongs
122  *	to multiple UDP clients, which is handled in IP itself.
123  *
124  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125  * determine which actual zone gets the segment.  This is used only in a
126  * labeled environment.  The matching rules are:
127  *
128  *	- If it's not a multilevel port, then the label on the packet selects
129  *	  the zone.  Unlabeled packets are delivered to the global zone.
130  *
131  *	- If it's a multilevel port, then only the zone registered to receive
132  *	  packets on that port matches.
133  *
134  * Also, in a labeled environment, packet labels need to be checked.  For fully
135  * bound TCP connections, we can assume that the packet label was checked
136  * during connection establishment, and doesn't need to be checked on each
137  * packet.  For others, though, we need to check for strict equality or, for
138  * multilevel ports, membership in the range or set.  This part currently does
139  * a tnrh lookup on each packet, but could be optimized to use cached results
140  * if that were necessary.  (SCTP doesn't come through here, but if it did,
141  * we would apply the same rules as TCP.)
142  *
143  * An implication of the above is that fully-bound TCP sockets must always use
144  * distinct 4-tuples; they can't be discriminated by label alone.
145  *
146  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147  * as there's no connection set-up handshake and no shared state.
148  *
149  * Labels on looped-back packets within a single zone do not need to be
150  * checked, as all processes in the same zone have the same label.
151  *
152  * Finally, for unlabeled packets received by a labeled system, special rules
153  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
154  * socket in the zone whose label matches the default label of the sender, if
155  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156  * receiver's label must dominate the sender's default label.
157  *
158  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
159  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160  *					 ip_stack);
161  *
162  *	Lookup routine to find a exact match for {src, dst, local port,
163  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
164  *	ports are read from the IP and TCP header respectively.
165  *
166  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
167  *					 zoneid, ip_stack);
168  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169  *					 zoneid, ip_stack);
170  *
171  * 	Lookup routine to find a listener with the tuple {lport, laddr,
172  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173  * 	parameter interface index is also compared.
174  *
175  * void ipcl_walk(func, arg, ip_stack)
176  *
177  * 	Apply 'func' to every connection available. The 'func' is called as
178  *	(*func)(connp, arg). The walk is non-atomic so connections may be
179  *	created and destroyed during the walk. The CONN_CONDEMNED and
180  *	CONN_INCIPIENT flags ensure that connections which are newly created
181  *	or being destroyed are not selected by the walker.
182  *
183  * Table Updates
184  * -------------
185  *
186  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
187  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
188  *
189  *	Insert 'connp' in the ipcl_conn_fanout.
190  *	Arguements :
191  *		connp		conn_t to be inserted
192  *		protocol	connection protocol
193  *		src		source address
194  *		dst		destination address
195  *		ports		local and remote port
196  *		ifindex		interface index for IPv6 connections
197  *
198  *	Return value :
199  *		0		if connp was inserted
200  *		EADDRINUSE	if the connection with the same tuple
201  *				already exists.
202  *
203  * int ipcl_bind_insert(connp, protocol, src, lport);
204  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
205  *
206  * 	Insert 'connp' in ipcl_bind_fanout.
207  * 	Arguements :
208  * 		connp		conn_t to be inserted
209  * 		protocol	connection protocol
210  * 		src		source address connection wants
211  * 				to bind to
212  * 		lport		local port connection wants to
213  * 				bind to
214  *
215  *
216  * void ipcl_hash_remove(connp);
217  *
218  * 	Removes the 'connp' from the connection fanout table.
219  *
220  * Connection Creation/Destruction
221  * -------------------------------
222  *
223  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
224  *
225  * 	Creates a new conn based on the type flag, inserts it into
226  * 	globalhash table.
227  *
228  *	type:	This flag determines the type of conn_t which needs to be
229  *		created i.e., which kmem_cache it comes from.
230  *		IPCL_TCPCONN	indicates a TCP connection
231  *		IPCL_SCTPCONN	indicates a SCTP connection
232  *		IPCL_UDPCONN	indicates a UDP conn_t.
233  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
234  *		IPCL_RTSCONN	indicates a RTS conn_t.
235  *		IPCL_IPCCONN	indicates all other connections.
236  *
237  * void ipcl_conn_destroy(connp)
238  *
239  * 	Destroys the connection state, removes it from the global
240  * 	connection hash table and frees its memory.
241  */
242 
243 #include <sys/types.h>
244 #include <sys/stream.h>
245 #include <sys/stropts.h>
246 #include <sys/sysmacros.h>
247 #include <sys/strsubr.h>
248 #include <sys/strsun.h>
249 #define	_SUN_TPI_VERSION 2
250 #include <sys/ddi.h>
251 #include <sys/cmn_err.h>
252 #include <sys/debug.h>
253 
254 #include <sys/systm.h>
255 #include <sys/param.h>
256 #include <sys/kmem.h>
257 #include <sys/isa_defs.h>
258 #include <inet/common.h>
259 #include <netinet/ip6.h>
260 #include <netinet/icmp6.h>
261 
262 #include <inet/ip.h>
263 #include <inet/ip6.h>
264 #include <inet/tcp.h>
265 #include <inet/ip_ndp.h>
266 #include <inet/udp_impl.h>
267 #include <inet/sctp_ip.h>
268 #include <inet/sctp/sctp_impl.h>
269 #include <inet/rawip_impl.h>
270 #include <inet/rts_impl.h>
271 
272 #include <sys/cpuvar.h>
273 
274 #include <inet/ipclassifier.h>
275 #include <inet/ipsec_impl.h>
276 
277 #include <sys/tsol/tnet.h>
278 
279 #ifdef DEBUG
280 #define	IPCL_DEBUG
281 #else
282 #undef	IPCL_DEBUG
283 #endif
284 
285 #ifdef	IPCL_DEBUG
286 int	ipcl_debug_level = 0;
287 #define	IPCL_DEBUG_LVL(level, args)	\
288 	if (ipcl_debug_level  & level) { printf args; }
289 #else
290 #define	IPCL_DEBUG_LVL(level, args) {; }
291 #endif
292 /* Old value for compatibility. Setable in /etc/system */
293 uint_t tcp_conn_hash_size = 0;
294 
295 /* New value. Zero means choose automatically.  Setable in /etc/system */
296 uint_t ipcl_conn_hash_size = 0;
297 uint_t ipcl_conn_hash_memfactor = 8192;
298 uint_t ipcl_conn_hash_maxsize = 82500;
299 
300 /* bind/udp fanout table size */
301 uint_t ipcl_bind_fanout_size = 512;
302 uint_t ipcl_udp_fanout_size = 16384;
303 
304 /* Raw socket fanout size.  Must be a power of 2. */
305 uint_t ipcl_raw_fanout_size = 256;
306 
307 /*
308  * Power of 2^N Primes useful for hashing for N of 0-28,
309  * these primes are the nearest prime <= 2^N - 2^(N-2).
310  */
311 
312 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
313 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
314 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
315 		50331599, 100663291, 201326557, 0}
316 
317 /*
318  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
319  * are aligned on cache lines.
320  */
321 typedef union itc_s {
322 	conn_t	itc_conn;
323 	char	itcu_filler[CACHE_ALIGN(conn_s)];
324 } itc_t;
325 
326 struct kmem_cache  *tcp_conn_cache;
327 struct kmem_cache  *ip_conn_cache;
328 extern struct kmem_cache  *sctp_conn_cache;
329 extern struct kmem_cache  *tcp_sack_info_cache;
330 extern struct kmem_cache  *tcp_iphc_cache;
331 struct kmem_cache  *udp_conn_cache;
332 struct kmem_cache  *rawip_conn_cache;
333 struct kmem_cache  *rts_conn_cache;
334 
335 extern void	tcp_timermp_free(tcp_t *);
336 extern mblk_t	*tcp_timermp_alloc(int);
337 
338 static int	ip_conn_constructor(void *, void *, int);
339 static void	ip_conn_destructor(void *, void *);
340 
341 static int	tcp_conn_constructor(void *, void *, int);
342 static void	tcp_conn_destructor(void *, void *);
343 
344 static int	udp_conn_constructor(void *, void *, int);
345 static void	udp_conn_destructor(void *, void *);
346 
347 static int	rawip_conn_constructor(void *, void *, int);
348 static void	rawip_conn_destructor(void *, void *);
349 
350 static int	rts_conn_constructor(void *, void *, int);
351 static void	rts_conn_destructor(void *, void *);
352 
353 #ifdef	IPCL_DEBUG
354 #define	INET_NTOA_BUFSIZE	18
355 
356 static char *
357 inet_ntoa_r(uint32_t in, char *b)
358 {
359 	unsigned char	*p;
360 
361 	p = (unsigned char *)&in;
362 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
363 	return (b);
364 }
365 #endif
366 
367 /*
368  * Global (for all stack instances) init routine
369  */
370 void
371 ipcl_g_init(void)
372 {
373 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
374 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
375 	    ip_conn_constructor, ip_conn_destructor,
376 	    NULL, NULL, NULL, 0);
377 
378 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
379 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
380 	    tcp_conn_constructor, tcp_conn_destructor,
381 	    NULL, NULL, NULL, 0);
382 
383 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
384 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
385 	    udp_conn_constructor, udp_conn_destructor,
386 	    NULL, NULL, NULL, 0);
387 
388 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
389 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
390 	    rawip_conn_constructor, rawip_conn_destructor,
391 	    NULL, NULL, NULL, 0);
392 
393 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
394 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
395 	    rts_conn_constructor, rts_conn_destructor,
396 	    NULL, NULL, NULL, 0);
397 }
398 
399 /*
400  * ipclassifier intialization routine, sets up hash tables.
401  */
402 void
403 ipcl_init(ip_stack_t *ipst)
404 {
405 	int i;
406 	int sizes[] = P2Ps();
407 
408 	/*
409 	 * Calculate size of conn fanout table from /etc/system settings
410 	 */
411 	if (ipcl_conn_hash_size != 0) {
412 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
413 	} else if (tcp_conn_hash_size != 0) {
414 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
415 	} else {
416 		extern pgcnt_t freemem;
417 
418 		ipst->ips_ipcl_conn_fanout_size =
419 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
420 
421 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
422 			ipst->ips_ipcl_conn_fanout_size =
423 			    ipcl_conn_hash_maxsize;
424 		}
425 	}
426 
427 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
428 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
429 			break;
430 		}
431 	}
432 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
433 		/* Out of range, use the 2^16 value */
434 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
435 	}
436 
437 	/* Take values from /etc/system */
438 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
439 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
440 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
441 
442 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
443 
444 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
445 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
446 
447 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
448 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
449 		    MUTEX_DEFAULT, NULL);
450 	}
451 
452 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
453 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
454 
455 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
456 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
457 		    MUTEX_DEFAULT, NULL);
458 	}
459 
460 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
461 	    sizeof (connf_t), KM_SLEEP);
462 	for (i = 0; i < IPPROTO_MAX; i++) {
463 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
464 		    MUTEX_DEFAULT, NULL);
465 	}
466 
467 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
468 	    sizeof (connf_t), KM_SLEEP);
469 	for (i = 0; i < IPPROTO_MAX; i++) {
470 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
471 		    MUTEX_DEFAULT, NULL);
472 	}
473 
474 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
475 	mutex_init(&ipst->ips_rts_clients->connf_lock,
476 	    NULL, MUTEX_DEFAULT, NULL);
477 
478 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
479 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
480 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
481 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
482 		    MUTEX_DEFAULT, NULL);
483 	}
484 
485 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
486 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
487 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
488 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
489 		    MUTEX_DEFAULT, NULL);
490 	}
491 
492 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
493 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
494 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
495 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
496 		    NULL, MUTEX_DEFAULT, NULL);
497 	}
498 }
499 
500 void
501 ipcl_g_destroy(void)
502 {
503 	kmem_cache_destroy(ip_conn_cache);
504 	kmem_cache_destroy(tcp_conn_cache);
505 	kmem_cache_destroy(udp_conn_cache);
506 	kmem_cache_destroy(rawip_conn_cache);
507 	kmem_cache_destroy(rts_conn_cache);
508 }
509 
510 /*
511  * All user-level and kernel use of the stack must be gone
512  * by now.
513  */
514 void
515 ipcl_destroy(ip_stack_t *ipst)
516 {
517 	int i;
518 
519 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
520 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
521 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
522 	}
523 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
524 	    sizeof (connf_t));
525 	ipst->ips_ipcl_conn_fanout = NULL;
526 
527 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
528 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
529 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
530 	}
531 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
532 	    sizeof (connf_t));
533 	ipst->ips_ipcl_bind_fanout = NULL;
534 
535 	for (i = 0; i < IPPROTO_MAX; i++) {
536 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
537 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
538 	}
539 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
540 	ipst->ips_ipcl_proto_fanout = NULL;
541 
542 	for (i = 0; i < IPPROTO_MAX; i++) {
543 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
544 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
545 	}
546 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
547 	    IPPROTO_MAX * sizeof (connf_t));
548 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
549 
550 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
551 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
552 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
553 	}
554 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
555 	    sizeof (connf_t));
556 	ipst->ips_ipcl_udp_fanout = NULL;
557 
558 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
559 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
560 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
561 	}
562 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
563 	    sizeof (connf_t));
564 	ipst->ips_ipcl_raw_fanout = NULL;
565 
566 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
567 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
568 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
569 	}
570 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
571 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
572 	ipst->ips_ipcl_globalhash_fanout = NULL;
573 
574 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
575 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
576 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
577 	ipst->ips_rts_clients = NULL;
578 }
579 
580 /*
581  * conn creation routine. initialize the conn, sets the reference
582  * and inserts it in the global hash table.
583  */
584 conn_t *
585 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
586 {
587 	conn_t	*connp;
588 	sctp_stack_t *sctps;
589 	struct kmem_cache *conn_cache;
590 
591 	switch (type) {
592 	case IPCL_SCTPCONN:
593 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
594 			return (NULL);
595 		sctp_conn_init(connp);
596 		sctps = ns->netstack_sctp;
597 		SCTP_G_Q_REFHOLD(sctps);
598 		netstack_hold(ns);
599 		connp->conn_netstack = ns;
600 		return (connp);
601 
602 	case IPCL_TCPCONN:
603 		conn_cache = tcp_conn_cache;
604 		break;
605 
606 	case IPCL_UDPCONN:
607 		conn_cache = udp_conn_cache;
608 		break;
609 
610 	case IPCL_RAWIPCONN:
611 		conn_cache = rawip_conn_cache;
612 		break;
613 
614 	case IPCL_RTSCONN:
615 		conn_cache = rts_conn_cache;
616 		break;
617 
618 	case IPCL_IPCCONN:
619 		conn_cache = ip_conn_cache;
620 		break;
621 
622 	default:
623 		connp = NULL;
624 		ASSERT(0);
625 	}
626 
627 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
628 		return (NULL);
629 
630 	connp->conn_ref = 1;
631 	netstack_hold(ns);
632 	connp->conn_netstack = ns;
633 	ipcl_globalhash_insert(connp);
634 	return (connp);
635 }
636 
637 void
638 ipcl_conn_destroy(conn_t *connp)
639 {
640 	mblk_t	*mp;
641 	netstack_t	*ns = connp->conn_netstack;
642 
643 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
644 	ASSERT(connp->conn_ref == 0);
645 	ASSERT(connp->conn_ire_cache == NULL);
646 
647 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
648 
649 	if (connp->conn_peercred != NULL &&
650 	    connp->conn_peercred != connp->conn_cred)
651 		crfree(connp->conn_peercred);
652 	connp->conn_peercred = NULL;
653 
654 	if (connp->conn_cred != NULL) {
655 		crfree(connp->conn_cred);
656 		connp->conn_cred = NULL;
657 	}
658 
659 	ipcl_globalhash_remove(connp);
660 
661 	/* FIXME: add separate tcp_conn_free()? */
662 	if (connp->conn_flags & IPCL_TCPCONN) {
663 		tcp_t	*tcp = connp->conn_tcp;
664 		tcp_stack_t *tcps;
665 
666 		ASSERT(tcp != NULL);
667 		tcps = tcp->tcp_tcps;
668 		if (tcps != NULL) {
669 			if (connp->conn_latch != NULL) {
670 				IPLATCH_REFRELE(connp->conn_latch, ns);
671 				connp->conn_latch = NULL;
672 			}
673 			if (connp->conn_policy != NULL) {
674 				IPPH_REFRELE(connp->conn_policy, ns);
675 				connp->conn_policy = NULL;
676 			}
677 			tcp->tcp_tcps = NULL;
678 			TCPS_REFRELE(tcps);
679 		}
680 
681 		tcp_free(tcp);
682 		mp = tcp->tcp_timercache;
683 		tcp->tcp_cred = NULL;
684 
685 		if (tcp->tcp_sack_info != NULL) {
686 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
687 			kmem_cache_free(tcp_sack_info_cache,
688 			    tcp->tcp_sack_info);
689 		}
690 		if (tcp->tcp_iphc != NULL) {
691 			if (tcp->tcp_hdr_grown) {
692 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
693 			} else {
694 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
695 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
696 			}
697 			tcp->tcp_iphc_len = 0;
698 		}
699 		ASSERT(tcp->tcp_iphc_len == 0);
700 
701 		if (tcp->tcp_ordrel_mp != NULL) {
702 			freeb(tcp->tcp_ordrel_mp);
703 			tcp->tcp_ordrel_mp = NULL;
704 		}
705 
706 		/*
707 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
708 		 * the mblk.
709 		 */
710 		if (tcp->tcp_rsrv_mp != NULL) {
711 			freeb(tcp->tcp_rsrv_mp);
712 			tcp->tcp_rsrv_mp = NULL;
713 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
714 		}
715 
716 		ASSERT(connp->conn_latch == NULL);
717 		ASSERT(connp->conn_policy == NULL);
718 
719 		if (ns != NULL) {
720 			ASSERT(tcp->tcp_tcps == NULL);
721 			connp->conn_netstack = NULL;
722 			netstack_rele(ns);
723 		}
724 
725 		ipcl_conn_cleanup(connp);
726 		connp->conn_flags = IPCL_TCPCONN;
727 		bzero(tcp, sizeof (tcp_t));
728 
729 		tcp->tcp_timercache = mp;
730 		tcp->tcp_connp = connp;
731 		kmem_cache_free(tcp_conn_cache, connp);
732 		return;
733 	}
734 	if (connp->conn_latch != NULL) {
735 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
736 		connp->conn_latch = NULL;
737 	}
738 	if (connp->conn_policy != NULL) {
739 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
740 		connp->conn_policy = NULL;
741 	}
742 	if (connp->conn_ipsec_opt_mp != NULL) {
743 		freemsg(connp->conn_ipsec_opt_mp);
744 		connp->conn_ipsec_opt_mp = NULL;
745 	}
746 
747 	if (connp->conn_flags & IPCL_SCTPCONN) {
748 		ASSERT(ns != NULL);
749 		sctp_free(connp);
750 		return;
751 	}
752 
753 	if (ns != NULL) {
754 		connp->conn_netstack = NULL;
755 		netstack_rele(ns);
756 	}
757 	ipcl_conn_cleanup(connp);
758 
759 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
760 	if (connp->conn_flags & IPCL_UDPCONN) {
761 		connp->conn_flags = IPCL_UDPCONN;
762 		kmem_cache_free(udp_conn_cache, connp);
763 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
764 		connp->conn_flags = IPCL_RAWIPCONN;
765 		connp->conn_ulp = IPPROTO_ICMP;
766 		kmem_cache_free(rawip_conn_cache, connp);
767 	} else if (connp->conn_flags & IPCL_RTSCONN) {
768 		connp->conn_flags = IPCL_RTSCONN;
769 		kmem_cache_free(rts_conn_cache, connp);
770 	} else {
771 		connp->conn_flags = IPCL_IPCCONN;
772 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
773 		ASSERT(connp->conn_priv == NULL);
774 		kmem_cache_free(ip_conn_cache, connp);
775 	}
776 }
777 
778 /*
779  * Running in cluster mode - deregister listener information
780  */
781 
782 static void
783 ipcl_conn_unlisten(conn_t *connp)
784 {
785 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
786 	ASSERT(connp->conn_lport != 0);
787 
788 	if (cl_inet_unlisten != NULL) {
789 		sa_family_t	addr_family;
790 		uint8_t		*laddrp;
791 
792 		if (connp->conn_pkt_isv6) {
793 			addr_family = AF_INET6;
794 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
795 		} else {
796 			addr_family = AF_INET;
797 			laddrp = (uint8_t *)&connp->conn_bound_source;
798 		}
799 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
800 		    connp->conn_lport);
801 	}
802 	connp->conn_flags &= ~IPCL_CL_LISTENER;
803 }
804 
805 /*
806  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
807  * which table the conn belonged to). So for debugging we can see which hash
808  * table this connection was in.
809  */
810 #define	IPCL_HASH_REMOVE(connp)	{					\
811 	connf_t	*connfp = (connp)->conn_fanout;				\
812 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
813 	if (connfp != NULL) {						\
814 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
815 		    (void *)(connp)));					\
816 		mutex_enter(&connfp->connf_lock);			\
817 		if ((connp)->conn_next != NULL)				\
818 			(connp)->conn_next->conn_prev =			\
819 			    (connp)->conn_prev;				\
820 		if ((connp)->conn_prev != NULL)				\
821 			(connp)->conn_prev->conn_next =			\
822 			    (connp)->conn_next;				\
823 		else							\
824 			connfp->connf_head = (connp)->conn_next;	\
825 		(connp)->conn_fanout = NULL;				\
826 		(connp)->conn_next = NULL;				\
827 		(connp)->conn_prev = NULL;				\
828 		(connp)->conn_flags |= IPCL_REMOVED;			\
829 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
830 			ipcl_conn_unlisten((connp));			\
831 		CONN_DEC_REF((connp));					\
832 		mutex_exit(&connfp->connf_lock);			\
833 	}								\
834 }
835 
836 void
837 ipcl_hash_remove(conn_t *connp)
838 {
839 	IPCL_HASH_REMOVE(connp);
840 }
841 
842 /*
843  * The whole purpose of this function is allow removal of
844  * a conn_t from the connected hash for timewait reclaim.
845  * This is essentially a TW reclaim fastpath where timewait
846  * collector checks under fanout lock (so no one else can
847  * get access to the conn_t) that refcnt is 2 i.e. one for
848  * TCP and one for the classifier hash list. If ref count
849  * is indeed 2, we can just remove the conn under lock and
850  * avoid cleaning up the conn under squeue. This gives us
851  * improved performance.
852  */
853 void
854 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
855 {
856 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
857 	ASSERT(MUTEX_HELD(&connp->conn_lock));
858 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
859 
860 	if ((connp)->conn_next != NULL) {
861 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
862 	}
863 	if ((connp)->conn_prev != NULL) {
864 		(connp)->conn_prev->conn_next = (connp)->conn_next;
865 	} else {
866 		connfp->connf_head = (connp)->conn_next;
867 	}
868 	(connp)->conn_fanout = NULL;
869 	(connp)->conn_next = NULL;
870 	(connp)->conn_prev = NULL;
871 	(connp)->conn_flags |= IPCL_REMOVED;
872 	ASSERT((connp)->conn_ref == 2);
873 	(connp)->conn_ref--;
874 }
875 
876 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
877 	ASSERT((connp)->conn_fanout == NULL);				\
878 	ASSERT((connp)->conn_next == NULL);				\
879 	ASSERT((connp)->conn_prev == NULL);				\
880 	if ((connfp)->connf_head != NULL) {				\
881 		(connfp)->connf_head->conn_prev = (connp);		\
882 		(connp)->conn_next = (connfp)->connf_head;		\
883 	}								\
884 	(connp)->conn_fanout = (connfp);				\
885 	(connfp)->connf_head = (connp);					\
886 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
887 	    IPCL_CONNECTED;						\
888 	CONN_INC_REF(connp);						\
889 }
890 
891 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
892 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
893 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
894 	IPCL_HASH_REMOVE((connp));					\
895 	mutex_enter(&(connfp)->connf_lock);				\
896 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
897 	mutex_exit(&(connfp)->connf_lock);				\
898 }
899 
900 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
901 	conn_t *pconnp = NULL, *nconnp;					\
902 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
903 	    "connp %p", (void *)connfp, (void *)(connp)));		\
904 	IPCL_HASH_REMOVE((connp));					\
905 	mutex_enter(&(connfp)->connf_lock);				\
906 	nconnp = (connfp)->connf_head;					\
907 	while (nconnp != NULL &&					\
908 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
909 		pconnp = nconnp;					\
910 		nconnp = nconnp->conn_next;				\
911 	}								\
912 	if (pconnp != NULL) {						\
913 		pconnp->conn_next = (connp);				\
914 		(connp)->conn_prev = pconnp;				\
915 	} else {							\
916 		(connfp)->connf_head = (connp);				\
917 	}								\
918 	if (nconnp != NULL) {						\
919 		(connp)->conn_next = nconnp;				\
920 		nconnp->conn_prev = (connp);				\
921 	}								\
922 	(connp)->conn_fanout = (connfp);				\
923 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
924 	    IPCL_BOUND;							\
925 	CONN_INC_REF(connp);						\
926 	mutex_exit(&(connfp)->connf_lock);				\
927 }
928 
929 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
930 	conn_t **list, *prev, *next;					\
931 	boolean_t isv4mapped =						\
932 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
933 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
934 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
935 	IPCL_HASH_REMOVE((connp));					\
936 	mutex_enter(&(connfp)->connf_lock);				\
937 	list = &(connfp)->connf_head;					\
938 	prev = NULL;							\
939 	while ((next = *list) != NULL) {				\
940 		if (isv4mapped &&					\
941 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
942 		    connp->conn_zoneid == next->conn_zoneid) {		\
943 			(connp)->conn_next = next;			\
944 			if (prev != NULL)				\
945 				prev = next->conn_prev;			\
946 			next->conn_prev = (connp);			\
947 			break;						\
948 		}							\
949 		list = &next->conn_next;				\
950 		prev = next;						\
951 	}								\
952 	(connp)->conn_prev = prev;					\
953 	*list = (connp);						\
954 	(connp)->conn_fanout = (connfp);				\
955 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
956 	    IPCL_BOUND;							\
957 	CONN_INC_REF((connp));						\
958 	mutex_exit(&(connfp)->connf_lock);				\
959 }
960 
961 void
962 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
963 {
964 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
965 }
966 
967 void
968 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
969 {
970 	connf_t	*connfp;
971 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
972 
973 	ASSERT(connp != NULL);
974 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
975 	    protocol == IPPROTO_ESP);
976 
977 	connp->conn_ulp = protocol;
978 
979 	/* Insert it in the protocol hash */
980 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
981 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
982 }
983 
984 void
985 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
986 {
987 	connf_t	*connfp;
988 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
989 
990 	ASSERT(connp != NULL);
991 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
992 	    protocol == IPPROTO_ESP);
993 
994 	connp->conn_ulp = protocol;
995 
996 	/* Insert it in the Bind Hash */
997 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
998 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
999 }
1000 
1001 /*
1002  * This function is used only for inserting SCTP raw socket now.
1003  * This may change later.
1004  *
1005  * Note that only one raw socket can be bound to a port.  The param
1006  * lport is in network byte order.
1007  */
1008 static int
1009 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1010 {
1011 	connf_t	*connfp;
1012 	conn_t	*oconnp;
1013 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1014 
1015 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1016 
1017 	/* Check for existing raw socket already bound to the port. */
1018 	mutex_enter(&connfp->connf_lock);
1019 	for (oconnp = connfp->connf_head; oconnp != NULL;
1020 	    oconnp = oconnp->conn_next) {
1021 		if (oconnp->conn_lport == lport &&
1022 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1023 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1024 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1025 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1026 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1027 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1028 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1029 		    &connp->conn_srcv6))) {
1030 			break;
1031 		}
1032 	}
1033 	mutex_exit(&connfp->connf_lock);
1034 	if (oconnp != NULL)
1035 		return (EADDRNOTAVAIL);
1036 
1037 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1038 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1039 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1040 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1041 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1042 		} else {
1043 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1044 		}
1045 	} else {
1046 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1047 	}
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Check for a MAC exemption conflict on a labeled system.  Note that for
1053  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1054  * transport layer.  This check is for binding all other protocols.
1055  *
1056  * Returns true if there's a conflict.
1057  */
1058 static boolean_t
1059 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1060 {
1061 	connf_t	*connfp;
1062 	conn_t *tconn;
1063 
1064 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1065 	mutex_enter(&connfp->connf_lock);
1066 	for (tconn = connfp->connf_head; tconn != NULL;
1067 	    tconn = tconn->conn_next) {
1068 		/* We don't allow v4 fallback for v6 raw socket */
1069 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1070 			continue;
1071 		/* If neither is exempt, then there's no conflict */
1072 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1073 			continue;
1074 		/* If both are bound to different specific addrs, ok */
1075 		if (connp->conn_src != INADDR_ANY &&
1076 		    tconn->conn_src != INADDR_ANY &&
1077 		    connp->conn_src != tconn->conn_src)
1078 			continue;
1079 		/* These two conflict; fail */
1080 		break;
1081 	}
1082 	mutex_exit(&connfp->connf_lock);
1083 	return (tconn != NULL);
1084 }
1085 
1086 static boolean_t
1087 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1088 {
1089 	connf_t	*connfp;
1090 	conn_t *tconn;
1091 
1092 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1093 	mutex_enter(&connfp->connf_lock);
1094 	for (tconn = connfp->connf_head; tconn != NULL;
1095 	    tconn = tconn->conn_next) {
1096 		/* We don't allow v4 fallback for v6 raw socket */
1097 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1098 			continue;
1099 		/* If neither is exempt, then there's no conflict */
1100 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1101 			continue;
1102 		/* If both are bound to different addrs, ok */
1103 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1104 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1105 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1106 			continue;
1107 		/* These two conflict; fail */
1108 		break;
1109 	}
1110 	mutex_exit(&connfp->connf_lock);
1111 	return (tconn != NULL);
1112 }
1113 
1114 /*
1115  * (v4, v6) bind hash insertion routines
1116  */
1117 int
1118 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1119 {
1120 	connf_t	*connfp;
1121 #ifdef	IPCL_DEBUG
1122 	char	buf[INET_NTOA_BUFSIZE];
1123 #endif
1124 	int	ret = 0;
1125 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1126 
1127 	ASSERT(connp);
1128 
1129 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1130 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1131 
1132 	connp->conn_ulp = protocol;
1133 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1134 	connp->conn_lport = lport;
1135 
1136 	switch (protocol) {
1137 	default:
1138 		if (is_system_labeled() &&
1139 		    check_exempt_conflict_v4(connp, ipst))
1140 			return (EADDRINUSE);
1141 		/* FALLTHROUGH */
1142 	case IPPROTO_UDP:
1143 		if (protocol == IPPROTO_UDP) {
1144 			IPCL_DEBUG_LVL(64,
1145 			    ("ipcl_bind_insert: connp %p - udp\n",
1146 			    (void *)connp));
1147 			connfp = &ipst->ips_ipcl_udp_fanout[
1148 			    IPCL_UDP_HASH(lport, ipst)];
1149 		} else {
1150 			IPCL_DEBUG_LVL(64,
1151 			    ("ipcl_bind_insert: connp %p - protocol\n",
1152 			    (void *)connp));
1153 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1154 		}
1155 
1156 		if (connp->conn_rem != INADDR_ANY) {
1157 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1158 		} else if (connp->conn_src != INADDR_ANY) {
1159 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1160 		} else {
1161 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1162 		}
1163 		break;
1164 
1165 	case IPPROTO_TCP:
1166 
1167 		/* Insert it in the Bind Hash */
1168 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1169 		connfp = &ipst->ips_ipcl_bind_fanout[
1170 		    IPCL_BIND_HASH(lport, ipst)];
1171 		if (connp->conn_src != INADDR_ANY) {
1172 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1173 		} else {
1174 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1175 		}
1176 		if (cl_inet_listen != NULL) {
1177 			ASSERT(!connp->conn_pkt_isv6);
1178 			connp->conn_flags |= IPCL_CL_LISTENER;
1179 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1180 			    (uint8_t *)&connp->conn_bound_source, lport);
1181 		}
1182 		break;
1183 
1184 	case IPPROTO_SCTP:
1185 		ret = ipcl_sctp_hash_insert(connp, lport);
1186 		break;
1187 	}
1188 
1189 	return (ret);
1190 }
1191 
1192 int
1193 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1194     uint16_t lport)
1195 {
1196 	connf_t	*connfp;
1197 	int	ret = 0;
1198 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1199 
1200 	ASSERT(connp);
1201 
1202 	connp->conn_ulp = protocol;
1203 	connp->conn_srcv6 = *src;
1204 	connp->conn_lport = lport;
1205 
1206 	switch (protocol) {
1207 	default:
1208 		if (is_system_labeled() &&
1209 		    check_exempt_conflict_v6(connp, ipst))
1210 			return (EADDRINUSE);
1211 		/* FALLTHROUGH */
1212 	case IPPROTO_UDP:
1213 		if (protocol == IPPROTO_UDP) {
1214 			IPCL_DEBUG_LVL(128,
1215 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1216 			    (void *)connp));
1217 			connfp = &ipst->ips_ipcl_udp_fanout[
1218 			    IPCL_UDP_HASH(lport, ipst)];
1219 		} else {
1220 			IPCL_DEBUG_LVL(128,
1221 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1222 			    (void *)connp));
1223 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1224 		}
1225 
1226 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1227 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1228 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1229 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1230 		} else {
1231 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1232 		}
1233 		break;
1234 
1235 	case IPPROTO_TCP:
1236 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1237 
1238 		/* Insert it in the Bind Hash */
1239 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1240 		connfp = &ipst->ips_ipcl_bind_fanout[
1241 		    IPCL_BIND_HASH(lport, ipst)];
1242 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1243 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1244 		} else {
1245 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1246 		}
1247 		if (cl_inet_listen != NULL) {
1248 			sa_family_t	addr_family;
1249 			uint8_t		*laddrp;
1250 
1251 			if (connp->conn_pkt_isv6) {
1252 				addr_family = AF_INET6;
1253 				laddrp =
1254 				    (uint8_t *)&connp->conn_bound_source_v6;
1255 			} else {
1256 				addr_family = AF_INET;
1257 				laddrp = (uint8_t *)&connp->conn_bound_source;
1258 			}
1259 			connp->conn_flags |= IPCL_CL_LISTENER;
1260 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1261 			    lport);
1262 		}
1263 		break;
1264 
1265 	case IPPROTO_SCTP:
1266 		ret = ipcl_sctp_hash_insert(connp, lport);
1267 		break;
1268 	}
1269 
1270 	return (ret);
1271 }
1272 
1273 /*
1274  * ipcl_conn_hash insertion routines.
1275  */
1276 int
1277 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1278     ipaddr_t rem, uint32_t ports)
1279 {
1280 	connf_t		*connfp;
1281 	uint16_t	*up;
1282 	conn_t		*tconnp;
1283 #ifdef	IPCL_DEBUG
1284 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1285 #endif
1286 	in_port_t	lport;
1287 	int		ret = 0;
1288 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1289 
1290 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1291 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1292 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1293 	    ports, protocol));
1294 
1295 	switch (protocol) {
1296 	case IPPROTO_TCP:
1297 		if (!(connp->conn_flags & IPCL_EAGER)) {
1298 			/*
1299 			 * for a eager connection, i.e connections which
1300 			 * have just been created, the initialization is
1301 			 * already done in ip at conn_creation time, so
1302 			 * we can skip the checks here.
1303 			 */
1304 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1305 		}
1306 		connfp = &ipst->ips_ipcl_conn_fanout[
1307 		    IPCL_CONN_HASH(connp->conn_rem,
1308 		    connp->conn_ports, ipst)];
1309 		mutex_enter(&connfp->connf_lock);
1310 		for (tconnp = connfp->connf_head; tconnp != NULL;
1311 		    tconnp = tconnp->conn_next) {
1312 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1313 			    connp->conn_rem, connp->conn_src,
1314 			    connp->conn_ports)) {
1315 
1316 				/* Already have a conn. bail out */
1317 				mutex_exit(&connfp->connf_lock);
1318 				return (EADDRINUSE);
1319 			}
1320 		}
1321 		if (connp->conn_fanout != NULL) {
1322 			/*
1323 			 * Probably a XTI/TLI application trying to do a
1324 			 * rebind. Let it happen.
1325 			 */
1326 			mutex_exit(&connfp->connf_lock);
1327 			IPCL_HASH_REMOVE(connp);
1328 			mutex_enter(&connfp->connf_lock);
1329 		}
1330 
1331 		ASSERT(connp->conn_recv != NULL);
1332 
1333 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1334 		mutex_exit(&connfp->connf_lock);
1335 		break;
1336 
1337 	case IPPROTO_SCTP:
1338 		/*
1339 		 * The raw socket may have already been bound, remove it
1340 		 * from the hash first.
1341 		 */
1342 		IPCL_HASH_REMOVE(connp);
1343 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1344 		ret = ipcl_sctp_hash_insert(connp, lport);
1345 		break;
1346 
1347 	default:
1348 		/*
1349 		 * Check for conflicts among MAC exempt bindings.  For
1350 		 * transports with port numbers, this is done by the upper
1351 		 * level per-transport binding logic.  For all others, it's
1352 		 * done here.
1353 		 */
1354 		if (is_system_labeled() &&
1355 		    check_exempt_conflict_v4(connp, ipst))
1356 			return (EADDRINUSE);
1357 		/* FALLTHROUGH */
1358 
1359 	case IPPROTO_UDP:
1360 		up = (uint16_t *)&ports;
1361 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1362 		if (protocol == IPPROTO_UDP) {
1363 			connfp = &ipst->ips_ipcl_udp_fanout[
1364 			    IPCL_UDP_HASH(up[1], ipst)];
1365 		} else {
1366 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1367 		}
1368 
1369 		if (connp->conn_rem != INADDR_ANY) {
1370 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1371 		} else if (connp->conn_src != INADDR_ANY) {
1372 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1373 		} else {
1374 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1375 		}
1376 		break;
1377 	}
1378 
1379 	return (ret);
1380 }
1381 
1382 int
1383 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1384     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1385 {
1386 	connf_t		*connfp;
1387 	uint16_t	*up;
1388 	conn_t		*tconnp;
1389 	in_port_t	lport;
1390 	int		ret = 0;
1391 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1392 
1393 	switch (protocol) {
1394 	case IPPROTO_TCP:
1395 		/* Just need to insert a conn struct */
1396 		if (!(connp->conn_flags & IPCL_EAGER)) {
1397 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1398 		}
1399 		connfp = &ipst->ips_ipcl_conn_fanout[
1400 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1401 		    ipst)];
1402 		mutex_enter(&connfp->connf_lock);
1403 		for (tconnp = connfp->connf_head; tconnp != NULL;
1404 		    tconnp = tconnp->conn_next) {
1405 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1406 			    connp->conn_remv6, connp->conn_srcv6,
1407 			    connp->conn_ports) &&
1408 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1409 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1410 				/* Already have a conn. bail out */
1411 				mutex_exit(&connfp->connf_lock);
1412 				return (EADDRINUSE);
1413 			}
1414 		}
1415 		if (connp->conn_fanout != NULL) {
1416 			/*
1417 			 * Probably a XTI/TLI application trying to do a
1418 			 * rebind. Let it happen.
1419 			 */
1420 			mutex_exit(&connfp->connf_lock);
1421 			IPCL_HASH_REMOVE(connp);
1422 			mutex_enter(&connfp->connf_lock);
1423 		}
1424 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1425 		mutex_exit(&connfp->connf_lock);
1426 		break;
1427 
1428 	case IPPROTO_SCTP:
1429 		IPCL_HASH_REMOVE(connp);
1430 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1431 		ret = ipcl_sctp_hash_insert(connp, lport);
1432 		break;
1433 
1434 	default:
1435 		if (is_system_labeled() &&
1436 		    check_exempt_conflict_v6(connp, ipst))
1437 			return (EADDRINUSE);
1438 		/* FALLTHROUGH */
1439 	case IPPROTO_UDP:
1440 		up = (uint16_t *)&ports;
1441 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1442 		if (protocol == IPPROTO_UDP) {
1443 			connfp = &ipst->ips_ipcl_udp_fanout[
1444 			    IPCL_UDP_HASH(up[1], ipst)];
1445 		} else {
1446 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1447 		}
1448 
1449 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1450 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1451 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1452 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1453 		} else {
1454 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1455 		}
1456 		break;
1457 	}
1458 
1459 	return (ret);
1460 }
1461 
1462 /*
1463  * v4 packet classifying function. looks up the fanout table to
1464  * find the conn, the packet belongs to. returns the conn with
1465  * the reference held, null otherwise.
1466  *
1467  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1468  * Lookup" comment block are applied.  Labels are also checked as described
1469  * above.  If the packet is from the inside (looped back), and is from the same
1470  * zone, then label checks are omitted.
1471  */
1472 conn_t *
1473 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1474     ip_stack_t *ipst)
1475 {
1476 	ipha_t	*ipha;
1477 	connf_t	*connfp, *bind_connfp;
1478 	uint16_t lport;
1479 	uint16_t fport;
1480 	uint32_t ports;
1481 	conn_t	*connp;
1482 	uint16_t  *up;
1483 	boolean_t shared_addr;
1484 	boolean_t unlabeled;
1485 
1486 	ipha = (ipha_t *)mp->b_rptr;
1487 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1488 
1489 	switch (protocol) {
1490 	case IPPROTO_TCP:
1491 		ports = *(uint32_t *)up;
1492 		connfp =
1493 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1494 		    ports, ipst)];
1495 		mutex_enter(&connfp->connf_lock);
1496 		for (connp = connfp->connf_head; connp != NULL;
1497 		    connp = connp->conn_next) {
1498 			if (IPCL_CONN_MATCH(connp, protocol,
1499 			    ipha->ipha_src, ipha->ipha_dst, ports))
1500 				break;
1501 		}
1502 
1503 		if (connp != NULL) {
1504 			/*
1505 			 * We have a fully-bound TCP connection.
1506 			 *
1507 			 * For labeled systems, there's no need to check the
1508 			 * label here.  It's known to be good as we checked
1509 			 * before allowing the connection to become bound.
1510 			 */
1511 			CONN_INC_REF(connp);
1512 			mutex_exit(&connfp->connf_lock);
1513 			return (connp);
1514 		}
1515 
1516 		mutex_exit(&connfp->connf_lock);
1517 
1518 		lport = up[1];
1519 		unlabeled = B_FALSE;
1520 		/* Cred cannot be null on IPv4 */
1521 		if (is_system_labeled())
1522 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1523 			    TSLF_UNLABELED) != 0;
1524 		shared_addr = (zoneid == ALL_ZONES);
1525 		if (shared_addr) {
1526 			/*
1527 			 * No need to handle exclusive-stack zones since
1528 			 * ALL_ZONES only applies to the shared stack.
1529 			 */
1530 			zoneid = tsol_mlp_findzone(protocol, lport);
1531 			/*
1532 			 * If no shared MLP is found, tsol_mlp_findzone returns
1533 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1534 			 * search for the zone based on the packet label.
1535 			 *
1536 			 * If there is such a zone, we prefer to find a
1537 			 * connection in it.  Otherwise, we look for a
1538 			 * MAC-exempt connection in any zone whose label
1539 			 * dominates the default label on the packet.
1540 			 */
1541 			if (zoneid == ALL_ZONES)
1542 				zoneid = tsol_packet_to_zoneid(mp);
1543 			else
1544 				unlabeled = B_FALSE;
1545 		}
1546 
1547 		bind_connfp =
1548 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1549 		mutex_enter(&bind_connfp->connf_lock);
1550 		for (connp = bind_connfp->connf_head; connp != NULL;
1551 		    connp = connp->conn_next) {
1552 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1553 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1554 			    (unlabeled && connp->conn_mac_exempt)))
1555 				break;
1556 		}
1557 
1558 		/*
1559 		 * If the matching connection is SLP on a private address, then
1560 		 * the label on the packet must match the local zone's label.
1561 		 * Otherwise, it must be in the label range defined by tnrh.
1562 		 * This is ensured by tsol_receive_label.
1563 		 */
1564 		if (connp != NULL && is_system_labeled() &&
1565 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1566 		    shared_addr, connp)) {
1567 				DTRACE_PROBE3(
1568 				    tx__ip__log__info__classify__tcp,
1569 				    char *,
1570 				    "connp(1) could not receive mp(2)",
1571 				    conn_t *, connp, mblk_t *, mp);
1572 			connp = NULL;
1573 		}
1574 
1575 		if (connp != NULL) {
1576 			/* Have a listener at least */
1577 			CONN_INC_REF(connp);
1578 			mutex_exit(&bind_connfp->connf_lock);
1579 			return (connp);
1580 		}
1581 
1582 		mutex_exit(&bind_connfp->connf_lock);
1583 
1584 		IPCL_DEBUG_LVL(512,
1585 		    ("ipcl_classify: couldn't classify mp = %p\n",
1586 		    (void *)mp));
1587 		break;
1588 
1589 	case IPPROTO_UDP:
1590 		lport = up[1];
1591 		unlabeled = B_FALSE;
1592 		/* Cred cannot be null on IPv4 */
1593 		if (is_system_labeled())
1594 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1595 			    TSLF_UNLABELED) != 0;
1596 		shared_addr = (zoneid == ALL_ZONES);
1597 		if (shared_addr) {
1598 			/*
1599 			 * No need to handle exclusive-stack zones since
1600 			 * ALL_ZONES only applies to the shared stack.
1601 			 */
1602 			zoneid = tsol_mlp_findzone(protocol, lport);
1603 			/*
1604 			 * If no shared MLP is found, tsol_mlp_findzone returns
1605 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1606 			 * search for the zone based on the packet label.
1607 			 *
1608 			 * If there is such a zone, we prefer to find a
1609 			 * connection in it.  Otherwise, we look for a
1610 			 * MAC-exempt connection in any zone whose label
1611 			 * dominates the default label on the packet.
1612 			 */
1613 			if (zoneid == ALL_ZONES)
1614 				zoneid = tsol_packet_to_zoneid(mp);
1615 			else
1616 				unlabeled = B_FALSE;
1617 		}
1618 		fport = up[0];
1619 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1620 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1621 		mutex_enter(&connfp->connf_lock);
1622 		for (connp = connfp->connf_head; connp != NULL;
1623 		    connp = connp->conn_next) {
1624 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1625 			    fport, ipha->ipha_src) &&
1626 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1627 			    (unlabeled && connp->conn_mac_exempt)))
1628 				break;
1629 		}
1630 
1631 		if (connp != NULL && is_system_labeled() &&
1632 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1633 		    shared_addr, connp)) {
1634 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1635 			    char *, "connp(1) could not receive mp(2)",
1636 			    conn_t *, connp, mblk_t *, mp);
1637 			connp = NULL;
1638 		}
1639 
1640 		if (connp != NULL) {
1641 			CONN_INC_REF(connp);
1642 			mutex_exit(&connfp->connf_lock);
1643 			return (connp);
1644 		}
1645 
1646 		/*
1647 		 * We shouldn't come here for multicast/broadcast packets
1648 		 */
1649 		mutex_exit(&connfp->connf_lock);
1650 		IPCL_DEBUG_LVL(512,
1651 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1652 		    lport, fport));
1653 		break;
1654 	}
1655 
1656 	return (NULL);
1657 }
1658 
1659 conn_t *
1660 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1661     ip_stack_t *ipst)
1662 {
1663 	ip6_t		*ip6h;
1664 	connf_t		*connfp, *bind_connfp;
1665 	uint16_t	lport;
1666 	uint16_t	fport;
1667 	tcph_t		*tcph;
1668 	uint32_t	ports;
1669 	conn_t		*connp;
1670 	uint16_t	*up;
1671 	boolean_t	shared_addr;
1672 	boolean_t	unlabeled;
1673 
1674 	ip6h = (ip6_t *)mp->b_rptr;
1675 
1676 	switch (protocol) {
1677 	case IPPROTO_TCP:
1678 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1679 		up = (uint16_t *)tcph->th_lport;
1680 		ports = *(uint32_t *)up;
1681 
1682 		connfp =
1683 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1684 		    ports, ipst)];
1685 		mutex_enter(&connfp->connf_lock);
1686 		for (connp = connfp->connf_head; connp != NULL;
1687 		    connp = connp->conn_next) {
1688 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1689 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1690 				break;
1691 		}
1692 
1693 		if (connp != NULL) {
1694 			/*
1695 			 * We have a fully-bound TCP connection.
1696 			 *
1697 			 * For labeled systems, there's no need to check the
1698 			 * label here.  It's known to be good as we checked
1699 			 * before allowing the connection to become bound.
1700 			 */
1701 			CONN_INC_REF(connp);
1702 			mutex_exit(&connfp->connf_lock);
1703 			return (connp);
1704 		}
1705 
1706 		mutex_exit(&connfp->connf_lock);
1707 
1708 		lport = up[1];
1709 		unlabeled = B_FALSE;
1710 		/* Cred can be null on IPv6 */
1711 		if (is_system_labeled()) {
1712 			cred_t *cr = DB_CRED(mp);
1713 
1714 			unlabeled = (cr != NULL &&
1715 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1716 		}
1717 		shared_addr = (zoneid == ALL_ZONES);
1718 		if (shared_addr) {
1719 			/*
1720 			 * No need to handle exclusive-stack zones since
1721 			 * ALL_ZONES only applies to the shared stack.
1722 			 */
1723 			zoneid = tsol_mlp_findzone(protocol, lport);
1724 			/*
1725 			 * If no shared MLP is found, tsol_mlp_findzone returns
1726 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1727 			 * search for the zone based on the packet label.
1728 			 *
1729 			 * If there is such a zone, we prefer to find a
1730 			 * connection in it.  Otherwise, we look for a
1731 			 * MAC-exempt connection in any zone whose label
1732 			 * dominates the default label on the packet.
1733 			 */
1734 			if (zoneid == ALL_ZONES)
1735 				zoneid = tsol_packet_to_zoneid(mp);
1736 			else
1737 				unlabeled = B_FALSE;
1738 		}
1739 
1740 		bind_connfp =
1741 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1742 		mutex_enter(&bind_connfp->connf_lock);
1743 		for (connp = bind_connfp->connf_head; connp != NULL;
1744 		    connp = connp->conn_next) {
1745 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1746 			    ip6h->ip6_dst, lport) &&
1747 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1748 			    (unlabeled && connp->conn_mac_exempt)))
1749 				break;
1750 		}
1751 
1752 		if (connp != NULL && is_system_labeled() &&
1753 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1754 		    shared_addr, connp)) {
1755 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1756 			    char *, "connp(1) could not receive mp(2)",
1757 			    conn_t *, connp, mblk_t *, mp);
1758 			connp = NULL;
1759 		}
1760 
1761 		if (connp != NULL) {
1762 			/* Have a listner at least */
1763 			CONN_INC_REF(connp);
1764 			mutex_exit(&bind_connfp->connf_lock);
1765 			IPCL_DEBUG_LVL(512,
1766 			    ("ipcl_classify_v6: found listner "
1767 			    "connp = %p\n", (void *)connp));
1768 
1769 			return (connp);
1770 		}
1771 
1772 		mutex_exit(&bind_connfp->connf_lock);
1773 
1774 		IPCL_DEBUG_LVL(512,
1775 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1776 		    (void *)mp));
1777 		break;
1778 
1779 	case IPPROTO_UDP:
1780 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1781 		lport = up[1];
1782 		unlabeled = B_FALSE;
1783 		/* Cred can be null on IPv6 */
1784 		if (is_system_labeled()) {
1785 			cred_t *cr = DB_CRED(mp);
1786 
1787 			unlabeled = (cr != NULL &&
1788 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1789 		}
1790 		shared_addr = (zoneid == ALL_ZONES);
1791 		if (shared_addr) {
1792 			/*
1793 			 * No need to handle exclusive-stack zones since
1794 			 * ALL_ZONES only applies to the shared stack.
1795 			 */
1796 			zoneid = tsol_mlp_findzone(protocol, lport);
1797 			/*
1798 			 * If no shared MLP is found, tsol_mlp_findzone returns
1799 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1800 			 * search for the zone based on the packet label.
1801 			 *
1802 			 * If there is such a zone, we prefer to find a
1803 			 * connection in it.  Otherwise, we look for a
1804 			 * MAC-exempt connection in any zone whose label
1805 			 * dominates the default label on the packet.
1806 			 */
1807 			if (zoneid == ALL_ZONES)
1808 				zoneid = tsol_packet_to_zoneid(mp);
1809 			else
1810 				unlabeled = B_FALSE;
1811 		}
1812 
1813 		fport = up[0];
1814 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1815 		    fport));
1816 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1817 		mutex_enter(&connfp->connf_lock);
1818 		for (connp = connfp->connf_head; connp != NULL;
1819 		    connp = connp->conn_next) {
1820 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1821 			    fport, ip6h->ip6_src) &&
1822 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1823 			    (unlabeled && connp->conn_mac_exempt)))
1824 				break;
1825 		}
1826 
1827 		if (connp != NULL && is_system_labeled() &&
1828 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1829 		    shared_addr, connp)) {
1830 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1831 			    char *, "connp(1) could not receive mp(2)",
1832 			    conn_t *, connp, mblk_t *, mp);
1833 			connp = NULL;
1834 		}
1835 
1836 		if (connp != NULL) {
1837 			CONN_INC_REF(connp);
1838 			mutex_exit(&connfp->connf_lock);
1839 			return (connp);
1840 		}
1841 
1842 		/*
1843 		 * We shouldn't come here for multicast/broadcast packets
1844 		 */
1845 		mutex_exit(&connfp->connf_lock);
1846 		IPCL_DEBUG_LVL(512,
1847 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1848 		    lport, fport));
1849 		break;
1850 	}
1851 
1852 	return (NULL);
1853 }
1854 
1855 /*
1856  * wrapper around ipcl_classify_(v4,v6) routines.
1857  */
1858 conn_t *
1859 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
1860 {
1861 	uint16_t	hdr_len;
1862 	ipha_t		*ipha;
1863 	uint8_t		*nexthdrp;
1864 
1865 	if (MBLKL(mp) < sizeof (ipha_t))
1866 		return (NULL);
1867 
1868 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1869 	case IPV4_VERSION:
1870 		ipha = (ipha_t *)mp->b_rptr;
1871 		hdr_len = IPH_HDR_LENGTH(ipha);
1872 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1873 		    zoneid, ipst));
1874 	case IPV6_VERSION:
1875 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1876 		    &hdr_len, &nexthdrp))
1877 			return (NULL);
1878 
1879 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
1880 	}
1881 
1882 	return (NULL);
1883 }
1884 
1885 conn_t *
1886 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1887     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
1888 {
1889 	connf_t		*connfp;
1890 	conn_t		*connp;
1891 	in_port_t	lport;
1892 	int		af;
1893 	boolean_t	shared_addr;
1894 	boolean_t	unlabeled;
1895 	const void	*dst;
1896 
1897 	lport = ((uint16_t *)&ports)[1];
1898 
1899 	unlabeled = B_FALSE;
1900 	/* Cred can be null on IPv6 */
1901 	if (is_system_labeled()) {
1902 		cred_t *cr = DB_CRED(mp);
1903 
1904 		unlabeled = (cr != NULL &&
1905 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1906 	}
1907 	shared_addr = (zoneid == ALL_ZONES);
1908 	if (shared_addr) {
1909 		/*
1910 		 * No need to handle exclusive-stack zones since ALL_ZONES
1911 		 * only applies to the shared stack.
1912 		 */
1913 		zoneid = tsol_mlp_findzone(protocol, lport);
1914 		/*
1915 		 * If no shared MLP is found, tsol_mlp_findzone returns
1916 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1917 		 * the zone based on the packet label.
1918 		 *
1919 		 * If there is such a zone, we prefer to find a connection in
1920 		 * it.  Otherwise, we look for a MAC-exempt connection in any
1921 		 * zone whose label dominates the default label on the packet.
1922 		 */
1923 		if (zoneid == ALL_ZONES)
1924 			zoneid = tsol_packet_to_zoneid(mp);
1925 		else
1926 			unlabeled = B_FALSE;
1927 	}
1928 
1929 	af = IPH_HDR_VERSION(hdr);
1930 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1931 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1932 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1933 
1934 	mutex_enter(&connfp->connf_lock);
1935 	for (connp = connfp->connf_head; connp != NULL;
1936 	    connp = connp->conn_next) {
1937 		/* We don't allow v4 fallback for v6 raw socket. */
1938 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1939 		    IPV6_VERSION))
1940 			continue;
1941 		if (connp->conn_fully_bound) {
1942 			if (af == IPV4_VERSION) {
1943 				if (!IPCL_CONN_MATCH(connp, protocol,
1944 				    hdr->ipha_src, hdr->ipha_dst, ports))
1945 					continue;
1946 			} else {
1947 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1948 				    ((ip6_t *)hdr)->ip6_src,
1949 				    ((ip6_t *)hdr)->ip6_dst, ports))
1950 					continue;
1951 			}
1952 		} else {
1953 			if (af == IPV4_VERSION) {
1954 				if (!IPCL_BIND_MATCH(connp, protocol,
1955 				    hdr->ipha_dst, lport))
1956 					continue;
1957 			} else {
1958 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1959 				    ((ip6_t *)hdr)->ip6_dst, lport))
1960 					continue;
1961 			}
1962 		}
1963 
1964 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1965 		    (unlabeled && connp->conn_mac_exempt))
1966 			break;
1967 	}
1968 	/*
1969 	 * If the connection is fully-bound and connection-oriented (TCP or
1970 	 * SCTP), then we've already validated the remote system's label.
1971 	 * There's no need to do it again for every packet.
1972 	 */
1973 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1974 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1975 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1976 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1977 		    char *, "connp(1) could not receive mp(2)",
1978 		    conn_t *, connp, mblk_t *, mp);
1979 		connp = NULL;
1980 	}
1981 
1982 	if (connp != NULL)
1983 		goto found;
1984 	mutex_exit(&connfp->connf_lock);
1985 
1986 	/* Try to look for a wildcard match. */
1987 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1988 	mutex_enter(&connfp->connf_lock);
1989 	for (connp = connfp->connf_head; connp != NULL;
1990 	    connp = connp->conn_next) {
1991 		/* We don't allow v4 fallback for v6 raw socket. */
1992 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1993 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1994 			continue;
1995 		}
1996 		if (af == IPV4_VERSION) {
1997 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1998 				break;
1999 		} else {
2000 			if (IPCL_RAW_MATCH_V6(connp, protocol,
2001 			    ((ip6_t *)hdr)->ip6_dst)) {
2002 				break;
2003 			}
2004 		}
2005 	}
2006 
2007 	if (connp != NULL)
2008 		goto found;
2009 
2010 	mutex_exit(&connfp->connf_lock);
2011 	return (NULL);
2012 
2013 found:
2014 	ASSERT(connp != NULL);
2015 	CONN_INC_REF(connp);
2016 	mutex_exit(&connfp->connf_lock);
2017 	return (connp);
2018 }
2019 
2020 /* ARGSUSED */
2021 static int
2022 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2023 {
2024 	itc_t	*itc = (itc_t *)buf;
2025 	conn_t 	*connp = &itc->itc_conn;
2026 	tcp_t	*tcp = (tcp_t *)&itc[1];
2027 
2028 	bzero(connp, sizeof (conn_t));
2029 	bzero(tcp, sizeof (tcp_t));
2030 
2031 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2032 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2033 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2034 	connp->conn_tcp = tcp;
2035 	connp->conn_flags = IPCL_TCPCONN;
2036 	connp->conn_ulp = IPPROTO_TCP;
2037 	tcp->tcp_connp = connp;
2038 	return (0);
2039 }
2040 
2041 /* ARGSUSED */
2042 static void
2043 tcp_conn_destructor(void *buf, void *cdrarg)
2044 {
2045 	itc_t	*itc = (itc_t *)buf;
2046 	conn_t 	*connp = &itc->itc_conn;
2047 	tcp_t	*tcp = (tcp_t *)&itc[1];
2048 
2049 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2050 	ASSERT(tcp->tcp_connp == connp);
2051 	ASSERT(connp->conn_tcp == tcp);
2052 	tcp_timermp_free(tcp);
2053 	mutex_destroy(&connp->conn_lock);
2054 	cv_destroy(&connp->conn_cv);
2055 }
2056 
2057 /* ARGSUSED */
2058 static int
2059 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2060 {
2061 	itc_t	*itc = (itc_t *)buf;
2062 	conn_t 	*connp = &itc->itc_conn;
2063 
2064 	bzero(connp, sizeof (conn_t));
2065 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2066 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2067 	connp->conn_flags = IPCL_IPCCONN;
2068 
2069 	return (0);
2070 }
2071 
2072 /* ARGSUSED */
2073 static void
2074 ip_conn_destructor(void *buf, void *cdrarg)
2075 {
2076 	itc_t	*itc = (itc_t *)buf;
2077 	conn_t 	*connp = &itc->itc_conn;
2078 
2079 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2080 	ASSERT(connp->conn_priv == NULL);
2081 	mutex_destroy(&connp->conn_lock);
2082 	cv_destroy(&connp->conn_cv);
2083 }
2084 
2085 /* ARGSUSED */
2086 static int
2087 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2088 {
2089 	itc_t	*itc = (itc_t *)buf;
2090 	conn_t 	*connp = &itc->itc_conn;
2091 	udp_t	*udp = (udp_t *)&itc[1];
2092 
2093 	bzero(connp, sizeof (conn_t));
2094 	bzero(udp, sizeof (udp_t));
2095 
2096 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2097 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2098 	connp->conn_udp = udp;
2099 	connp->conn_flags = IPCL_UDPCONN;
2100 	connp->conn_ulp = IPPROTO_UDP;
2101 	udp->udp_connp = connp;
2102 	return (0);
2103 }
2104 
2105 /* ARGSUSED */
2106 static void
2107 udp_conn_destructor(void *buf, void *cdrarg)
2108 {
2109 	itc_t	*itc = (itc_t *)buf;
2110 	conn_t 	*connp = &itc->itc_conn;
2111 	udp_t	*udp = (udp_t *)&itc[1];
2112 
2113 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2114 	ASSERT(udp->udp_connp == connp);
2115 	ASSERT(connp->conn_udp == udp);
2116 	mutex_destroy(&connp->conn_lock);
2117 	cv_destroy(&connp->conn_cv);
2118 }
2119 
2120 /* ARGSUSED */
2121 static int
2122 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2123 {
2124 	itc_t	*itc = (itc_t *)buf;
2125 	conn_t 	*connp = &itc->itc_conn;
2126 	icmp_t	*icmp = (icmp_t *)&itc[1];
2127 
2128 	bzero(connp, sizeof (conn_t));
2129 	bzero(icmp, sizeof (icmp_t));
2130 
2131 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2132 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2133 	connp->conn_icmp = icmp;
2134 	connp->conn_flags = IPCL_RAWIPCONN;
2135 	connp->conn_ulp = IPPROTO_ICMP;
2136 	icmp->icmp_connp = connp;
2137 	return (0);
2138 }
2139 
2140 /* ARGSUSED */
2141 static void
2142 rawip_conn_destructor(void *buf, void *cdrarg)
2143 {
2144 	itc_t	*itc = (itc_t *)buf;
2145 	conn_t 	*connp = &itc->itc_conn;
2146 	icmp_t	*icmp = (icmp_t *)&itc[1];
2147 
2148 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2149 	ASSERT(icmp->icmp_connp == connp);
2150 	ASSERT(connp->conn_icmp == icmp);
2151 	mutex_destroy(&connp->conn_lock);
2152 	cv_destroy(&connp->conn_cv);
2153 }
2154 
2155 /* ARGSUSED */
2156 static int
2157 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2158 {
2159 	itc_t	*itc = (itc_t *)buf;
2160 	conn_t 	*connp = &itc->itc_conn;
2161 	rts_t	*rts = (rts_t *)&itc[1];
2162 
2163 	bzero(connp, sizeof (conn_t));
2164 	bzero(rts, sizeof (rts_t));
2165 
2166 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2167 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2168 	connp->conn_rts = rts;
2169 	connp->conn_flags = IPCL_RTSCONN;
2170 	rts->rts_connp = connp;
2171 	return (0);
2172 }
2173 
2174 /* ARGSUSED */
2175 static void
2176 rts_conn_destructor(void *buf, void *cdrarg)
2177 {
2178 	itc_t	*itc = (itc_t *)buf;
2179 	conn_t 	*connp = &itc->itc_conn;
2180 	rts_t	*rts = (rts_t *)&itc[1];
2181 
2182 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2183 	ASSERT(rts->rts_connp == connp);
2184 	ASSERT(connp->conn_rts == rts);
2185 	mutex_destroy(&connp->conn_lock);
2186 	cv_destroy(&connp->conn_cv);
2187 }
2188 
2189 /*
2190  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2191  * in the conn_t.
2192  *
2193  * Below we list all the pointers in the conn_t as a documentation aid.
2194  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2195  * If you add any pointers to the conn_t please add an ASSERT here
2196  * and #ifdef it out if it can't be actually asserted to be NULL.
2197  * In any case, we bzero most of the conn_t at the end of the function.
2198  */
2199 void
2200 ipcl_conn_cleanup(conn_t *connp)
2201 {
2202 	ASSERT(connp->conn_ire_cache == NULL);
2203 	ASSERT(connp->conn_latch == NULL);
2204 #ifdef notdef
2205 	/* These are not cleared */
2206 	ASSERT(connp->conn_rq == NULL);
2207 	ASSERT(connp->conn_wq == NULL);
2208 #endif
2209 	ASSERT(connp->conn_cred == NULL);
2210 	ASSERT(connp->conn_g_fanout == NULL);
2211 	ASSERT(connp->conn_g_next == NULL);
2212 	ASSERT(connp->conn_g_prev == NULL);
2213 	ASSERT(connp->conn_policy == NULL);
2214 	ASSERT(connp->conn_fanout == NULL);
2215 	ASSERT(connp->conn_next == NULL);
2216 	ASSERT(connp->conn_prev == NULL);
2217 #ifdef notdef
2218 	/*
2219 	 * The ill and ipif pointers are not cleared before the conn_t
2220 	 * goes away since they do not hold a reference on the ill/ipif.
2221 	 * We should replace these pointers with ifindex/ipaddr_t to
2222 	 * make the code less complex.
2223 	 */
2224 	ASSERT(connp->conn_xmit_if_ill == NULL);
2225 	ASSERT(connp->conn_nofailover_ill == NULL);
2226 	ASSERT(connp->conn_outgoing_ill == NULL);
2227 	ASSERT(connp->conn_incoming_ill == NULL);
2228 	ASSERT(connp->conn_outgoing_pill == NULL);
2229 	ASSERT(connp->conn_multicast_ipif == NULL);
2230 	ASSERT(connp->conn_multicast_ill == NULL);
2231 #endif
2232 	ASSERT(connp->conn_oper_pending_ill == NULL);
2233 	ASSERT(connp->conn_ilg == NULL);
2234 	ASSERT(connp->conn_drain_next == NULL);
2235 	ASSERT(connp->conn_drain_prev == NULL);
2236 #ifdef notdef
2237 	/* conn_idl is not cleared when removed from idl list */
2238 	ASSERT(connp->conn_idl == NULL);
2239 #endif
2240 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2241 	ASSERT(connp->conn_peercred == NULL);
2242 	ASSERT(connp->conn_netstack == NULL);
2243 
2244 	/* Clear out the conn_t fields that are not preserved */
2245 	bzero(&connp->conn_start_clr,
2246 	    sizeof (conn_t) -
2247 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2248 
2249 }
2250 
2251 /*
2252  * All conns are inserted in a global multi-list for the benefit of
2253  * walkers. The walk is guaranteed to walk all open conns at the time
2254  * of the start of the walk exactly once. This property is needed to
2255  * achieve some cleanups during unplumb of interfaces. This is achieved
2256  * as follows.
2257  *
2258  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2259  * call the insert and delete functions below at creation and deletion
2260  * time respectively. The conn never moves or changes its position in this
2261  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2262  * won't increase due to walkers, once the conn deletion has started. Note
2263  * that we can't remove the conn from the global list and then wait for
2264  * the refcnt to drop to zero, since walkers would then see a truncated
2265  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2266  * conns until ip_open is ready to make them globally visible.
2267  * The global round robin multi-list locks are held only to get the
2268  * next member/insertion/deletion and contention should be negligible
2269  * if the multi-list is much greater than the number of cpus.
2270  */
2271 void
2272 ipcl_globalhash_insert(conn_t *connp)
2273 {
2274 	int	index;
2275 	struct connf_s	*connfp;
2276 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2277 
2278 	/*
2279 	 * No need for atomic here. Approximate even distribution
2280 	 * in the global lists is sufficient.
2281 	 */
2282 	ipst->ips_conn_g_index++;
2283 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2284 
2285 	connp->conn_g_prev = NULL;
2286 	/*
2287 	 * Mark as INCIPIENT, so that walkers will ignore this
2288 	 * for now, till ip_open is ready to make it visible globally.
2289 	 */
2290 	connp->conn_state_flags |= CONN_INCIPIENT;
2291 
2292 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2293 	/* Insert at the head of the list */
2294 	mutex_enter(&connfp->connf_lock);
2295 	connp->conn_g_next = connfp->connf_head;
2296 	if (connp->conn_g_next != NULL)
2297 		connp->conn_g_next->conn_g_prev = connp;
2298 	connfp->connf_head = connp;
2299 
2300 	/* The fanout bucket this conn points to */
2301 	connp->conn_g_fanout = connfp;
2302 
2303 	mutex_exit(&connfp->connf_lock);
2304 }
2305 
2306 void
2307 ipcl_globalhash_remove(conn_t *connp)
2308 {
2309 	struct connf_s	*connfp;
2310 
2311 	/*
2312 	 * We were never inserted in the global multi list.
2313 	 * IPCL_NONE variety is never inserted in the global multilist
2314 	 * since it is presumed to not need any cleanup and is transient.
2315 	 */
2316 	if (connp->conn_g_fanout == NULL)
2317 		return;
2318 
2319 	connfp = connp->conn_g_fanout;
2320 	mutex_enter(&connfp->connf_lock);
2321 	if (connp->conn_g_prev != NULL)
2322 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2323 	else
2324 		connfp->connf_head = connp->conn_g_next;
2325 	if (connp->conn_g_next != NULL)
2326 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2327 	mutex_exit(&connfp->connf_lock);
2328 
2329 	/* Better to stumble on a null pointer than to corrupt memory */
2330 	connp->conn_g_next = NULL;
2331 	connp->conn_g_prev = NULL;
2332 	connp->conn_g_fanout = NULL;
2333 }
2334 
2335 /*
2336  * Walk the list of all conn_t's in the system, calling the function provided
2337  * with the specified argument for each.
2338  * Applies to both IPv4 and IPv6.
2339  *
2340  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2341  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2342  * unplumbed or removed. New conn_t's that are created while we are walking
2343  * may be missed by this walk, because they are not necessarily inserted
2344  * at the tail of the list. They are new conn_t's and thus don't have any
2345  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2346  * is created to the struct that is going away.
2347  */
2348 void
2349 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2350 {
2351 	int	i;
2352 	conn_t	*connp;
2353 	conn_t	*prev_connp;
2354 
2355 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2356 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 		prev_connp = NULL;
2358 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2359 		while (connp != NULL) {
2360 			mutex_enter(&connp->conn_lock);
2361 			if (connp->conn_state_flags &
2362 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2363 				mutex_exit(&connp->conn_lock);
2364 				connp = connp->conn_g_next;
2365 				continue;
2366 			}
2367 			CONN_INC_REF_LOCKED(connp);
2368 			mutex_exit(&connp->conn_lock);
2369 			mutex_exit(
2370 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2371 			(*func)(connp, arg);
2372 			if (prev_connp != NULL)
2373 				CONN_DEC_REF(prev_connp);
2374 			mutex_enter(
2375 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2376 			prev_connp = connp;
2377 			connp = connp->conn_g_next;
2378 		}
2379 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2380 		if (prev_connp != NULL)
2381 			CONN_DEC_REF(prev_connp);
2382 	}
2383 }
2384 
2385 /*
2386  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2387  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2388  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2389  * (peer tcp in ESTABLISHED state).
2390  */
2391 conn_t *
2392 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2393     ip_stack_t *ipst)
2394 {
2395 	uint32_t ports;
2396 	uint16_t *pports = (uint16_t *)&ports;
2397 	connf_t	*connfp;
2398 	conn_t	*tconnp;
2399 	boolean_t zone_chk;
2400 
2401 	/*
2402 	 * If either the source of destination address is loopback, then
2403 	 * both endpoints must be in the same Zone.  Otherwise, both of
2404 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2405 	 * state) and the endpoints may reside in different Zones.
2406 	 */
2407 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2408 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2409 
2410 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2411 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2412 
2413 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2414 	    ports, ipst)];
2415 
2416 	mutex_enter(&connfp->connf_lock);
2417 	for (tconnp = connfp->connf_head; tconnp != NULL;
2418 	    tconnp = tconnp->conn_next) {
2419 
2420 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2421 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2422 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2423 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2424 
2425 			ASSERT(tconnp != connp);
2426 			CONN_INC_REF(tconnp);
2427 			mutex_exit(&connfp->connf_lock);
2428 			return (tconnp);
2429 		}
2430 	}
2431 	mutex_exit(&connfp->connf_lock);
2432 	return (NULL);
2433 }
2434 
2435 /*
2436  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2437  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2438  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2439  * (peer tcp in ESTABLISHED state).
2440  */
2441 conn_t *
2442 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2443     ip_stack_t *ipst)
2444 {
2445 	uint32_t ports;
2446 	uint16_t *pports = (uint16_t *)&ports;
2447 	connf_t	*connfp;
2448 	conn_t	*tconnp;
2449 	boolean_t zone_chk;
2450 
2451 	/*
2452 	 * If either the source of destination address is loopback, then
2453 	 * both endpoints must be in the same Zone.  Otherwise, both of
2454 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2455 	 * state) and the endpoints may reside in different Zones.  We
2456 	 * don't do Zone check for link local address(es) because the
2457 	 * current Zone implementation treats each link local address as
2458 	 * being unique per system node, i.e. they belong to global Zone.
2459 	 */
2460 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2461 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2462 
2463 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2464 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2465 
2466 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2467 	    ports, ipst)];
2468 
2469 	mutex_enter(&connfp->connf_lock);
2470 	for (tconnp = connfp->connf_head; tconnp != NULL;
2471 	    tconnp = tconnp->conn_next) {
2472 
2473 		/* We skip tcp_bound_if check here as this is loopback tcp */
2474 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2475 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2476 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2477 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2478 
2479 			ASSERT(tconnp != connp);
2480 			CONN_INC_REF(tconnp);
2481 			mutex_exit(&connfp->connf_lock);
2482 			return (tconnp);
2483 		}
2484 	}
2485 	mutex_exit(&connfp->connf_lock);
2486 	return (NULL);
2487 }
2488 
2489 /*
2490  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2491  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2492  * Only checks for connected entries i.e. no INADDR_ANY checks.
2493  */
2494 conn_t *
2495 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2496     ip_stack_t *ipst)
2497 {
2498 	uint32_t ports;
2499 	uint16_t *pports;
2500 	connf_t	*connfp;
2501 	conn_t	*tconnp;
2502 
2503 	pports = (uint16_t *)&ports;
2504 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2505 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2506 
2507 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2508 	    ports, ipst)];
2509 
2510 	mutex_enter(&connfp->connf_lock);
2511 	for (tconnp = connfp->connf_head; tconnp != NULL;
2512 	    tconnp = tconnp->conn_next) {
2513 
2514 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2515 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2516 		    tconnp->conn_tcp->tcp_state >= min_state) {
2517 
2518 			CONN_INC_REF(tconnp);
2519 			mutex_exit(&connfp->connf_lock);
2520 			return (tconnp);
2521 		}
2522 	}
2523 	mutex_exit(&connfp->connf_lock);
2524 	return (NULL);
2525 }
2526 
2527 /*
2528  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2529  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2530  * Only checks for connected entries i.e. no INADDR_ANY checks.
2531  * Match on ifindex in addition to addresses.
2532  */
2533 conn_t *
2534 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2535     uint_t ifindex, ip_stack_t *ipst)
2536 {
2537 	tcp_t	*tcp;
2538 	uint32_t ports;
2539 	uint16_t *pports;
2540 	connf_t	*connfp;
2541 	conn_t	*tconnp;
2542 
2543 	pports = (uint16_t *)&ports;
2544 	pports[0] = tcpha->tha_fport;
2545 	pports[1] = tcpha->tha_lport;
2546 
2547 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2548 	    ports, ipst)];
2549 
2550 	mutex_enter(&connfp->connf_lock);
2551 	for (tconnp = connfp->connf_head; tconnp != NULL;
2552 	    tconnp = tconnp->conn_next) {
2553 
2554 		tcp = tconnp->conn_tcp;
2555 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2556 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2557 		    tcp->tcp_state >= min_state &&
2558 		    (tcp->tcp_bound_if == 0 ||
2559 		    tcp->tcp_bound_if == ifindex)) {
2560 
2561 			CONN_INC_REF(tconnp);
2562 			mutex_exit(&connfp->connf_lock);
2563 			return (tconnp);
2564 		}
2565 	}
2566 	mutex_exit(&connfp->connf_lock);
2567 	return (NULL);
2568 }
2569 
2570 /*
2571  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2572  * a listener when changing state.
2573  */
2574 conn_t *
2575 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2576     ip_stack_t *ipst)
2577 {
2578 	connf_t		*bind_connfp;
2579 	conn_t		*connp;
2580 	tcp_t		*tcp;
2581 
2582 	/*
2583 	 * Avoid false matches for packets sent to an IP destination of
2584 	 * all zeros.
2585 	 */
2586 	if (laddr == 0)
2587 		return (NULL);
2588 
2589 	ASSERT(zoneid != ALL_ZONES);
2590 
2591 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2592 	mutex_enter(&bind_connfp->connf_lock);
2593 	for (connp = bind_connfp->connf_head; connp != NULL;
2594 	    connp = connp->conn_next) {
2595 		tcp = connp->conn_tcp;
2596 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2597 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2598 		    (tcp->tcp_listener == NULL)) {
2599 			CONN_INC_REF(connp);
2600 			mutex_exit(&bind_connfp->connf_lock);
2601 			return (connp);
2602 		}
2603 	}
2604 	mutex_exit(&bind_connfp->connf_lock);
2605 	return (NULL);
2606 }
2607 
2608 /*
2609  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2610  * a listener when changing state.
2611  */
2612 conn_t *
2613 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2614     zoneid_t zoneid, ip_stack_t *ipst)
2615 {
2616 	connf_t		*bind_connfp;
2617 	conn_t		*connp = NULL;
2618 	tcp_t		*tcp;
2619 
2620 	/*
2621 	 * Avoid false matches for packets sent to an IP destination of
2622 	 * all zeros.
2623 	 */
2624 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2625 		return (NULL);
2626 
2627 	ASSERT(zoneid != ALL_ZONES);
2628 
2629 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2630 	mutex_enter(&bind_connfp->connf_lock);
2631 	for (connp = bind_connfp->connf_head; connp != NULL;
2632 	    connp = connp->conn_next) {
2633 		tcp = connp->conn_tcp;
2634 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2635 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2636 		    (tcp->tcp_bound_if == 0 ||
2637 		    tcp->tcp_bound_if == ifindex) &&
2638 		    tcp->tcp_listener == NULL) {
2639 			CONN_INC_REF(connp);
2640 			mutex_exit(&bind_connfp->connf_lock);
2641 			return (connp);
2642 		}
2643 	}
2644 	mutex_exit(&bind_connfp->connf_lock);
2645 	return (NULL);
2646 }
2647 
2648 /*
2649  * ipcl_get_next_conn
2650  *	get the next entry in the conn global list
2651  *	and put a reference on the next_conn.
2652  *	decrement the reference on the current conn.
2653  *
2654  * This is an iterator based walker function that also provides for
2655  * some selection by the caller. It walks through the conn_hash bucket
2656  * searching for the next valid connp in the list, and selects connections
2657  * that are neither closed nor condemned. It also REFHOLDS the conn
2658  * thus ensuring that the conn exists when the caller uses the conn.
2659  */
2660 conn_t *
2661 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2662 {
2663 	conn_t	*next_connp;
2664 
2665 	if (connfp == NULL)
2666 		return (NULL);
2667 
2668 	mutex_enter(&connfp->connf_lock);
2669 
2670 	next_connp = (connp == NULL) ?
2671 	    connfp->connf_head : connp->conn_g_next;
2672 
2673 	while (next_connp != NULL) {
2674 		mutex_enter(&next_connp->conn_lock);
2675 		if (!(next_connp->conn_flags & conn_flags) ||
2676 		    (next_connp->conn_state_flags &
2677 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2678 			/*
2679 			 * This conn has been condemned or
2680 			 * is closing, or the flags don't match
2681 			 */
2682 			mutex_exit(&next_connp->conn_lock);
2683 			next_connp = next_connp->conn_g_next;
2684 			continue;
2685 		}
2686 		CONN_INC_REF_LOCKED(next_connp);
2687 		mutex_exit(&next_connp->conn_lock);
2688 		break;
2689 	}
2690 
2691 	mutex_exit(&connfp->connf_lock);
2692 
2693 	if (connp != NULL)
2694 		CONN_DEC_REF(connp);
2695 
2696 	return (next_connp);
2697 }
2698 
2699 #ifdef CONN_DEBUG
2700 /*
2701  * Trace of the last NBUF refhold/refrele
2702  */
2703 int
2704 conn_trace_ref(conn_t *connp)
2705 {
2706 	int	last;
2707 	conn_trace_t	*ctb;
2708 
2709 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2710 	last = connp->conn_trace_last;
2711 	last++;
2712 	if (last == CONN_TRACE_MAX)
2713 		last = 0;
2714 
2715 	ctb = &connp->conn_trace_buf[last];
2716 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2717 	connp->conn_trace_last = last;
2718 	return (1);
2719 }
2720 
2721 int
2722 conn_untrace_ref(conn_t *connp)
2723 {
2724 	int	last;
2725 	conn_trace_t	*ctb;
2726 
2727 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2728 	last = connp->conn_trace_last;
2729 	last++;
2730 	if (last == CONN_TRACE_MAX)
2731 		last = 0;
2732 
2733 	ctb = &connp->conn_trace_buf[last];
2734 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2735 	connp->conn_trace_last = last;
2736 	return (1);
2737 }
2738 #endif
2739