xref: /titanic_41/usr/src/uts/common/inet/ip/ipclassifier.c (revision 160abee025ef30c34521b981edd40ffcaab560aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
29 
30 /*
31  * IP PACKET CLASSIFIER
32  *
33  * The IP packet classifier provides mapping between IP packets and persistent
34  * connection state for connection-oriented protocols. It also provides
35  * interface for managing connection states.
36  *
37  * The connection state is kept in conn_t data structure and contains, among
38  * other things:
39  *
40  *	o local/remote address and ports
41  *	o Transport protocol
42  *	o squeue for the connection (for TCP only)
43  *	o reference counter
44  *	o Connection state
45  *	o hash table linkage
46  *	o interface/ire information
47  *	o credentials
48  *	o ipsec policy
49  *	o send and receive functions.
50  *	o mutex lock.
51  *
52  * Connections use a reference counting scheme. They are freed when the
53  * reference counter drops to zero. A reference is incremented when connection
54  * is placed in a list or table, when incoming packet for the connection arrives
55  * and when connection is processed via squeue (squeue processing may be
56  * asynchronous and the reference protects the connection from being destroyed
57  * before its processing is finished).
58  *
59  * send and receive functions are currently used for TCP only. The send function
60  * determines the IP entry point for the packet once it leaves TCP to be sent to
61  * the destination address. The receive function is used by IP when the packet
62  * should be passed for TCP processing. When a new connection is created these
63  * are set to ip_output() and tcp_input() respectively. During the lifetime of
64  * the connection the send and receive functions may change depending on the
65  * changes in the connection state. For example, Once the connection is bound to
66  * an addresse, the receive function for this connection is set to
67  * tcp_conn_request().  This allows incoming SYNs to go directly into the
68  * listener SYN processing function without going to tcp_input() first.
69  *
70  * Classifier uses several hash tables:
71  *
72  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73  *	ipcl_bind_fanout:	contains all connections in BOUND state
74  *	ipcl_proto_fanout:	IPv4 protocol fanout
75  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76  *	ipcl_udp_fanout:	contains all UDP connections
77  *	ipcl_globalhash_fanout:	contains all connections
78  *
79  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80  * which need to view all existing connections.
81  *
82  * All tables are protected by per-bucket locks. When both per-bucket lock and
83  * connection lock need to be held, the per-bucket lock should be acquired
84  * first, followed by the connection lock.
85  *
86  * All functions doing search in one of these tables increment a reference
87  * counter on the connection found (if any). This reference should be dropped
88  * when the caller has finished processing the connection.
89  *
90  *
91  * INTERFACES:
92  * ===========
93  *
94  * Connection Lookup:
95  * ------------------
96  *
97  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
98  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
99  *
100  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101  * it can't find any associated connection. If the connection is found, its
102  * reference counter is incremented.
103  *
104  *	mp:	mblock, containing packet header. The full header should fit
105  *		into a single mblock. It should also contain at least full IP
106  *		and TCP or UDP header.
107  *
108  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109  *
110  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111  *		 the packet.
112  *
113  * 	zoneid: The zone in which the returned connection must be; the zoneid
114  *		corresponding to the ire_zoneid on the IRE located for the
115  *		packet's destination address.
116  *
117  *	For TCP connections, the lookup order is as follows:
118  *		5-tuple {src, dst, protocol, local port, remote port}
119  *			lookup in ipcl_conn_fanout table.
120  *		3-tuple {dst, remote port, protocol} lookup in
121  *			ipcl_bind_fanout table.
122  *
123  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
124  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
125  *	these interfaces do not handle cases where a packets belongs
126  *	to multiple UDP clients, which is handled in IP itself.
127  *
128  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
129  * determine which actual zone gets the segment.  This is used only in a
130  * labeled environment.  The matching rules are:
131  *
132  *	- If it's not a multilevel port, then the label on the packet selects
133  *	  the zone.  Unlabeled packets are delivered to the global zone.
134  *
135  *	- If it's a multilevel port, then only the zone registered to receive
136  *	  packets on that port matches.
137  *
138  * Also, in a labeled environment, packet labels need to be checked.  For fully
139  * bound TCP connections, we can assume that the packet label was checked
140  * during connection establishment, and doesn't need to be checked on each
141  * packet.  For others, though, we need to check for strict equality or, for
142  * multilevel ports, membership in the range or set.  This part currently does
143  * a tnrh lookup on each packet, but could be optimized to use cached results
144  * if that were necessary.  (SCTP doesn't come through here, but if it did,
145  * we would apply the same rules as TCP.)
146  *
147  * An implication of the above is that fully-bound TCP sockets must always use
148  * distinct 4-tuples; they can't be discriminated by label alone.
149  *
150  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
151  * as there's no connection set-up handshake and no shared state.
152  *
153  * Labels on looped-back packets within a single zone do not need to be
154  * checked, as all processes in the same zone have the same label.
155  *
156  * Finally, for unlabeled packets received by a labeled system, special rules
157  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
158  * socket in the zone whose label matches the default label of the sender, if
159  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
160  * receiver's label must dominate the sender's default label.
161  *
162  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
163  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
164  *					 ip_stack);
165  *
166  *	Lookup routine to find a exact match for {src, dst, local port,
167  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
168  *	ports are read from the IP and TCP header respectively.
169  *
170  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
171  *					 zoneid, ip_stack);
172  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
173  *					 zoneid, ip_stack);
174  *
175  * 	Lookup routine to find a listener with the tuple {lport, laddr,
176  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
177  * 	parameter interface index is also compared.
178  *
179  * void ipcl_walk(func, arg, ip_stack)
180  *
181  * 	Apply 'func' to every connection available. The 'func' is called as
182  *	(*func)(connp, arg). The walk is non-atomic so connections may be
183  *	created and destroyed during the walk. The CONN_CONDEMNED and
184  *	CONN_INCIPIENT flags ensure that connections which are newly created
185  *	or being destroyed are not selected by the walker.
186  *
187  * Table Updates
188  * -------------
189  *
190  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
191  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
192  *
193  *	Insert 'connp' in the ipcl_conn_fanout.
194  *	Arguements :
195  *		connp		conn_t to be inserted
196  *		protocol	connection protocol
197  *		src		source address
198  *		dst		destination address
199  *		ports		local and remote port
200  *		ifindex		interface index for IPv6 connections
201  *
202  *	Return value :
203  *		0		if connp was inserted
204  *		EADDRINUSE	if the connection with the same tuple
205  *				already exists.
206  *
207  * int ipcl_bind_insert(connp, protocol, src, lport);
208  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
209  *
210  * 	Insert 'connp' in ipcl_bind_fanout.
211  * 	Arguements :
212  * 		connp		conn_t to be inserted
213  * 		protocol	connection protocol
214  * 		src		source address connection wants
215  * 				to bind to
216  * 		lport		local port connection wants to
217  * 				bind to
218  *
219  *
220  * void ipcl_hash_remove(connp);
221  *
222  * 	Removes the 'connp' from the connection fanout table.
223  *
224  * Connection Creation/Destruction
225  * -------------------------------
226  *
227  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
228  *
229  * 	Creates a new conn based on the type flag, inserts it into
230  * 	globalhash table.
231  *
232  *	type:	This flag determines the type of conn_t which needs to be
233  *		created i.e., which kmem_cache it comes from.
234  *		IPCL_TCPCONN	indicates a TCP connection
235  *		IPCL_SCTPCONN	indicates a SCTP connection
236  *		IPCL_UDPCONN	indicates a UDP conn_t.
237  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
238  *		IPCL_RTSCONN	indicates a RTS conn_t.
239  *		IPCL_IPCCONN	indicates all other connections.
240  *
241  * void ipcl_conn_destroy(connp)
242  *
243  * 	Destroys the connection state, removes it from the global
244  * 	connection hash table and frees its memory.
245  */
246 
247 #include <sys/types.h>
248 #include <sys/stream.h>
249 #include <sys/stropts.h>
250 #include <sys/sysmacros.h>
251 #include <sys/strsubr.h>
252 #include <sys/strsun.h>
253 #define	_SUN_TPI_VERSION 2
254 #include <sys/ddi.h>
255 #include <sys/cmn_err.h>
256 #include <sys/debug.h>
257 
258 #include <sys/systm.h>
259 #include <sys/param.h>
260 #include <sys/kmem.h>
261 #include <sys/isa_defs.h>
262 #include <inet/common.h>
263 #include <netinet/ip6.h>
264 #include <netinet/icmp6.h>
265 
266 #include <inet/ip.h>
267 #include <inet/ip6.h>
268 #include <inet/tcp.h>
269 #include <inet/ip_ndp.h>
270 #include <inet/udp_impl.h>
271 #include <inet/sctp_ip.h>
272 #include <inet/sctp/sctp_impl.h>
273 #include <inet/rawip_impl.h>
274 #include <inet/rts_impl.h>
275 
276 #include <sys/cpuvar.h>
277 
278 #include <inet/ipclassifier.h>
279 #include <inet/ipsec_impl.h>
280 
281 #include <sys/tsol/tnet.h>
282 
283 #ifdef DEBUG
284 #define	IPCL_DEBUG
285 #else
286 #undef	IPCL_DEBUG
287 #endif
288 
289 #ifdef	IPCL_DEBUG
290 int	ipcl_debug_level = 0;
291 #define	IPCL_DEBUG_LVL(level, args)	\
292 	if (ipcl_debug_level  & level) { printf args; }
293 #else
294 #define	IPCL_DEBUG_LVL(level, args) {; }
295 #endif
296 /* Old value for compatibility. Setable in /etc/system */
297 uint_t tcp_conn_hash_size = 0;
298 
299 /* New value. Zero means choose automatically.  Setable in /etc/system */
300 uint_t ipcl_conn_hash_size = 0;
301 uint_t ipcl_conn_hash_memfactor = 8192;
302 uint_t ipcl_conn_hash_maxsize = 82500;
303 
304 /* bind/udp fanout table size */
305 uint_t ipcl_bind_fanout_size = 512;
306 uint_t ipcl_udp_fanout_size = 16384;
307 
308 /* Raw socket fanout size.  Must be a power of 2. */
309 uint_t ipcl_raw_fanout_size = 256;
310 
311 /*
312  * Power of 2^N Primes useful for hashing for N of 0-28,
313  * these primes are the nearest prime <= 2^N - 2^(N-2).
314  */
315 
316 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
317 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
318 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
319 		50331599, 100663291, 201326557, 0}
320 
321 /*
322  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
323  * are aligned on cache lines.
324  */
325 typedef union itc_s {
326 	conn_t	itc_conn;
327 	char	itcu_filler[CACHE_ALIGN(conn_s)];
328 } itc_t;
329 
330 struct kmem_cache  *tcp_conn_cache;
331 struct kmem_cache  *ip_conn_cache;
332 extern struct kmem_cache  *sctp_conn_cache;
333 extern struct kmem_cache  *tcp_sack_info_cache;
334 extern struct kmem_cache  *tcp_iphc_cache;
335 struct kmem_cache  *udp_conn_cache;
336 struct kmem_cache  *rawip_conn_cache;
337 struct kmem_cache  *rts_conn_cache;
338 
339 extern void	tcp_timermp_free(tcp_t *);
340 extern mblk_t	*tcp_timermp_alloc(int);
341 
342 static int	ip_conn_constructor(void *, void *, int);
343 static void	ip_conn_destructor(void *, void *);
344 
345 static int	tcp_conn_constructor(void *, void *, int);
346 static void	tcp_conn_destructor(void *, void *);
347 
348 static int	udp_conn_constructor(void *, void *, int);
349 static void	udp_conn_destructor(void *, void *);
350 
351 static int	rawip_conn_constructor(void *, void *, int);
352 static void	rawip_conn_destructor(void *, void *);
353 
354 static int	rts_conn_constructor(void *, void *, int);
355 static void	rts_conn_destructor(void *, void *);
356 
357 #ifdef	IPCL_DEBUG
358 #define	INET_NTOA_BUFSIZE	18
359 
360 static char *
361 inet_ntoa_r(uint32_t in, char *b)
362 {
363 	unsigned char	*p;
364 
365 	p = (unsigned char *)&in;
366 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
367 	return (b);
368 }
369 #endif
370 
371 /*
372  * Global (for all stack instances) init routine
373  */
374 void
375 ipcl_g_init(void)
376 {
377 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
378 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
379 	    ip_conn_constructor, ip_conn_destructor,
380 	    NULL, NULL, NULL, 0);
381 
382 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
383 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
384 	    tcp_conn_constructor, tcp_conn_destructor,
385 	    NULL, NULL, NULL, 0);
386 
387 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
388 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
389 	    udp_conn_constructor, udp_conn_destructor,
390 	    NULL, NULL, NULL, 0);
391 
392 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
393 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
394 	    rawip_conn_constructor, rawip_conn_destructor,
395 	    NULL, NULL, NULL, 0);
396 
397 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
398 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
399 	    rts_conn_constructor, rts_conn_destructor,
400 	    NULL, NULL, NULL, 0);
401 }
402 
403 /*
404  * ipclassifier intialization routine, sets up hash tables.
405  */
406 void
407 ipcl_init(ip_stack_t *ipst)
408 {
409 	int i;
410 	int sizes[] = P2Ps();
411 
412 	/*
413 	 * Calculate size of conn fanout table from /etc/system settings
414 	 */
415 	if (ipcl_conn_hash_size != 0) {
416 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
417 	} else if (tcp_conn_hash_size != 0) {
418 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
419 	} else {
420 		extern pgcnt_t freemem;
421 
422 		ipst->ips_ipcl_conn_fanout_size =
423 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
424 
425 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
426 			ipst->ips_ipcl_conn_fanout_size =
427 			    ipcl_conn_hash_maxsize;
428 		}
429 	}
430 
431 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
432 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
433 			break;
434 		}
435 	}
436 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
437 		/* Out of range, use the 2^16 value */
438 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
439 	}
440 
441 	/* Take values from /etc/system */
442 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
443 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
444 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
445 
446 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
447 
448 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
449 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
450 
451 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
452 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
453 		    MUTEX_DEFAULT, NULL);
454 	}
455 
456 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
457 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
458 
459 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
460 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
461 		    MUTEX_DEFAULT, NULL);
462 	}
463 
464 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
465 	    sizeof (connf_t), KM_SLEEP);
466 	for (i = 0; i < IPPROTO_MAX; i++) {
467 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
468 		    MUTEX_DEFAULT, NULL);
469 	}
470 
471 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
472 	    sizeof (connf_t), KM_SLEEP);
473 	for (i = 0; i < IPPROTO_MAX; i++) {
474 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
475 		    MUTEX_DEFAULT, NULL);
476 	}
477 
478 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
479 	mutex_init(&ipst->ips_rts_clients->connf_lock,
480 	    NULL, MUTEX_DEFAULT, NULL);
481 
482 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
483 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
484 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
485 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
486 		    MUTEX_DEFAULT, NULL);
487 	}
488 
489 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
490 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
491 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
492 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
493 		    MUTEX_DEFAULT, NULL);
494 	}
495 
496 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
497 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
498 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
499 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
500 		    NULL, MUTEX_DEFAULT, NULL);
501 	}
502 }
503 
504 void
505 ipcl_g_destroy(void)
506 {
507 	kmem_cache_destroy(ip_conn_cache);
508 	kmem_cache_destroy(tcp_conn_cache);
509 	kmem_cache_destroy(udp_conn_cache);
510 	kmem_cache_destroy(rawip_conn_cache);
511 	kmem_cache_destroy(rts_conn_cache);
512 }
513 
514 /*
515  * All user-level and kernel use of the stack must be gone
516  * by now.
517  */
518 void
519 ipcl_destroy(ip_stack_t *ipst)
520 {
521 	int i;
522 
523 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
524 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
525 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
526 	}
527 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
528 	    sizeof (connf_t));
529 	ipst->ips_ipcl_conn_fanout = NULL;
530 
531 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
532 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
533 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
534 	}
535 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
536 	    sizeof (connf_t));
537 	ipst->ips_ipcl_bind_fanout = NULL;
538 
539 	for (i = 0; i < IPPROTO_MAX; i++) {
540 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
541 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
542 	}
543 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
544 	ipst->ips_ipcl_proto_fanout = NULL;
545 
546 	for (i = 0; i < IPPROTO_MAX; i++) {
547 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
548 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
549 	}
550 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
551 	    IPPROTO_MAX * sizeof (connf_t));
552 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
553 
554 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
555 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
556 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
557 	}
558 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
559 	    sizeof (connf_t));
560 	ipst->ips_ipcl_udp_fanout = NULL;
561 
562 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
563 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
564 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
565 	}
566 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
567 	    sizeof (connf_t));
568 	ipst->ips_ipcl_raw_fanout = NULL;
569 
570 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
571 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
572 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
573 	}
574 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
575 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
576 	ipst->ips_ipcl_globalhash_fanout = NULL;
577 
578 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
579 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
580 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
581 	ipst->ips_rts_clients = NULL;
582 }
583 
584 /*
585  * conn creation routine. initialize the conn, sets the reference
586  * and inserts it in the global hash table.
587  */
588 conn_t *
589 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
590 {
591 	conn_t	*connp;
592 	sctp_stack_t *sctps;
593 	struct kmem_cache *conn_cache;
594 
595 	switch (type) {
596 	case IPCL_SCTPCONN:
597 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
598 			return (NULL);
599 		sctp_conn_init(connp);
600 		sctps = ns->netstack_sctp;
601 		SCTP_G_Q_REFHOLD(sctps);
602 		netstack_hold(ns);
603 		connp->conn_netstack = ns;
604 		return (connp);
605 
606 	case IPCL_TCPCONN:
607 		conn_cache = tcp_conn_cache;
608 		break;
609 
610 	case IPCL_UDPCONN:
611 		conn_cache = udp_conn_cache;
612 		break;
613 
614 	case IPCL_RAWIPCONN:
615 		conn_cache = rawip_conn_cache;
616 		break;
617 
618 	case IPCL_RTSCONN:
619 		conn_cache = rts_conn_cache;
620 		break;
621 
622 	case IPCL_IPCCONN:
623 		conn_cache = ip_conn_cache;
624 		break;
625 
626 	default:
627 		connp = NULL;
628 		ASSERT(0);
629 	}
630 
631 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
632 		return (NULL);
633 
634 	connp->conn_ref = 1;
635 	netstack_hold(ns);
636 	connp->conn_netstack = ns;
637 	ipcl_globalhash_insert(connp);
638 	return (connp);
639 }
640 
641 void
642 ipcl_conn_destroy(conn_t *connp)
643 {
644 	mblk_t	*mp;
645 	netstack_t	*ns = connp->conn_netstack;
646 
647 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
648 	ASSERT(connp->conn_ref == 0);
649 	ASSERT(connp->conn_ire_cache == NULL);
650 
651 	if (connp->conn_peercred != NULL &&
652 	    connp->conn_peercred != connp->conn_cred)
653 		crfree(connp->conn_peercred);
654 	connp->conn_peercred = NULL;
655 
656 	if (connp->conn_cred != NULL) {
657 		crfree(connp->conn_cred);
658 		connp->conn_cred = NULL;
659 	}
660 
661 	ipcl_globalhash_remove(connp);
662 
663 	/* FIXME: add separate tcp_conn_free()? */
664 	if (connp->conn_flags & IPCL_TCPCONN) {
665 		tcp_t	*tcp = connp->conn_tcp;
666 		tcp_stack_t *tcps;
667 
668 		ASSERT(tcp != NULL);
669 		tcps = tcp->tcp_tcps;
670 		if (tcps != NULL) {
671 			if (connp->conn_latch != NULL) {
672 				IPLATCH_REFRELE(connp->conn_latch, ns);
673 				connp->conn_latch = NULL;
674 			}
675 			if (connp->conn_policy != NULL) {
676 				IPPH_REFRELE(connp->conn_policy, ns);
677 				connp->conn_policy = NULL;
678 			}
679 			tcp->tcp_tcps = NULL;
680 			TCPS_REFRELE(tcps);
681 		}
682 
683 		tcp_free(tcp);
684 		mp = tcp->tcp_timercache;
685 		tcp->tcp_cred = NULL;
686 
687 		if (tcp->tcp_sack_info != NULL) {
688 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
689 			kmem_cache_free(tcp_sack_info_cache,
690 			    tcp->tcp_sack_info);
691 		}
692 		if (tcp->tcp_iphc != NULL) {
693 			if (tcp->tcp_hdr_grown) {
694 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
695 			} else {
696 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
697 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
698 			}
699 			tcp->tcp_iphc_len = 0;
700 		}
701 		ASSERT(tcp->tcp_iphc_len == 0);
702 
703 		ASSERT(connp->conn_latch == NULL);
704 		ASSERT(connp->conn_policy == NULL);
705 
706 		if (ns != NULL) {
707 			ASSERT(tcp->tcp_tcps == NULL);
708 			connp->conn_netstack = NULL;
709 			netstack_rele(ns);
710 		}
711 
712 		ipcl_conn_cleanup(connp);
713 		connp->conn_flags = IPCL_TCPCONN;
714 		bzero(tcp, sizeof (tcp_t));
715 
716 		tcp->tcp_timercache = mp;
717 		tcp->tcp_connp = connp;
718 		kmem_cache_free(tcp_conn_cache, connp);
719 		return;
720 	}
721 	if (connp->conn_latch != NULL) {
722 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
723 		connp->conn_latch = NULL;
724 	}
725 	if (connp->conn_policy != NULL) {
726 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
727 		connp->conn_policy = NULL;
728 	}
729 	if (connp->conn_ipsec_opt_mp != NULL) {
730 		freemsg(connp->conn_ipsec_opt_mp);
731 		connp->conn_ipsec_opt_mp = NULL;
732 	}
733 
734 	if (connp->conn_flags & IPCL_SCTPCONN) {
735 		ASSERT(ns != NULL);
736 		sctp_free(connp);
737 		return;
738 	}
739 
740 	if (ns != NULL) {
741 		connp->conn_netstack = NULL;
742 		netstack_rele(ns);
743 	}
744 	ipcl_conn_cleanup(connp);
745 
746 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
747 	if (connp->conn_flags & IPCL_UDPCONN) {
748 		connp->conn_flags = IPCL_UDPCONN;
749 		kmem_cache_free(udp_conn_cache, connp);
750 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
751 		connp->conn_flags = IPCL_RAWIPCONN;
752 		connp->conn_ulp = IPPROTO_ICMP;
753 		kmem_cache_free(rawip_conn_cache, connp);
754 	} else if (connp->conn_flags & IPCL_RTSCONN) {
755 		connp->conn_flags = IPCL_RTSCONN;
756 		kmem_cache_free(rts_conn_cache, connp);
757 	} else {
758 		connp->conn_flags = IPCL_IPCCONN;
759 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
760 		ASSERT(connp->conn_priv == NULL);
761 		kmem_cache_free(ip_conn_cache, connp);
762 	}
763 }
764 
765 /*
766  * Running in cluster mode - deregister listener information
767  */
768 
769 static void
770 ipcl_conn_unlisten(conn_t *connp)
771 {
772 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
773 	ASSERT(connp->conn_lport != 0);
774 
775 	if (cl_inet_unlisten != NULL) {
776 		sa_family_t	addr_family;
777 		uint8_t		*laddrp;
778 
779 		if (connp->conn_pkt_isv6) {
780 			addr_family = AF_INET6;
781 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
782 		} else {
783 			addr_family = AF_INET;
784 			laddrp = (uint8_t *)&connp->conn_bound_source;
785 		}
786 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
787 		    connp->conn_lport);
788 	}
789 	connp->conn_flags &= ~IPCL_CL_LISTENER;
790 }
791 
792 /*
793  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
794  * which table the conn belonged to). So for debugging we can see which hash
795  * table this connection was in.
796  */
797 #define	IPCL_HASH_REMOVE(connp)	{					\
798 	connf_t	*connfp = (connp)->conn_fanout;				\
799 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
800 	if (connfp != NULL) {						\
801 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
802 		    (void *)(connp)));					\
803 		mutex_enter(&connfp->connf_lock);			\
804 		if ((connp)->conn_next != NULL)				\
805 			(connp)->conn_next->conn_prev =			\
806 			    (connp)->conn_prev;				\
807 		if ((connp)->conn_prev != NULL)				\
808 			(connp)->conn_prev->conn_next =			\
809 			    (connp)->conn_next;				\
810 		else							\
811 			connfp->connf_head = (connp)->conn_next;	\
812 		(connp)->conn_fanout = NULL;				\
813 		(connp)->conn_next = NULL;				\
814 		(connp)->conn_prev = NULL;				\
815 		(connp)->conn_flags |= IPCL_REMOVED;			\
816 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
817 			ipcl_conn_unlisten((connp));			\
818 		CONN_DEC_REF((connp));					\
819 		mutex_exit(&connfp->connf_lock);			\
820 	}								\
821 }
822 
823 void
824 ipcl_hash_remove(conn_t *connp)
825 {
826 	IPCL_HASH_REMOVE(connp);
827 }
828 
829 /*
830  * The whole purpose of this function is allow removal of
831  * a conn_t from the connected hash for timewait reclaim.
832  * This is essentially a TW reclaim fastpath where timewait
833  * collector checks under fanout lock (so no one else can
834  * get access to the conn_t) that refcnt is 2 i.e. one for
835  * TCP and one for the classifier hash list. If ref count
836  * is indeed 2, we can just remove the conn under lock and
837  * avoid cleaning up the conn under squeue. This gives us
838  * improved performance.
839  */
840 void
841 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
842 {
843 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
844 	ASSERT(MUTEX_HELD(&connp->conn_lock));
845 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
846 
847 	if ((connp)->conn_next != NULL) {
848 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
849 	}
850 	if ((connp)->conn_prev != NULL) {
851 		(connp)->conn_prev->conn_next = (connp)->conn_next;
852 	} else {
853 		connfp->connf_head = (connp)->conn_next;
854 	}
855 	(connp)->conn_fanout = NULL;
856 	(connp)->conn_next = NULL;
857 	(connp)->conn_prev = NULL;
858 	(connp)->conn_flags |= IPCL_REMOVED;
859 	ASSERT((connp)->conn_ref == 2);
860 	(connp)->conn_ref--;
861 }
862 
863 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
864 	ASSERT((connp)->conn_fanout == NULL);				\
865 	ASSERT((connp)->conn_next == NULL);				\
866 	ASSERT((connp)->conn_prev == NULL);				\
867 	if ((connfp)->connf_head != NULL) {				\
868 		(connfp)->connf_head->conn_prev = (connp);		\
869 		(connp)->conn_next = (connfp)->connf_head;		\
870 	}								\
871 	(connp)->conn_fanout = (connfp);				\
872 	(connfp)->connf_head = (connp);					\
873 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
874 	    IPCL_CONNECTED;						\
875 	CONN_INC_REF(connp);						\
876 }
877 
878 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
879 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
880 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
881 	IPCL_HASH_REMOVE((connp));					\
882 	mutex_enter(&(connfp)->connf_lock);				\
883 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
884 	mutex_exit(&(connfp)->connf_lock);				\
885 }
886 
887 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
888 	conn_t *pconnp = NULL, *nconnp;					\
889 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
890 	    "connp %p", (void *)connfp, (void *)(connp)));		\
891 	IPCL_HASH_REMOVE((connp));					\
892 	mutex_enter(&(connfp)->connf_lock);				\
893 	nconnp = (connfp)->connf_head;					\
894 	while (nconnp != NULL &&					\
895 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
896 		pconnp = nconnp;					\
897 		nconnp = nconnp->conn_next;				\
898 	}								\
899 	if (pconnp != NULL) {						\
900 		pconnp->conn_next = (connp);				\
901 		(connp)->conn_prev = pconnp;				\
902 	} else {							\
903 		(connfp)->connf_head = (connp);				\
904 	}								\
905 	if (nconnp != NULL) {						\
906 		(connp)->conn_next = nconnp;				\
907 		nconnp->conn_prev = (connp);				\
908 	}								\
909 	(connp)->conn_fanout = (connfp);				\
910 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
911 	    IPCL_BOUND;							\
912 	CONN_INC_REF(connp);						\
913 	mutex_exit(&(connfp)->connf_lock);				\
914 }
915 
916 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
917 	conn_t **list, *prev, *next;					\
918 	boolean_t isv4mapped =						\
919 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
920 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
921 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
922 	IPCL_HASH_REMOVE((connp));					\
923 	mutex_enter(&(connfp)->connf_lock);				\
924 	list = &(connfp)->connf_head;					\
925 	prev = NULL;							\
926 	while ((next = *list) != NULL) {				\
927 		if (isv4mapped &&					\
928 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
929 		    connp->conn_zoneid == next->conn_zoneid) {		\
930 			(connp)->conn_next = next;			\
931 			if (prev != NULL)				\
932 				prev = next->conn_prev;			\
933 			next->conn_prev = (connp);			\
934 			break;						\
935 		}							\
936 		list = &next->conn_next;				\
937 		prev = next;						\
938 	}								\
939 	(connp)->conn_prev = prev;					\
940 	*list = (connp);						\
941 	(connp)->conn_fanout = (connfp);				\
942 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
943 	    IPCL_BOUND;							\
944 	CONN_INC_REF((connp));						\
945 	mutex_exit(&(connfp)->connf_lock);				\
946 }
947 
948 void
949 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
950 {
951 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
952 }
953 
954 void
955 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
956 {
957 	connf_t	*connfp;
958 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
959 
960 	ASSERT(connp != NULL);
961 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
962 	    protocol == IPPROTO_ESP);
963 
964 	connp->conn_ulp = protocol;
965 
966 	/* Insert it in the protocol hash */
967 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
968 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
969 }
970 
971 void
972 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
973 {
974 	connf_t	*connfp;
975 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
976 
977 	ASSERT(connp != NULL);
978 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
979 	    protocol == IPPROTO_ESP);
980 
981 	connp->conn_ulp = protocol;
982 
983 	/* Insert it in the Bind Hash */
984 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
985 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
986 }
987 
988 /*
989  * This function is used only for inserting SCTP raw socket now.
990  * This may change later.
991  *
992  * Note that only one raw socket can be bound to a port.  The param
993  * lport is in network byte order.
994  */
995 static int
996 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
997 {
998 	connf_t	*connfp;
999 	conn_t	*oconnp;
1000 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1001 
1002 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1003 
1004 	/* Check for existing raw socket already bound to the port. */
1005 	mutex_enter(&connfp->connf_lock);
1006 	for (oconnp = connfp->connf_head; oconnp != NULL;
1007 	    oconnp = oconnp->conn_next) {
1008 		if (oconnp->conn_lport == lport &&
1009 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1010 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1011 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1012 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1013 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1014 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1015 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1016 		    &connp->conn_srcv6))) {
1017 			break;
1018 		}
1019 	}
1020 	mutex_exit(&connfp->connf_lock);
1021 	if (oconnp != NULL)
1022 		return (EADDRNOTAVAIL);
1023 
1024 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1025 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1026 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1027 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1028 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1029 		} else {
1030 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1031 		}
1032 	} else {
1033 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1034 	}
1035 	return (0);
1036 }
1037 
1038 /*
1039  * Check for a MAC exemption conflict on a labeled system.  Note that for
1040  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1041  * transport layer.  This check is for binding all other protocols.
1042  *
1043  * Returns true if there's a conflict.
1044  */
1045 static boolean_t
1046 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1047 {
1048 	connf_t	*connfp;
1049 	conn_t *tconn;
1050 
1051 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1052 	mutex_enter(&connfp->connf_lock);
1053 	for (tconn = connfp->connf_head; tconn != NULL;
1054 	    tconn = tconn->conn_next) {
1055 		/* We don't allow v4 fallback for v6 raw socket */
1056 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1057 			continue;
1058 		/* If neither is exempt, then there's no conflict */
1059 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1060 			continue;
1061 		/* If both are bound to different specific addrs, ok */
1062 		if (connp->conn_src != INADDR_ANY &&
1063 		    tconn->conn_src != INADDR_ANY &&
1064 		    connp->conn_src != tconn->conn_src)
1065 			continue;
1066 		/* These two conflict; fail */
1067 		break;
1068 	}
1069 	mutex_exit(&connfp->connf_lock);
1070 	return (tconn != NULL);
1071 }
1072 
1073 static boolean_t
1074 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1075 {
1076 	connf_t	*connfp;
1077 	conn_t *tconn;
1078 
1079 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1080 	mutex_enter(&connfp->connf_lock);
1081 	for (tconn = connfp->connf_head; tconn != NULL;
1082 	    tconn = tconn->conn_next) {
1083 		/* We don't allow v4 fallback for v6 raw socket */
1084 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1085 			continue;
1086 		/* If neither is exempt, then there's no conflict */
1087 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1088 			continue;
1089 		/* If both are bound to different addrs, ok */
1090 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1091 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1092 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1093 			continue;
1094 		/* These two conflict; fail */
1095 		break;
1096 	}
1097 	mutex_exit(&connfp->connf_lock);
1098 	return (tconn != NULL);
1099 }
1100 
1101 /*
1102  * (v4, v6) bind hash insertion routines
1103  */
1104 int
1105 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1106 {
1107 	connf_t	*connfp;
1108 #ifdef	IPCL_DEBUG
1109 	char	buf[INET_NTOA_BUFSIZE];
1110 #endif
1111 	int	ret = 0;
1112 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1113 
1114 	ASSERT(connp);
1115 
1116 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1117 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1118 
1119 	connp->conn_ulp = protocol;
1120 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1121 	connp->conn_lport = lport;
1122 
1123 	switch (protocol) {
1124 	default:
1125 		if (is_system_labeled() &&
1126 		    check_exempt_conflict_v4(connp, ipst))
1127 			return (EADDRINUSE);
1128 		/* FALLTHROUGH */
1129 	case IPPROTO_UDP:
1130 		if (protocol == IPPROTO_UDP) {
1131 			IPCL_DEBUG_LVL(64,
1132 			    ("ipcl_bind_insert: connp %p - udp\n",
1133 			    (void *)connp));
1134 			connfp = &ipst->ips_ipcl_udp_fanout[
1135 			    IPCL_UDP_HASH(lport, ipst)];
1136 		} else {
1137 			IPCL_DEBUG_LVL(64,
1138 			    ("ipcl_bind_insert: connp %p - protocol\n",
1139 			    (void *)connp));
1140 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1141 		}
1142 
1143 		if (connp->conn_rem != INADDR_ANY) {
1144 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1145 		} else if (connp->conn_src != INADDR_ANY) {
1146 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1147 		} else {
1148 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1149 		}
1150 		break;
1151 
1152 	case IPPROTO_TCP:
1153 
1154 		/* Insert it in the Bind Hash */
1155 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1156 		connfp = &ipst->ips_ipcl_bind_fanout[
1157 		    IPCL_BIND_HASH(lport, ipst)];
1158 		if (connp->conn_src != INADDR_ANY) {
1159 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1160 		} else {
1161 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1162 		}
1163 		if (cl_inet_listen != NULL) {
1164 			ASSERT(!connp->conn_pkt_isv6);
1165 			connp->conn_flags |= IPCL_CL_LISTENER;
1166 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1167 			    (uint8_t *)&connp->conn_bound_source, lport);
1168 		}
1169 		break;
1170 
1171 	case IPPROTO_SCTP:
1172 		ret = ipcl_sctp_hash_insert(connp, lport);
1173 		break;
1174 	}
1175 
1176 	return (ret);
1177 }
1178 
1179 int
1180 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1181     uint16_t lport)
1182 {
1183 	connf_t	*connfp;
1184 	int	ret = 0;
1185 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1186 
1187 	ASSERT(connp);
1188 
1189 	connp->conn_ulp = protocol;
1190 	connp->conn_srcv6 = *src;
1191 	connp->conn_lport = lport;
1192 
1193 	switch (protocol) {
1194 	default:
1195 		if (is_system_labeled() &&
1196 		    check_exempt_conflict_v6(connp, ipst))
1197 			return (EADDRINUSE);
1198 		/* FALLTHROUGH */
1199 	case IPPROTO_UDP:
1200 		if (protocol == IPPROTO_UDP) {
1201 			IPCL_DEBUG_LVL(128,
1202 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1203 			    (void *)connp));
1204 			connfp = &ipst->ips_ipcl_udp_fanout[
1205 			    IPCL_UDP_HASH(lport, ipst)];
1206 		} else {
1207 			IPCL_DEBUG_LVL(128,
1208 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1209 			    (void *)connp));
1210 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1211 		}
1212 
1213 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1214 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1215 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1216 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1217 		} else {
1218 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1219 		}
1220 		break;
1221 
1222 	case IPPROTO_TCP:
1223 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1224 
1225 		/* Insert it in the Bind Hash */
1226 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1227 		connfp = &ipst->ips_ipcl_bind_fanout[
1228 		    IPCL_BIND_HASH(lport, ipst)];
1229 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1230 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1231 		} else {
1232 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1233 		}
1234 		if (cl_inet_listen != NULL) {
1235 			sa_family_t	addr_family;
1236 			uint8_t		*laddrp;
1237 
1238 			if (connp->conn_pkt_isv6) {
1239 				addr_family = AF_INET6;
1240 				laddrp =
1241 				    (uint8_t *)&connp->conn_bound_source_v6;
1242 			} else {
1243 				addr_family = AF_INET;
1244 				laddrp = (uint8_t *)&connp->conn_bound_source;
1245 			}
1246 			connp->conn_flags |= IPCL_CL_LISTENER;
1247 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1248 			    lport);
1249 		}
1250 		break;
1251 
1252 	case IPPROTO_SCTP:
1253 		ret = ipcl_sctp_hash_insert(connp, lport);
1254 		break;
1255 	}
1256 
1257 	return (ret);
1258 }
1259 
1260 /*
1261  * ipcl_conn_hash insertion routines.
1262  */
1263 int
1264 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1265     ipaddr_t rem, uint32_t ports)
1266 {
1267 	connf_t		*connfp;
1268 	uint16_t	*up;
1269 	conn_t		*tconnp;
1270 #ifdef	IPCL_DEBUG
1271 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1272 #endif
1273 	in_port_t	lport;
1274 	int		ret = 0;
1275 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1276 
1277 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1278 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1279 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1280 	    ports, protocol));
1281 
1282 	switch (protocol) {
1283 	case IPPROTO_TCP:
1284 		if (!(connp->conn_flags & IPCL_EAGER)) {
1285 			/*
1286 			 * for a eager connection, i.e connections which
1287 			 * have just been created, the initialization is
1288 			 * already done in ip at conn_creation time, so
1289 			 * we can skip the checks here.
1290 			 */
1291 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1292 		}
1293 		connfp = &ipst->ips_ipcl_conn_fanout[
1294 		    IPCL_CONN_HASH(connp->conn_rem,
1295 		    connp->conn_ports, ipst)];
1296 		mutex_enter(&connfp->connf_lock);
1297 		for (tconnp = connfp->connf_head; tconnp != NULL;
1298 		    tconnp = tconnp->conn_next) {
1299 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1300 			    connp->conn_rem, connp->conn_src,
1301 			    connp->conn_ports)) {
1302 
1303 				/* Already have a conn. bail out */
1304 				mutex_exit(&connfp->connf_lock);
1305 				return (EADDRINUSE);
1306 			}
1307 		}
1308 		if (connp->conn_fanout != NULL) {
1309 			/*
1310 			 * Probably a XTI/TLI application trying to do a
1311 			 * rebind. Let it happen.
1312 			 */
1313 			mutex_exit(&connfp->connf_lock);
1314 			IPCL_HASH_REMOVE(connp);
1315 			mutex_enter(&connfp->connf_lock);
1316 		}
1317 
1318 		ASSERT(connp->conn_recv != NULL);
1319 
1320 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1321 		mutex_exit(&connfp->connf_lock);
1322 		break;
1323 
1324 	case IPPROTO_SCTP:
1325 		/*
1326 		 * The raw socket may have already been bound, remove it
1327 		 * from the hash first.
1328 		 */
1329 		IPCL_HASH_REMOVE(connp);
1330 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1331 		ret = ipcl_sctp_hash_insert(connp, lport);
1332 		break;
1333 
1334 	default:
1335 		/*
1336 		 * Check for conflicts among MAC exempt bindings.  For
1337 		 * transports with port numbers, this is done by the upper
1338 		 * level per-transport binding logic.  For all others, it's
1339 		 * done here.
1340 		 */
1341 		if (is_system_labeled() &&
1342 		    check_exempt_conflict_v4(connp, ipst))
1343 			return (EADDRINUSE);
1344 		/* FALLTHROUGH */
1345 
1346 	case IPPROTO_UDP:
1347 		up = (uint16_t *)&ports;
1348 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1349 		if (protocol == IPPROTO_UDP) {
1350 			connfp = &ipst->ips_ipcl_udp_fanout[
1351 			    IPCL_UDP_HASH(up[1], ipst)];
1352 		} else {
1353 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1354 		}
1355 
1356 		if (connp->conn_rem != INADDR_ANY) {
1357 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1358 		} else if (connp->conn_src != INADDR_ANY) {
1359 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1360 		} else {
1361 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1362 		}
1363 		break;
1364 	}
1365 
1366 	return (ret);
1367 }
1368 
1369 int
1370 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1371     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1372 {
1373 	connf_t		*connfp;
1374 	uint16_t	*up;
1375 	conn_t		*tconnp;
1376 	in_port_t	lport;
1377 	int		ret = 0;
1378 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1379 
1380 	switch (protocol) {
1381 	case IPPROTO_TCP:
1382 		/* Just need to insert a conn struct */
1383 		if (!(connp->conn_flags & IPCL_EAGER)) {
1384 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1385 		}
1386 		connfp = &ipst->ips_ipcl_conn_fanout[
1387 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1388 		    ipst)];
1389 		mutex_enter(&connfp->connf_lock);
1390 		for (tconnp = connfp->connf_head; tconnp != NULL;
1391 		    tconnp = tconnp->conn_next) {
1392 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1393 			    connp->conn_remv6, connp->conn_srcv6,
1394 			    connp->conn_ports) &&
1395 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1396 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1397 				/* Already have a conn. bail out */
1398 				mutex_exit(&connfp->connf_lock);
1399 				return (EADDRINUSE);
1400 			}
1401 		}
1402 		if (connp->conn_fanout != NULL) {
1403 			/*
1404 			 * Probably a XTI/TLI application trying to do a
1405 			 * rebind. Let it happen.
1406 			 */
1407 			mutex_exit(&connfp->connf_lock);
1408 			IPCL_HASH_REMOVE(connp);
1409 			mutex_enter(&connfp->connf_lock);
1410 		}
1411 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1412 		mutex_exit(&connfp->connf_lock);
1413 		break;
1414 
1415 	case IPPROTO_SCTP:
1416 		IPCL_HASH_REMOVE(connp);
1417 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1418 		ret = ipcl_sctp_hash_insert(connp, lport);
1419 		break;
1420 
1421 	default:
1422 		if (is_system_labeled() &&
1423 		    check_exempt_conflict_v6(connp, ipst))
1424 			return (EADDRINUSE);
1425 		/* FALLTHROUGH */
1426 	case IPPROTO_UDP:
1427 		up = (uint16_t *)&ports;
1428 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1429 		if (protocol == IPPROTO_UDP) {
1430 			connfp = &ipst->ips_ipcl_udp_fanout[
1431 			    IPCL_UDP_HASH(up[1], ipst)];
1432 		} else {
1433 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1434 		}
1435 
1436 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1437 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1438 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1439 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1440 		} else {
1441 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1442 		}
1443 		break;
1444 	}
1445 
1446 	return (ret);
1447 }
1448 
1449 /*
1450  * v4 packet classifying function. looks up the fanout table to
1451  * find the conn, the packet belongs to. returns the conn with
1452  * the reference held, null otherwise.
1453  *
1454  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1455  * Lookup" comment block are applied.  Labels are also checked as described
1456  * above.  If the packet is from the inside (looped back), and is from the same
1457  * zone, then label checks are omitted.
1458  */
1459 conn_t *
1460 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1461     ip_stack_t *ipst)
1462 {
1463 	ipha_t	*ipha;
1464 	connf_t	*connfp, *bind_connfp;
1465 	uint16_t lport;
1466 	uint16_t fport;
1467 	uint32_t ports;
1468 	conn_t	*connp;
1469 	uint16_t  *up;
1470 	boolean_t shared_addr;
1471 	boolean_t unlabeled;
1472 
1473 	ipha = (ipha_t *)mp->b_rptr;
1474 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1475 
1476 	switch (protocol) {
1477 	case IPPROTO_TCP:
1478 		ports = *(uint32_t *)up;
1479 		connfp =
1480 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1481 		    ports, ipst)];
1482 		mutex_enter(&connfp->connf_lock);
1483 		for (connp = connfp->connf_head; connp != NULL;
1484 		    connp = connp->conn_next) {
1485 			if (IPCL_CONN_MATCH(connp, protocol,
1486 			    ipha->ipha_src, ipha->ipha_dst, ports))
1487 				break;
1488 		}
1489 
1490 		if (connp != NULL) {
1491 			/*
1492 			 * We have a fully-bound TCP connection.
1493 			 *
1494 			 * For labeled systems, there's no need to check the
1495 			 * label here.  It's known to be good as we checked
1496 			 * before allowing the connection to become bound.
1497 			 */
1498 			CONN_INC_REF(connp);
1499 			mutex_exit(&connfp->connf_lock);
1500 			return (connp);
1501 		}
1502 
1503 		mutex_exit(&connfp->connf_lock);
1504 
1505 		lport = up[1];
1506 		unlabeled = B_FALSE;
1507 		/* Cred cannot be null on IPv4 */
1508 		if (is_system_labeled())
1509 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1510 			    TSLF_UNLABELED) != 0;
1511 		shared_addr = (zoneid == ALL_ZONES);
1512 		if (shared_addr) {
1513 			/*
1514 			 * No need to handle exclusive-stack zones since
1515 			 * ALL_ZONES only applies to the shared stack.
1516 			 */
1517 			zoneid = tsol_mlp_findzone(protocol, lport);
1518 			/*
1519 			 * If no shared MLP is found, tsol_mlp_findzone returns
1520 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1521 			 * search for the zone based on the packet label.
1522 			 *
1523 			 * If there is such a zone, we prefer to find a
1524 			 * connection in it.  Otherwise, we look for a
1525 			 * MAC-exempt connection in any zone whose label
1526 			 * dominates the default label on the packet.
1527 			 */
1528 			if (zoneid == ALL_ZONES)
1529 				zoneid = tsol_packet_to_zoneid(mp);
1530 			else
1531 				unlabeled = B_FALSE;
1532 		}
1533 
1534 		bind_connfp =
1535 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1536 		mutex_enter(&bind_connfp->connf_lock);
1537 		for (connp = bind_connfp->connf_head; connp != NULL;
1538 		    connp = connp->conn_next) {
1539 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1540 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1541 			    (unlabeled && connp->conn_mac_exempt)))
1542 				break;
1543 		}
1544 
1545 		/*
1546 		 * If the matching connection is SLP on a private address, then
1547 		 * the label on the packet must match the local zone's label.
1548 		 * Otherwise, it must be in the label range defined by tnrh.
1549 		 * This is ensured by tsol_receive_label.
1550 		 */
1551 		if (connp != NULL && is_system_labeled() &&
1552 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1553 		    shared_addr, connp)) {
1554 				DTRACE_PROBE3(
1555 				    tx__ip__log__info__classify__tcp,
1556 				    char *,
1557 				    "connp(1) could not receive mp(2)",
1558 				    conn_t *, connp, mblk_t *, mp);
1559 			connp = NULL;
1560 		}
1561 
1562 		if (connp != NULL) {
1563 			/* Have a listener at least */
1564 			CONN_INC_REF(connp);
1565 			mutex_exit(&bind_connfp->connf_lock);
1566 			return (connp);
1567 		}
1568 
1569 		mutex_exit(&bind_connfp->connf_lock);
1570 
1571 		IPCL_DEBUG_LVL(512,
1572 		    ("ipcl_classify: couldn't classify mp = %p\n",
1573 		    (void *)mp));
1574 		break;
1575 
1576 	case IPPROTO_UDP:
1577 		lport = up[1];
1578 		unlabeled = B_FALSE;
1579 		/* Cred cannot be null on IPv4 */
1580 		if (is_system_labeled())
1581 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1582 			    TSLF_UNLABELED) != 0;
1583 		shared_addr = (zoneid == ALL_ZONES);
1584 		if (shared_addr) {
1585 			/*
1586 			 * No need to handle exclusive-stack zones since
1587 			 * ALL_ZONES only applies to the shared stack.
1588 			 */
1589 			zoneid = tsol_mlp_findzone(protocol, lport);
1590 			/*
1591 			 * If no shared MLP is found, tsol_mlp_findzone returns
1592 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1593 			 * search for the zone based on the packet label.
1594 			 *
1595 			 * If there is such a zone, we prefer to find a
1596 			 * connection in it.  Otherwise, we look for a
1597 			 * MAC-exempt connection in any zone whose label
1598 			 * dominates the default label on the packet.
1599 			 */
1600 			if (zoneid == ALL_ZONES)
1601 				zoneid = tsol_packet_to_zoneid(mp);
1602 			else
1603 				unlabeled = B_FALSE;
1604 		}
1605 		fport = up[0];
1606 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1607 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1608 		mutex_enter(&connfp->connf_lock);
1609 		for (connp = connfp->connf_head; connp != NULL;
1610 		    connp = connp->conn_next) {
1611 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1612 			    fport, ipha->ipha_src) &&
1613 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1614 			    (unlabeled && connp->conn_mac_exempt)))
1615 				break;
1616 		}
1617 
1618 		if (connp != NULL && is_system_labeled() &&
1619 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1620 		    shared_addr, connp)) {
1621 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1622 			    char *, "connp(1) could not receive mp(2)",
1623 			    conn_t *, connp, mblk_t *, mp);
1624 			connp = NULL;
1625 		}
1626 
1627 		if (connp != NULL) {
1628 			CONN_INC_REF(connp);
1629 			mutex_exit(&connfp->connf_lock);
1630 			return (connp);
1631 		}
1632 
1633 		/*
1634 		 * We shouldn't come here for multicast/broadcast packets
1635 		 */
1636 		mutex_exit(&connfp->connf_lock);
1637 		IPCL_DEBUG_LVL(512,
1638 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1639 		    lport, fport));
1640 		break;
1641 	}
1642 
1643 	return (NULL);
1644 }
1645 
1646 conn_t *
1647 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1648     ip_stack_t *ipst)
1649 {
1650 	ip6_t		*ip6h;
1651 	connf_t		*connfp, *bind_connfp;
1652 	uint16_t	lport;
1653 	uint16_t	fport;
1654 	tcph_t		*tcph;
1655 	uint32_t	ports;
1656 	conn_t		*connp;
1657 	uint16_t	*up;
1658 	boolean_t	shared_addr;
1659 	boolean_t	unlabeled;
1660 
1661 	ip6h = (ip6_t *)mp->b_rptr;
1662 
1663 	switch (protocol) {
1664 	case IPPROTO_TCP:
1665 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1666 		up = (uint16_t *)tcph->th_lport;
1667 		ports = *(uint32_t *)up;
1668 
1669 		connfp =
1670 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1671 		    ports, ipst)];
1672 		mutex_enter(&connfp->connf_lock);
1673 		for (connp = connfp->connf_head; connp != NULL;
1674 		    connp = connp->conn_next) {
1675 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1676 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1677 				break;
1678 		}
1679 
1680 		if (connp != NULL) {
1681 			/*
1682 			 * We have a fully-bound TCP connection.
1683 			 *
1684 			 * For labeled systems, there's no need to check the
1685 			 * label here.  It's known to be good as we checked
1686 			 * before allowing the connection to become bound.
1687 			 */
1688 			CONN_INC_REF(connp);
1689 			mutex_exit(&connfp->connf_lock);
1690 			return (connp);
1691 		}
1692 
1693 		mutex_exit(&connfp->connf_lock);
1694 
1695 		lport = up[1];
1696 		unlabeled = B_FALSE;
1697 		/* Cred can be null on IPv6 */
1698 		if (is_system_labeled()) {
1699 			cred_t *cr = DB_CRED(mp);
1700 
1701 			unlabeled = (cr != NULL &&
1702 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1703 		}
1704 		shared_addr = (zoneid == ALL_ZONES);
1705 		if (shared_addr) {
1706 			/*
1707 			 * No need to handle exclusive-stack zones since
1708 			 * ALL_ZONES only applies to the shared stack.
1709 			 */
1710 			zoneid = tsol_mlp_findzone(protocol, lport);
1711 			/*
1712 			 * If no shared MLP is found, tsol_mlp_findzone returns
1713 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1714 			 * search for the zone based on the packet label.
1715 			 *
1716 			 * If there is such a zone, we prefer to find a
1717 			 * connection in it.  Otherwise, we look for a
1718 			 * MAC-exempt connection in any zone whose label
1719 			 * dominates the default label on the packet.
1720 			 */
1721 			if (zoneid == ALL_ZONES)
1722 				zoneid = tsol_packet_to_zoneid(mp);
1723 			else
1724 				unlabeled = B_FALSE;
1725 		}
1726 
1727 		bind_connfp =
1728 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1729 		mutex_enter(&bind_connfp->connf_lock);
1730 		for (connp = bind_connfp->connf_head; connp != NULL;
1731 		    connp = connp->conn_next) {
1732 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1733 			    ip6h->ip6_dst, lport) &&
1734 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1735 			    (unlabeled && connp->conn_mac_exempt)))
1736 				break;
1737 		}
1738 
1739 		if (connp != NULL && is_system_labeled() &&
1740 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1741 		    shared_addr, connp)) {
1742 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1743 			    char *, "connp(1) could not receive mp(2)",
1744 			    conn_t *, connp, mblk_t *, mp);
1745 			connp = NULL;
1746 		}
1747 
1748 		if (connp != NULL) {
1749 			/* Have a listner at least */
1750 			CONN_INC_REF(connp);
1751 			mutex_exit(&bind_connfp->connf_lock);
1752 			IPCL_DEBUG_LVL(512,
1753 			    ("ipcl_classify_v6: found listner "
1754 			    "connp = %p\n", (void *)connp));
1755 
1756 			return (connp);
1757 		}
1758 
1759 		mutex_exit(&bind_connfp->connf_lock);
1760 
1761 		IPCL_DEBUG_LVL(512,
1762 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1763 		    (void *)mp));
1764 		break;
1765 
1766 	case IPPROTO_UDP:
1767 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1768 		lport = up[1];
1769 		unlabeled = B_FALSE;
1770 		/* Cred can be null on IPv6 */
1771 		if (is_system_labeled()) {
1772 			cred_t *cr = DB_CRED(mp);
1773 
1774 			unlabeled = (cr != NULL &&
1775 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1776 		}
1777 		shared_addr = (zoneid == ALL_ZONES);
1778 		if (shared_addr) {
1779 			/*
1780 			 * No need to handle exclusive-stack zones since
1781 			 * ALL_ZONES only applies to the shared stack.
1782 			 */
1783 			zoneid = tsol_mlp_findzone(protocol, lport);
1784 			/*
1785 			 * If no shared MLP is found, tsol_mlp_findzone returns
1786 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1787 			 * search for the zone based on the packet label.
1788 			 *
1789 			 * If there is such a zone, we prefer to find a
1790 			 * connection in it.  Otherwise, we look for a
1791 			 * MAC-exempt connection in any zone whose label
1792 			 * dominates the default label on the packet.
1793 			 */
1794 			if (zoneid == ALL_ZONES)
1795 				zoneid = tsol_packet_to_zoneid(mp);
1796 			else
1797 				unlabeled = B_FALSE;
1798 		}
1799 
1800 		fport = up[0];
1801 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1802 		    fport));
1803 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1804 		mutex_enter(&connfp->connf_lock);
1805 		for (connp = connfp->connf_head; connp != NULL;
1806 		    connp = connp->conn_next) {
1807 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1808 			    fport, ip6h->ip6_src) &&
1809 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1810 			    (unlabeled && connp->conn_mac_exempt)))
1811 				break;
1812 		}
1813 
1814 		if (connp != NULL && is_system_labeled() &&
1815 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1816 		    shared_addr, connp)) {
1817 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1818 			    char *, "connp(1) could not receive mp(2)",
1819 			    conn_t *, connp, mblk_t *, mp);
1820 			connp = NULL;
1821 		}
1822 
1823 		if (connp != NULL) {
1824 			CONN_INC_REF(connp);
1825 			mutex_exit(&connfp->connf_lock);
1826 			return (connp);
1827 		}
1828 
1829 		/*
1830 		 * We shouldn't come here for multicast/broadcast packets
1831 		 */
1832 		mutex_exit(&connfp->connf_lock);
1833 		IPCL_DEBUG_LVL(512,
1834 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1835 		    lport, fport));
1836 		break;
1837 	}
1838 
1839 	return (NULL);
1840 }
1841 
1842 /*
1843  * wrapper around ipcl_classify_(v4,v6) routines.
1844  */
1845 conn_t *
1846 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
1847 {
1848 	uint16_t	hdr_len;
1849 	ipha_t		*ipha;
1850 	uint8_t		*nexthdrp;
1851 
1852 	if (MBLKL(mp) < sizeof (ipha_t))
1853 		return (NULL);
1854 
1855 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1856 	case IPV4_VERSION:
1857 		ipha = (ipha_t *)mp->b_rptr;
1858 		hdr_len = IPH_HDR_LENGTH(ipha);
1859 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1860 		    zoneid, ipst));
1861 	case IPV6_VERSION:
1862 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1863 		    &hdr_len, &nexthdrp))
1864 			return (NULL);
1865 
1866 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
1867 	}
1868 
1869 	return (NULL);
1870 }
1871 
1872 conn_t *
1873 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1874     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
1875 {
1876 	connf_t		*connfp;
1877 	conn_t		*connp;
1878 	in_port_t	lport;
1879 	int		af;
1880 	boolean_t	shared_addr;
1881 	boolean_t	unlabeled;
1882 	const void	*dst;
1883 
1884 	lport = ((uint16_t *)&ports)[1];
1885 
1886 	unlabeled = B_FALSE;
1887 	/* Cred can be null on IPv6 */
1888 	if (is_system_labeled()) {
1889 		cred_t *cr = DB_CRED(mp);
1890 
1891 		unlabeled = (cr != NULL &&
1892 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1893 	}
1894 	shared_addr = (zoneid == ALL_ZONES);
1895 	if (shared_addr) {
1896 		/*
1897 		 * No need to handle exclusive-stack zones since ALL_ZONES
1898 		 * only applies to the shared stack.
1899 		 */
1900 		zoneid = tsol_mlp_findzone(protocol, lport);
1901 		/*
1902 		 * If no shared MLP is found, tsol_mlp_findzone returns
1903 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1904 		 * the zone based on the packet label.
1905 		 *
1906 		 * If there is such a zone, we prefer to find a connection in
1907 		 * it.  Otherwise, we look for a MAC-exempt connection in any
1908 		 * zone whose label dominates the default label on the packet.
1909 		 */
1910 		if (zoneid == ALL_ZONES)
1911 			zoneid = tsol_packet_to_zoneid(mp);
1912 		else
1913 			unlabeled = B_FALSE;
1914 	}
1915 
1916 	af = IPH_HDR_VERSION(hdr);
1917 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1918 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1919 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1920 
1921 	mutex_enter(&connfp->connf_lock);
1922 	for (connp = connfp->connf_head; connp != NULL;
1923 	    connp = connp->conn_next) {
1924 		/* We don't allow v4 fallback for v6 raw socket. */
1925 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1926 		    IPV6_VERSION))
1927 			continue;
1928 		if (connp->conn_fully_bound) {
1929 			if (af == IPV4_VERSION) {
1930 				if (!IPCL_CONN_MATCH(connp, protocol,
1931 				    hdr->ipha_src, hdr->ipha_dst, ports))
1932 					continue;
1933 			} else {
1934 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1935 				    ((ip6_t *)hdr)->ip6_src,
1936 				    ((ip6_t *)hdr)->ip6_dst, ports))
1937 					continue;
1938 			}
1939 		} else {
1940 			if (af == IPV4_VERSION) {
1941 				if (!IPCL_BIND_MATCH(connp, protocol,
1942 				    hdr->ipha_dst, lport))
1943 					continue;
1944 			} else {
1945 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1946 				    ((ip6_t *)hdr)->ip6_dst, lport))
1947 					continue;
1948 			}
1949 		}
1950 
1951 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1952 		    (unlabeled && connp->conn_mac_exempt))
1953 			break;
1954 	}
1955 	/*
1956 	 * If the connection is fully-bound and connection-oriented (TCP or
1957 	 * SCTP), then we've already validated the remote system's label.
1958 	 * There's no need to do it again for every packet.
1959 	 */
1960 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1961 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1962 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1963 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1964 		    char *, "connp(1) could not receive mp(2)",
1965 		    conn_t *, connp, mblk_t *, mp);
1966 		connp = NULL;
1967 	}
1968 
1969 	if (connp != NULL)
1970 		goto found;
1971 	mutex_exit(&connfp->connf_lock);
1972 
1973 	/* Try to look for a wildcard match. */
1974 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1975 	mutex_enter(&connfp->connf_lock);
1976 	for (connp = connfp->connf_head; connp != NULL;
1977 	    connp = connp->conn_next) {
1978 		/* We don't allow v4 fallback for v6 raw socket. */
1979 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1980 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1981 			continue;
1982 		}
1983 		if (af == IPV4_VERSION) {
1984 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1985 				break;
1986 		} else {
1987 			if (IPCL_RAW_MATCH_V6(connp, protocol,
1988 			    ((ip6_t *)hdr)->ip6_dst)) {
1989 				break;
1990 			}
1991 		}
1992 	}
1993 
1994 	if (connp != NULL)
1995 		goto found;
1996 
1997 	mutex_exit(&connfp->connf_lock);
1998 	return (NULL);
1999 
2000 found:
2001 	ASSERT(connp != NULL);
2002 	CONN_INC_REF(connp);
2003 	mutex_exit(&connfp->connf_lock);
2004 	return (connp);
2005 }
2006 
2007 /* ARGSUSED */
2008 static int
2009 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2010 {
2011 	itc_t	*itc = (itc_t *)buf;
2012 	conn_t 	*connp = &itc->itc_conn;
2013 	tcp_t	*tcp = (tcp_t *)&itc[1];
2014 
2015 	bzero(connp, sizeof (conn_t));
2016 	bzero(tcp, sizeof (tcp_t));
2017 
2018 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2019 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2020 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2021 	connp->conn_tcp = tcp;
2022 	connp->conn_flags = IPCL_TCPCONN;
2023 	connp->conn_ulp = IPPROTO_TCP;
2024 	tcp->tcp_connp = connp;
2025 	return (0);
2026 }
2027 
2028 /* ARGSUSED */
2029 static void
2030 tcp_conn_destructor(void *buf, void *cdrarg)
2031 {
2032 	itc_t	*itc = (itc_t *)buf;
2033 	conn_t 	*connp = &itc->itc_conn;
2034 	tcp_t	*tcp = (tcp_t *)&itc[1];
2035 
2036 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2037 	ASSERT(tcp->tcp_connp == connp);
2038 	ASSERT(connp->conn_tcp == tcp);
2039 	tcp_timermp_free(tcp);
2040 	mutex_destroy(&connp->conn_lock);
2041 	cv_destroy(&connp->conn_cv);
2042 }
2043 
2044 /* ARGSUSED */
2045 static int
2046 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2047 {
2048 	itc_t	*itc = (itc_t *)buf;
2049 	conn_t 	*connp = &itc->itc_conn;
2050 
2051 	bzero(connp, sizeof (conn_t));
2052 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2053 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2054 	connp->conn_flags = IPCL_IPCCONN;
2055 
2056 	return (0);
2057 }
2058 
2059 /* ARGSUSED */
2060 static void
2061 ip_conn_destructor(void *buf, void *cdrarg)
2062 {
2063 	itc_t	*itc = (itc_t *)buf;
2064 	conn_t 	*connp = &itc->itc_conn;
2065 
2066 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2067 	ASSERT(connp->conn_priv == NULL);
2068 	mutex_destroy(&connp->conn_lock);
2069 	cv_destroy(&connp->conn_cv);
2070 }
2071 
2072 /* ARGSUSED */
2073 static int
2074 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2075 {
2076 	itc_t	*itc = (itc_t *)buf;
2077 	conn_t 	*connp = &itc->itc_conn;
2078 	udp_t	*udp = (udp_t *)&itc[1];
2079 
2080 	bzero(connp, sizeof (conn_t));
2081 	bzero(udp, sizeof (udp_t));
2082 
2083 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2084 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2085 	connp->conn_udp = udp;
2086 	connp->conn_flags = IPCL_UDPCONN;
2087 	connp->conn_ulp = IPPROTO_UDP;
2088 	udp->udp_connp = connp;
2089 	return (0);
2090 }
2091 
2092 /* ARGSUSED */
2093 static void
2094 udp_conn_destructor(void *buf, void *cdrarg)
2095 {
2096 	itc_t	*itc = (itc_t *)buf;
2097 	conn_t 	*connp = &itc->itc_conn;
2098 	udp_t	*udp = (udp_t *)&itc[1];
2099 
2100 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2101 	ASSERT(udp->udp_connp == connp);
2102 	ASSERT(connp->conn_udp == udp);
2103 	mutex_destroy(&connp->conn_lock);
2104 	cv_destroy(&connp->conn_cv);
2105 }
2106 
2107 /* ARGSUSED */
2108 static int
2109 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2110 {
2111 	itc_t	*itc = (itc_t *)buf;
2112 	conn_t 	*connp = &itc->itc_conn;
2113 	icmp_t	*icmp = (icmp_t *)&itc[1];
2114 
2115 	bzero(connp, sizeof (conn_t));
2116 	bzero(icmp, sizeof (icmp_t));
2117 
2118 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2119 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2120 	connp->conn_icmp = icmp;
2121 	connp->conn_flags = IPCL_RAWIPCONN;
2122 	connp->conn_ulp = IPPROTO_ICMP;
2123 	icmp->icmp_connp = connp;
2124 	return (0);
2125 }
2126 
2127 /* ARGSUSED */
2128 static void
2129 rawip_conn_destructor(void *buf, void *cdrarg)
2130 {
2131 	itc_t	*itc = (itc_t *)buf;
2132 	conn_t 	*connp = &itc->itc_conn;
2133 	icmp_t	*icmp = (icmp_t *)&itc[1];
2134 
2135 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2136 	ASSERT(icmp->icmp_connp == connp);
2137 	ASSERT(connp->conn_icmp == icmp);
2138 	mutex_destroy(&connp->conn_lock);
2139 	cv_destroy(&connp->conn_cv);
2140 }
2141 
2142 /* ARGSUSED */
2143 static int
2144 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2145 {
2146 	itc_t	*itc = (itc_t *)buf;
2147 	conn_t 	*connp = &itc->itc_conn;
2148 	rts_t	*rts = (rts_t *)&itc[1];
2149 
2150 	bzero(connp, sizeof (conn_t));
2151 	bzero(rts, sizeof (rts_t));
2152 
2153 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2154 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2155 	connp->conn_rts = rts;
2156 	connp->conn_flags = IPCL_RTSCONN;
2157 	rts->rts_connp = connp;
2158 	return (0);
2159 }
2160 
2161 /* ARGSUSED */
2162 static void
2163 rts_conn_destructor(void *buf, void *cdrarg)
2164 {
2165 	itc_t	*itc = (itc_t *)buf;
2166 	conn_t 	*connp = &itc->itc_conn;
2167 	rts_t	*rts = (rts_t *)&itc[1];
2168 
2169 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2170 	ASSERT(rts->rts_connp == connp);
2171 	ASSERT(connp->conn_rts == rts);
2172 	mutex_destroy(&connp->conn_lock);
2173 	cv_destroy(&connp->conn_cv);
2174 }
2175 
2176 /*
2177  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2178  * in the conn_t.
2179  *
2180  * Below we list all the pointers in the conn_t as a documentation aid.
2181  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2182  * If you add any pointers to the conn_t please add an ASSERT here
2183  * and #ifdef it out if it can't be actually asserted to be NULL.
2184  * In any case, we bzero most of the conn_t at the end of the function.
2185  */
2186 void
2187 ipcl_conn_cleanup(conn_t *connp)
2188 {
2189 	ASSERT(connp->conn_ire_cache == NULL);
2190 	ASSERT(connp->conn_latch == NULL);
2191 #ifdef notdef
2192 	/* These are not cleared */
2193 	ASSERT(connp->conn_rq == NULL);
2194 	ASSERT(connp->conn_wq == NULL);
2195 #endif
2196 	ASSERT(connp->conn_cred == NULL);
2197 	ASSERT(connp->conn_g_fanout == NULL);
2198 	ASSERT(connp->conn_g_next == NULL);
2199 	ASSERT(connp->conn_g_prev == NULL);
2200 	ASSERT(connp->conn_policy == NULL);
2201 	ASSERT(connp->conn_fanout == NULL);
2202 	ASSERT(connp->conn_next == NULL);
2203 	ASSERT(connp->conn_prev == NULL);
2204 #ifdef notdef
2205 	/*
2206 	 * The ill and ipif pointers are not cleared before the conn_t
2207 	 * goes away since they do not hold a reference on the ill/ipif.
2208 	 * We should replace these pointers with ifindex/ipaddr_t to
2209 	 * make the code less complex.
2210 	 */
2211 	ASSERT(connp->conn_xmit_if_ill == NULL);
2212 	ASSERT(connp->conn_nofailover_ill == NULL);
2213 	ASSERT(connp->conn_outgoing_ill == NULL);
2214 	ASSERT(connp->conn_incoming_ill == NULL);
2215 	ASSERT(connp->conn_outgoing_pill == NULL);
2216 	ASSERT(connp->conn_multicast_ipif == NULL);
2217 	ASSERT(connp->conn_multicast_ill == NULL);
2218 #endif
2219 	ASSERT(connp->conn_oper_pending_ill == NULL);
2220 	ASSERT(connp->conn_ilg == NULL);
2221 	ASSERT(connp->conn_drain_next == NULL);
2222 	ASSERT(connp->conn_drain_prev == NULL);
2223 #ifdef notdef
2224 	/* conn_idl is not cleared when removed from idl list */
2225 	ASSERT(connp->conn_idl == NULL);
2226 #endif
2227 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2228 	ASSERT(connp->conn_peercred == NULL);
2229 	ASSERT(connp->conn_netstack == NULL);
2230 
2231 	/* Clear out the conn_t fields that are not preserved */
2232 	bzero(&connp->conn_start_clr,
2233 	    sizeof (conn_t) -
2234 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2235 
2236 }
2237 
2238 /*
2239  * All conns are inserted in a global multi-list for the benefit of
2240  * walkers. The walk is guaranteed to walk all open conns at the time
2241  * of the start of the walk exactly once. This property is needed to
2242  * achieve some cleanups during unplumb of interfaces. This is achieved
2243  * as follows.
2244  *
2245  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2246  * call the insert and delete functions below at creation and deletion
2247  * time respectively. The conn never moves or changes its position in this
2248  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2249  * won't increase due to walkers, once the conn deletion has started. Note
2250  * that we can't remove the conn from the global list and then wait for
2251  * the refcnt to drop to zero, since walkers would then see a truncated
2252  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2253  * conns until ip_open is ready to make them globally visible.
2254  * The global round robin multi-list locks are held only to get the
2255  * next member/insertion/deletion and contention should be negligible
2256  * if the multi-list is much greater than the number of cpus.
2257  */
2258 void
2259 ipcl_globalhash_insert(conn_t *connp)
2260 {
2261 	int	index;
2262 	struct connf_s	*connfp;
2263 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2264 
2265 	/*
2266 	 * No need for atomic here. Approximate even distribution
2267 	 * in the global lists is sufficient.
2268 	 */
2269 	ipst->ips_conn_g_index++;
2270 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2271 
2272 	connp->conn_g_prev = NULL;
2273 	/*
2274 	 * Mark as INCIPIENT, so that walkers will ignore this
2275 	 * for now, till ip_open is ready to make it visible globally.
2276 	 */
2277 	connp->conn_state_flags |= CONN_INCIPIENT;
2278 
2279 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2280 	/* Insert at the head of the list */
2281 	mutex_enter(&connfp->connf_lock);
2282 	connp->conn_g_next = connfp->connf_head;
2283 	if (connp->conn_g_next != NULL)
2284 		connp->conn_g_next->conn_g_prev = connp;
2285 	connfp->connf_head = connp;
2286 
2287 	/* The fanout bucket this conn points to */
2288 	connp->conn_g_fanout = connfp;
2289 
2290 	mutex_exit(&connfp->connf_lock);
2291 }
2292 
2293 void
2294 ipcl_globalhash_remove(conn_t *connp)
2295 {
2296 	struct connf_s	*connfp;
2297 
2298 	/*
2299 	 * We were never inserted in the global multi list.
2300 	 * IPCL_NONE variety is never inserted in the global multilist
2301 	 * since it is presumed to not need any cleanup and is transient.
2302 	 */
2303 	if (connp->conn_g_fanout == NULL)
2304 		return;
2305 
2306 	connfp = connp->conn_g_fanout;
2307 	mutex_enter(&connfp->connf_lock);
2308 	if (connp->conn_g_prev != NULL)
2309 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2310 	else
2311 		connfp->connf_head = connp->conn_g_next;
2312 	if (connp->conn_g_next != NULL)
2313 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2314 	mutex_exit(&connfp->connf_lock);
2315 
2316 	/* Better to stumble on a null pointer than to corrupt memory */
2317 	connp->conn_g_next = NULL;
2318 	connp->conn_g_prev = NULL;
2319 	connp->conn_g_fanout = NULL;
2320 }
2321 
2322 /*
2323  * Walk the list of all conn_t's in the system, calling the function provided
2324  * with the specified argument for each.
2325  * Applies to both IPv4 and IPv6.
2326  *
2327  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2328  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2329  * unplumbed or removed. New conn_t's that are created while we are walking
2330  * may be missed by this walk, because they are not necessarily inserted
2331  * at the tail of the list. They are new conn_t's and thus don't have any
2332  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2333  * is created to the struct that is going away.
2334  */
2335 void
2336 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2337 {
2338 	int	i;
2339 	conn_t	*connp;
2340 	conn_t	*prev_connp;
2341 
2342 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2343 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2344 		prev_connp = NULL;
2345 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2346 		while (connp != NULL) {
2347 			mutex_enter(&connp->conn_lock);
2348 			if (connp->conn_state_flags &
2349 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2350 				mutex_exit(&connp->conn_lock);
2351 				connp = connp->conn_g_next;
2352 				continue;
2353 			}
2354 			CONN_INC_REF_LOCKED(connp);
2355 			mutex_exit(&connp->conn_lock);
2356 			mutex_exit(
2357 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2358 			(*func)(connp, arg);
2359 			if (prev_connp != NULL)
2360 				CONN_DEC_REF(prev_connp);
2361 			mutex_enter(
2362 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2363 			prev_connp = connp;
2364 			connp = connp->conn_g_next;
2365 		}
2366 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2367 		if (prev_connp != NULL)
2368 			CONN_DEC_REF(prev_connp);
2369 	}
2370 }
2371 
2372 /*
2373  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2374  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2375  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2376  * (peer tcp in ESTABLISHED state).
2377  */
2378 conn_t *
2379 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2380     ip_stack_t *ipst)
2381 {
2382 	uint32_t ports;
2383 	uint16_t *pports = (uint16_t *)&ports;
2384 	connf_t	*connfp;
2385 	conn_t	*tconnp;
2386 	boolean_t zone_chk;
2387 
2388 	/*
2389 	 * If either the source of destination address is loopback, then
2390 	 * both endpoints must be in the same Zone.  Otherwise, both of
2391 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2392 	 * state) and the endpoints may reside in different Zones.
2393 	 */
2394 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2395 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2396 
2397 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2398 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2399 
2400 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2401 	    ports, ipst)];
2402 
2403 	mutex_enter(&connfp->connf_lock);
2404 	for (tconnp = connfp->connf_head; tconnp != NULL;
2405 	    tconnp = tconnp->conn_next) {
2406 
2407 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2408 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2409 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2410 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2411 
2412 			ASSERT(tconnp != connp);
2413 			CONN_INC_REF(tconnp);
2414 			mutex_exit(&connfp->connf_lock);
2415 			return (tconnp);
2416 		}
2417 	}
2418 	mutex_exit(&connfp->connf_lock);
2419 	return (NULL);
2420 }
2421 
2422 /*
2423  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2424  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2425  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2426  * (peer tcp in ESTABLISHED state).
2427  */
2428 conn_t *
2429 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2430     ip_stack_t *ipst)
2431 {
2432 	uint32_t ports;
2433 	uint16_t *pports = (uint16_t *)&ports;
2434 	connf_t	*connfp;
2435 	conn_t	*tconnp;
2436 	boolean_t zone_chk;
2437 
2438 	/*
2439 	 * If either the source of destination address is loopback, then
2440 	 * both endpoints must be in the same Zone.  Otherwise, both of
2441 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2442 	 * state) and the endpoints may reside in different Zones.  We
2443 	 * don't do Zone check for link local address(es) because the
2444 	 * current Zone implementation treats each link local address as
2445 	 * being unique per system node, i.e. they belong to global Zone.
2446 	 */
2447 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2448 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2449 
2450 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2451 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2452 
2453 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2454 	    ports, ipst)];
2455 
2456 	mutex_enter(&connfp->connf_lock);
2457 	for (tconnp = connfp->connf_head; tconnp != NULL;
2458 	    tconnp = tconnp->conn_next) {
2459 
2460 		/* We skip tcp_bound_if check here as this is loopback tcp */
2461 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2462 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2463 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2464 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2465 
2466 			ASSERT(tconnp != connp);
2467 			CONN_INC_REF(tconnp);
2468 			mutex_exit(&connfp->connf_lock);
2469 			return (tconnp);
2470 		}
2471 	}
2472 	mutex_exit(&connfp->connf_lock);
2473 	return (NULL);
2474 }
2475 
2476 /*
2477  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2478  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2479  * Only checks for connected entries i.e. no INADDR_ANY checks.
2480  */
2481 conn_t *
2482 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2483     ip_stack_t *ipst)
2484 {
2485 	uint32_t ports;
2486 	uint16_t *pports;
2487 	connf_t	*connfp;
2488 	conn_t	*tconnp;
2489 
2490 	pports = (uint16_t *)&ports;
2491 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2492 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2493 
2494 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2495 	    ports, ipst)];
2496 
2497 	mutex_enter(&connfp->connf_lock);
2498 	for (tconnp = connfp->connf_head; tconnp != NULL;
2499 	    tconnp = tconnp->conn_next) {
2500 
2501 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2502 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2503 		    tconnp->conn_tcp->tcp_state >= min_state) {
2504 
2505 			CONN_INC_REF(tconnp);
2506 			mutex_exit(&connfp->connf_lock);
2507 			return (tconnp);
2508 		}
2509 	}
2510 	mutex_exit(&connfp->connf_lock);
2511 	return (NULL);
2512 }
2513 
2514 /*
2515  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2516  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2517  * Only checks for connected entries i.e. no INADDR_ANY checks.
2518  * Match on ifindex in addition to addresses.
2519  */
2520 conn_t *
2521 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2522     uint_t ifindex, ip_stack_t *ipst)
2523 {
2524 	tcp_t	*tcp;
2525 	uint32_t ports;
2526 	uint16_t *pports;
2527 	connf_t	*connfp;
2528 	conn_t	*tconnp;
2529 
2530 	pports = (uint16_t *)&ports;
2531 	pports[0] = tcpha->tha_fport;
2532 	pports[1] = tcpha->tha_lport;
2533 
2534 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2535 	    ports, ipst)];
2536 
2537 	mutex_enter(&connfp->connf_lock);
2538 	for (tconnp = connfp->connf_head; tconnp != NULL;
2539 	    tconnp = tconnp->conn_next) {
2540 
2541 		tcp = tconnp->conn_tcp;
2542 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2543 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2544 		    tcp->tcp_state >= min_state &&
2545 		    (tcp->tcp_bound_if == 0 ||
2546 		    tcp->tcp_bound_if == ifindex)) {
2547 
2548 			CONN_INC_REF(tconnp);
2549 			mutex_exit(&connfp->connf_lock);
2550 			return (tconnp);
2551 		}
2552 	}
2553 	mutex_exit(&connfp->connf_lock);
2554 	return (NULL);
2555 }
2556 
2557 /*
2558  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2559  * a listener when changing state.
2560  */
2561 conn_t *
2562 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2563     ip_stack_t *ipst)
2564 {
2565 	connf_t		*bind_connfp;
2566 	conn_t		*connp;
2567 	tcp_t		*tcp;
2568 
2569 	/*
2570 	 * Avoid false matches for packets sent to an IP destination of
2571 	 * all zeros.
2572 	 */
2573 	if (laddr == 0)
2574 		return (NULL);
2575 
2576 	ASSERT(zoneid != ALL_ZONES);
2577 
2578 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2579 	mutex_enter(&bind_connfp->connf_lock);
2580 	for (connp = bind_connfp->connf_head; connp != NULL;
2581 	    connp = connp->conn_next) {
2582 		tcp = connp->conn_tcp;
2583 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2584 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2585 		    (tcp->tcp_listener == NULL)) {
2586 			CONN_INC_REF(connp);
2587 			mutex_exit(&bind_connfp->connf_lock);
2588 			return (connp);
2589 		}
2590 	}
2591 	mutex_exit(&bind_connfp->connf_lock);
2592 	return (NULL);
2593 }
2594 
2595 /*
2596  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2597  * a listener when changing state.
2598  */
2599 conn_t *
2600 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2601     zoneid_t zoneid, ip_stack_t *ipst)
2602 {
2603 	connf_t		*bind_connfp;
2604 	conn_t		*connp = NULL;
2605 	tcp_t		*tcp;
2606 
2607 	/*
2608 	 * Avoid false matches for packets sent to an IP destination of
2609 	 * all zeros.
2610 	 */
2611 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2612 		return (NULL);
2613 
2614 	ASSERT(zoneid != ALL_ZONES);
2615 
2616 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2617 	mutex_enter(&bind_connfp->connf_lock);
2618 	for (connp = bind_connfp->connf_head; connp != NULL;
2619 	    connp = connp->conn_next) {
2620 		tcp = connp->conn_tcp;
2621 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2622 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2623 		    (tcp->tcp_bound_if == 0 ||
2624 		    tcp->tcp_bound_if == ifindex) &&
2625 		    tcp->tcp_listener == NULL) {
2626 			CONN_INC_REF(connp);
2627 			mutex_exit(&bind_connfp->connf_lock);
2628 			return (connp);
2629 		}
2630 	}
2631 	mutex_exit(&bind_connfp->connf_lock);
2632 	return (NULL);
2633 }
2634 
2635 /*
2636  * ipcl_get_next_conn
2637  *	get the next entry in the conn global list
2638  *	and put a reference on the next_conn.
2639  *	decrement the reference on the current conn.
2640  *
2641  * This is an iterator based walker function that also provides for
2642  * some selection by the caller. It walks through the conn_hash bucket
2643  * searching for the next valid connp in the list, and selects connections
2644  * that are neither closed nor condemned. It also REFHOLDS the conn
2645  * thus ensuring that the conn exists when the caller uses the conn.
2646  */
2647 conn_t *
2648 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2649 {
2650 	conn_t	*next_connp;
2651 
2652 	if (connfp == NULL)
2653 		return (NULL);
2654 
2655 	mutex_enter(&connfp->connf_lock);
2656 
2657 	next_connp = (connp == NULL) ?
2658 	    connfp->connf_head : connp->conn_g_next;
2659 
2660 	while (next_connp != NULL) {
2661 		mutex_enter(&next_connp->conn_lock);
2662 		if (!(next_connp->conn_flags & conn_flags) ||
2663 		    (next_connp->conn_state_flags &
2664 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2665 			/*
2666 			 * This conn has been condemned or
2667 			 * is closing, or the flags don't match
2668 			 */
2669 			mutex_exit(&next_connp->conn_lock);
2670 			next_connp = next_connp->conn_g_next;
2671 			continue;
2672 		}
2673 		CONN_INC_REF_LOCKED(next_connp);
2674 		mutex_exit(&next_connp->conn_lock);
2675 		break;
2676 	}
2677 
2678 	mutex_exit(&connfp->connf_lock);
2679 
2680 	if (connp != NULL)
2681 		CONN_DEC_REF(connp);
2682 
2683 	return (next_connp);
2684 }
2685 
2686 #ifdef CONN_DEBUG
2687 /*
2688  * Trace of the last NBUF refhold/refrele
2689  */
2690 int
2691 conn_trace_ref(conn_t *connp)
2692 {
2693 	int	last;
2694 	conn_trace_t	*ctb;
2695 
2696 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2697 	last = connp->conn_trace_last;
2698 	last++;
2699 	if (last == CONN_TRACE_MAX)
2700 		last = 0;
2701 
2702 	ctb = &connp->conn_trace_buf[last];
2703 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2704 	connp->conn_trace_last = last;
2705 	return (1);
2706 }
2707 
2708 int
2709 conn_untrace_ref(conn_t *connp)
2710 {
2711 	int	last;
2712 	conn_trace_t	*ctb;
2713 
2714 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2715 	last = connp->conn_trace_last;
2716 	last++;
2717 	if (last == CONN_TRACE_MAX)
2718 		last = 0;
2719 
2720 	ctb = &connp->conn_trace_buf[last];
2721 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2722 	connp->conn_trace_last = last;
2723 	return (1);
2724 }
2725 #endif
2726