xref: /titanic_51/usr/src/uts/common/inet/ip/ipclassifier.c (revision 72bdce51192b13a20009855f749004480874291b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
29 
30 /*
31  * IP PACKET CLASSIFIER
32  *
33  * The IP packet classifier provides mapping between IP packets and persistent
34  * connection state for connection-oriented protocols. It also provides
35  * interface for managing connection states.
36  *
37  * The connection state is kept in conn_t data structure and contains, among
38  * other things:
39  *
40  *	o local/remote address and ports
41  *	o Transport protocol
42  *	o squeue for the connection (for TCP only)
43  *	o reference counter
44  *	o Connection state
45  *	o hash table linkage
46  *	o interface/ire information
47  *	o credentials
48  *	o ipsec policy
49  *	o send and receive functions.
50  *	o mutex lock.
51  *
52  * Connections use a reference counting scheme. They are freed when the
53  * reference counter drops to zero. A reference is incremented when connection
54  * is placed in a list or table, when incoming packet for the connection arrives
55  * and when connection is processed via squeue (squeue processing may be
56  * asynchronous and the reference protects the connection from being destroyed
57  * before its processing is finished).
58  *
59  * send and receive functions are currently used for TCP only. The send function
60  * determines the IP entry point for the packet once it leaves TCP to be sent to
61  * the destination address. The receive function is used by IP when the packet
62  * should be passed for TCP processing. When a new connection is created these
63  * are set to ip_output() and tcp_input() respectively. During the lifetime of
64  * the connection the send and receive functions may change depending on the
65  * changes in the connection state. For example, Once the connection is bound to
66  * an addresse, the receive function for this connection is set to
67  * tcp_conn_request().  This allows incoming SYNs to go directly into the
68  * listener SYN processing function without going to tcp_input() first.
69  *
70  * Classifier uses several hash tables:
71  *
72  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73  *	ipcl_bind_fanout:	contains all connections in BOUND state
74  *	ipcl_proto_fanout:	IPv4 protocol fanout
75  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76  *	ipcl_udp_fanout:	contains all UDP connections
77  *	ipcl_globalhash_fanout:	contains all connections
78  *
79  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80  * which need to view all existing connections.
81  *
82  * All tables are protected by per-bucket locks. When both per-bucket lock and
83  * connection lock need to be held, the per-bucket lock should be acquired
84  * first, followed by the connection lock.
85  *
86  * All functions doing search in one of these tables increment a reference
87  * counter on the connection found (if any). This reference should be dropped
88  * when the caller has finished processing the connection.
89  *
90  *
91  * INTERFACES:
92  * ===========
93  *
94  * Connection Lookup:
95  * ------------------
96  *
97  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
98  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
99  *
100  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101  * it can't find any associated connection. If the connection is found, its
102  * reference counter is incremented.
103  *
104  *	mp:	mblock, containing packet header. The full header should fit
105  *		into a single mblock. It should also contain at least full IP
106  *		and TCP or UDP header.
107  *
108  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109  *
110  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111  *		 the packet.
112  *
113  * 	zoneid: The zone in which the returned connection must be; the zoneid
114  *		corresponding to the ire_zoneid on the IRE located for the
115  *		packet's destination address.
116  *
117  *	For TCP connections, the lookup order is as follows:
118  *		5-tuple {src, dst, protocol, local port, remote port}
119  *			lookup in ipcl_conn_fanout table.
120  *		3-tuple {dst, remote port, protocol} lookup in
121  *			ipcl_bind_fanout table.
122  *
123  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
124  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
125  *	these interfaces do not handle cases where a packets belongs
126  *	to multiple UDP clients, which is handled in IP itself.
127  *
128  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
129  * determine which actual zone gets the segment.  This is used only in a
130  * labeled environment.  The matching rules are:
131  *
132  *	- If it's not a multilevel port, then the label on the packet selects
133  *	  the zone.  Unlabeled packets are delivered to the global zone.
134  *
135  *	- If it's a multilevel port, then only the zone registered to receive
136  *	  packets on that port matches.
137  *
138  * Also, in a labeled environment, packet labels need to be checked.  For fully
139  * bound TCP connections, we can assume that the packet label was checked
140  * during connection establishment, and doesn't need to be checked on each
141  * packet.  For others, though, we need to check for strict equality or, for
142  * multilevel ports, membership in the range or set.  This part currently does
143  * a tnrh lookup on each packet, but could be optimized to use cached results
144  * if that were necessary.  (SCTP doesn't come through here, but if it did,
145  * we would apply the same rules as TCP.)
146  *
147  * An implication of the above is that fully-bound TCP sockets must always use
148  * distinct 4-tuples; they can't be discriminated by label alone.
149  *
150  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
151  * as there's no connection set-up handshake and no shared state.
152  *
153  * Labels on looped-back packets within a single zone do not need to be
154  * checked, as all processes in the same zone have the same label.
155  *
156  * Finally, for unlabeled packets received by a labeled system, special rules
157  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
158  * socket in the zone whose label matches the default label of the sender, if
159  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
160  * receiver's label must dominate the sender's default label.
161  *
162  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
163  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
164  *					 ip_stack);
165  *
166  *	Lookup routine to find a exact match for {src, dst, local port,
167  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
168  *	ports are read from the IP and TCP header respectively.
169  *
170  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
171  *					 zoneid, ip_stack);
172  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
173  *					 zoneid, ip_stack);
174  *
175  * 	Lookup routine to find a listener with the tuple {lport, laddr,
176  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
177  * 	parameter interface index is also compared.
178  *
179  * void ipcl_walk(func, arg, ip_stack)
180  *
181  * 	Apply 'func' to every connection available. The 'func' is called as
182  *	(*func)(connp, arg). The walk is non-atomic so connections may be
183  *	created and destroyed during the walk. The CONN_CONDEMNED and
184  *	CONN_INCIPIENT flags ensure that connections which are newly created
185  *	or being destroyed are not selected by the walker.
186  *
187  * Table Updates
188  * -------------
189  *
190  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
191  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
192  *
193  *	Insert 'connp' in the ipcl_conn_fanout.
194  *	Arguements :
195  *		connp		conn_t to be inserted
196  *		protocol	connection protocol
197  *		src		source address
198  *		dst		destination address
199  *		ports		local and remote port
200  *		ifindex		interface index for IPv6 connections
201  *
202  *	Return value :
203  *		0		if connp was inserted
204  *		EADDRINUSE	if the connection with the same tuple
205  *				already exists.
206  *
207  * int ipcl_bind_insert(connp, protocol, src, lport);
208  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
209  *
210  * 	Insert 'connp' in ipcl_bind_fanout.
211  * 	Arguements :
212  * 		connp		conn_t to be inserted
213  * 		protocol	connection protocol
214  * 		src		source address connection wants
215  * 				to bind to
216  * 		lport		local port connection wants to
217  * 				bind to
218  *
219  *
220  * void ipcl_hash_remove(connp);
221  *
222  * 	Removes the 'connp' from the connection fanout table.
223  *
224  * Connection Creation/Destruction
225  * -------------------------------
226  *
227  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
228  *
229  * 	Creates a new conn based on the type flag, inserts it into
230  * 	globalhash table.
231  *
232  *	type:	This flag determines the type of conn_t which needs to be
233  *		created.
234  *		IPCL_TCPCONN	indicates a TCP connection
235  *		IPCL_IPCONN	indicates all non-TCP connections.
236  *
237  * void ipcl_conn_destroy(connp)
238  *
239  * 	Destroys the connection state, removes it from the global
240  * 	connection hash table and frees its memory.
241  */
242 
243 #include <sys/types.h>
244 #include <sys/stream.h>
245 #include <sys/stropts.h>
246 #include <sys/sysmacros.h>
247 #include <sys/strsubr.h>
248 #include <sys/strsun.h>
249 #define	_SUN_TPI_VERSION 2
250 #include <sys/ddi.h>
251 #include <sys/cmn_err.h>
252 #include <sys/debug.h>
253 
254 #include <sys/systm.h>
255 #include <sys/param.h>
256 #include <sys/kmem.h>
257 #include <sys/isa_defs.h>
258 #include <inet/common.h>
259 #include <netinet/ip6.h>
260 #include <netinet/icmp6.h>
261 
262 #include <inet/ip.h>
263 #include <inet/ip6.h>
264 #include <inet/tcp.h>
265 #include <inet/ip_ndp.h>
266 #include <inet/udp_impl.h>
267 #include <inet/sctp_ip.h>
268 #include <inet/sctp/sctp_impl.h>
269 
270 #include <sys/cpuvar.h>
271 
272 #include <inet/ipclassifier.h>
273 #include <inet/ipsec_impl.h>
274 
275 #include <sys/tsol/tnet.h>
276 
277 #ifdef DEBUG
278 #define	IPCL_DEBUG
279 #else
280 #undef	IPCL_DEBUG
281 #endif
282 
283 #ifdef	IPCL_DEBUG
284 int	ipcl_debug_level = 0;
285 #define	IPCL_DEBUG_LVL(level, args)	\
286 	if (ipcl_debug_level  & level) { printf args; }
287 #else
288 #define	IPCL_DEBUG_LVL(level, args) {; }
289 #endif
290 /* Old value for compatibility. Setable in /etc/system */
291 uint_t tcp_conn_hash_size = 0;
292 
293 /* New value. Zero means choose automatically.  Setable in /etc/system */
294 uint_t ipcl_conn_hash_size = 0;
295 uint_t ipcl_conn_hash_memfactor = 8192;
296 uint_t ipcl_conn_hash_maxsize = 82500;
297 
298 /* bind/udp fanout table size */
299 uint_t ipcl_bind_fanout_size = 512;
300 uint_t ipcl_udp_fanout_size = 16384;
301 
302 /* Raw socket fanout size.  Must be a power of 2. */
303 uint_t ipcl_raw_fanout_size = 256;
304 
305 /*
306  * Power of 2^N Primes useful for hashing for N of 0-28,
307  * these primes are the nearest prime <= 2^N - 2^(N-2).
308  */
309 
310 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
311 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
312 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
313 		50331599, 100663291, 201326557, 0}
314 
315 /*
316  * wrapper structure to ensure that conn+tcpb are aligned
317  * on cache lines.
318  */
319 typedef struct itc_s {
320 	union {
321 		conn_t	itcu_conn;
322 		char	itcu_filler[CACHE_ALIGN(conn_s)];
323 	}	itc_u;
324 	tcp_t	itc_tcp;
325 } itc_t;
326 
327 #define	itc_conn	itc_u.itcu_conn
328 
329 struct kmem_cache  *ipcl_tcpconn_cache;
330 struct kmem_cache  *ipcl_conn_cache;
331 extern struct kmem_cache  *sctp_conn_cache;
332 extern struct kmem_cache  *tcp_sack_info_cache;
333 extern struct kmem_cache  *tcp_iphc_cache;
334 
335 extern void	tcp_timermp_free(tcp_t *);
336 extern mblk_t	*tcp_timermp_alloc(int);
337 
338 static int	ipcl_tcpconn_constructor(void *, void *, int);
339 static void	ipcl_tcpconn_destructor(void *, void *);
340 
341 #ifdef	IPCL_DEBUG
342 #define	INET_NTOA_BUFSIZE	18
343 
344 static char *
345 inet_ntoa_r(uint32_t in, char *b)
346 {
347 	unsigned char	*p;
348 
349 	p = (unsigned char *)&in;
350 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
351 	return (b);
352 }
353 #endif
354 
355 /*
356  * Global (for all stack instances) init routine
357  */
358 void
359 ipcl_g_init(void)
360 {
361 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
362 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
363 	    NULL, NULL, NULL, NULL, NULL, 0);
364 
365 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
366 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
367 	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
368 	    NULL, NULL, NULL, 0);
369 }
370 
371 /*
372  * ipclassifier intialization routine, sets up hash tables.
373  */
374 void
375 ipcl_init(ip_stack_t *ipst)
376 {
377 	int i;
378 	int sizes[] = P2Ps();
379 
380 	/*
381 	 * Calculate size of conn fanout table from /etc/system settings
382 	 */
383 	if (ipcl_conn_hash_size != 0) {
384 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
385 	} else if (tcp_conn_hash_size != 0) {
386 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
387 	} else {
388 		extern pgcnt_t freemem;
389 
390 		ipst->ips_ipcl_conn_fanout_size =
391 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
392 
393 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
394 			ipst->ips_ipcl_conn_fanout_size =
395 			    ipcl_conn_hash_maxsize;
396 		}
397 	}
398 
399 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
400 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
401 			break;
402 		}
403 	}
404 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
405 		/* Out of range, use the 2^16 value */
406 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
407 	}
408 
409 	/* Take values from /etc/system */
410 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
411 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
412 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
413 
414 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
415 
416 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
417 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
418 
419 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
420 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
421 		    MUTEX_DEFAULT, NULL);
422 	}
423 
424 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
425 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
426 
427 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
428 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
429 		    MUTEX_DEFAULT, NULL);
430 	}
431 
432 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
433 	    sizeof (connf_t), KM_SLEEP);
434 	for (i = 0; i < IPPROTO_MAX; i++) {
435 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
436 		    MUTEX_DEFAULT, NULL);
437 	}
438 
439 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
440 	    sizeof (connf_t), KM_SLEEP);
441 	for (i = 0; i < IPPROTO_MAX; i++) {
442 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
443 		    MUTEX_DEFAULT, NULL);
444 	}
445 
446 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
447 	mutex_init(&ipst->ips_rts_clients->connf_lock,
448 	    NULL, MUTEX_DEFAULT, NULL);
449 
450 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
451 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
452 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
453 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
454 		    MUTEX_DEFAULT, NULL);
455 	}
456 
457 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
458 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
459 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
460 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
461 		    MUTEX_DEFAULT, NULL);
462 	}
463 
464 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
465 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
466 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
467 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
468 		    NULL, MUTEX_DEFAULT, NULL);
469 	}
470 }
471 
472 void
473 ipcl_g_destroy(void)
474 {
475 	kmem_cache_destroy(ipcl_conn_cache);
476 	kmem_cache_destroy(ipcl_tcpconn_cache);
477 }
478 
479 /*
480  * All user-level and kernel use of the stack must be gone
481  * by now.
482  */
483 void
484 ipcl_destroy(ip_stack_t *ipst)
485 {
486 	int i;
487 
488 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
489 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
490 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
491 	}
492 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
493 	    sizeof (connf_t));
494 	ipst->ips_ipcl_conn_fanout = NULL;
495 
496 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
497 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
498 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
499 	}
500 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
501 	    sizeof (connf_t));
502 	ipst->ips_ipcl_bind_fanout = NULL;
503 
504 	for (i = 0; i < IPPROTO_MAX; i++) {
505 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
506 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
507 	}
508 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
509 	ipst->ips_ipcl_proto_fanout = NULL;
510 
511 	for (i = 0; i < IPPROTO_MAX; i++) {
512 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
513 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
514 	}
515 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
516 	    IPPROTO_MAX * sizeof (connf_t));
517 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
518 
519 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
520 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
521 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
522 	}
523 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
524 	    sizeof (connf_t));
525 	ipst->ips_ipcl_udp_fanout = NULL;
526 
527 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
528 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
529 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
530 	}
531 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
532 	    sizeof (connf_t));
533 	ipst->ips_ipcl_raw_fanout = NULL;
534 
535 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
536 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
537 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
538 	}
539 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
540 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
541 	ipst->ips_ipcl_globalhash_fanout = NULL;
542 
543 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
544 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
545 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
546 	ipst->ips_rts_clients = NULL;
547 }
548 
549 /*
550  * conn creation routine. initialize the conn, sets the reference
551  * and inserts it in the global hash table.
552  */
553 conn_t *
554 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
555 {
556 	itc_t	*itc;
557 	conn_t	*connp;
558 	sctp_stack_t *sctps;
559 
560 	switch (type) {
561 	case IPCL_TCPCONN:
562 		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
563 		    sleep)) == NULL)
564 			return (NULL);
565 		connp = &itc->itc_conn;
566 		connp->conn_ref = 1;
567 		netstack_hold(ns);
568 		connp->conn_netstack = ns;
569 		IPCL_DEBUG_LVL(1,
570 		    ("ipcl_conn_create: connp = %p tcp (%p)",
571 		    (void *)connp, (void *)connp->conn_tcp));
572 		ipcl_globalhash_insert(connp);
573 		break;
574 	case IPCL_SCTPCONN:
575 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
576 			return (NULL);
577 		connp->conn_flags = IPCL_SCTPCONN;
578 		sctps = ns->netstack_sctp;
579 		SCTP_G_Q_REFHOLD(sctps);
580 		netstack_hold(ns);
581 		connp->conn_netstack = ns;
582 		break;
583 	case IPCL_IPCCONN:
584 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
585 		if (connp == NULL)
586 			return (NULL);
587 		bzero(connp, sizeof (conn_t));
588 		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
589 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
590 		connp->conn_flags = IPCL_IPCCONN;
591 		connp->conn_ref = 1;
592 		netstack_hold(ns);
593 		connp->conn_netstack = ns;
594 		IPCL_DEBUG_LVL(1,
595 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
596 		ipcl_globalhash_insert(connp);
597 		break;
598 	default:
599 		connp = NULL;
600 		ASSERT(0);
601 	}
602 
603 	return (connp);
604 }
605 
606 void
607 ipcl_conn_destroy(conn_t *connp)
608 {
609 	mblk_t	*mp;
610 	netstack_t	*ns = connp->conn_netstack;
611 
612 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
613 	ASSERT(connp->conn_ref == 0);
614 	ASSERT(connp->conn_ire_cache == NULL);
615 
616 	if (connp->conn_peercred != NULL &&
617 	    connp->conn_peercred != connp->conn_cred)
618 		crfree(connp->conn_peercred);
619 	connp->conn_peercred = NULL;
620 
621 	if (connp->conn_cred != NULL) {
622 		crfree(connp->conn_cred);
623 		connp->conn_cred = NULL;
624 	}
625 
626 	ipcl_globalhash_remove(connp);
627 
628 	cv_destroy(&connp->conn_cv);
629 	if (connp->conn_flags & IPCL_TCPCONN) {
630 		tcp_t	*tcp = connp->conn_tcp;
631 		tcp_stack_t *tcps;
632 
633 		ASSERT(tcp != NULL);
634 		tcps = tcp->tcp_tcps;
635 		if (tcps != NULL) {
636 			if (connp->conn_latch != NULL) {
637 				IPLATCH_REFRELE(connp->conn_latch, ns);
638 				connp->conn_latch = NULL;
639 			}
640 			if (connp->conn_policy != NULL) {
641 				IPPH_REFRELE(connp->conn_policy, ns);
642 				connp->conn_policy = NULL;
643 			}
644 			tcp->tcp_tcps = NULL;
645 			TCPS_REFRELE(tcps);
646 		}
647 
648 		mutex_destroy(&connp->conn_lock);
649 		tcp_free(tcp);
650 		mp = tcp->tcp_timercache;
651 		tcp->tcp_cred = NULL;
652 
653 		if (tcp->tcp_sack_info != NULL) {
654 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
655 			kmem_cache_free(tcp_sack_info_cache,
656 			    tcp->tcp_sack_info);
657 		}
658 		if (tcp->tcp_iphc != NULL) {
659 			if (tcp->tcp_hdr_grown) {
660 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
661 			} else {
662 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
663 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
664 			}
665 			tcp->tcp_iphc_len = 0;
666 		}
667 		ASSERT(tcp->tcp_iphc_len == 0);
668 
669 		ASSERT(connp->conn_latch == NULL);
670 		ASSERT(connp->conn_policy == NULL);
671 
672 		bzero(connp, sizeof (itc_t));
673 
674 		tcp->tcp_timercache = mp;
675 		connp->conn_tcp = tcp;
676 		connp->conn_flags = IPCL_TCPCONN;
677 		connp->conn_ulp = IPPROTO_TCP;
678 		tcp->tcp_connp = connp;
679 		if (ns != NULL) {
680 			ASSERT(tcp->tcp_tcps == NULL);
681 			connp->conn_netstack = NULL;
682 			netstack_rele(ns);
683 		}
684 		kmem_cache_free(ipcl_tcpconn_cache, connp);
685 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
686 		ASSERT(ns != NULL);
687 		sctp_free(connp);
688 	} else {
689 		ASSERT(connp->conn_udp == NULL);
690 		mutex_destroy(&connp->conn_lock);
691 		if (ns != NULL) {
692 			connp->conn_netstack = NULL;
693 			netstack_rele(ns);
694 		}
695 		kmem_cache_free(ipcl_conn_cache, connp);
696 	}
697 }
698 
699 /*
700  * Running in cluster mode - deregister listener information
701  */
702 
703 static void
704 ipcl_conn_unlisten(conn_t *connp)
705 {
706 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
707 	ASSERT(connp->conn_lport != 0);
708 
709 	if (cl_inet_unlisten != NULL) {
710 		sa_family_t	addr_family;
711 		uint8_t		*laddrp;
712 
713 		if (connp->conn_pkt_isv6) {
714 			addr_family = AF_INET6;
715 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
716 		} else {
717 			addr_family = AF_INET;
718 			laddrp = (uint8_t *)&connp->conn_bound_source;
719 		}
720 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
721 		    connp->conn_lport);
722 	}
723 	connp->conn_flags &= ~IPCL_CL_LISTENER;
724 }
725 
726 /*
727  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
728  * which table the conn belonged to). So for debugging we can see which hash
729  * table this connection was in.
730  */
731 #define	IPCL_HASH_REMOVE(connp)	{					\
732 	connf_t	*connfp = (connp)->conn_fanout;				\
733 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
734 	if (connfp != NULL) {						\
735 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
736 		    (void *)(connp)));					\
737 		mutex_enter(&connfp->connf_lock);			\
738 		if ((connp)->conn_next != NULL)				\
739 			(connp)->conn_next->conn_prev =			\
740 			    (connp)->conn_prev;				\
741 		if ((connp)->conn_prev != NULL)				\
742 			(connp)->conn_prev->conn_next =			\
743 			    (connp)->conn_next;				\
744 		else							\
745 			connfp->connf_head = (connp)->conn_next;	\
746 		(connp)->conn_fanout = NULL;				\
747 		(connp)->conn_next = NULL;				\
748 		(connp)->conn_prev = NULL;				\
749 		(connp)->conn_flags |= IPCL_REMOVED;			\
750 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
751 			ipcl_conn_unlisten((connp));			\
752 		CONN_DEC_REF((connp));					\
753 		mutex_exit(&connfp->connf_lock);			\
754 	}								\
755 }
756 
757 void
758 ipcl_hash_remove(conn_t *connp)
759 {
760 	IPCL_HASH_REMOVE(connp);
761 }
762 
763 /*
764  * The whole purpose of this function is allow removal of
765  * a conn_t from the connected hash for timewait reclaim.
766  * This is essentially a TW reclaim fastpath where timewait
767  * collector checks under fanout lock (so no one else can
768  * get access to the conn_t) that refcnt is 2 i.e. one for
769  * TCP and one for the classifier hash list. If ref count
770  * is indeed 2, we can just remove the conn under lock and
771  * avoid cleaning up the conn under squeue. This gives us
772  * improved performance.
773  */
774 void
775 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
776 {
777 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
778 	ASSERT(MUTEX_HELD(&connp->conn_lock));
779 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
780 
781 	if ((connp)->conn_next != NULL) {
782 		(connp)->conn_next->conn_prev =
783 			(connp)->conn_prev;
784 	}
785 	if ((connp)->conn_prev != NULL) {
786 		(connp)->conn_prev->conn_next =
787 			(connp)->conn_next;
788 	} else {
789 		connfp->connf_head = (connp)->conn_next;
790 	}
791 	(connp)->conn_fanout = NULL;
792 	(connp)->conn_next = NULL;
793 	(connp)->conn_prev = NULL;
794 	(connp)->conn_flags |= IPCL_REMOVED;
795 	ASSERT((connp)->conn_ref == 2);
796 	(connp)->conn_ref--;
797 }
798 
799 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
800 	ASSERT((connp)->conn_fanout == NULL);				\
801 	ASSERT((connp)->conn_next == NULL);				\
802 	ASSERT((connp)->conn_prev == NULL);				\
803 	if ((connfp)->connf_head != NULL) {				\
804 		(connfp)->connf_head->conn_prev = (connp);		\
805 		(connp)->conn_next = (connfp)->connf_head;		\
806 	}								\
807 	(connp)->conn_fanout = (connfp);				\
808 	(connfp)->connf_head = (connp);					\
809 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
810 	    IPCL_CONNECTED;						\
811 	CONN_INC_REF(connp);						\
812 }
813 
814 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
815 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
816 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
817 	IPCL_HASH_REMOVE((connp));					\
818 	mutex_enter(&(connfp)->connf_lock);				\
819 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
820 	mutex_exit(&(connfp)->connf_lock);				\
821 }
822 
823 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
824 	conn_t *pconnp = NULL, *nconnp;					\
825 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
826 	    "connp %p", (void *)connfp, (void *)(connp)));		\
827 	IPCL_HASH_REMOVE((connp));					\
828 	mutex_enter(&(connfp)->connf_lock);				\
829 	nconnp = (connfp)->connf_head;					\
830 	while (nconnp != NULL &&					\
831 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
832 		pconnp = nconnp;					\
833 		nconnp = nconnp->conn_next;				\
834 	}								\
835 	if (pconnp != NULL) {						\
836 		pconnp->conn_next = (connp);				\
837 		(connp)->conn_prev = pconnp;				\
838 	} else {							\
839 		(connfp)->connf_head = (connp);				\
840 	}								\
841 	if (nconnp != NULL) {						\
842 		(connp)->conn_next = nconnp;				\
843 		nconnp->conn_prev = (connp);				\
844 	}								\
845 	(connp)->conn_fanout = (connfp);				\
846 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
847 	    IPCL_BOUND;							\
848 	CONN_INC_REF(connp);						\
849 	mutex_exit(&(connfp)->connf_lock);				\
850 }
851 
852 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
853 	conn_t **list, *prev, *next;					\
854 	boolean_t isv4mapped =						\
855 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
856 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
857 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
858 	IPCL_HASH_REMOVE((connp));					\
859 	mutex_enter(&(connfp)->connf_lock);				\
860 	list = &(connfp)->connf_head;					\
861 	prev = NULL;							\
862 	while ((next = *list) != NULL) {				\
863 		if (isv4mapped &&					\
864 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
865 		    connp->conn_zoneid == next->conn_zoneid) {		\
866 			(connp)->conn_next = next;			\
867 			if (prev != NULL)				\
868 				prev = next->conn_prev;			\
869 			next->conn_prev = (connp);			\
870 			break;						\
871 		}							\
872 		list = &next->conn_next;				\
873 		prev = next;						\
874 	}								\
875 	(connp)->conn_prev = prev;					\
876 	*list = (connp);						\
877 	(connp)->conn_fanout = (connfp);				\
878 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
879 	    IPCL_BOUND;							\
880 	CONN_INC_REF((connp));						\
881 	mutex_exit(&(connfp)->connf_lock);				\
882 }
883 
884 void
885 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
886 {
887 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
888 }
889 
890 void
891 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
892 {
893 	connf_t	*connfp;
894 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
895 
896 	ASSERT(connp != NULL);
897 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
898 	    protocol == IPPROTO_ESP);
899 
900 	connp->conn_ulp = protocol;
901 
902 	/* Insert it in the protocol hash */
903 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
904 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
905 }
906 
907 void
908 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
909 {
910 	connf_t	*connfp;
911 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
912 
913 	ASSERT(connp != NULL);
914 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
915 	    protocol == IPPROTO_ESP);
916 
917 	connp->conn_ulp = protocol;
918 
919 	/* Insert it in the Bind Hash */
920 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
921 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
922 }
923 
924 /*
925  * This function is used only for inserting SCTP raw socket now.
926  * This may change later.
927  *
928  * Note that only one raw socket can be bound to a port.  The param
929  * lport is in network byte order.
930  */
931 static int
932 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
933 {
934 	connf_t	*connfp;
935 	conn_t	*oconnp;
936 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
937 
938 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
939 
940 	/* Check for existing raw socket already bound to the port. */
941 	mutex_enter(&connfp->connf_lock);
942 	for (oconnp = connfp->connf_head; oconnp != NULL;
943 	    oconnp = oconnp->conn_next) {
944 		if (oconnp->conn_lport == lport &&
945 		    oconnp->conn_zoneid == connp->conn_zoneid &&
946 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
947 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
948 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
949 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
950 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
951 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
952 		    &connp->conn_srcv6))) {
953 			break;
954 		}
955 	}
956 	mutex_exit(&connfp->connf_lock);
957 	if (oconnp != NULL)
958 		return (EADDRNOTAVAIL);
959 
960 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
961 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
962 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
963 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
964 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
965 		} else {
966 			IPCL_HASH_INSERT_BOUND(connfp, connp);
967 		}
968 	} else {
969 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
970 	}
971 	return (0);
972 }
973 
974 /*
975  * Check for a MAC exemption conflict on a labeled system.  Note that for
976  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
977  * transport layer.  This check is for binding all other protocols.
978  *
979  * Returns true if there's a conflict.
980  */
981 static boolean_t
982 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
983 {
984 	connf_t	*connfp;
985 	conn_t *tconn;
986 
987 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
988 	mutex_enter(&connfp->connf_lock);
989 	for (tconn = connfp->connf_head; tconn != NULL;
990 	    tconn = tconn->conn_next) {
991 		/* We don't allow v4 fallback for v6 raw socket */
992 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
993 			continue;
994 		/* If neither is exempt, then there's no conflict */
995 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
996 			continue;
997 		/* If both are bound to different specific addrs, ok */
998 		if (connp->conn_src != INADDR_ANY &&
999 		    tconn->conn_src != INADDR_ANY &&
1000 		    connp->conn_src != tconn->conn_src)
1001 			continue;
1002 		/* These two conflict; fail */
1003 		break;
1004 	}
1005 	mutex_exit(&connfp->connf_lock);
1006 	return (tconn != NULL);
1007 }
1008 
1009 static boolean_t
1010 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1011 {
1012 	connf_t	*connfp;
1013 	conn_t *tconn;
1014 
1015 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1016 	mutex_enter(&connfp->connf_lock);
1017 	for (tconn = connfp->connf_head; tconn != NULL;
1018 	    tconn = tconn->conn_next) {
1019 		/* We don't allow v4 fallback for v6 raw socket */
1020 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1021 			continue;
1022 		/* If neither is exempt, then there's no conflict */
1023 		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1024 			continue;
1025 		/* If both are bound to different addrs, ok */
1026 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1027 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1028 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1029 			continue;
1030 		/* These two conflict; fail */
1031 		break;
1032 	}
1033 	mutex_exit(&connfp->connf_lock);
1034 	return (tconn != NULL);
1035 }
1036 
1037 /*
1038  * (v4, v6) bind hash insertion routines
1039  */
1040 int
1041 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1042 {
1043 	connf_t	*connfp;
1044 #ifdef	IPCL_DEBUG
1045 	char	buf[INET_NTOA_BUFSIZE];
1046 #endif
1047 	int	ret = 0;
1048 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1049 
1050 	ASSERT(connp);
1051 
1052 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1053 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1054 
1055 	connp->conn_ulp = protocol;
1056 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1057 	connp->conn_lport = lport;
1058 
1059 	switch (protocol) {
1060 	default:
1061 		if (is_system_labeled() &&
1062 		    check_exempt_conflict_v4(connp, ipst))
1063 			return (EADDRINUSE);
1064 		/* FALLTHROUGH */
1065 	case IPPROTO_UDP:
1066 		if (protocol == IPPROTO_UDP) {
1067 			IPCL_DEBUG_LVL(64,
1068 			    ("ipcl_bind_insert: connp %p - udp\n",
1069 			    (void *)connp));
1070 			connfp = &ipst->ips_ipcl_udp_fanout[
1071 			    IPCL_UDP_HASH(lport, ipst)];
1072 		} else {
1073 			IPCL_DEBUG_LVL(64,
1074 			    ("ipcl_bind_insert: connp %p - protocol\n",
1075 			    (void *)connp));
1076 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1077 		}
1078 
1079 		if (connp->conn_rem != INADDR_ANY) {
1080 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1081 		} else if (connp->conn_src != INADDR_ANY) {
1082 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1083 		} else {
1084 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1085 		}
1086 		break;
1087 
1088 	case IPPROTO_TCP:
1089 
1090 		/* Insert it in the Bind Hash */
1091 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1092 		connfp = &ipst->ips_ipcl_bind_fanout[
1093 		    IPCL_BIND_HASH(lport, ipst)];
1094 		if (connp->conn_src != INADDR_ANY) {
1095 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1096 		} else {
1097 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1098 		}
1099 		if (cl_inet_listen != NULL) {
1100 			ASSERT(!connp->conn_pkt_isv6);
1101 			connp->conn_flags |= IPCL_CL_LISTENER;
1102 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1103 			    (uint8_t *)&connp->conn_bound_source, lport);
1104 		}
1105 		break;
1106 
1107 	case IPPROTO_SCTP:
1108 		ret = ipcl_sctp_hash_insert(connp, lport);
1109 		break;
1110 	}
1111 
1112 	return (ret);
1113 }
1114 
1115 int
1116 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1117     uint16_t lport)
1118 {
1119 	connf_t	*connfp;
1120 	int	ret = 0;
1121 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1122 
1123 	ASSERT(connp);
1124 
1125 	connp->conn_ulp = protocol;
1126 	connp->conn_srcv6 = *src;
1127 	connp->conn_lport = lport;
1128 
1129 	switch (protocol) {
1130 	default:
1131 		if (is_system_labeled() &&
1132 		    check_exempt_conflict_v6(connp, ipst))
1133 			return (EADDRINUSE);
1134 		/* FALLTHROUGH */
1135 	case IPPROTO_UDP:
1136 		if (protocol == IPPROTO_UDP) {
1137 			IPCL_DEBUG_LVL(128,
1138 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1139 			    (void *)connp));
1140 			connfp = &ipst->ips_ipcl_udp_fanout[
1141 			    IPCL_UDP_HASH(lport, ipst)];
1142 		} else {
1143 			IPCL_DEBUG_LVL(128,
1144 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1145 			    (void *)connp));
1146 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1147 		}
1148 
1149 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1150 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1151 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1152 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1153 		} else {
1154 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1155 		}
1156 		break;
1157 
1158 	case IPPROTO_TCP:
1159 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1160 
1161 		/* Insert it in the Bind Hash */
1162 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1163 		connfp = &ipst->ips_ipcl_bind_fanout[
1164 		    IPCL_BIND_HASH(lport, ipst)];
1165 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1166 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1167 		} else {
1168 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1169 		}
1170 		if (cl_inet_listen != NULL) {
1171 			sa_family_t	addr_family;
1172 			uint8_t		*laddrp;
1173 
1174 			if (connp->conn_pkt_isv6) {
1175 				addr_family = AF_INET6;
1176 				laddrp =
1177 				    (uint8_t *)&connp->conn_bound_source_v6;
1178 			} else {
1179 				addr_family = AF_INET;
1180 				laddrp = (uint8_t *)&connp->conn_bound_source;
1181 			}
1182 			connp->conn_flags |= IPCL_CL_LISTENER;
1183 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1184 			    lport);
1185 		}
1186 		break;
1187 
1188 	case IPPROTO_SCTP:
1189 		ret = ipcl_sctp_hash_insert(connp, lport);
1190 		break;
1191 	}
1192 
1193 	return (ret);
1194 }
1195 
1196 /*
1197  * ipcl_conn_hash insertion routines.
1198  */
1199 int
1200 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1201     ipaddr_t rem, uint32_t ports)
1202 {
1203 	connf_t		*connfp;
1204 	uint16_t	*up;
1205 	conn_t		*tconnp;
1206 #ifdef	IPCL_DEBUG
1207 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1208 #endif
1209 	in_port_t	lport;
1210 	int		ret = 0;
1211 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1212 
1213 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1214 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1215 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1216 	    ports, protocol));
1217 
1218 	switch (protocol) {
1219 	case IPPROTO_TCP:
1220 		if (!(connp->conn_flags & IPCL_EAGER)) {
1221 			/*
1222 			 * for a eager connection, i.e connections which
1223 			 * have just been created, the initialization is
1224 			 * already done in ip at conn_creation time, so
1225 			 * we can skip the checks here.
1226 			 */
1227 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1228 		}
1229 		connfp = &ipst->ips_ipcl_conn_fanout[
1230 		    IPCL_CONN_HASH(connp->conn_rem,
1231 		    connp->conn_ports, ipst)];
1232 		mutex_enter(&connfp->connf_lock);
1233 		for (tconnp = connfp->connf_head; tconnp != NULL;
1234 		    tconnp = tconnp->conn_next) {
1235 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1236 			    connp->conn_rem, connp->conn_src,
1237 			    connp->conn_ports)) {
1238 
1239 				/* Already have a conn. bail out */
1240 				mutex_exit(&connfp->connf_lock);
1241 				return (EADDRINUSE);
1242 			}
1243 		}
1244 		if (connp->conn_fanout != NULL) {
1245 			/*
1246 			 * Probably a XTI/TLI application trying to do a
1247 			 * rebind. Let it happen.
1248 			 */
1249 			mutex_exit(&connfp->connf_lock);
1250 			IPCL_HASH_REMOVE(connp);
1251 			mutex_enter(&connfp->connf_lock);
1252 		}
1253 
1254 		ASSERT(connp->conn_recv != NULL);
1255 
1256 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1257 		mutex_exit(&connfp->connf_lock);
1258 		break;
1259 
1260 	case IPPROTO_SCTP:
1261 		/*
1262 		 * The raw socket may have already been bound, remove it
1263 		 * from the hash first.
1264 		 */
1265 		IPCL_HASH_REMOVE(connp);
1266 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1267 		ret = ipcl_sctp_hash_insert(connp, lport);
1268 		break;
1269 
1270 	default:
1271 		/*
1272 		 * Check for conflicts among MAC exempt bindings.  For
1273 		 * transports with port numbers, this is done by the upper
1274 		 * level per-transport binding logic.  For all others, it's
1275 		 * done here.
1276 		 */
1277 		if (is_system_labeled() &&
1278 		    check_exempt_conflict_v4(connp, ipst))
1279 			return (EADDRINUSE);
1280 		/* FALLTHROUGH */
1281 
1282 	case IPPROTO_UDP:
1283 		up = (uint16_t *)&ports;
1284 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1285 		if (protocol == IPPROTO_UDP) {
1286 			connfp = &ipst->ips_ipcl_udp_fanout[
1287 			    IPCL_UDP_HASH(up[1], ipst)];
1288 		} else {
1289 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1290 		}
1291 
1292 		if (connp->conn_rem != INADDR_ANY) {
1293 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1294 		} else if (connp->conn_src != INADDR_ANY) {
1295 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1296 		} else {
1297 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1298 		}
1299 		break;
1300 	}
1301 
1302 	return (ret);
1303 }
1304 
1305 int
1306 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1307     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1308 {
1309 	connf_t		*connfp;
1310 	uint16_t	*up;
1311 	conn_t		*tconnp;
1312 	in_port_t	lport;
1313 	int		ret = 0;
1314 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1315 
1316 	switch (protocol) {
1317 	case IPPROTO_TCP:
1318 		/* Just need to insert a conn struct */
1319 		if (!(connp->conn_flags & IPCL_EAGER)) {
1320 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1321 		}
1322 		connfp = &ipst->ips_ipcl_conn_fanout[
1323 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1324 		    ipst)];
1325 		mutex_enter(&connfp->connf_lock);
1326 		for (tconnp = connfp->connf_head; tconnp != NULL;
1327 		    tconnp = tconnp->conn_next) {
1328 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1329 			    connp->conn_remv6, connp->conn_srcv6,
1330 			    connp->conn_ports) &&
1331 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1332 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1333 				/* Already have a conn. bail out */
1334 				mutex_exit(&connfp->connf_lock);
1335 				return (EADDRINUSE);
1336 			}
1337 		}
1338 		if (connp->conn_fanout != NULL) {
1339 			/*
1340 			 * Probably a XTI/TLI application trying to do a
1341 			 * rebind. Let it happen.
1342 			 */
1343 			mutex_exit(&connfp->connf_lock);
1344 			IPCL_HASH_REMOVE(connp);
1345 			mutex_enter(&connfp->connf_lock);
1346 		}
1347 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1348 		mutex_exit(&connfp->connf_lock);
1349 		break;
1350 
1351 	case IPPROTO_SCTP:
1352 		IPCL_HASH_REMOVE(connp);
1353 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1354 		ret = ipcl_sctp_hash_insert(connp, lport);
1355 		break;
1356 
1357 	default:
1358 		if (is_system_labeled() &&
1359 		    check_exempt_conflict_v6(connp, ipst))
1360 			return (EADDRINUSE);
1361 		/* FALLTHROUGH */
1362 	case IPPROTO_UDP:
1363 		up = (uint16_t *)&ports;
1364 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1365 		if (protocol == IPPROTO_UDP) {
1366 			connfp = &ipst->ips_ipcl_udp_fanout[
1367 			    IPCL_UDP_HASH(up[1], ipst)];
1368 		} else {
1369 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1370 		}
1371 
1372 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1373 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1374 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1375 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1376 		} else {
1377 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1378 		}
1379 		break;
1380 	}
1381 
1382 	return (ret);
1383 }
1384 
1385 /*
1386  * v4 packet classifying function. looks up the fanout table to
1387  * find the conn, the packet belongs to. returns the conn with
1388  * the reference held, null otherwise.
1389  *
1390  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1391  * Lookup" comment block are applied.  Labels are also checked as described
1392  * above.  If the packet is from the inside (looped back), and is from the same
1393  * zone, then label checks are omitted.
1394  */
1395 conn_t *
1396 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1397     ip_stack_t *ipst)
1398 {
1399 	ipha_t	*ipha;
1400 	connf_t	*connfp, *bind_connfp;
1401 	uint16_t lport;
1402 	uint16_t fport;
1403 	uint32_t ports;
1404 	conn_t	*connp;
1405 	uint16_t  *up;
1406 	boolean_t shared_addr;
1407 	boolean_t unlabeled;
1408 
1409 	ipha = (ipha_t *)mp->b_rptr;
1410 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1411 
1412 	switch (protocol) {
1413 	case IPPROTO_TCP:
1414 		ports = *(uint32_t *)up;
1415 		connfp =
1416 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1417 		    ports, ipst)];
1418 		mutex_enter(&connfp->connf_lock);
1419 		for (connp = connfp->connf_head; connp != NULL;
1420 		    connp = connp->conn_next) {
1421 			if (IPCL_CONN_MATCH(connp, protocol,
1422 			    ipha->ipha_src, ipha->ipha_dst, ports))
1423 				break;
1424 		}
1425 
1426 		if (connp != NULL) {
1427 			/*
1428 			 * We have a fully-bound TCP connection.
1429 			 *
1430 			 * For labeled systems, there's no need to check the
1431 			 * label here.  It's known to be good as we checked
1432 			 * before allowing the connection to become bound.
1433 			 */
1434 			CONN_INC_REF(connp);
1435 			mutex_exit(&connfp->connf_lock);
1436 			return (connp);
1437 		}
1438 
1439 		mutex_exit(&connfp->connf_lock);
1440 
1441 		lport = up[1];
1442 		unlabeled = B_FALSE;
1443 		/* Cred cannot be null on IPv4 */
1444 		if (is_system_labeled())
1445 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1446 			    TSLF_UNLABELED) != 0;
1447 		shared_addr = (zoneid == ALL_ZONES);
1448 		if (shared_addr) {
1449 			/*
1450 			 * No need to handle exclusive-stack zones since
1451 			 * ALL_ZONES only applies to the shared stack.
1452 			 */
1453 			zoneid = tsol_mlp_findzone(protocol, lport);
1454 			/*
1455 			 * If no shared MLP is found, tsol_mlp_findzone returns
1456 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1457 			 * search for the zone based on the packet label.
1458 			 *
1459 			 * If there is such a zone, we prefer to find a
1460 			 * connection in it.  Otherwise, we look for a
1461 			 * MAC-exempt connection in any zone whose label
1462 			 * dominates the default label on the packet.
1463 			 */
1464 			if (zoneid == ALL_ZONES)
1465 				zoneid = tsol_packet_to_zoneid(mp);
1466 			else
1467 				unlabeled = B_FALSE;
1468 		}
1469 
1470 		bind_connfp =
1471 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1472 		mutex_enter(&bind_connfp->connf_lock);
1473 		for (connp = bind_connfp->connf_head; connp != NULL;
1474 		    connp = connp->conn_next) {
1475 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1476 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1477 			    (unlabeled && connp->conn_mac_exempt)))
1478 				break;
1479 		}
1480 
1481 		/*
1482 		 * If the matching connection is SLP on a private address, then
1483 		 * the label on the packet must match the local zone's label.
1484 		 * Otherwise, it must be in the label range defined by tnrh.
1485 		 * This is ensured by tsol_receive_label.
1486 		 */
1487 		if (connp != NULL && is_system_labeled() &&
1488 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1489 		    shared_addr, connp)) {
1490 				DTRACE_PROBE3(
1491 				    tx__ip__log__info__classify__tcp,
1492 				    char *,
1493 				    "connp(1) could not receive mp(2)",
1494 				    conn_t *, connp, mblk_t *, mp);
1495 			connp = NULL;
1496 		}
1497 
1498 		if (connp != NULL) {
1499 			/* Have a listener at least */
1500 			CONN_INC_REF(connp);
1501 			mutex_exit(&bind_connfp->connf_lock);
1502 			return (connp);
1503 		}
1504 
1505 		mutex_exit(&bind_connfp->connf_lock);
1506 
1507 		IPCL_DEBUG_LVL(512,
1508 		    ("ipcl_classify: couldn't classify mp = %p\n",
1509 		    (void *)mp));
1510 		break;
1511 
1512 	case IPPROTO_UDP:
1513 		lport = up[1];
1514 		unlabeled = B_FALSE;
1515 		/* Cred cannot be null on IPv4 */
1516 		if (is_system_labeled())
1517 			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1518 			    TSLF_UNLABELED) != 0;
1519 		shared_addr = (zoneid == ALL_ZONES);
1520 		if (shared_addr) {
1521 			/*
1522 			 * No need to handle exclusive-stack zones since
1523 			 * ALL_ZONES only applies to the shared stack.
1524 			 */
1525 			zoneid = tsol_mlp_findzone(protocol, lport);
1526 			/*
1527 			 * If no shared MLP is found, tsol_mlp_findzone returns
1528 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1529 			 * search for the zone based on the packet label.
1530 			 *
1531 			 * If there is such a zone, we prefer to find a
1532 			 * connection in it.  Otherwise, we look for a
1533 			 * MAC-exempt connection in any zone whose label
1534 			 * dominates the default label on the packet.
1535 			 */
1536 			if (zoneid == ALL_ZONES)
1537 				zoneid = tsol_packet_to_zoneid(mp);
1538 			else
1539 				unlabeled = B_FALSE;
1540 		}
1541 		fport = up[0];
1542 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1543 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1544 		mutex_enter(&connfp->connf_lock);
1545 		for (connp = connfp->connf_head; connp != NULL;
1546 		    connp = connp->conn_next) {
1547 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1548 			    fport, ipha->ipha_src) &&
1549 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1550 			    (unlabeled && connp->conn_mac_exempt)))
1551 				break;
1552 		}
1553 
1554 		if (connp != NULL && is_system_labeled() &&
1555 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1556 		    shared_addr, connp)) {
1557 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1558 			    char *, "connp(1) could not receive mp(2)",
1559 			    conn_t *, connp, mblk_t *, mp);
1560 			connp = NULL;
1561 		}
1562 
1563 		if (connp != NULL) {
1564 			CONN_INC_REF(connp);
1565 			mutex_exit(&connfp->connf_lock);
1566 			return (connp);
1567 		}
1568 
1569 		/*
1570 		 * We shouldn't come here for multicast/broadcast packets
1571 		 */
1572 		mutex_exit(&connfp->connf_lock);
1573 		IPCL_DEBUG_LVL(512,
1574 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1575 		    lport, fport));
1576 		break;
1577 	}
1578 
1579 	return (NULL);
1580 }
1581 
1582 conn_t *
1583 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1584     ip_stack_t *ipst)
1585 {
1586 	ip6_t		*ip6h;
1587 	connf_t		*connfp, *bind_connfp;
1588 	uint16_t	lport;
1589 	uint16_t	fport;
1590 	tcph_t		*tcph;
1591 	uint32_t	ports;
1592 	conn_t		*connp;
1593 	uint16_t	*up;
1594 	boolean_t	shared_addr;
1595 	boolean_t	unlabeled;
1596 
1597 	ip6h = (ip6_t *)mp->b_rptr;
1598 
1599 	switch (protocol) {
1600 	case IPPROTO_TCP:
1601 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1602 		up = (uint16_t *)tcph->th_lport;
1603 		ports = *(uint32_t *)up;
1604 
1605 		connfp =
1606 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1607 		    ports, ipst)];
1608 		mutex_enter(&connfp->connf_lock);
1609 		for (connp = connfp->connf_head; connp != NULL;
1610 		    connp = connp->conn_next) {
1611 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1612 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1613 				break;
1614 		}
1615 
1616 		if (connp != NULL) {
1617 			/*
1618 			 * We have a fully-bound TCP connection.
1619 			 *
1620 			 * For labeled systems, there's no need to check the
1621 			 * label here.  It's known to be good as we checked
1622 			 * before allowing the connection to become bound.
1623 			 */
1624 			CONN_INC_REF(connp);
1625 			mutex_exit(&connfp->connf_lock);
1626 			return (connp);
1627 		}
1628 
1629 		mutex_exit(&connfp->connf_lock);
1630 
1631 		lport = up[1];
1632 		unlabeled = B_FALSE;
1633 		/* Cred can be null on IPv6 */
1634 		if (is_system_labeled()) {
1635 			cred_t *cr = DB_CRED(mp);
1636 
1637 			unlabeled = (cr != NULL &&
1638 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1639 		}
1640 		shared_addr = (zoneid == ALL_ZONES);
1641 		if (shared_addr) {
1642 			/*
1643 			 * No need to handle exclusive-stack zones since
1644 			 * ALL_ZONES only applies to the shared stack.
1645 			 */
1646 			zoneid = tsol_mlp_findzone(protocol, lport);
1647 			/*
1648 			 * If no shared MLP is found, tsol_mlp_findzone returns
1649 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1650 			 * search for the zone based on the packet label.
1651 			 *
1652 			 * If there is such a zone, we prefer to find a
1653 			 * connection in it.  Otherwise, we look for a
1654 			 * MAC-exempt connection in any zone whose label
1655 			 * dominates the default label on the packet.
1656 			 */
1657 			if (zoneid == ALL_ZONES)
1658 				zoneid = tsol_packet_to_zoneid(mp);
1659 			else
1660 				unlabeled = B_FALSE;
1661 		}
1662 
1663 		bind_connfp =
1664 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1665 		mutex_enter(&bind_connfp->connf_lock);
1666 		for (connp = bind_connfp->connf_head; connp != NULL;
1667 		    connp = connp->conn_next) {
1668 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1669 			    ip6h->ip6_dst, lport) &&
1670 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1671 			    (unlabeled && connp->conn_mac_exempt)))
1672 				break;
1673 		}
1674 
1675 		if (connp != NULL && is_system_labeled() &&
1676 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1677 		    shared_addr, connp)) {
1678 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1679 			    char *, "connp(1) could not receive mp(2)",
1680 			    conn_t *, connp, mblk_t *, mp);
1681 			connp = NULL;
1682 		}
1683 
1684 		if (connp != NULL) {
1685 			/* Have a listner at least */
1686 			CONN_INC_REF(connp);
1687 			mutex_exit(&bind_connfp->connf_lock);
1688 			IPCL_DEBUG_LVL(512,
1689 			    ("ipcl_classify_v6: found listner "
1690 			    "connp = %p\n", (void *)connp));
1691 
1692 			return (connp);
1693 		}
1694 
1695 		mutex_exit(&bind_connfp->connf_lock);
1696 
1697 		IPCL_DEBUG_LVL(512,
1698 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1699 		    (void *)mp));
1700 		break;
1701 
1702 	case IPPROTO_UDP:
1703 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1704 		lport = up[1];
1705 		unlabeled = B_FALSE;
1706 		/* Cred can be null on IPv6 */
1707 		if (is_system_labeled()) {
1708 			cred_t *cr = DB_CRED(mp);
1709 
1710 			unlabeled = (cr != NULL &&
1711 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1712 		}
1713 		shared_addr = (zoneid == ALL_ZONES);
1714 		if (shared_addr) {
1715 			/*
1716 			 * No need to handle exclusive-stack zones since
1717 			 * ALL_ZONES only applies to the shared stack.
1718 			 */
1719 			zoneid = tsol_mlp_findzone(protocol, lport);
1720 			/*
1721 			 * If no shared MLP is found, tsol_mlp_findzone returns
1722 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1723 			 * search for the zone based on the packet label.
1724 			 *
1725 			 * If there is such a zone, we prefer to find a
1726 			 * connection in it.  Otherwise, we look for a
1727 			 * MAC-exempt connection in any zone whose label
1728 			 * dominates the default label on the packet.
1729 			 */
1730 			if (zoneid == ALL_ZONES)
1731 				zoneid = tsol_packet_to_zoneid(mp);
1732 			else
1733 				unlabeled = B_FALSE;
1734 		}
1735 
1736 		fport = up[0];
1737 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1738 		    fport));
1739 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1740 		mutex_enter(&connfp->connf_lock);
1741 		for (connp = connfp->connf_head; connp != NULL;
1742 		    connp = connp->conn_next) {
1743 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1744 			    fport, ip6h->ip6_src) &&
1745 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1746 			    (unlabeled && connp->conn_mac_exempt)))
1747 				break;
1748 		}
1749 
1750 		if (connp != NULL && is_system_labeled() &&
1751 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1752 		    shared_addr, connp)) {
1753 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1754 			    char *, "connp(1) could not receive mp(2)",
1755 			    conn_t *, connp, mblk_t *, mp);
1756 			connp = NULL;
1757 		}
1758 
1759 		if (connp != NULL) {
1760 			CONN_INC_REF(connp);
1761 			mutex_exit(&connfp->connf_lock);
1762 			return (connp);
1763 		}
1764 
1765 		/*
1766 		 * We shouldn't come here for multicast/broadcast packets
1767 		 */
1768 		mutex_exit(&connfp->connf_lock);
1769 		IPCL_DEBUG_LVL(512,
1770 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1771 		    lport, fport));
1772 		break;
1773 	}
1774 
1775 	return (NULL);
1776 }
1777 
1778 /*
1779  * wrapper around ipcl_classify_(v4,v6) routines.
1780  */
1781 conn_t *
1782 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
1783 {
1784 	uint16_t	hdr_len;
1785 	ipha_t		*ipha;
1786 	uint8_t		*nexthdrp;
1787 
1788 	if (MBLKL(mp) < sizeof (ipha_t))
1789 		return (NULL);
1790 
1791 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1792 	case IPV4_VERSION:
1793 		ipha = (ipha_t *)mp->b_rptr;
1794 		hdr_len = IPH_HDR_LENGTH(ipha);
1795 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1796 		    zoneid, ipst));
1797 	case IPV6_VERSION:
1798 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1799 		    &hdr_len, &nexthdrp))
1800 			return (NULL);
1801 
1802 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
1803 	}
1804 
1805 	return (NULL);
1806 }
1807 
1808 conn_t *
1809 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1810     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
1811 {
1812 	connf_t		*connfp;
1813 	conn_t		*connp;
1814 	in_port_t	lport;
1815 	int		af;
1816 	boolean_t	shared_addr;
1817 	boolean_t	unlabeled;
1818 	const void	*dst;
1819 
1820 	lport = ((uint16_t *)&ports)[1];
1821 
1822 	unlabeled = B_FALSE;
1823 	/* Cred can be null on IPv6 */
1824 	if (is_system_labeled()) {
1825 		cred_t *cr = DB_CRED(mp);
1826 
1827 		unlabeled = (cr != NULL &&
1828 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1829 	}
1830 	shared_addr = (zoneid == ALL_ZONES);
1831 	if (shared_addr) {
1832 		/*
1833 		 * No need to handle exclusive-stack zones since ALL_ZONES
1834 		 * only applies to the shared stack.
1835 		 */
1836 		zoneid = tsol_mlp_findzone(protocol, lport);
1837 		/*
1838 		 * If no shared MLP is found, tsol_mlp_findzone returns
1839 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1840 		 * the zone based on the packet label.
1841 		 *
1842 		 * If there is such a zone, we prefer to find a connection in
1843 		 * it.  Otherwise, we look for a MAC-exempt connection in any
1844 		 * zone whose label dominates the default label on the packet.
1845 		 */
1846 		if (zoneid == ALL_ZONES)
1847 			zoneid = tsol_packet_to_zoneid(mp);
1848 		else
1849 			unlabeled = B_FALSE;
1850 	}
1851 
1852 	af = IPH_HDR_VERSION(hdr);
1853 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1854 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1855 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1856 
1857 	mutex_enter(&connfp->connf_lock);
1858 	for (connp = connfp->connf_head; connp != NULL;
1859 	    connp = connp->conn_next) {
1860 		/* We don't allow v4 fallback for v6 raw socket. */
1861 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1862 		    IPV6_VERSION))
1863 			continue;
1864 		if (connp->conn_fully_bound) {
1865 			if (af == IPV4_VERSION) {
1866 				if (!IPCL_CONN_MATCH(connp, protocol,
1867 				    hdr->ipha_src, hdr->ipha_dst, ports))
1868 					continue;
1869 			} else {
1870 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1871 				    ((ip6_t *)hdr)->ip6_src,
1872 				    ((ip6_t *)hdr)->ip6_dst, ports))
1873 					continue;
1874 			}
1875 		} else {
1876 			if (af == IPV4_VERSION) {
1877 				if (!IPCL_BIND_MATCH(connp, protocol,
1878 				    hdr->ipha_dst, lport))
1879 					continue;
1880 			} else {
1881 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1882 				    ((ip6_t *)hdr)->ip6_dst, lport))
1883 					continue;
1884 			}
1885 		}
1886 
1887 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1888 		    (unlabeled && connp->conn_mac_exempt))
1889 			break;
1890 	}
1891 	/*
1892 	 * If the connection is fully-bound and connection-oriented (TCP or
1893 	 * SCTP), then we've already validated the remote system's label.
1894 	 * There's no need to do it again for every packet.
1895 	 */
1896 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1897 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1898 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1899 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1900 		    char *, "connp(1) could not receive mp(2)",
1901 		    conn_t *, connp, mblk_t *, mp);
1902 		connp = NULL;
1903 	}
1904 
1905 	if (connp != NULL)
1906 		goto found;
1907 	mutex_exit(&connfp->connf_lock);
1908 
1909 	/* Try to look for a wildcard match. */
1910 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1911 	mutex_enter(&connfp->connf_lock);
1912 	for (connp = connfp->connf_head; connp != NULL;
1913 	    connp = connp->conn_next) {
1914 		/* We don't allow v4 fallback for v6 raw socket. */
1915 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1916 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1917 			continue;
1918 		}
1919 		if (af == IPV4_VERSION) {
1920 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1921 				break;
1922 		} else {
1923 			if (IPCL_RAW_MATCH_V6(connp, protocol,
1924 			    ((ip6_t *)hdr)->ip6_dst)) {
1925 				break;
1926 			}
1927 		}
1928 	}
1929 
1930 	if (connp != NULL)
1931 		goto found;
1932 
1933 	mutex_exit(&connfp->connf_lock);
1934 	return (NULL);
1935 
1936 found:
1937 	ASSERT(connp != NULL);
1938 	CONN_INC_REF(connp);
1939 	mutex_exit(&connfp->connf_lock);
1940 	return (connp);
1941 }
1942 
1943 /* ARGSUSED */
1944 static int
1945 ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1946 {
1947 	itc_t	*itc = (itc_t *)buf;
1948 	conn_t 	*connp = &itc->itc_conn;
1949 	tcp_t	*tcp = &itc->itc_tcp;
1950 	bzero(itc, sizeof (itc_t));
1951 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1952 	connp->conn_tcp = tcp;
1953 	connp->conn_flags = IPCL_TCPCONN;
1954 	connp->conn_ulp = IPPROTO_TCP;
1955 	tcp->tcp_connp = connp;
1956 	return (0);
1957 }
1958 
1959 /* ARGSUSED */
1960 static void
1961 ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1962 {
1963 	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1964 }
1965 
1966 /*
1967  * All conns are inserted in a global multi-list for the benefit of
1968  * walkers. The walk is guaranteed to walk all open conns at the time
1969  * of the start of the walk exactly once. This property is needed to
1970  * achieve some cleanups during unplumb of interfaces. This is achieved
1971  * as follows.
1972  *
1973  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1974  * call the insert and delete functions below at creation and deletion
1975  * time respectively. The conn never moves or changes its position in this
1976  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1977  * won't increase due to walkers, once the conn deletion has started. Note
1978  * that we can't remove the conn from the global list and then wait for
1979  * the refcnt to drop to zero, since walkers would then see a truncated
1980  * list. CONN_INCIPIENT ensures that walkers don't start looking at
1981  * conns until ip_open is ready to make them globally visible.
1982  * The global round robin multi-list locks are held only to get the
1983  * next member/insertion/deletion and contention should be negligible
1984  * if the multi-list is much greater than the number of cpus.
1985  */
1986 void
1987 ipcl_globalhash_insert(conn_t *connp)
1988 {
1989 	int	index;
1990 	struct connf_s	*connfp;
1991 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1992 
1993 	/*
1994 	 * No need for atomic here. Approximate even distribution
1995 	 * in the global lists is sufficient.
1996 	 */
1997 	ipst->ips_conn_g_index++;
1998 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
1999 
2000 	connp->conn_g_prev = NULL;
2001 	/*
2002 	 * Mark as INCIPIENT, so that walkers will ignore this
2003 	 * for now, till ip_open is ready to make it visible globally.
2004 	 */
2005 	connp->conn_state_flags |= CONN_INCIPIENT;
2006 
2007 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2008 	/* Insert at the head of the list */
2009 	mutex_enter(&connfp->connf_lock);
2010 	connp->conn_g_next = connfp->connf_head;
2011 	if (connp->conn_g_next != NULL)
2012 		connp->conn_g_next->conn_g_prev = connp;
2013 	connfp->connf_head = connp;
2014 
2015 	/* The fanout bucket this conn points to */
2016 	connp->conn_g_fanout = connfp;
2017 
2018 	mutex_exit(&connfp->connf_lock);
2019 }
2020 
2021 void
2022 ipcl_globalhash_remove(conn_t *connp)
2023 {
2024 	struct connf_s	*connfp;
2025 
2026 	/*
2027 	 * We were never inserted in the global multi list.
2028 	 * IPCL_NONE variety is never inserted in the global multilist
2029 	 * since it is presumed to not need any cleanup and is transient.
2030 	 */
2031 	if (connp->conn_g_fanout == NULL)
2032 		return;
2033 
2034 	connfp = connp->conn_g_fanout;
2035 	mutex_enter(&connfp->connf_lock);
2036 	if (connp->conn_g_prev != NULL)
2037 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2038 	else
2039 		connfp->connf_head = connp->conn_g_next;
2040 	if (connp->conn_g_next != NULL)
2041 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2042 	mutex_exit(&connfp->connf_lock);
2043 
2044 	/* Better to stumble on a null pointer than to corrupt memory */
2045 	connp->conn_g_next = NULL;
2046 	connp->conn_g_prev = NULL;
2047 }
2048 
2049 /*
2050  * Walk the list of all conn_t's in the system, calling the function provided
2051  * with the specified argument for each.
2052  * Applies to both IPv4 and IPv6.
2053  *
2054  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2055  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2056  * unplumbed or removed. New conn_t's that are created while we are walking
2057  * may be missed by this walk, because they are not necessarily inserted
2058  * at the tail of the list. They are new conn_t's and thus don't have any
2059  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2060  * is created to the struct that is going away.
2061  */
2062 void
2063 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2064 {
2065 	int	i;
2066 	conn_t	*connp;
2067 	conn_t	*prev_connp;
2068 
2069 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2070 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2071 		prev_connp = NULL;
2072 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2073 		while (connp != NULL) {
2074 			mutex_enter(&connp->conn_lock);
2075 			if (connp->conn_state_flags &
2076 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2077 				mutex_exit(&connp->conn_lock);
2078 				connp = connp->conn_g_next;
2079 				continue;
2080 			}
2081 			CONN_INC_REF_LOCKED(connp);
2082 			mutex_exit(&connp->conn_lock);
2083 			mutex_exit(
2084 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2085 			(*func)(connp, arg);
2086 			if (prev_connp != NULL)
2087 				CONN_DEC_REF(prev_connp);
2088 			mutex_enter(
2089 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2090 			prev_connp = connp;
2091 			connp = connp->conn_g_next;
2092 		}
2093 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2094 		if (prev_connp != NULL)
2095 			CONN_DEC_REF(prev_connp);
2096 	}
2097 }
2098 
2099 /*
2100  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2101  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2102  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2103  * (peer tcp in ESTABLISHED state).
2104  */
2105 conn_t *
2106 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2107     ip_stack_t *ipst)
2108 {
2109 	uint32_t ports;
2110 	uint16_t *pports = (uint16_t *)&ports;
2111 	connf_t	*connfp;
2112 	conn_t	*tconnp;
2113 	boolean_t zone_chk;
2114 
2115 	/*
2116 	 * If either the source of destination address is loopback, then
2117 	 * both endpoints must be in the same Zone.  Otherwise, both of
2118 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2119 	 * state) and the endpoints may reside in different Zones.
2120 	 */
2121 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2122 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2123 
2124 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2125 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2126 
2127 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2128 	    ports, ipst)];
2129 
2130 	mutex_enter(&connfp->connf_lock);
2131 	for (tconnp = connfp->connf_head; tconnp != NULL;
2132 	    tconnp = tconnp->conn_next) {
2133 
2134 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2135 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2136 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2137 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2138 
2139 			ASSERT(tconnp != connp);
2140 			CONN_INC_REF(tconnp);
2141 			mutex_exit(&connfp->connf_lock);
2142 			return (tconnp);
2143 		}
2144 	}
2145 	mutex_exit(&connfp->connf_lock);
2146 	return (NULL);
2147 }
2148 
2149 /*
2150  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2151  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2152  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2153  * (peer tcp in ESTABLISHED state).
2154  */
2155 conn_t *
2156 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2157     ip_stack_t *ipst)
2158 {
2159 	uint32_t ports;
2160 	uint16_t *pports = (uint16_t *)&ports;
2161 	connf_t	*connfp;
2162 	conn_t	*tconnp;
2163 	boolean_t zone_chk;
2164 
2165 	/*
2166 	 * If either the source of destination address is loopback, then
2167 	 * both endpoints must be in the same Zone.  Otherwise, both of
2168 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2169 	 * state) and the endpoints may reside in different Zones.  We
2170 	 * don't do Zone check for link local address(es) because the
2171 	 * current Zone implementation treats each link local address as
2172 	 * being unique per system node, i.e. they belong to global Zone.
2173 	 */
2174 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2175 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2176 
2177 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2178 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2179 
2180 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2181 	    ports, ipst)];
2182 
2183 	mutex_enter(&connfp->connf_lock);
2184 	for (tconnp = connfp->connf_head; tconnp != NULL;
2185 	    tconnp = tconnp->conn_next) {
2186 
2187 		/* We skip tcp_bound_if check here as this is loopback tcp */
2188 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2189 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2190 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2191 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2192 
2193 			ASSERT(tconnp != connp);
2194 			CONN_INC_REF(tconnp);
2195 			mutex_exit(&connfp->connf_lock);
2196 			return (tconnp);
2197 		}
2198 	}
2199 	mutex_exit(&connfp->connf_lock);
2200 	return (NULL);
2201 }
2202 
2203 /*
2204  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2205  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2206  * Only checks for connected entries i.e. no INADDR_ANY checks.
2207  */
2208 conn_t *
2209 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2210     ip_stack_t *ipst)
2211 {
2212 	uint32_t ports;
2213 	uint16_t *pports;
2214 	connf_t	*connfp;
2215 	conn_t	*tconnp;
2216 
2217 	pports = (uint16_t *)&ports;
2218 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2219 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2220 
2221 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2222 					    ports, ipst)];
2223 
2224 	mutex_enter(&connfp->connf_lock);
2225 	for (tconnp = connfp->connf_head; tconnp != NULL;
2226 	    tconnp = tconnp->conn_next) {
2227 
2228 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2229 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2230 		    tconnp->conn_tcp->tcp_state >= min_state) {
2231 
2232 			CONN_INC_REF(tconnp);
2233 			mutex_exit(&connfp->connf_lock);
2234 			return (tconnp);
2235 		}
2236 	}
2237 	mutex_exit(&connfp->connf_lock);
2238 	return (NULL);
2239 }
2240 
2241 /*
2242  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2243  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2244  * Only checks for connected entries i.e. no INADDR_ANY checks.
2245  * Match on ifindex in addition to addresses.
2246  */
2247 conn_t *
2248 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2249     uint_t ifindex, ip_stack_t *ipst)
2250 {
2251 	tcp_t	*tcp;
2252 	uint32_t ports;
2253 	uint16_t *pports;
2254 	connf_t	*connfp;
2255 	conn_t	*tconnp;
2256 
2257 	pports = (uint16_t *)&ports;
2258 	pports[0] = tcpha->tha_fport;
2259 	pports[1] = tcpha->tha_lport;
2260 
2261 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2262 					    ports, ipst)];
2263 
2264 	mutex_enter(&connfp->connf_lock);
2265 	for (tconnp = connfp->connf_head; tconnp != NULL;
2266 	    tconnp = tconnp->conn_next) {
2267 
2268 		tcp = tconnp->conn_tcp;
2269 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2270 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2271 		    tcp->tcp_state >= min_state &&
2272 		    (tcp->tcp_bound_if == 0 ||
2273 		    tcp->tcp_bound_if == ifindex)) {
2274 
2275 			CONN_INC_REF(tconnp);
2276 			mutex_exit(&connfp->connf_lock);
2277 			return (tconnp);
2278 		}
2279 	}
2280 	mutex_exit(&connfp->connf_lock);
2281 	return (NULL);
2282 }
2283 
2284 /*
2285  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2286  * a listener when changing state.
2287  */
2288 conn_t *
2289 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2290     ip_stack_t *ipst)
2291 {
2292 	connf_t		*bind_connfp;
2293 	conn_t		*connp;
2294 	tcp_t		*tcp;
2295 
2296 	/*
2297 	 * Avoid false matches for packets sent to an IP destination of
2298 	 * all zeros.
2299 	 */
2300 	if (laddr == 0)
2301 		return (NULL);
2302 
2303 	ASSERT(zoneid != ALL_ZONES);
2304 
2305 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2306 	mutex_enter(&bind_connfp->connf_lock);
2307 	for (connp = bind_connfp->connf_head; connp != NULL;
2308 	    connp = connp->conn_next) {
2309 		tcp = connp->conn_tcp;
2310 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2311 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2312 		    (tcp->tcp_listener == NULL)) {
2313 			CONN_INC_REF(connp);
2314 			mutex_exit(&bind_connfp->connf_lock);
2315 			return (connp);
2316 		}
2317 	}
2318 	mutex_exit(&bind_connfp->connf_lock);
2319 	return (NULL);
2320 }
2321 
2322 /*
2323  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2324  * a listener when changing state.
2325  */
2326 conn_t *
2327 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2328     zoneid_t zoneid, ip_stack_t *ipst)
2329 {
2330 	connf_t		*bind_connfp;
2331 	conn_t		*connp = NULL;
2332 	tcp_t		*tcp;
2333 
2334 	/*
2335 	 * Avoid false matches for packets sent to an IP destination of
2336 	 * all zeros.
2337 	 */
2338 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2339 		return (NULL);
2340 
2341 	ASSERT(zoneid != ALL_ZONES);
2342 
2343 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2344 	mutex_enter(&bind_connfp->connf_lock);
2345 	for (connp = bind_connfp->connf_head; connp != NULL;
2346 	    connp = connp->conn_next) {
2347 		tcp = connp->conn_tcp;
2348 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2349 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2350 		    (tcp->tcp_bound_if == 0 ||
2351 		    tcp->tcp_bound_if == ifindex) &&
2352 		    tcp->tcp_listener == NULL) {
2353 			CONN_INC_REF(connp);
2354 			mutex_exit(&bind_connfp->connf_lock);
2355 			return (connp);
2356 		}
2357 	}
2358 	mutex_exit(&bind_connfp->connf_lock);
2359 	return (NULL);
2360 }
2361 
2362 /*
2363  * ipcl_get_next_conn
2364  *	get the next entry in the conn global list
2365  *	and put a reference on the next_conn.
2366  *	decrement the reference on the current conn.
2367  *
2368  * This is an iterator based walker function that also provides for
2369  * some selection by the caller. It walks through the conn_hash bucket
2370  * searching for the next valid connp in the list, and selects connections
2371  * that are neither closed nor condemned. It also REFHOLDS the conn
2372  * thus ensuring that the conn exists when the caller uses the conn.
2373  */
2374 conn_t *
2375 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2376 {
2377 	conn_t	*next_connp;
2378 
2379 	if (connfp == NULL)
2380 		return (NULL);
2381 
2382 	mutex_enter(&connfp->connf_lock);
2383 
2384 	next_connp = (connp == NULL) ?
2385 	    connfp->connf_head : connp->conn_g_next;
2386 
2387 	while (next_connp != NULL) {
2388 		mutex_enter(&next_connp->conn_lock);
2389 		if (!(next_connp->conn_flags & conn_flags) ||
2390 		    (next_connp->conn_state_flags &
2391 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2392 			/*
2393 			 * This conn has been condemned or
2394 			 * is closing, or the flags don't match
2395 			 */
2396 			mutex_exit(&next_connp->conn_lock);
2397 			next_connp = next_connp->conn_g_next;
2398 			continue;
2399 		}
2400 		CONN_INC_REF_LOCKED(next_connp);
2401 		mutex_exit(&next_connp->conn_lock);
2402 		break;
2403 	}
2404 
2405 	mutex_exit(&connfp->connf_lock);
2406 
2407 	if (connp != NULL)
2408 		CONN_DEC_REF(connp);
2409 
2410 	return (next_connp);
2411 }
2412 
2413 #ifdef CONN_DEBUG
2414 /*
2415  * Trace of the last NBUF refhold/refrele
2416  */
2417 int
2418 conn_trace_ref(conn_t *connp)
2419 {
2420 	int	last;
2421 	conn_trace_t	*ctb;
2422 
2423 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2424 	last = connp->conn_trace_last;
2425 	last++;
2426 	if (last == CONN_TRACE_MAX)
2427 		last = 0;
2428 
2429 	ctb = &connp->conn_trace_buf[last];
2430 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2431 	connp->conn_trace_last = last;
2432 	return (1);
2433 }
2434 
2435 int
2436 conn_untrace_ref(conn_t *connp)
2437 {
2438 	int	last;
2439 	conn_trace_t	*ctb;
2440 
2441 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2442 	last = connp->conn_trace_last;
2443 	last++;
2444 	if (last == CONN_TRACE_MAX)
2445 		last = 0;
2446 
2447 	ctb = &connp->conn_trace_buf[last];
2448 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2449 	connp->conn_trace_last = last;
2450 	return (1);
2451 }
2452 #endif
2453