xref: /illumos-gate/usr/src/uts/common/inet/ip/ipclassifier.c (revision 46e5ca4c180bbc8cb48be79bc045e873add461ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP PACKET CLASSIFIER
28  *
29  * The IP packet classifier provides mapping between IP packets and persistent
30  * connection state for connection-oriented protocols. It also provides
31  * interface for managing connection states.
32  *
33  * The connection state is kept in conn_t data structure and contains, among
34  * other things:
35  *
36  *	o local/remote address and ports
37  *	o Transport protocol
38  *	o squeue for the connection (for TCP only)
39  *	o reference counter
40  *	o Connection state
41  *	o hash table linkage
42  *	o interface/ire information
43  *	o credentials
44  *	o ipsec policy
45  *	o send and receive functions.
46  *	o mutex lock.
47  *
48  * Connections use a reference counting scheme. They are freed when the
49  * reference counter drops to zero. A reference is incremented when connection
50  * is placed in a list or table, when incoming packet for the connection arrives
51  * and when connection is processed via squeue (squeue processing may be
52  * asynchronous and the reference protects the connection from being destroyed
53  * before its processing is finished).
54  *
55  * send and receive functions are currently used for TCP only. The send function
56  * determines the IP entry point for the packet once it leaves TCP to be sent to
57  * the destination address. The receive function is used by IP when the packet
58  * should be passed for TCP processing. When a new connection is created these
59  * are set to ip_output() and tcp_input() respectively. During the lifetime of
60  * the connection the send and receive functions may change depending on the
61  * changes in the connection state. For example, Once the connection is bound to
62  * an addresse, the receive function for this connection is set to
63  * tcp_conn_request().  This allows incoming SYNs to go directly into the
64  * listener SYN processing function without going to tcp_input() first.
65  *
66  * Classifier uses several hash tables:
67  *
68  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
69  *	ipcl_bind_fanout:	contains all connections in BOUND state
70  *	ipcl_proto_fanout:	IPv4 protocol fanout
71  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
72  *	ipcl_udp_fanout:	contains all UDP connections
73  *	ipcl_iptun_fanout:	contains all IP tunnel connections
74  *	ipcl_globalhash_fanout:	contains all connections
75  *
76  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
77  * which need to view all existing connections.
78  *
79  * All tables are protected by per-bucket locks. When both per-bucket lock and
80  * connection lock need to be held, the per-bucket lock should be acquired
81  * first, followed by the connection lock.
82  *
83  * All functions doing search in one of these tables increment a reference
84  * counter on the connection found (if any). This reference should be dropped
85  * when the caller has finished processing the connection.
86  *
87  *
88  * INTERFACES:
89  * ===========
90  *
91  * Connection Lookup:
92  * ------------------
93  *
94  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
95  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
96  *
97  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
98  * it can't find any associated connection. If the connection is found, its
99  * reference counter is incremented.
100  *
101  *	mp:	mblock, containing packet header. The full header should fit
102  *		into a single mblock. It should also contain at least full IP
103  *		and TCP or UDP header.
104  *
105  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
106  *
107  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
108  *		 the packet.
109  *
110  * 	zoneid: The zone in which the returned connection must be; the zoneid
111  *		corresponding to the ire_zoneid on the IRE located for the
112  *		packet's destination address.
113  *
114  *	For TCP connections, the lookup order is as follows:
115  *		5-tuple {src, dst, protocol, local port, remote port}
116  *			lookup in ipcl_conn_fanout table.
117  *		3-tuple {dst, remote port, protocol} lookup in
118  *			ipcl_bind_fanout table.
119  *
120  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
121  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
122  *	these interfaces do not handle cases where a packets belongs
123  *	to multiple UDP clients, which is handled in IP itself.
124  *
125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
126  * determine which actual zone gets the segment.  This is used only in a
127  * labeled environment.  The matching rules are:
128  *
129  *	- If it's not a multilevel port, then the label on the packet selects
130  *	  the zone.  Unlabeled packets are delivered to the global zone.
131  *
132  *	- If it's a multilevel port, then only the zone registered to receive
133  *	  packets on that port matches.
134  *
135  * Also, in a labeled environment, packet labels need to be checked.  For fully
136  * bound TCP connections, we can assume that the packet label was checked
137  * during connection establishment, and doesn't need to be checked on each
138  * packet.  For others, though, we need to check for strict equality or, for
139  * multilevel ports, membership in the range or set.  This part currently does
140  * a tnrh lookup on each packet, but could be optimized to use cached results
141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
142  * we would apply the same rules as TCP.)
143  *
144  * An implication of the above is that fully-bound TCP sockets must always use
145  * distinct 4-tuples; they can't be discriminated by label alone.
146  *
147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
148  * as there's no connection set-up handshake and no shared state.
149  *
150  * Labels on looped-back packets within a single zone do not need to be
151  * checked, as all processes in the same zone have the same label.
152  *
153  * Finally, for unlabeled packets received by a labeled system, special rules
154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
155  * socket in the zone whose label matches the default label of the sender, if
156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
157  * receiver's label must dominate the sender's default label.
158  *
159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161  *					 ip_stack);
162  *
163  *	Lookup routine to find a exact match for {src, dst, local port,
164  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
165  *	ports are read from the IP and TCP header respectively.
166  *
167  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
168  *					 zoneid, ip_stack);
169  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170  *					 zoneid, ip_stack);
171  *
172  * 	Lookup routine to find a listener with the tuple {lport, laddr,
173  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174  * 	parameter interface index is also compared.
175  *
176  * void ipcl_walk(func, arg, ip_stack)
177  *
178  * 	Apply 'func' to every connection available. The 'func' is called as
179  *	(*func)(connp, arg). The walk is non-atomic so connections may be
180  *	created and destroyed during the walk. The CONN_CONDEMNED and
181  *	CONN_INCIPIENT flags ensure that connections which are newly created
182  *	or being destroyed are not selected by the walker.
183  *
184  * Table Updates
185  * -------------
186  *
187  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
188  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
189  *
190  *	Insert 'connp' in the ipcl_conn_fanout.
191  *	Arguements :
192  *		connp		conn_t to be inserted
193  *		protocol	connection protocol
194  *		src		source address
195  *		dst		destination address
196  *		ports		local and remote port
197  *		ifindex		interface index for IPv6 connections
198  *
199  *	Return value :
200  *		0		if connp was inserted
201  *		EADDRINUSE	if the connection with the same tuple
202  *				already exists.
203  *
204  * int ipcl_bind_insert(connp, protocol, src, lport);
205  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
206  *
207  * 	Insert 'connp' in ipcl_bind_fanout.
208  * 	Arguements :
209  * 		connp		conn_t to be inserted
210  * 		protocol	connection protocol
211  * 		src		source address connection wants
212  * 				to bind to
213  * 		lport		local port connection wants to
214  * 				bind to
215  *
216  *
217  * void ipcl_hash_remove(connp);
218  *
219  * 	Removes the 'connp' from the connection fanout table.
220  *
221  * Connection Creation/Destruction
222  * -------------------------------
223  *
224  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
225  *
226  * 	Creates a new conn based on the type flag, inserts it into
227  * 	globalhash table.
228  *
229  *	type:	This flag determines the type of conn_t which needs to be
230  *		created i.e., which kmem_cache it comes from.
231  *		IPCL_TCPCONN	indicates a TCP connection
232  *		IPCL_SCTPCONN	indicates a SCTP connection
233  *		IPCL_UDPCONN	indicates a UDP conn_t.
234  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
235  *		IPCL_RTSCONN	indicates a RTS conn_t.
236  *		IPCL_IPCCONN	indicates all other connections.
237  *
238  * void ipcl_conn_destroy(connp)
239  *
240  * 	Destroys the connection state, removes it from the global
241  * 	connection hash table and frees its memory.
242  */
243 
244 #include <sys/types.h>
245 #include <sys/stream.h>
246 #include <sys/stropts.h>
247 #include <sys/sysmacros.h>
248 #include <sys/strsubr.h>
249 #include <sys/strsun.h>
250 #define	_SUN_TPI_VERSION 2
251 #include <sys/ddi.h>
252 #include <sys/cmn_err.h>
253 #include <sys/debug.h>
254 
255 #include <sys/systm.h>
256 #include <sys/param.h>
257 #include <sys/kmem.h>
258 #include <sys/isa_defs.h>
259 #include <inet/common.h>
260 #include <netinet/ip6.h>
261 #include <netinet/icmp6.h>
262 
263 #include <inet/ip.h>
264 #include <inet/ip6.h>
265 #include <inet/ip_ndp.h>
266 #include <inet/ip_impl.h>
267 #include <inet/udp_impl.h>
268 #include <inet/sctp_ip.h>
269 #include <inet/sctp/sctp_impl.h>
270 #include <inet/rawip_impl.h>
271 #include <inet/rts_impl.h>
272 #include <inet/iptun/iptun_impl.h>
273 
274 #include <sys/cpuvar.h>
275 
276 #include <inet/ipclassifier.h>
277 #include <inet/tcp.h>
278 #include <inet/ipsec_impl.h>
279 
280 #include <sys/tsol/tnet.h>
281 #include <sys/sockio.h>
282 
283 #ifdef DEBUG
284 #define	IPCL_DEBUG
285 #else
286 #undef	IPCL_DEBUG
287 #endif
288 
289 #ifdef	IPCL_DEBUG
290 int	ipcl_debug_level = 0;
291 #define	IPCL_DEBUG_LVL(level, args)	\
292 	if (ipcl_debug_level  & level) { printf args; }
293 #else
294 #define	IPCL_DEBUG_LVL(level, args) {; }
295 #endif
296 /* Old value for compatibility. Setable in /etc/system */
297 uint_t tcp_conn_hash_size = 0;
298 
299 /* New value. Zero means choose automatically.  Setable in /etc/system */
300 uint_t ipcl_conn_hash_size = 0;
301 uint_t ipcl_conn_hash_memfactor = 8192;
302 uint_t ipcl_conn_hash_maxsize = 82500;
303 
304 /* bind/udp fanout table size */
305 uint_t ipcl_bind_fanout_size = 512;
306 uint_t ipcl_udp_fanout_size = 16384;
307 
308 /* Raw socket fanout size.  Must be a power of 2. */
309 uint_t ipcl_raw_fanout_size = 256;
310 
311 /*
312  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
313  * expect that most large deployments would have hundreds of tunnels, and
314  * thousands in the extreme case.
315  */
316 uint_t ipcl_iptun_fanout_size = 6143;
317 
318 /*
319  * Power of 2^N Primes useful for hashing for N of 0-28,
320  * these primes are the nearest prime <= 2^N - 2^(N-2).
321  */
322 
323 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
324 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
325 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
326 		50331599, 100663291, 201326557, 0}
327 
328 /*
329  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
330  * are aligned on cache lines.
331  */
332 typedef union itc_s {
333 	conn_t	itc_conn;
334 	char	itcu_filler[CACHE_ALIGN(conn_s)];
335 } itc_t;
336 
337 struct kmem_cache  *tcp_conn_cache;
338 struct kmem_cache  *ip_conn_cache;
339 struct kmem_cache  *ip_helper_stream_cache;
340 extern struct kmem_cache  *sctp_conn_cache;
341 extern struct kmem_cache  *tcp_sack_info_cache;
342 extern struct kmem_cache  *tcp_iphc_cache;
343 struct kmem_cache  *udp_conn_cache;
344 struct kmem_cache  *rawip_conn_cache;
345 struct kmem_cache  *rts_conn_cache;
346 
347 extern void	tcp_timermp_free(tcp_t *);
348 extern mblk_t	*tcp_timermp_alloc(int);
349 
350 static int	ip_conn_constructor(void *, void *, int);
351 static void	ip_conn_destructor(void *, void *);
352 
353 static int	tcp_conn_constructor(void *, void *, int);
354 static void	tcp_conn_destructor(void *, void *);
355 
356 static int	udp_conn_constructor(void *, void *, int);
357 static void	udp_conn_destructor(void *, void *);
358 
359 static int	rawip_conn_constructor(void *, void *, int);
360 static void	rawip_conn_destructor(void *, void *);
361 
362 static int	rts_conn_constructor(void *, void *, int);
363 static void	rts_conn_destructor(void *, void *);
364 
365 static int	ip_helper_stream_constructor(void *, void *, int);
366 static void	ip_helper_stream_destructor(void *, void *);
367 
368 boolean_t	ip_use_helper_cache = B_TRUE;
369 
370 /*
371  * Hook functions to enable cluster networking
372  * On non-clustered systems these vectors must always be NULL.
373  */
374 extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
375 		    uint8_t *, in_port_t, void *);
376 extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
377 		    uint8_t *, in_port_t, void *);
378 
379 #ifdef	IPCL_DEBUG
380 #define	INET_NTOA_BUFSIZE	18
381 
382 static char *
383 inet_ntoa_r(uint32_t in, char *b)
384 {
385 	unsigned char	*p;
386 
387 	p = (unsigned char *)&in;
388 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
389 	return (b);
390 }
391 #endif
392 
393 /*
394  * Global (for all stack instances) init routine
395  */
396 void
397 ipcl_g_init(void)
398 {
399 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
400 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
401 	    ip_conn_constructor, ip_conn_destructor,
402 	    NULL, NULL, NULL, 0);
403 
404 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
405 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
406 	    tcp_conn_constructor, tcp_conn_destructor,
407 	    NULL, NULL, NULL, 0);
408 
409 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
410 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
411 	    udp_conn_constructor, udp_conn_destructor,
412 	    NULL, NULL, NULL, 0);
413 
414 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
415 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
416 	    rawip_conn_constructor, rawip_conn_destructor,
417 	    NULL, NULL, NULL, 0);
418 
419 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
420 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
421 	    rts_conn_constructor, rts_conn_destructor,
422 	    NULL, NULL, NULL, 0);
423 
424 	if (ip_use_helper_cache) {
425 		ip_helper_stream_cache = kmem_cache_create
426 		    ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
427 		    CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
428 		    ip_helper_stream_destructor, NULL, NULL, NULL, 0);
429 	} else {
430 		ip_helper_stream_cache = NULL;
431 	}
432 }
433 
434 /*
435  * ipclassifier intialization routine, sets up hash tables.
436  */
437 void
438 ipcl_init(ip_stack_t *ipst)
439 {
440 	int i;
441 	int sizes[] = P2Ps();
442 
443 	/*
444 	 * Calculate size of conn fanout table from /etc/system settings
445 	 */
446 	if (ipcl_conn_hash_size != 0) {
447 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
448 	} else if (tcp_conn_hash_size != 0) {
449 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
450 	} else {
451 		extern pgcnt_t freemem;
452 
453 		ipst->ips_ipcl_conn_fanout_size =
454 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
455 
456 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
457 			ipst->ips_ipcl_conn_fanout_size =
458 			    ipcl_conn_hash_maxsize;
459 		}
460 	}
461 
462 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
463 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
464 			break;
465 		}
466 	}
467 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
468 		/* Out of range, use the 2^16 value */
469 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
470 	}
471 
472 	/* Take values from /etc/system */
473 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
474 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
475 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
476 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
477 
478 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
479 
480 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
481 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
482 
483 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
484 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
485 		    MUTEX_DEFAULT, NULL);
486 	}
487 
488 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
489 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
490 
491 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
492 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
493 		    MUTEX_DEFAULT, NULL);
494 	}
495 
496 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
497 	    sizeof (connf_t), KM_SLEEP);
498 	for (i = 0; i < IPPROTO_MAX; i++) {
499 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
500 		    MUTEX_DEFAULT, NULL);
501 	}
502 
503 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
504 	    sizeof (connf_t), KM_SLEEP);
505 	for (i = 0; i < IPPROTO_MAX; i++) {
506 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
507 		    MUTEX_DEFAULT, NULL);
508 	}
509 
510 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
511 	mutex_init(&ipst->ips_rts_clients->connf_lock,
512 	    NULL, MUTEX_DEFAULT, NULL);
513 
514 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
515 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
516 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
517 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
518 		    MUTEX_DEFAULT, NULL);
519 	}
520 
521 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
522 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
523 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
524 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
525 		    MUTEX_DEFAULT, NULL);
526 	}
527 
528 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
529 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
530 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
531 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
532 		    MUTEX_DEFAULT, NULL);
533 	}
534 
535 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
536 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
537 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
538 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
539 		    NULL, MUTEX_DEFAULT, NULL);
540 	}
541 }
542 
543 void
544 ipcl_g_destroy(void)
545 {
546 	kmem_cache_destroy(ip_conn_cache);
547 	kmem_cache_destroy(tcp_conn_cache);
548 	kmem_cache_destroy(udp_conn_cache);
549 	kmem_cache_destroy(rawip_conn_cache);
550 	kmem_cache_destroy(rts_conn_cache);
551 }
552 
553 /*
554  * All user-level and kernel use of the stack must be gone
555  * by now.
556  */
557 void
558 ipcl_destroy(ip_stack_t *ipst)
559 {
560 	int i;
561 
562 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
563 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
564 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
565 	}
566 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
567 	    sizeof (connf_t));
568 	ipst->ips_ipcl_conn_fanout = NULL;
569 
570 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
571 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
572 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
573 	}
574 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
575 	    sizeof (connf_t));
576 	ipst->ips_ipcl_bind_fanout = NULL;
577 
578 	for (i = 0; i < IPPROTO_MAX; i++) {
579 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
580 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
581 	}
582 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
583 	ipst->ips_ipcl_proto_fanout = NULL;
584 
585 	for (i = 0; i < IPPROTO_MAX; i++) {
586 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
587 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
588 	}
589 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
590 	    IPPROTO_MAX * sizeof (connf_t));
591 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
592 
593 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
594 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
595 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
596 	}
597 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
598 	    sizeof (connf_t));
599 	ipst->ips_ipcl_udp_fanout = NULL;
600 
601 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
602 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
603 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
604 	}
605 	kmem_free(ipst->ips_ipcl_iptun_fanout,
606 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
607 	ipst->ips_ipcl_iptun_fanout = NULL;
608 
609 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
610 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
611 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
612 	}
613 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
614 	    sizeof (connf_t));
615 	ipst->ips_ipcl_raw_fanout = NULL;
616 
617 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
618 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
619 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
620 	}
621 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
622 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
623 	ipst->ips_ipcl_globalhash_fanout = NULL;
624 
625 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
626 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
627 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
628 	ipst->ips_rts_clients = NULL;
629 }
630 
631 /*
632  * conn creation routine. initialize the conn, sets the reference
633  * and inserts it in the global hash table.
634  */
635 conn_t *
636 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
637 {
638 	conn_t	*connp;
639 	sctp_stack_t *sctps;
640 	struct kmem_cache *conn_cache;
641 
642 	switch (type) {
643 	case IPCL_SCTPCONN:
644 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
645 			return (NULL);
646 		sctp_conn_init(connp);
647 		sctps = ns->netstack_sctp;
648 		SCTP_G_Q_REFHOLD(sctps);
649 		netstack_hold(ns);
650 		connp->conn_netstack = ns;
651 		return (connp);
652 
653 	case IPCL_TCPCONN:
654 		conn_cache = tcp_conn_cache;
655 		break;
656 
657 	case IPCL_UDPCONN:
658 		conn_cache = udp_conn_cache;
659 		break;
660 
661 	case IPCL_RAWIPCONN:
662 		conn_cache = rawip_conn_cache;
663 		break;
664 
665 	case IPCL_RTSCONN:
666 		conn_cache = rts_conn_cache;
667 		break;
668 
669 	case IPCL_IPCCONN:
670 		conn_cache = ip_conn_cache;
671 		break;
672 
673 	default:
674 		connp = NULL;
675 		ASSERT(0);
676 	}
677 
678 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
679 		return (NULL);
680 
681 	connp->conn_ref = 1;
682 	netstack_hold(ns);
683 	connp->conn_netstack = ns;
684 	ipcl_globalhash_insert(connp);
685 	return (connp);
686 }
687 
688 void
689 ipcl_conn_destroy(conn_t *connp)
690 {
691 	mblk_t	*mp;
692 	netstack_t	*ns = connp->conn_netstack;
693 
694 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
695 	ASSERT(connp->conn_ref == 0);
696 	ASSERT(connp->conn_ire_cache == NULL);
697 
698 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
699 
700 	if (connp->conn_effective_cred != NULL) {
701 		crfree(connp->conn_effective_cred);
702 		connp->conn_effective_cred = NULL;
703 	}
704 
705 	if (connp->conn_cred != NULL) {
706 		crfree(connp->conn_cred);
707 		connp->conn_cred = NULL;
708 	}
709 
710 	ipcl_globalhash_remove(connp);
711 
712 	/* FIXME: add separate tcp_conn_free()? */
713 	if (connp->conn_flags & IPCL_TCPCONN) {
714 		tcp_t	*tcp = connp->conn_tcp;
715 		tcp_stack_t *tcps;
716 
717 		ASSERT(tcp != NULL);
718 		tcps = tcp->tcp_tcps;
719 		if (tcps != NULL) {
720 			if (connp->conn_latch != NULL) {
721 				IPLATCH_REFRELE(connp->conn_latch, ns);
722 				connp->conn_latch = NULL;
723 			}
724 			if (connp->conn_policy != NULL) {
725 				IPPH_REFRELE(connp->conn_policy, ns);
726 				connp->conn_policy = NULL;
727 			}
728 			tcp->tcp_tcps = NULL;
729 			TCPS_REFRELE(tcps);
730 		}
731 
732 		tcp_free(tcp);
733 		mp = tcp->tcp_timercache;
734 		tcp->tcp_cred = NULL;
735 
736 		if (tcp->tcp_sack_info != NULL) {
737 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
738 			kmem_cache_free(tcp_sack_info_cache,
739 			    tcp->tcp_sack_info);
740 		}
741 		if (tcp->tcp_iphc != NULL) {
742 			if (tcp->tcp_hdr_grown) {
743 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
744 			} else {
745 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
746 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
747 			}
748 			tcp->tcp_iphc_len = 0;
749 		}
750 		ASSERT(tcp->tcp_iphc_len == 0);
751 
752 		/*
753 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
754 		 * the mblk.
755 		 */
756 		if (tcp->tcp_rsrv_mp != NULL) {
757 			freeb(tcp->tcp_rsrv_mp);
758 			tcp->tcp_rsrv_mp = NULL;
759 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
760 		}
761 
762 		ASSERT(connp->conn_latch == NULL);
763 		ASSERT(connp->conn_policy == NULL);
764 
765 		if (ns != NULL) {
766 			ASSERT(tcp->tcp_tcps == NULL);
767 			connp->conn_netstack = NULL;
768 			netstack_rele(ns);
769 		}
770 
771 		ipcl_conn_cleanup(connp);
772 		connp->conn_flags = IPCL_TCPCONN;
773 		bzero(tcp, sizeof (tcp_t));
774 
775 		tcp->tcp_timercache = mp;
776 		tcp->tcp_connp = connp;
777 		kmem_cache_free(tcp_conn_cache, connp);
778 		return;
779 	}
780 	if (connp->conn_latch != NULL) {
781 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
782 		connp->conn_latch = NULL;
783 	}
784 	if (connp->conn_policy != NULL) {
785 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
786 		connp->conn_policy = NULL;
787 	}
788 	if (connp->conn_ipsec_opt_mp != NULL) {
789 		freemsg(connp->conn_ipsec_opt_mp);
790 		connp->conn_ipsec_opt_mp = NULL;
791 	}
792 
793 	if (connp->conn_flags & IPCL_SCTPCONN) {
794 		ASSERT(ns != NULL);
795 		sctp_free(connp);
796 		return;
797 	}
798 
799 	if (ns != NULL) {
800 		connp->conn_netstack = NULL;
801 		netstack_rele(ns);
802 	}
803 
804 	ipcl_conn_cleanup(connp);
805 
806 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
807 	if (connp->conn_flags & IPCL_UDPCONN) {
808 		connp->conn_flags = IPCL_UDPCONN;
809 		kmem_cache_free(udp_conn_cache, connp);
810 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
811 
812 		connp->conn_flags = IPCL_RAWIPCONN;
813 		connp->conn_ulp = IPPROTO_ICMP;
814 		kmem_cache_free(rawip_conn_cache, connp);
815 	} else if (connp->conn_flags & IPCL_RTSCONN) {
816 		connp->conn_flags = IPCL_RTSCONN;
817 		kmem_cache_free(rts_conn_cache, connp);
818 	} else {
819 		connp->conn_flags = IPCL_IPCCONN;
820 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
821 		ASSERT(connp->conn_priv == NULL);
822 		kmem_cache_free(ip_conn_cache, connp);
823 	}
824 }
825 
826 /*
827  * Running in cluster mode - deregister listener information
828  */
829 
830 static void
831 ipcl_conn_unlisten(conn_t *connp)
832 {
833 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
834 	ASSERT(connp->conn_lport != 0);
835 
836 	if (cl_inet_unlisten != NULL) {
837 		sa_family_t	addr_family;
838 		uint8_t		*laddrp;
839 
840 		if (connp->conn_pkt_isv6) {
841 			addr_family = AF_INET6;
842 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
843 		} else {
844 			addr_family = AF_INET;
845 			laddrp = (uint8_t *)&connp->conn_bound_source;
846 		}
847 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
848 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
849 	}
850 	connp->conn_flags &= ~IPCL_CL_LISTENER;
851 }
852 
853 /*
854  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
855  * which table the conn belonged to). So for debugging we can see which hash
856  * table this connection was in.
857  */
858 #define	IPCL_HASH_REMOVE(connp)	{					\
859 	connf_t	*connfp = (connp)->conn_fanout;				\
860 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
861 	if (connfp != NULL) {						\
862 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
863 		    (void *)(connp)));					\
864 		mutex_enter(&connfp->connf_lock);			\
865 		if ((connp)->conn_next != NULL)				\
866 			(connp)->conn_next->conn_prev =			\
867 			    (connp)->conn_prev;				\
868 		if ((connp)->conn_prev != NULL)				\
869 			(connp)->conn_prev->conn_next =			\
870 			    (connp)->conn_next;				\
871 		else							\
872 			connfp->connf_head = (connp)->conn_next;	\
873 		(connp)->conn_fanout = NULL;				\
874 		(connp)->conn_next = NULL;				\
875 		(connp)->conn_prev = NULL;				\
876 		(connp)->conn_flags |= IPCL_REMOVED;			\
877 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
878 			ipcl_conn_unlisten((connp));			\
879 		CONN_DEC_REF((connp));					\
880 		mutex_exit(&connfp->connf_lock);			\
881 	}								\
882 }
883 
884 void
885 ipcl_hash_remove(conn_t *connp)
886 {
887 	IPCL_HASH_REMOVE(connp);
888 }
889 
890 /*
891  * The whole purpose of this function is allow removal of
892  * a conn_t from the connected hash for timewait reclaim.
893  * This is essentially a TW reclaim fastpath where timewait
894  * collector checks under fanout lock (so no one else can
895  * get access to the conn_t) that refcnt is 2 i.e. one for
896  * TCP and one for the classifier hash list. If ref count
897  * is indeed 2, we can just remove the conn under lock and
898  * avoid cleaning up the conn under squeue. This gives us
899  * improved performance.
900  */
901 void
902 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
903 {
904 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
905 	ASSERT(MUTEX_HELD(&connp->conn_lock));
906 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
907 
908 	if ((connp)->conn_next != NULL) {
909 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
910 	}
911 	if ((connp)->conn_prev != NULL) {
912 		(connp)->conn_prev->conn_next = (connp)->conn_next;
913 	} else {
914 		connfp->connf_head = (connp)->conn_next;
915 	}
916 	(connp)->conn_fanout = NULL;
917 	(connp)->conn_next = NULL;
918 	(connp)->conn_prev = NULL;
919 	(connp)->conn_flags |= IPCL_REMOVED;
920 	ASSERT((connp)->conn_ref == 2);
921 	(connp)->conn_ref--;
922 }
923 
924 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
925 	ASSERT((connp)->conn_fanout == NULL);				\
926 	ASSERT((connp)->conn_next == NULL);				\
927 	ASSERT((connp)->conn_prev == NULL);				\
928 	if ((connfp)->connf_head != NULL) {				\
929 		(connfp)->connf_head->conn_prev = (connp);		\
930 		(connp)->conn_next = (connfp)->connf_head;		\
931 	}								\
932 	(connp)->conn_fanout = (connfp);				\
933 	(connfp)->connf_head = (connp);					\
934 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
935 	    IPCL_CONNECTED;						\
936 	CONN_INC_REF(connp);						\
937 }
938 
939 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
940 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
941 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
942 	IPCL_HASH_REMOVE((connp));					\
943 	mutex_enter(&(connfp)->connf_lock);				\
944 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
945 	mutex_exit(&(connfp)->connf_lock);				\
946 }
947 
948 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
949 	conn_t *pconnp = NULL, *nconnp;					\
950 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
951 	    "connp %p", (void *)connfp, (void *)(connp)));		\
952 	IPCL_HASH_REMOVE((connp));					\
953 	mutex_enter(&(connfp)->connf_lock);				\
954 	nconnp = (connfp)->connf_head;					\
955 	while (nconnp != NULL &&					\
956 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
957 		pconnp = nconnp;					\
958 		nconnp = nconnp->conn_next;				\
959 	}								\
960 	if (pconnp != NULL) {						\
961 		pconnp->conn_next = (connp);				\
962 		(connp)->conn_prev = pconnp;				\
963 	} else {							\
964 		(connfp)->connf_head = (connp);				\
965 	}								\
966 	if (nconnp != NULL) {						\
967 		(connp)->conn_next = nconnp;				\
968 		nconnp->conn_prev = (connp);				\
969 	}								\
970 	(connp)->conn_fanout = (connfp);				\
971 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
972 	    IPCL_BOUND;							\
973 	CONN_INC_REF(connp);						\
974 	mutex_exit(&(connfp)->connf_lock);				\
975 }
976 
977 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
978 	conn_t **list, *prev, *next;					\
979 	boolean_t isv4mapped =						\
980 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
981 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
982 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
983 	IPCL_HASH_REMOVE((connp));					\
984 	mutex_enter(&(connfp)->connf_lock);				\
985 	list = &(connfp)->connf_head;					\
986 	prev = NULL;							\
987 	while ((next = *list) != NULL) {				\
988 		if (isv4mapped &&					\
989 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
990 		    connp->conn_zoneid == next->conn_zoneid) {		\
991 			(connp)->conn_next = next;			\
992 			if (prev != NULL)				\
993 				prev = next->conn_prev;			\
994 			next->conn_prev = (connp);			\
995 			break;						\
996 		}							\
997 		list = &next->conn_next;				\
998 		prev = next;						\
999 	}								\
1000 	(connp)->conn_prev = prev;					\
1001 	*list = (connp);						\
1002 	(connp)->conn_fanout = (connfp);				\
1003 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
1004 	    IPCL_BOUND;							\
1005 	CONN_INC_REF((connp));						\
1006 	mutex_exit(&(connfp)->connf_lock);				\
1007 }
1008 
1009 void
1010 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
1011 {
1012 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1013 }
1014 
1015 void
1016 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
1017 {
1018 	connf_t	*connfp;
1019 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1020 
1021 	ASSERT(connp != NULL);
1022 	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
1023 	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
1024 
1025 	connp->conn_ulp = protocol;
1026 
1027 	/* Insert it in the protocol hash */
1028 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1029 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1030 }
1031 
1032 void
1033 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
1034 {
1035 	connf_t	*connfp;
1036 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1037 
1038 	ASSERT(connp != NULL);
1039 	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
1040 	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
1041 
1042 	connp->conn_ulp = protocol;
1043 
1044 	/* Insert it in the Bind Hash */
1045 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1046 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1047 }
1048 
1049 /*
1050  * Because the classifier is used to classify inbound packets, the destination
1051  * address is meant to be our local tunnel address (tunnel source), and the
1052  * source the remote tunnel address (tunnel destination).
1053  */
1054 conn_t *
1055 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
1056 {
1057 	connf_t	*connfp;
1058 	conn_t	*connp;
1059 
1060 	/* first look for IPv4 tunnel links */
1061 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
1062 	mutex_enter(&connfp->connf_lock);
1063 	for (connp = connfp->connf_head; connp != NULL;
1064 	    connp = connp->conn_next) {
1065 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
1066 			break;
1067 	}
1068 	if (connp != NULL)
1069 		goto done;
1070 
1071 	mutex_exit(&connfp->connf_lock);
1072 
1073 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
1074 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
1075 	    INADDR_ANY)];
1076 	mutex_enter(&connfp->connf_lock);
1077 	for (connp = connfp->connf_head; connp != NULL;
1078 	    connp = connp->conn_next) {
1079 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
1080 			break;
1081 	}
1082 done:
1083 	if (connp != NULL)
1084 		CONN_INC_REF(connp);
1085 	mutex_exit(&connfp->connf_lock);
1086 	return (connp);
1087 }
1088 
1089 conn_t *
1090 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1091 {
1092 	connf_t	*connfp;
1093 	conn_t	*connp;
1094 
1095 	/* Look for an IPv6 tunnel link */
1096 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1097 	mutex_enter(&connfp->connf_lock);
1098 	for (connp = connfp->connf_head; connp != NULL;
1099 	    connp = connp->conn_next) {
1100 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1101 			CONN_INC_REF(connp);
1102 			break;
1103 		}
1104 	}
1105 	mutex_exit(&connfp->connf_lock);
1106 	return (connp);
1107 }
1108 
1109 /*
1110  * This function is used only for inserting SCTP raw socket now.
1111  * This may change later.
1112  *
1113  * Note that only one raw socket can be bound to a port.  The param
1114  * lport is in network byte order.
1115  */
1116 static int
1117 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1118 {
1119 	connf_t	*connfp;
1120 	conn_t	*oconnp;
1121 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1122 
1123 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1124 
1125 	/* Check for existing raw socket already bound to the port. */
1126 	mutex_enter(&connfp->connf_lock);
1127 	for (oconnp = connfp->connf_head; oconnp != NULL;
1128 	    oconnp = oconnp->conn_next) {
1129 		if (oconnp->conn_lport == lport &&
1130 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1131 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1132 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1133 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1134 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1135 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1136 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1137 		    &connp->conn_srcv6))) {
1138 			break;
1139 		}
1140 	}
1141 	mutex_exit(&connfp->connf_lock);
1142 	if (oconnp != NULL)
1143 		return (EADDRNOTAVAIL);
1144 
1145 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1146 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1147 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1148 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1149 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1150 		} else {
1151 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1152 		}
1153 	} else {
1154 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1155 	}
1156 	return (0);
1157 }
1158 
1159 static int
1160 ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
1161     ip_stack_t *ipst)
1162 {
1163 	connf_t	*connfp;
1164 	conn_t	*tconnp;
1165 
1166 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)];
1167 	mutex_enter(&connfp->connf_lock);
1168 	for (tconnp = connfp->connf_head; tconnp != NULL;
1169 	    tconnp = tconnp->conn_next) {
1170 		if (IPCL_IPTUN_MATCH(tconnp, src, dst)) {
1171 			/* A tunnel is already bound to these addresses. */
1172 			mutex_exit(&connfp->connf_lock);
1173 			return (EADDRINUSE);
1174 		}
1175 	}
1176 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1177 	mutex_exit(&connfp->connf_lock);
1178 	return (0);
1179 }
1180 
1181 static int
1182 ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src,
1183     const in6_addr_t *dst, ip_stack_t *ipst)
1184 {
1185 	connf_t	*connfp;
1186 	conn_t	*tconnp;
1187 
1188 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)];
1189 	mutex_enter(&connfp->connf_lock);
1190 	for (tconnp = connfp->connf_head; tconnp != NULL;
1191 	    tconnp = tconnp->conn_next) {
1192 		if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) {
1193 			/* A tunnel is already bound to these addresses. */
1194 			mutex_exit(&connfp->connf_lock);
1195 			return (EADDRINUSE);
1196 		}
1197 	}
1198 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1199 	mutex_exit(&connfp->connf_lock);
1200 	return (0);
1201 }
1202 
1203 /*
1204  * Check for a MAC exemption conflict on a labeled system.  Note that for
1205  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1206  * transport layer.  This check is for binding all other protocols.
1207  *
1208  * Returns true if there's a conflict.
1209  */
1210 static boolean_t
1211 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1212 {
1213 	connf_t	*connfp;
1214 	conn_t *tconn;
1215 
1216 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1217 	mutex_enter(&connfp->connf_lock);
1218 	for (tconn = connfp->connf_head; tconn != NULL;
1219 	    tconn = tconn->conn_next) {
1220 		/* We don't allow v4 fallback for v6 raw socket */
1221 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1222 			continue;
1223 		/* If neither is exempt, then there's no conflict */
1224 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1225 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1226 			continue;
1227 		/* We are only concerned about sockets for a different zone */
1228 		if (connp->conn_zoneid == tconn->conn_zoneid)
1229 			continue;
1230 		/* If both are bound to different specific addrs, ok */
1231 		if (connp->conn_src != INADDR_ANY &&
1232 		    tconn->conn_src != INADDR_ANY &&
1233 		    connp->conn_src != tconn->conn_src)
1234 			continue;
1235 		/* These two conflict; fail */
1236 		break;
1237 	}
1238 	mutex_exit(&connfp->connf_lock);
1239 	return (tconn != NULL);
1240 }
1241 
1242 static boolean_t
1243 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1244 {
1245 	connf_t	*connfp;
1246 	conn_t *tconn;
1247 
1248 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1249 	mutex_enter(&connfp->connf_lock);
1250 	for (tconn = connfp->connf_head; tconn != NULL;
1251 	    tconn = tconn->conn_next) {
1252 		/* We don't allow v4 fallback for v6 raw socket */
1253 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1254 			continue;
1255 		/* If neither is exempt, then there's no conflict */
1256 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1257 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1258 			continue;
1259 		/* We are only concerned about sockets for a different zone */
1260 		if (connp->conn_zoneid == tconn->conn_zoneid)
1261 			continue;
1262 		/* If both are bound to different addrs, ok */
1263 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1264 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1265 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1266 			continue;
1267 		/* These two conflict; fail */
1268 		break;
1269 	}
1270 	mutex_exit(&connfp->connf_lock);
1271 	return (tconn != NULL);
1272 }
1273 
1274 /*
1275  * (v4, v6) bind hash insertion routines
1276  */
1277 int
1278 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1279 {
1280 	connf_t	*connfp;
1281 #ifdef	IPCL_DEBUG
1282 	char	buf[INET_NTOA_BUFSIZE];
1283 #endif
1284 	int	ret = 0;
1285 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1286 
1287 	ASSERT(connp);
1288 
1289 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1290 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1291 
1292 	connp->conn_ulp = protocol;
1293 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1294 	connp->conn_lport = lport;
1295 
1296 	if (IPCL_IS_IPTUN(connp))
1297 		return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst));
1298 
1299 	switch (protocol) {
1300 	default:
1301 		if (is_system_labeled() &&
1302 		    check_exempt_conflict_v4(connp, ipst))
1303 			return (EADDRINUSE);
1304 		/* FALLTHROUGH */
1305 	case IPPROTO_UDP:
1306 		if (protocol == IPPROTO_UDP) {
1307 			IPCL_DEBUG_LVL(64,
1308 			    ("ipcl_bind_insert: connp %p - udp\n",
1309 			    (void *)connp));
1310 			connfp = &ipst->ips_ipcl_udp_fanout[
1311 			    IPCL_UDP_HASH(lport, ipst)];
1312 		} else {
1313 			IPCL_DEBUG_LVL(64,
1314 			    ("ipcl_bind_insert: connp %p - protocol\n",
1315 			    (void *)connp));
1316 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1317 		}
1318 
1319 		if (connp->conn_rem != INADDR_ANY) {
1320 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1321 		} else if (connp->conn_src != INADDR_ANY) {
1322 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1323 		} else {
1324 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1325 		}
1326 		break;
1327 
1328 	case IPPROTO_TCP:
1329 
1330 		/* Insert it in the Bind Hash */
1331 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1332 		connfp = &ipst->ips_ipcl_bind_fanout[
1333 		    IPCL_BIND_HASH(lport, ipst)];
1334 		if (connp->conn_src != INADDR_ANY) {
1335 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1336 		} else {
1337 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1338 		}
1339 		if (cl_inet_listen != NULL) {
1340 			ASSERT(!connp->conn_pkt_isv6);
1341 			connp->conn_flags |= IPCL_CL_LISTENER;
1342 			(*cl_inet_listen)(
1343 			    connp->conn_netstack->netstack_stackid,
1344 			    IPPROTO_TCP, AF_INET,
1345 			    (uint8_t *)&connp->conn_bound_source, lport, NULL);
1346 		}
1347 		break;
1348 
1349 	case IPPROTO_SCTP:
1350 		ret = ipcl_sctp_hash_insert(connp, lport);
1351 		break;
1352 	}
1353 
1354 	return (ret);
1355 }
1356 
1357 int
1358 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1359     uint16_t lport)
1360 {
1361 	connf_t		*connfp;
1362 	int		ret = 0;
1363 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1364 
1365 	ASSERT(connp != NULL);	connp->conn_ulp = protocol;
1366 	connp->conn_srcv6 = *src;
1367 	connp->conn_lport = lport;
1368 
1369 	if (IPCL_IS_IPTUN(connp)) {
1370 		return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros,
1371 		    ipst));
1372 	}
1373 
1374 	switch (protocol) {
1375 	default:
1376 		if (is_system_labeled() &&
1377 		    check_exempt_conflict_v6(connp, ipst))
1378 			return (EADDRINUSE);
1379 		/* FALLTHROUGH */
1380 	case IPPROTO_UDP:
1381 		if (protocol == IPPROTO_UDP) {
1382 			IPCL_DEBUG_LVL(128,
1383 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1384 			    (void *)connp));
1385 			connfp = &ipst->ips_ipcl_udp_fanout[
1386 			    IPCL_UDP_HASH(lport, ipst)];
1387 		} else {
1388 			IPCL_DEBUG_LVL(128,
1389 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1390 			    (void *)connp));
1391 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1392 		}
1393 
1394 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1395 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1396 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1397 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1398 		} else {
1399 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1400 		}
1401 		break;
1402 
1403 	case IPPROTO_TCP:
1404 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1405 
1406 		/* Insert it in the Bind Hash */
1407 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1408 		connfp = &ipst->ips_ipcl_bind_fanout[
1409 		    IPCL_BIND_HASH(lport, ipst)];
1410 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1411 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1412 		} else {
1413 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1414 		}
1415 		if (cl_inet_listen != NULL) {
1416 			sa_family_t	addr_family;
1417 			uint8_t		*laddrp;
1418 
1419 			if (connp->conn_pkt_isv6) {
1420 				addr_family = AF_INET6;
1421 				laddrp =
1422 				    (uint8_t *)&connp->conn_bound_source_v6;
1423 			} else {
1424 				addr_family = AF_INET;
1425 				laddrp = (uint8_t *)&connp->conn_bound_source;
1426 			}
1427 			connp->conn_flags |= IPCL_CL_LISTENER;
1428 			(*cl_inet_listen)(
1429 			    connp->conn_netstack->netstack_stackid,
1430 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1431 		}
1432 		break;
1433 
1434 	case IPPROTO_SCTP:
1435 		ret = ipcl_sctp_hash_insert(connp, lport);
1436 		break;
1437 	}
1438 
1439 	return (ret);
1440 }
1441 
1442 /*
1443  * ipcl_conn_hash insertion routines.
1444  */
1445 int
1446 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1447     ipaddr_t rem, uint32_t ports)
1448 {
1449 	connf_t		*connfp;
1450 	uint16_t	*up;
1451 	conn_t		*tconnp;
1452 #ifdef	IPCL_DEBUG
1453 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1454 #endif
1455 	in_port_t	lport;
1456 	int		ret = 0;
1457 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1458 
1459 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1460 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1461 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1462 	    ports, protocol));
1463 
1464 	if (IPCL_IS_IPTUN(connp))
1465 		return (ipcl_iptun_hash_insert(connp, src, rem, ipst));
1466 
1467 	switch (protocol) {
1468 	case IPPROTO_TCP:
1469 		if (!(connp->conn_flags & IPCL_EAGER)) {
1470 			/*
1471 			 * for a eager connection, i.e connections which
1472 			 * have just been created, the initialization is
1473 			 * already done in ip at conn_creation time, so
1474 			 * we can skip the checks here.
1475 			 */
1476 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1477 		}
1478 
1479 		/*
1480 		 * For tcp, we check whether the connection tuple already
1481 		 * exists before allowing the connection to proceed.  We
1482 		 * also allow indexing on the zoneid. This is to allow
1483 		 * multiple shared stack zones to have the same tcp
1484 		 * connection tuple. In practice this only happens for
1485 		 * INADDR_LOOPBACK as it's the only local address which
1486 		 * doesn't have to be unique.
1487 		 */
1488 		connfp = &ipst->ips_ipcl_conn_fanout[
1489 		    IPCL_CONN_HASH(connp->conn_rem,
1490 		    connp->conn_ports, ipst)];
1491 		mutex_enter(&connfp->connf_lock);
1492 		for (tconnp = connfp->connf_head; tconnp != NULL;
1493 		    tconnp = tconnp->conn_next) {
1494 			if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1495 			    connp->conn_rem, connp->conn_src,
1496 			    connp->conn_ports)) &&
1497 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1498 
1499 				/* Already have a conn. bail out */
1500 				mutex_exit(&connfp->connf_lock);
1501 				return (EADDRINUSE);
1502 			}
1503 		}
1504 		if (connp->conn_fanout != NULL) {
1505 			/*
1506 			 * Probably a XTI/TLI application trying to do a
1507 			 * rebind. Let it happen.
1508 			 */
1509 			mutex_exit(&connfp->connf_lock);
1510 			IPCL_HASH_REMOVE(connp);
1511 			mutex_enter(&connfp->connf_lock);
1512 		}
1513 
1514 		ASSERT(connp->conn_recv != NULL);
1515 
1516 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1517 		mutex_exit(&connfp->connf_lock);
1518 		break;
1519 
1520 	case IPPROTO_SCTP:
1521 		/*
1522 		 * The raw socket may have already been bound, remove it
1523 		 * from the hash first.
1524 		 */
1525 		IPCL_HASH_REMOVE(connp);
1526 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1527 		ret = ipcl_sctp_hash_insert(connp, lport);
1528 		break;
1529 
1530 	default:
1531 		/*
1532 		 * Check for conflicts among MAC exempt bindings.  For
1533 		 * transports with port numbers, this is done by the upper
1534 		 * level per-transport binding logic.  For all others, it's
1535 		 * done here.
1536 		 */
1537 		if (is_system_labeled() &&
1538 		    check_exempt_conflict_v4(connp, ipst))
1539 			return (EADDRINUSE);
1540 		/* FALLTHROUGH */
1541 
1542 	case IPPROTO_UDP:
1543 		up = (uint16_t *)&ports;
1544 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1545 		if (protocol == IPPROTO_UDP) {
1546 			connfp = &ipst->ips_ipcl_udp_fanout[
1547 			    IPCL_UDP_HASH(up[1], ipst)];
1548 		} else {
1549 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1550 		}
1551 
1552 		if (connp->conn_rem != INADDR_ANY) {
1553 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1554 		} else if (connp->conn_src != INADDR_ANY) {
1555 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1556 		} else {
1557 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1558 		}
1559 		break;
1560 	}
1561 
1562 	return (ret);
1563 }
1564 
1565 int
1566 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1567     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1568 {
1569 	connf_t		*connfp;
1570 	uint16_t	*up;
1571 	conn_t		*tconnp;
1572 	in_port_t	lport;
1573 	int		ret = 0;
1574 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1575 
1576 	if (IPCL_IS_IPTUN(connp))
1577 		return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst));
1578 
1579 	switch (protocol) {
1580 	case IPPROTO_TCP:
1581 		/* Just need to insert a conn struct */
1582 		if (!(connp->conn_flags & IPCL_EAGER)) {
1583 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1584 		}
1585 
1586 		/*
1587 		 * For tcp, we check whether the connection tuple already
1588 		 * exists before allowing the connection to proceed.  We
1589 		 * also allow indexing on the zoneid. This is to allow
1590 		 * multiple shared stack zones to have the same tcp
1591 		 * connection tuple. In practice this only happens for
1592 		 * ipv6_loopback as it's the only local address which
1593 		 * doesn't have to be unique.
1594 		 */
1595 		connfp = &ipst->ips_ipcl_conn_fanout[
1596 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1597 		    ipst)];
1598 		mutex_enter(&connfp->connf_lock);
1599 		for (tconnp = connfp->connf_head; tconnp != NULL;
1600 		    tconnp = tconnp->conn_next) {
1601 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1602 			    connp->conn_remv6, connp->conn_srcv6,
1603 			    connp->conn_ports) &&
1604 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1605 			    tconnp->conn_tcp->tcp_bound_if == ifindex) &&
1606 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
1607 				/* Already have a conn. bail out */
1608 				mutex_exit(&connfp->connf_lock);
1609 				return (EADDRINUSE);
1610 			}
1611 		}
1612 		if (connp->conn_fanout != NULL) {
1613 			/*
1614 			 * Probably a XTI/TLI application trying to do a
1615 			 * rebind. Let it happen.
1616 			 */
1617 			mutex_exit(&connfp->connf_lock);
1618 			IPCL_HASH_REMOVE(connp);
1619 			mutex_enter(&connfp->connf_lock);
1620 		}
1621 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1622 		mutex_exit(&connfp->connf_lock);
1623 		break;
1624 
1625 	case IPPROTO_SCTP:
1626 		IPCL_HASH_REMOVE(connp);
1627 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1628 		ret = ipcl_sctp_hash_insert(connp, lport);
1629 		break;
1630 
1631 	default:
1632 		if (is_system_labeled() &&
1633 		    check_exempt_conflict_v6(connp, ipst))
1634 			return (EADDRINUSE);
1635 		/* FALLTHROUGH */
1636 	case IPPROTO_UDP:
1637 		up = (uint16_t *)&ports;
1638 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1639 		if (protocol == IPPROTO_UDP) {
1640 			connfp = &ipst->ips_ipcl_udp_fanout[
1641 			    IPCL_UDP_HASH(up[1], ipst)];
1642 		} else {
1643 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1644 		}
1645 
1646 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1647 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1648 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1649 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1650 		} else {
1651 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1652 		}
1653 		break;
1654 	}
1655 
1656 	return (ret);
1657 }
1658 
1659 /*
1660  * v4 packet classifying function. looks up the fanout table to
1661  * find the conn, the packet belongs to. returns the conn with
1662  * the reference held, null otherwise.
1663  *
1664  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1665  * Lookup" comment block are applied.  Labels are also checked as described
1666  * above.  If the packet is from the inside (looped back), and is from the same
1667  * zone, then label checks are omitted.
1668  */
1669 conn_t *
1670 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1671     ip_stack_t *ipst)
1672 {
1673 	ipha_t	*ipha;
1674 	connf_t	*connfp, *bind_connfp;
1675 	uint16_t lport;
1676 	uint16_t fport;
1677 	uint32_t ports;
1678 	conn_t	*connp;
1679 	uint16_t  *up;
1680 	boolean_t shared_addr;
1681 	boolean_t unlabeled;
1682 
1683 	ipha = (ipha_t *)mp->b_rptr;
1684 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1685 
1686 	switch (protocol) {
1687 	case IPPROTO_TCP:
1688 		ports = *(uint32_t *)up;
1689 		connfp =
1690 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1691 		    ports, ipst)];
1692 		mutex_enter(&connfp->connf_lock);
1693 		for (connp = connfp->connf_head; connp != NULL;
1694 		    connp = connp->conn_next) {
1695 			if ((IPCL_CONN_MATCH(connp, protocol,
1696 			    ipha->ipha_src, ipha->ipha_dst, ports)) &&
1697 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1698 				break;
1699 			}
1700 		}
1701 
1702 		if (connp != NULL) {
1703 			/*
1704 			 * We have a fully-bound TCP connection.
1705 			 *
1706 			 * For labeled systems, there's no need to check the
1707 			 * label here.  It's known to be good as we checked
1708 			 * before allowing the connection to become bound.
1709 			 */
1710 			CONN_INC_REF(connp);
1711 			mutex_exit(&connfp->connf_lock);
1712 			return (connp);
1713 		}
1714 
1715 		mutex_exit(&connfp->connf_lock);
1716 
1717 		lport = up[1];
1718 		unlabeled = B_FALSE;
1719 		/* Cred cannot be null on IPv4 */
1720 		if (is_system_labeled()) {
1721 			cred_t *cr = msg_getcred(mp, NULL);
1722 			ASSERT(cr != NULL);
1723 			unlabeled = (crgetlabel(cr)->tsl_flags &
1724 			    TSLF_UNLABELED) != 0;
1725 		}
1726 		shared_addr = (zoneid == ALL_ZONES);
1727 		if (shared_addr) {
1728 			/*
1729 			 * No need to handle exclusive-stack zones since
1730 			 * ALL_ZONES only applies to the shared stack.
1731 			 */
1732 			zoneid = tsol_mlp_findzone(protocol, lport);
1733 			/*
1734 			 * If no shared MLP is found, tsol_mlp_findzone returns
1735 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1736 			 * search for the zone based on the packet label.
1737 			 *
1738 			 * If there is such a zone, we prefer to find a
1739 			 * connection in it.  Otherwise, we look for a
1740 			 * MAC-exempt connection in any zone whose label
1741 			 * dominates the default label on the packet.
1742 			 */
1743 			if (zoneid == ALL_ZONES)
1744 				zoneid = tsol_packet_to_zoneid(mp);
1745 			else
1746 				unlabeled = B_FALSE;
1747 		}
1748 
1749 		bind_connfp =
1750 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1751 		mutex_enter(&bind_connfp->connf_lock);
1752 		for (connp = bind_connfp->connf_head; connp != NULL;
1753 		    connp = connp->conn_next) {
1754 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1755 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1756 			    (unlabeled && shared_addr &&
1757 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
1758 				break;
1759 		}
1760 
1761 		/*
1762 		 * If the matching connection is SLP on a private address, then
1763 		 * the label on the packet must match the local zone's label.
1764 		 * Otherwise, it must be in the label range defined by tnrh.
1765 		 * This is ensured by tsol_receive_label.
1766 		 */
1767 		if (connp != NULL && is_system_labeled() &&
1768 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1769 		    shared_addr, connp)) {
1770 				DTRACE_PROBE3(
1771 				    tx__ip__log__info__classify__tcp,
1772 				    char *,
1773 				    "connp(1) could not receive mp(2)",
1774 				    conn_t *, connp, mblk_t *, mp);
1775 			connp = NULL;
1776 		}
1777 
1778 		if (connp != NULL) {
1779 			/* Have a listener at least */
1780 			CONN_INC_REF(connp);
1781 			mutex_exit(&bind_connfp->connf_lock);
1782 			return (connp);
1783 		}
1784 
1785 		mutex_exit(&bind_connfp->connf_lock);
1786 
1787 		IPCL_DEBUG_LVL(512,
1788 		    ("ipcl_classify: couldn't classify mp = %p\n",
1789 		    (void *)mp));
1790 		break;
1791 
1792 	case IPPROTO_UDP:
1793 		lport = up[1];
1794 		unlabeled = B_FALSE;
1795 		/* Cred cannot be null on IPv4 */
1796 		if (is_system_labeled()) {
1797 			cred_t *cr = msg_getcred(mp, NULL);
1798 			ASSERT(cr != NULL);
1799 			unlabeled = (crgetlabel(cr)->tsl_flags &
1800 			    TSLF_UNLABELED) != 0;
1801 		}
1802 		shared_addr = (zoneid == ALL_ZONES);
1803 		if (shared_addr) {
1804 			/*
1805 			 * No need to handle exclusive-stack zones since
1806 			 * ALL_ZONES only applies to the shared stack.
1807 			 */
1808 			zoneid = tsol_mlp_findzone(protocol, lport);
1809 			/*
1810 			 * If no shared MLP is found, tsol_mlp_findzone returns
1811 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1812 			 * search for the zone based on the packet label.
1813 			 *
1814 			 * If there is such a zone, we prefer to find a
1815 			 * connection in it.  Otherwise, we look for a
1816 			 * MAC-exempt connection in any zone whose label
1817 			 * dominates the default label on the packet.
1818 			 */
1819 			if (zoneid == ALL_ZONES)
1820 				zoneid = tsol_packet_to_zoneid(mp);
1821 			else
1822 				unlabeled = B_FALSE;
1823 		}
1824 		fport = up[0];
1825 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1826 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1827 		mutex_enter(&connfp->connf_lock);
1828 		for (connp = connfp->connf_head; connp != NULL;
1829 		    connp = connp->conn_next) {
1830 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1831 			    fport, ipha->ipha_src) &&
1832 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1833 			    (unlabeled && shared_addr &&
1834 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
1835 				break;
1836 		}
1837 
1838 		if (connp != NULL && is_system_labeled() &&
1839 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1840 		    shared_addr, connp)) {
1841 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1842 			    char *, "connp(1) could not receive mp(2)",
1843 			    conn_t *, connp, mblk_t *, mp);
1844 			connp = NULL;
1845 		}
1846 
1847 		if (connp != NULL) {
1848 			CONN_INC_REF(connp);
1849 			mutex_exit(&connfp->connf_lock);
1850 			return (connp);
1851 		}
1852 
1853 		/*
1854 		 * We shouldn't come here for multicast/broadcast packets
1855 		 */
1856 		mutex_exit(&connfp->connf_lock);
1857 		IPCL_DEBUG_LVL(512,
1858 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1859 		    lport, fport));
1860 		break;
1861 
1862 	case IPPROTO_ENCAP:
1863 	case IPPROTO_IPV6:
1864 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1865 		    &ipha->ipha_dst, ipst));
1866 	}
1867 
1868 	return (NULL);
1869 }
1870 
1871 conn_t *
1872 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1873     ip_stack_t *ipst)
1874 {
1875 	ip6_t		*ip6h;
1876 	connf_t		*connfp, *bind_connfp;
1877 	uint16_t	lport;
1878 	uint16_t	fport;
1879 	tcph_t		*tcph;
1880 	uint32_t	ports;
1881 	conn_t		*connp;
1882 	uint16_t	*up;
1883 	boolean_t	shared_addr;
1884 	boolean_t	unlabeled;
1885 
1886 	ip6h = (ip6_t *)mp->b_rptr;
1887 
1888 	switch (protocol) {
1889 	case IPPROTO_TCP:
1890 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1891 		up = (uint16_t *)tcph->th_lport;
1892 		ports = *(uint32_t *)up;
1893 
1894 		connfp =
1895 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1896 		    ports, ipst)];
1897 		mutex_enter(&connfp->connf_lock);
1898 		for (connp = connfp->connf_head; connp != NULL;
1899 		    connp = connp->conn_next) {
1900 			if ((IPCL_CONN_MATCH_V6(connp, protocol,
1901 			    ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
1902 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
1903 				break;
1904 			}
1905 		}
1906 
1907 		if (connp != NULL) {
1908 			/*
1909 			 * We have a fully-bound TCP connection.
1910 			 *
1911 			 * For labeled systems, there's no need to check the
1912 			 * label here.  It's known to be good as we checked
1913 			 * before allowing the connection to become bound.
1914 			 */
1915 			CONN_INC_REF(connp);
1916 			mutex_exit(&connfp->connf_lock);
1917 			return (connp);
1918 		}
1919 
1920 		mutex_exit(&connfp->connf_lock);
1921 
1922 		lport = up[1];
1923 		unlabeled = B_FALSE;
1924 		/* Cred can be null on IPv6 */
1925 		if (is_system_labeled()) {
1926 			cred_t *cr = msg_getcred(mp, NULL);
1927 
1928 			unlabeled = (cr != NULL &&
1929 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1930 		}
1931 		shared_addr = (zoneid == ALL_ZONES);
1932 		if (shared_addr) {
1933 			/*
1934 			 * No need to handle exclusive-stack zones since
1935 			 * ALL_ZONES only applies to the shared stack.
1936 			 */
1937 			zoneid = tsol_mlp_findzone(protocol, lport);
1938 			/*
1939 			 * If no shared MLP is found, tsol_mlp_findzone returns
1940 			 * ALL_ZONES.  In that case, we assume it's SLP, and
1941 			 * search for the zone based on the packet label.
1942 			 *
1943 			 * If there is such a zone, we prefer to find a
1944 			 * connection in it.  Otherwise, we look for a
1945 			 * MAC-exempt connection in any zone whose label
1946 			 * dominates the default label on the packet.
1947 			 */
1948 			if (zoneid == ALL_ZONES)
1949 				zoneid = tsol_packet_to_zoneid(mp);
1950 			else
1951 				unlabeled = B_FALSE;
1952 		}
1953 
1954 		bind_connfp =
1955 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1956 		mutex_enter(&bind_connfp->connf_lock);
1957 		for (connp = bind_connfp->connf_head; connp != NULL;
1958 		    connp = connp->conn_next) {
1959 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1960 			    ip6h->ip6_dst, lport) &&
1961 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1962 			    (unlabeled && shared_addr &&
1963 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
1964 				break;
1965 		}
1966 
1967 		if (connp != NULL && is_system_labeled() &&
1968 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1969 		    shared_addr, connp)) {
1970 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1971 			    char *, "connp(1) could not receive mp(2)",
1972 			    conn_t *, connp, mblk_t *, mp);
1973 			connp = NULL;
1974 		}
1975 
1976 		if (connp != NULL) {
1977 			/* Have a listner at least */
1978 			CONN_INC_REF(connp);
1979 			mutex_exit(&bind_connfp->connf_lock);
1980 			IPCL_DEBUG_LVL(512,
1981 			    ("ipcl_classify_v6: found listner "
1982 			    "connp = %p\n", (void *)connp));
1983 
1984 			return (connp);
1985 		}
1986 
1987 		mutex_exit(&bind_connfp->connf_lock);
1988 
1989 		IPCL_DEBUG_LVL(512,
1990 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1991 		    (void *)mp));
1992 		break;
1993 
1994 	case IPPROTO_UDP:
1995 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1996 		lport = up[1];
1997 		unlabeled = B_FALSE;
1998 		/* Cred can be null on IPv6 */
1999 		if (is_system_labeled()) {
2000 			cred_t *cr = msg_getcred(mp, NULL);
2001 
2002 			unlabeled = (cr != NULL &&
2003 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
2004 		}
2005 		shared_addr = (zoneid == ALL_ZONES);
2006 		if (shared_addr) {
2007 			/*
2008 			 * No need to handle exclusive-stack zones since
2009 			 * ALL_ZONES only applies to the shared stack.
2010 			 */
2011 			zoneid = tsol_mlp_findzone(protocol, lport);
2012 			/*
2013 			 * If no shared MLP is found, tsol_mlp_findzone returns
2014 			 * ALL_ZONES.  In that case, we assume it's SLP, and
2015 			 * search for the zone based on the packet label.
2016 			 *
2017 			 * If there is such a zone, we prefer to find a
2018 			 * connection in it.  Otherwise, we look for a
2019 			 * MAC-exempt connection in any zone whose label
2020 			 * dominates the default label on the packet.
2021 			 */
2022 			if (zoneid == ALL_ZONES)
2023 				zoneid = tsol_packet_to_zoneid(mp);
2024 			else
2025 				unlabeled = B_FALSE;
2026 		}
2027 
2028 		fport = up[0];
2029 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
2030 		    fport));
2031 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2032 		mutex_enter(&connfp->connf_lock);
2033 		for (connp = connfp->connf_head; connp != NULL;
2034 		    connp = connp->conn_next) {
2035 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
2036 			    fport, ip6h->ip6_src) &&
2037 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
2038 			    (unlabeled && shared_addr &&
2039 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
2040 				break;
2041 		}
2042 
2043 		if (connp != NULL && is_system_labeled() &&
2044 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
2045 		    shared_addr, connp)) {
2046 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
2047 			    char *, "connp(1) could not receive mp(2)",
2048 			    conn_t *, connp, mblk_t *, mp);
2049 			connp = NULL;
2050 		}
2051 
2052 		if (connp != NULL) {
2053 			CONN_INC_REF(connp);
2054 			mutex_exit(&connfp->connf_lock);
2055 			return (connp);
2056 		}
2057 
2058 		/*
2059 		 * We shouldn't come here for multicast/broadcast packets
2060 		 */
2061 		mutex_exit(&connfp->connf_lock);
2062 		IPCL_DEBUG_LVL(512,
2063 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
2064 		    lport, fport));
2065 		break;
2066 	case IPPROTO_ENCAP:
2067 	case IPPROTO_IPV6:
2068 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
2069 		    &ip6h->ip6_dst, ipst));
2070 	}
2071 
2072 	return (NULL);
2073 }
2074 
2075 /*
2076  * wrapper around ipcl_classify_(v4,v6) routines.
2077  */
2078 conn_t *
2079 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
2080 {
2081 	uint16_t	hdr_len;
2082 	ipha_t		*ipha;
2083 	uint8_t		*nexthdrp;
2084 
2085 	if (MBLKL(mp) < sizeof (ipha_t))
2086 		return (NULL);
2087 
2088 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
2089 	case IPV4_VERSION:
2090 		ipha = (ipha_t *)mp->b_rptr;
2091 		hdr_len = IPH_HDR_LENGTH(ipha);
2092 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
2093 		    zoneid, ipst));
2094 	case IPV6_VERSION:
2095 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
2096 		    &hdr_len, &nexthdrp))
2097 			return (NULL);
2098 
2099 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
2100 	}
2101 
2102 	return (NULL);
2103 }
2104 
2105 conn_t *
2106 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
2107     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
2108 {
2109 	connf_t		*connfp;
2110 	conn_t		*connp;
2111 	in_port_t	lport;
2112 	int		af;
2113 	boolean_t	shared_addr;
2114 	boolean_t	unlabeled;
2115 	const void	*dst;
2116 
2117 	lport = ((uint16_t *)&ports)[1];
2118 
2119 	unlabeled = B_FALSE;
2120 	/* Cred can be null on IPv6 */
2121 	if (is_system_labeled()) {
2122 		cred_t *cr = msg_getcred(mp, NULL);
2123 
2124 		unlabeled = (cr != NULL &&
2125 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
2126 	}
2127 	shared_addr = (zoneid == ALL_ZONES);
2128 	if (shared_addr) {
2129 		/*
2130 		 * No need to handle exclusive-stack zones since ALL_ZONES
2131 		 * only applies to the shared stack.
2132 		 */
2133 		zoneid = tsol_mlp_findzone(protocol, lport);
2134 		/*
2135 		 * If no shared MLP is found, tsol_mlp_findzone returns
2136 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
2137 		 * the zone based on the packet label.
2138 		 *
2139 		 * If there is such a zone, we prefer to find a connection in
2140 		 * it.  Otherwise, we look for a MAC-exempt connection in any
2141 		 * zone whose label dominates the default label on the packet.
2142 		 */
2143 		if (zoneid == ALL_ZONES)
2144 			zoneid = tsol_packet_to_zoneid(mp);
2145 		else
2146 			unlabeled = B_FALSE;
2147 	}
2148 
2149 	af = IPH_HDR_VERSION(hdr);
2150 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
2151 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
2152 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
2153 
2154 	mutex_enter(&connfp->connf_lock);
2155 	for (connp = connfp->connf_head; connp != NULL;
2156 	    connp = connp->conn_next) {
2157 		/* We don't allow v4 fallback for v6 raw socket. */
2158 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
2159 		    IPV6_VERSION))
2160 			continue;
2161 		if (connp->conn_fully_bound) {
2162 			if (af == IPV4_VERSION) {
2163 				if (!IPCL_CONN_MATCH(connp, protocol,
2164 				    hdr->ipha_src, hdr->ipha_dst, ports))
2165 					continue;
2166 			} else {
2167 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
2168 				    ((ip6_t *)hdr)->ip6_src,
2169 				    ((ip6_t *)hdr)->ip6_dst, ports))
2170 					continue;
2171 			}
2172 		} else {
2173 			if (af == IPV4_VERSION) {
2174 				if (!IPCL_BIND_MATCH(connp, protocol,
2175 				    hdr->ipha_dst, lport))
2176 					continue;
2177 			} else {
2178 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
2179 				    ((ip6_t *)hdr)->ip6_dst, lport))
2180 					continue;
2181 			}
2182 		}
2183 
2184 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
2185 		    (unlabeled &&
2186 		    (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
2187 		    shared_addr))
2188 			break;
2189 	}
2190 	/*
2191 	 * If the connection is fully-bound and connection-oriented (TCP or
2192 	 * SCTP), then we've already validated the remote system's label.
2193 	 * There's no need to do it again for every packet.
2194 	 */
2195 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
2196 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
2197 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
2198 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
2199 		    char *, "connp(1) could not receive mp(2)",
2200 		    conn_t *, connp, mblk_t *, mp);
2201 		connp = NULL;
2202 	}
2203 
2204 	if (connp != NULL)
2205 		goto found;
2206 	mutex_exit(&connfp->connf_lock);
2207 
2208 	/* Try to look for a wildcard match. */
2209 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2210 	mutex_enter(&connfp->connf_lock);
2211 	for (connp = connfp->connf_head; connp != NULL;
2212 	    connp = connp->conn_next) {
2213 		/* We don't allow v4 fallback for v6 raw socket. */
2214 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
2215 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
2216 			continue;
2217 		}
2218 		if (af == IPV4_VERSION) {
2219 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
2220 				break;
2221 		} else {
2222 			if (IPCL_RAW_MATCH_V6(connp, protocol,
2223 			    ((ip6_t *)hdr)->ip6_dst)) {
2224 				break;
2225 			}
2226 		}
2227 	}
2228 
2229 	if (connp != NULL)
2230 		goto found;
2231 
2232 	mutex_exit(&connfp->connf_lock);
2233 	return (NULL);
2234 
2235 found:
2236 	ASSERT(connp != NULL);
2237 	CONN_INC_REF(connp);
2238 	mutex_exit(&connfp->connf_lock);
2239 	return (connp);
2240 }
2241 
2242 /* ARGSUSED */
2243 static int
2244 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2245 {
2246 	itc_t	*itc = (itc_t *)buf;
2247 	conn_t 	*connp = &itc->itc_conn;
2248 	tcp_t	*tcp = (tcp_t *)&itc[1];
2249 
2250 	bzero(connp, sizeof (conn_t));
2251 	bzero(tcp, sizeof (tcp_t));
2252 
2253 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2254 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2255 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2256 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2257 	connp->conn_tcp = tcp;
2258 	connp->conn_flags = IPCL_TCPCONN;
2259 	connp->conn_ulp = IPPROTO_TCP;
2260 	tcp->tcp_connp = connp;
2261 	return (0);
2262 }
2263 
2264 /* ARGSUSED */
2265 static void
2266 tcp_conn_destructor(void *buf, void *cdrarg)
2267 {
2268 	itc_t	*itc = (itc_t *)buf;
2269 	conn_t 	*connp = &itc->itc_conn;
2270 	tcp_t	*tcp = (tcp_t *)&itc[1];
2271 
2272 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2273 	ASSERT(tcp->tcp_connp == connp);
2274 	ASSERT(connp->conn_tcp == tcp);
2275 	tcp_timermp_free(tcp);
2276 	mutex_destroy(&connp->conn_lock);
2277 	cv_destroy(&connp->conn_cv);
2278 	cv_destroy(&connp->conn_sq_cv);
2279 }
2280 
2281 /* ARGSUSED */
2282 static int
2283 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2284 {
2285 	itc_t	*itc = (itc_t *)buf;
2286 	conn_t 	*connp = &itc->itc_conn;
2287 
2288 	bzero(connp, sizeof (conn_t));
2289 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2290 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2291 	connp->conn_flags = IPCL_IPCCONN;
2292 
2293 	return (0);
2294 }
2295 
2296 /* ARGSUSED */
2297 static void
2298 ip_conn_destructor(void *buf, void *cdrarg)
2299 {
2300 	itc_t	*itc = (itc_t *)buf;
2301 	conn_t 	*connp = &itc->itc_conn;
2302 
2303 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2304 	ASSERT(connp->conn_priv == NULL);
2305 	mutex_destroy(&connp->conn_lock);
2306 	cv_destroy(&connp->conn_cv);
2307 }
2308 
2309 /* ARGSUSED */
2310 static int
2311 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2312 {
2313 	itc_t	*itc = (itc_t *)buf;
2314 	conn_t 	*connp = &itc->itc_conn;
2315 	udp_t	*udp = (udp_t *)&itc[1];
2316 
2317 	bzero(connp, sizeof (conn_t));
2318 	bzero(udp, sizeof (udp_t));
2319 
2320 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2321 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2322 	connp->conn_udp = udp;
2323 	connp->conn_flags = IPCL_UDPCONN;
2324 	connp->conn_ulp = IPPROTO_UDP;
2325 	udp->udp_connp = connp;
2326 	return (0);
2327 }
2328 
2329 /* ARGSUSED */
2330 static void
2331 udp_conn_destructor(void *buf, void *cdrarg)
2332 {
2333 	itc_t	*itc = (itc_t *)buf;
2334 	conn_t 	*connp = &itc->itc_conn;
2335 	udp_t	*udp = (udp_t *)&itc[1];
2336 
2337 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2338 	ASSERT(udp->udp_connp == connp);
2339 	ASSERT(connp->conn_udp == udp);
2340 	mutex_destroy(&connp->conn_lock);
2341 	cv_destroy(&connp->conn_cv);
2342 }
2343 
2344 /* ARGSUSED */
2345 static int
2346 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2347 {
2348 	itc_t	*itc = (itc_t *)buf;
2349 	conn_t 	*connp = &itc->itc_conn;
2350 	icmp_t	*icmp = (icmp_t *)&itc[1];
2351 
2352 	bzero(connp, sizeof (conn_t));
2353 	bzero(icmp, sizeof (icmp_t));
2354 
2355 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2356 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2357 	connp->conn_icmp = icmp;
2358 	connp->conn_flags = IPCL_RAWIPCONN;
2359 	connp->conn_ulp = IPPROTO_ICMP;
2360 	icmp->icmp_connp = connp;
2361 	return (0);
2362 }
2363 
2364 /* ARGSUSED */
2365 static void
2366 rawip_conn_destructor(void *buf, void *cdrarg)
2367 {
2368 	itc_t	*itc = (itc_t *)buf;
2369 	conn_t 	*connp = &itc->itc_conn;
2370 	icmp_t	*icmp = (icmp_t *)&itc[1];
2371 
2372 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2373 	ASSERT(icmp->icmp_connp == connp);
2374 	ASSERT(connp->conn_icmp == icmp);
2375 	mutex_destroy(&connp->conn_lock);
2376 	cv_destroy(&connp->conn_cv);
2377 }
2378 
2379 /* ARGSUSED */
2380 static int
2381 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2382 {
2383 	itc_t	*itc = (itc_t *)buf;
2384 	conn_t 	*connp = &itc->itc_conn;
2385 	rts_t	*rts = (rts_t *)&itc[1];
2386 
2387 	bzero(connp, sizeof (conn_t));
2388 	bzero(rts, sizeof (rts_t));
2389 
2390 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2391 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2392 	connp->conn_rts = rts;
2393 	connp->conn_flags = IPCL_RTSCONN;
2394 	rts->rts_connp = connp;
2395 	return (0);
2396 }
2397 
2398 /* ARGSUSED */
2399 static void
2400 rts_conn_destructor(void *buf, void *cdrarg)
2401 {
2402 	itc_t	*itc = (itc_t *)buf;
2403 	conn_t 	*connp = &itc->itc_conn;
2404 	rts_t	*rts = (rts_t *)&itc[1];
2405 
2406 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2407 	ASSERT(rts->rts_connp == connp);
2408 	ASSERT(connp->conn_rts == rts);
2409 	mutex_destroy(&connp->conn_lock);
2410 	cv_destroy(&connp->conn_cv);
2411 }
2412 
2413 /* ARGSUSED */
2414 int
2415 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
2416 {
2417 	int error;
2418 	netstack_t	*ns;
2419 	int		ret;
2420 	tcp_stack_t	*tcps;
2421 	ip_helper_stream_info_t	*ip_helper_str;
2422 	ip_stack_t	*ipst;
2423 
2424 	ns = netstack_find_by_cred(kcred);
2425 	ASSERT(ns != NULL);
2426 	tcps = ns->netstack_tcp;
2427 	ipst = ns->netstack_ip;
2428 	ASSERT(tcps != NULL);
2429 	ip_helper_str = (ip_helper_stream_info_t *)buf;
2430 
2431 	do {
2432 		error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
2433 		    &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
2434 	} while (error == EINTR);
2435 
2436 	if (error == 0) {
2437 		do {
2438 			error = ldi_ioctl(
2439 			    ip_helper_str->iphs_handle, SIOCSQPTR,
2440 			    (intptr_t)buf, FKIOCTL, kcred, &ret);
2441 		} while (error == EINTR);
2442 
2443 		if (error != 0) {
2444 			(void) ldi_close(
2445 			    ip_helper_str->iphs_handle, 0, kcred);
2446 		}
2447 	}
2448 
2449 	netstack_rele(ipst->ips_netstack);
2450 
2451 	return (error);
2452 }
2453 
2454 /* ARGSUSED */
2455 static void
2456 ip_helper_stream_destructor(void *buf, void *cdrarg)
2457 {
2458 	ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
2459 
2460 	ip_helper_str->iphs_rq->q_ptr =
2461 	    ip_helper_str->iphs_wq->q_ptr =
2462 	    ip_helper_str->iphs_minfo;
2463 	(void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
2464 }
2465 
2466 
2467 /*
2468  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2469  * in the conn_t.
2470  */
2471 void
2472 ipcl_conn_cleanup(conn_t *connp)
2473 {
2474 	ASSERT(connp->conn_ire_cache == NULL);
2475 	ASSERT(connp->conn_latch == NULL);
2476 #ifdef notdef
2477 	ASSERT(connp->conn_rq == NULL);
2478 	ASSERT(connp->conn_wq == NULL);
2479 #endif
2480 	ASSERT(connp->conn_cred == NULL);
2481 	ASSERT(connp->conn_g_fanout == NULL);
2482 	ASSERT(connp->conn_g_next == NULL);
2483 	ASSERT(connp->conn_g_prev == NULL);
2484 	ASSERT(connp->conn_policy == NULL);
2485 	ASSERT(connp->conn_fanout == NULL);
2486 	ASSERT(connp->conn_next == NULL);
2487 	ASSERT(connp->conn_prev == NULL);
2488 #ifdef notdef
2489 	/*
2490 	 * The ill and ipif pointers are not cleared before the conn_t
2491 	 * goes away since they do not hold a reference on the ill/ipif.
2492 	 * We should replace these pointers with ifindex/ipaddr_t to
2493 	 * make the code less complex.
2494 	 */
2495 	ASSERT(connp->conn_outgoing_ill == NULL);
2496 	ASSERT(connp->conn_incoming_ill == NULL);
2497 	ASSERT(connp->conn_multicast_ipif == NULL);
2498 	ASSERT(connp->conn_multicast_ill == NULL);
2499 #endif
2500 	ASSERT(connp->conn_oper_pending_ill == NULL);
2501 	ASSERT(connp->conn_ilg == NULL);
2502 	ASSERT(connp->conn_drain_next == NULL);
2503 	ASSERT(connp->conn_drain_prev == NULL);
2504 #ifdef notdef
2505 	/* conn_idl is not cleared when removed from idl list */
2506 	ASSERT(connp->conn_idl == NULL);
2507 #endif
2508 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2509 	ASSERT(connp->conn_effective_cred == NULL);
2510 	ASSERT(connp->conn_netstack == NULL);
2511 
2512 	ASSERT(connp->conn_helper_info == NULL);
2513 	/* Clear out the conn_t fields that are not preserved */
2514 	bzero(&connp->conn_start_clr,
2515 	    sizeof (conn_t) -
2516 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2517 }
2518 
2519 /*
2520  * All conns are inserted in a global multi-list for the benefit of
2521  * walkers. The walk is guaranteed to walk all open conns at the time
2522  * of the start of the walk exactly once. This property is needed to
2523  * achieve some cleanups during unplumb of interfaces. This is achieved
2524  * as follows.
2525  *
2526  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2527  * call the insert and delete functions below at creation and deletion
2528  * time respectively. The conn never moves or changes its position in this
2529  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2530  * won't increase due to walkers, once the conn deletion has started. Note
2531  * that we can't remove the conn from the global list and then wait for
2532  * the refcnt to drop to zero, since walkers would then see a truncated
2533  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2534  * conns until ip_open is ready to make them globally visible.
2535  * The global round robin multi-list locks are held only to get the
2536  * next member/insertion/deletion and contention should be negligible
2537  * if the multi-list is much greater than the number of cpus.
2538  */
2539 void
2540 ipcl_globalhash_insert(conn_t *connp)
2541 {
2542 	int	index;
2543 	struct connf_s	*connfp;
2544 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2545 
2546 	/*
2547 	 * No need for atomic here. Approximate even distribution
2548 	 * in the global lists is sufficient.
2549 	 */
2550 	ipst->ips_conn_g_index++;
2551 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2552 
2553 	connp->conn_g_prev = NULL;
2554 	/*
2555 	 * Mark as INCIPIENT, so that walkers will ignore this
2556 	 * for now, till ip_open is ready to make it visible globally.
2557 	 */
2558 	connp->conn_state_flags |= CONN_INCIPIENT;
2559 
2560 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2561 	/* Insert at the head of the list */
2562 	mutex_enter(&connfp->connf_lock);
2563 	connp->conn_g_next = connfp->connf_head;
2564 	if (connp->conn_g_next != NULL)
2565 		connp->conn_g_next->conn_g_prev = connp;
2566 	connfp->connf_head = connp;
2567 
2568 	/* The fanout bucket this conn points to */
2569 	connp->conn_g_fanout = connfp;
2570 
2571 	mutex_exit(&connfp->connf_lock);
2572 }
2573 
2574 void
2575 ipcl_globalhash_remove(conn_t *connp)
2576 {
2577 	struct connf_s	*connfp;
2578 
2579 	/*
2580 	 * We were never inserted in the global multi list.
2581 	 * IPCL_NONE variety is never inserted in the global multilist
2582 	 * since it is presumed to not need any cleanup and is transient.
2583 	 */
2584 	if (connp->conn_g_fanout == NULL)
2585 		return;
2586 
2587 	connfp = connp->conn_g_fanout;
2588 	mutex_enter(&connfp->connf_lock);
2589 	if (connp->conn_g_prev != NULL)
2590 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2591 	else
2592 		connfp->connf_head = connp->conn_g_next;
2593 	if (connp->conn_g_next != NULL)
2594 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2595 	mutex_exit(&connfp->connf_lock);
2596 
2597 	/* Better to stumble on a null pointer than to corrupt memory */
2598 	connp->conn_g_next = NULL;
2599 	connp->conn_g_prev = NULL;
2600 	connp->conn_g_fanout = NULL;
2601 }
2602 
2603 /*
2604  * Walk the list of all conn_t's in the system, calling the function provided
2605  * with the specified argument for each.
2606  * Applies to both IPv4 and IPv6.
2607  *
2608  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2609  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2610  * unplumbed or removed. New conn_t's that are created while we are walking
2611  * may be missed by this walk, because they are not necessarily inserted
2612  * at the tail of the list. They are new conn_t's and thus don't have any
2613  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2614  * is created to the struct that is going away.
2615  */
2616 void
2617 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2618 {
2619 	int	i;
2620 	conn_t	*connp;
2621 	conn_t	*prev_connp;
2622 
2623 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2624 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2625 		prev_connp = NULL;
2626 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2627 		while (connp != NULL) {
2628 			mutex_enter(&connp->conn_lock);
2629 			if (connp->conn_state_flags &
2630 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2631 				mutex_exit(&connp->conn_lock);
2632 				connp = connp->conn_g_next;
2633 				continue;
2634 			}
2635 			CONN_INC_REF_LOCKED(connp);
2636 			mutex_exit(&connp->conn_lock);
2637 			mutex_exit(
2638 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2639 			(*func)(connp, arg);
2640 			if (prev_connp != NULL)
2641 				CONN_DEC_REF(prev_connp);
2642 			mutex_enter(
2643 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2644 			prev_connp = connp;
2645 			connp = connp->conn_g_next;
2646 		}
2647 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2648 		if (prev_connp != NULL)
2649 			CONN_DEC_REF(prev_connp);
2650 	}
2651 }
2652 
2653 /*
2654  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2655  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2656  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2657  * (peer tcp in ESTABLISHED state).
2658  */
2659 conn_t *
2660 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2661     ip_stack_t *ipst)
2662 {
2663 	uint32_t ports;
2664 	uint16_t *pports = (uint16_t *)&ports;
2665 	connf_t	*connfp;
2666 	conn_t	*tconnp;
2667 	boolean_t zone_chk;
2668 
2669 	/*
2670 	 * If either the source of destination address is loopback, then
2671 	 * both endpoints must be in the same Zone.  Otherwise, both of
2672 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2673 	 * state) and the endpoints may reside in different Zones.
2674 	 */
2675 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2676 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2677 
2678 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2679 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2680 
2681 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2682 	    ports, ipst)];
2683 
2684 	mutex_enter(&connfp->connf_lock);
2685 	for (tconnp = connfp->connf_head; tconnp != NULL;
2686 	    tconnp = tconnp->conn_next) {
2687 
2688 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2689 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2690 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2691 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2692 
2693 			ASSERT(tconnp != connp);
2694 			CONN_INC_REF(tconnp);
2695 			mutex_exit(&connfp->connf_lock);
2696 			return (tconnp);
2697 		}
2698 	}
2699 	mutex_exit(&connfp->connf_lock);
2700 	return (NULL);
2701 }
2702 
2703 /*
2704  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2705  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2706  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2707  * (peer tcp in ESTABLISHED state).
2708  */
2709 conn_t *
2710 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2711     ip_stack_t *ipst)
2712 {
2713 	uint32_t ports;
2714 	uint16_t *pports = (uint16_t *)&ports;
2715 	connf_t	*connfp;
2716 	conn_t	*tconnp;
2717 	boolean_t zone_chk;
2718 
2719 	/*
2720 	 * If either the source of destination address is loopback, then
2721 	 * both endpoints must be in the same Zone.  Otherwise, both of
2722 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2723 	 * state) and the endpoints may reside in different Zones.  We
2724 	 * don't do Zone check for link local address(es) because the
2725 	 * current Zone implementation treats each link local address as
2726 	 * being unique per system node, i.e. they belong to global Zone.
2727 	 */
2728 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2729 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2730 
2731 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2732 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2733 
2734 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2735 	    ports, ipst)];
2736 
2737 	mutex_enter(&connfp->connf_lock);
2738 	for (tconnp = connfp->connf_head; tconnp != NULL;
2739 	    tconnp = tconnp->conn_next) {
2740 
2741 		/* We skip tcp_bound_if check here as this is loopback tcp */
2742 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2743 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2744 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2745 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2746 
2747 			ASSERT(tconnp != connp);
2748 			CONN_INC_REF(tconnp);
2749 			mutex_exit(&connfp->connf_lock);
2750 			return (tconnp);
2751 		}
2752 	}
2753 	mutex_exit(&connfp->connf_lock);
2754 	return (NULL);
2755 }
2756 
2757 /*
2758  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2759  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2760  * Only checks for connected entries i.e. no INADDR_ANY checks.
2761  */
2762 conn_t *
2763 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2764     ip_stack_t *ipst)
2765 {
2766 	uint32_t ports;
2767 	uint16_t *pports;
2768 	connf_t	*connfp;
2769 	conn_t	*tconnp;
2770 
2771 	pports = (uint16_t *)&ports;
2772 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2773 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2774 
2775 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2776 	    ports, ipst)];
2777 
2778 	mutex_enter(&connfp->connf_lock);
2779 	for (tconnp = connfp->connf_head; tconnp != NULL;
2780 	    tconnp = tconnp->conn_next) {
2781 
2782 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2783 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2784 		    tconnp->conn_tcp->tcp_state >= min_state) {
2785 
2786 			CONN_INC_REF(tconnp);
2787 			mutex_exit(&connfp->connf_lock);
2788 			return (tconnp);
2789 		}
2790 	}
2791 	mutex_exit(&connfp->connf_lock);
2792 	return (NULL);
2793 }
2794 
2795 /*
2796  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2797  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2798  * Only checks for connected entries i.e. no INADDR_ANY checks.
2799  * Match on ifindex in addition to addresses.
2800  */
2801 conn_t *
2802 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2803     uint_t ifindex, ip_stack_t *ipst)
2804 {
2805 	tcp_t	*tcp;
2806 	uint32_t ports;
2807 	uint16_t *pports;
2808 	connf_t	*connfp;
2809 	conn_t	*tconnp;
2810 
2811 	pports = (uint16_t *)&ports;
2812 	pports[0] = tcpha->tha_fport;
2813 	pports[1] = tcpha->tha_lport;
2814 
2815 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2816 	    ports, ipst)];
2817 
2818 	mutex_enter(&connfp->connf_lock);
2819 	for (tconnp = connfp->connf_head; tconnp != NULL;
2820 	    tconnp = tconnp->conn_next) {
2821 
2822 		tcp = tconnp->conn_tcp;
2823 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2824 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2825 		    tcp->tcp_state >= min_state &&
2826 		    (tcp->tcp_bound_if == 0 ||
2827 		    tcp->tcp_bound_if == ifindex)) {
2828 
2829 			CONN_INC_REF(tconnp);
2830 			mutex_exit(&connfp->connf_lock);
2831 			return (tconnp);
2832 		}
2833 	}
2834 	mutex_exit(&connfp->connf_lock);
2835 	return (NULL);
2836 }
2837 
2838 /*
2839  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2840  * a listener when changing state.
2841  */
2842 conn_t *
2843 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2844     ip_stack_t *ipst)
2845 {
2846 	connf_t		*bind_connfp;
2847 	conn_t		*connp;
2848 	tcp_t		*tcp;
2849 
2850 	/*
2851 	 * Avoid false matches for packets sent to an IP destination of
2852 	 * all zeros.
2853 	 */
2854 	if (laddr == 0)
2855 		return (NULL);
2856 
2857 	ASSERT(zoneid != ALL_ZONES);
2858 
2859 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2860 	mutex_enter(&bind_connfp->connf_lock);
2861 	for (connp = bind_connfp->connf_head; connp != NULL;
2862 	    connp = connp->conn_next) {
2863 		tcp = connp->conn_tcp;
2864 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2865 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2866 		    (tcp->tcp_listener == NULL)) {
2867 			CONN_INC_REF(connp);
2868 			mutex_exit(&bind_connfp->connf_lock);
2869 			return (connp);
2870 		}
2871 	}
2872 	mutex_exit(&bind_connfp->connf_lock);
2873 	return (NULL);
2874 }
2875 
2876 /*
2877  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2878  * a listener when changing state.
2879  */
2880 conn_t *
2881 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2882     zoneid_t zoneid, ip_stack_t *ipst)
2883 {
2884 	connf_t		*bind_connfp;
2885 	conn_t		*connp = NULL;
2886 	tcp_t		*tcp;
2887 
2888 	/*
2889 	 * Avoid false matches for packets sent to an IP destination of
2890 	 * all zeros.
2891 	 */
2892 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2893 		return (NULL);
2894 
2895 	ASSERT(zoneid != ALL_ZONES);
2896 
2897 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2898 	mutex_enter(&bind_connfp->connf_lock);
2899 	for (connp = bind_connfp->connf_head; connp != NULL;
2900 	    connp = connp->conn_next) {
2901 		tcp = connp->conn_tcp;
2902 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2903 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2904 		    (tcp->tcp_bound_if == 0 ||
2905 		    tcp->tcp_bound_if == ifindex) &&
2906 		    tcp->tcp_listener == NULL) {
2907 			CONN_INC_REF(connp);
2908 			mutex_exit(&bind_connfp->connf_lock);
2909 			return (connp);
2910 		}
2911 	}
2912 	mutex_exit(&bind_connfp->connf_lock);
2913 	return (NULL);
2914 }
2915 
2916 /*
2917  * ipcl_get_next_conn
2918  *	get the next entry in the conn global list
2919  *	and put a reference on the next_conn.
2920  *	decrement the reference on the current conn.
2921  *
2922  * This is an iterator based walker function that also provides for
2923  * some selection by the caller. It walks through the conn_hash bucket
2924  * searching for the next valid connp in the list, and selects connections
2925  * that are neither closed nor condemned. It also REFHOLDS the conn
2926  * thus ensuring that the conn exists when the caller uses the conn.
2927  */
2928 conn_t *
2929 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2930 {
2931 	conn_t	*next_connp;
2932 
2933 	if (connfp == NULL)
2934 		return (NULL);
2935 
2936 	mutex_enter(&connfp->connf_lock);
2937 
2938 	next_connp = (connp == NULL) ?
2939 	    connfp->connf_head : connp->conn_g_next;
2940 
2941 	while (next_connp != NULL) {
2942 		mutex_enter(&next_connp->conn_lock);
2943 		if (!(next_connp->conn_flags & conn_flags) ||
2944 		    (next_connp->conn_state_flags &
2945 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2946 			/*
2947 			 * This conn has been condemned or
2948 			 * is closing, or the flags don't match
2949 			 */
2950 			mutex_exit(&next_connp->conn_lock);
2951 			next_connp = next_connp->conn_g_next;
2952 			continue;
2953 		}
2954 		CONN_INC_REF_LOCKED(next_connp);
2955 		mutex_exit(&next_connp->conn_lock);
2956 		break;
2957 	}
2958 
2959 	mutex_exit(&connfp->connf_lock);
2960 
2961 	if (connp != NULL)
2962 		CONN_DEC_REF(connp);
2963 
2964 	return (next_connp);
2965 }
2966 
2967 #ifdef CONN_DEBUG
2968 /*
2969  * Trace of the last NBUF refhold/refrele
2970  */
2971 int
2972 conn_trace_ref(conn_t *connp)
2973 {
2974 	int	last;
2975 	conn_trace_t	*ctb;
2976 
2977 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2978 	last = connp->conn_trace_last;
2979 	last++;
2980 	if (last == CONN_TRACE_MAX)
2981 		last = 0;
2982 
2983 	ctb = &connp->conn_trace_buf[last];
2984 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2985 	connp->conn_trace_last = last;
2986 	return (1);
2987 }
2988 
2989 int
2990 conn_untrace_ref(conn_t *connp)
2991 {
2992 	int	last;
2993 	conn_trace_t	*ctb;
2994 
2995 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2996 	last = connp->conn_trace_last;
2997 	last++;
2998 	if (last == CONN_TRACE_MAX)
2999 		last = 0;
3000 
3001 	ctb = &connp->conn_trace_buf[last];
3002 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
3003 	connp->conn_trace_last = last;
3004 	return (1);
3005 }
3006 #endif
3007