xref: /titanic_51/usr/src/uts/common/inet/ip/ipclassifier.c (revision 749f21d359d8fbd020c974a1a5227316221bfc9c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 const char ipclassifier_version[] = "@(#)ipclassifier.c	1.6	04/03/31 SMI";
30 
31 /*
32  * IP PACKET CLASSIFIER
33  *
34  * The IP packet classifier provides mapping between IP packets and persistent
35  * connection state for connection-oriented protocols. It also provides
36  * interface for managing connection states.
37  *
38  * The connection state is kept in conn_t data structure and contains, among
39  * other things:
40  *
41  *	o local/remote address and ports
42  *	o Transport protocol
43  *	o squeue for the connection (for TCP only)
44  *	o reference counter
45  *	o Connection state
46  *	o hash table linkage
47  *	o interface/ire information
48  *	o credentials
49  *	o ipsec policy
50  *	o send and receive functions.
51  *	o mutex lock.
52  *
53  * Connections use a reference counting scheme. They are freed when the
54  * reference counter drops to zero. A reference is incremented when connection
55  * is placed in a list or table, when incoming packet for the connection arrives
56  * and when connection is processed via squeue (squeue processing may be
57  * asynchronous and the reference protects the connection from being destroyed
58  * before its processing is finished).
59  *
60  * send and receive functions are currently used for TCP only. The send function
61  * determines the IP entry point for the packet once it leaves TCP to be sent to
62  * the destination address. The receive function is used by IP when the packet
63  * should be passed for TCP processing. When a new connection is created these
64  * are set to ip_output() and tcp_input() respectively. During the lifetime of
65  * the connection the send and receive functions may change depending on the
66  * changes in the connection state. For example, Once the connection is bound to
67  * an addresse, the receive function for this connection is set to
68  * tcp_conn_request().  This allows incoming SYNs to go directly into the
69  * listener SYN processing function without going to tcp_input() first.
70  *
71  * Classifier uses several hash tables:
72  *
73  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
74  *	ipcl_bind_fanout:	contains all connections in BOUND state
75  *	ipcl_proto_fanout:	IPv4 protocol fanout
76  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
77  *	ipcl_udp_fanout:	contains all UDP connections
78  *	ipcl_globalhash_fanout:	contains all connections
79  *
80  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
81  * which need to view all existing connections.
82  *
83  * All tables are protected by per-bucket locks. When both per-bucket lock and
84  * connection lock need to be held, the per-bucket lock should be acquired
85  * first, followed by the connection lock.
86  *
87  * All functions doing search in one of these tables increment a reference
88  * counter on the connection found (if any). This reference should be dropped
89  * when the caller has finished processing the connection.
90  *
91  *
92  * INTERFACES:
93  * ===========
94  *
95  * Connection Lookup:
96  * ------------------
97  *
98  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid)
99  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid)
100  *
101  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
102  * it can't find any associated connection. If the connection is found, its
103  * reference counter is incremented.
104  *
105  *	mp:	mblock, containing packet header. The full header should fit
106  *		into a single mblock. It should also contain at least full IP
107  *		and TCP or UDP header.
108  *
109  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
110  *
111  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
112  *		 the packet.
113  *
114  * 	zoneid: The zone in which the returned connection must be.
115  *
116  *	For TCP connections, the lookup order is as follows:
117  *		5-tuple {src, dst, protocol, local port, remote port}
118  *			lookup in ipcl_conn_fanout table.
119  *		3-tuple {dst, remote port, protocol} lookup in
120  *			ipcl_bind_fanout table.
121  *
122  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
123  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
124  *	these interfaces do not handle cases where a packets belongs
125  *	to multiple UDP clients, which is handled in IP itself.
126  *
127  * conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int);
128  * conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t);
129  *
130  *	Lookup routine to find a exact match for {src, dst, local port,
131  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
132  *	ports are read from the IP and TCP header respectively.
133  *
134  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol);
135  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex);
136  *
137  * 	Lookup routine to find a listener with the tuple {lport, laddr,
138  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
139  * 	parameter interface index is also compared.
140  *
141  * void ipcl_walk(func, arg)
142  *
143  * 	Apply 'func' to every connection available. The 'func' is called as
144  *	(*func)(connp, arg). The walk is non-atomic so connections may be
145  *	created and destroyed during the walk. The CONN_CONDEMNED and
146  *	CONN_INCIPIENT flags ensure that connections which are newly created
147  *	or being destroyed are not selected by the walker.
148  *
149  * Table Updates
150  * -------------
151  *
152  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
153  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
154  *
155  *	Insert 'connp' in the ipcl_conn_fanout.
156  *	Arguements :
157  *		connp		conn_t to be inserted
158  *		protocol	connection protocol
159  *		src		source address
160  *		dst		destination address
161  *		ports		local and remote port
162  *		ifindex		interface index for IPv6 connections
163  *
164  *	Return value :
165  *		0		if connp was inserted
166  *		EADDRINUSE	if the connection with the same tuple
167  *				already exists.
168  *
169  * int ipcl_bind_insert(connp, protocol, src, lport);
170  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
171  *
172  * 	Insert 'connp' in ipcl_bind_fanout.
173  * 	Arguements :
174  * 		connp		conn_t to be inserted
175  * 		protocol	connection protocol
176  * 		src		source address connection wants
177  * 				to bind to
178  * 		lport		local port connection wants to
179  * 				bind to
180  *
181  *
182  * void ipcl_hash_remove(connp);
183  *
184  * 	Removes the 'connp' from the connection fanout table.
185  *
186  * Connection Creation/Destruction
187  * -------------------------------
188  *
189  * conn_t *ipcl_conn_create(type, sleep)
190  *
191  * 	Creates a new conn based on the type flag, inserts it into
192  * 	globalhash table.
193  *
194  *	type:	This flag determines the type of conn_t which needs to be
195  *		created.
196  *		IPCL_TCPCONN	indicates a TCP connection
197  *		IPCL_IPCONN	indicates all non-TCP connections.
198  *
199  * void ipcl_conn_destroy(connp)
200  *
201  * 	Destroys the connection state, removes it from the global
202  * 	connection hash table and frees its memory.
203  */
204 
205 #include <sys/types.h>
206 #include <sys/stream.h>
207 #include <sys/dlpi.h>
208 #include <sys/stropts.h>
209 #include <sys/sysmacros.h>
210 #include <sys/strsubr.h>
211 #include <sys/strlog.h>
212 #include <sys/strsun.h>
213 #define	_SUN_TPI_VERSION 2
214 #include <sys/ddi.h>
215 #include <sys/cmn_err.h>
216 #include <sys/debug.h>
217 
218 #include <sys/systm.h>
219 #include <sys/param.h>
220 #include <sys/kmem.h>
221 #include <sys/isa_defs.h>
222 #include <inet/common.h>
223 #include <netinet/ip6.h>
224 #include <netinet/icmp6.h>
225 
226 #include <inet/ip.h>
227 #include <inet/ip6.h>
228 #include <inet/tcp.h>
229 #include <inet/tcp_trace.h>
230 #include <inet/ip_multi.h>
231 #include <inet/ip_if.h>
232 #include <inet/ip_ire.h>
233 #include <inet/ip_rts.h>
234 #include <inet/optcom.h>
235 #include <inet/ip_ndp.h>
236 #include <inet/udp_impl.h>
237 #include <inet/sctp_ip.h>
238 
239 #include <sys/ethernet.h>
240 #include <net/if_types.h>
241 #include <sys/cpuvar.h>
242 
243 #include <inet/mi.h>
244 #include <inet/ipclassifier.h>
245 #include <inet/ipsec_impl.h>
246 
247 #ifdef DEBUG
248 #define	IPCL_DEBUG
249 #else
250 #undef	IPCL_DEBUG
251 #endif
252 
253 #ifdef	IPCL_DEBUG
254 int	ipcl_debug_level = 0;
255 #define	IPCL_DEBUG_LVL(level, args)	\
256 	if (ipcl_debug_level  & level) { printf args; }
257 #else
258 #define	IPCL_DEBUG_LVL(level, args) {; }
259 #endif
260 connf_t	*ipcl_conn_fanout;
261 connf_t	*ipcl_bind_fanout;
262 connf_t	ipcl_proto_fanout[IPPROTO_MAX + 1];
263 connf_t	ipcl_proto_fanout_v6[IPPROTO_MAX + 1];
264 connf_t	*ipcl_udp_fanout;
265 
266 /* A separate hash list for raw socket. */
267 connf_t *ipcl_raw_fanout;
268 
269 connf_t rts_clients;
270 
271 /* Old value for compatibility */
272 uint_t tcp_conn_hash_size = 0;
273 
274 /* New value. Zero means choose automatically. */
275 uint_t ipcl_conn_hash_size = 0;
276 uint_t ipcl_conn_hash_memfactor = 8192;
277 uint_t ipcl_conn_hash_maxsize = 82500;
278 
279 uint_t ipcl_conn_fanout_size = 0;
280 
281 
282 /* bind/udp fanout table size */
283 uint_t ipcl_bind_fanout_size = 512;
284 uint_t ipcl_udp_fanout_size = 256;
285 
286 /* Raw socket fanout size.  Must be a power of 2. */
287 uint_t ipcl_raw_fanout_size = 256;
288 
289 /*
290  * Power of 2^N Primes useful for hashing for N of 0-28,
291  * these primes are the nearest prime <= 2^N - 2^(N-2).
292  */
293 
294 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
295 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
296 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
297 		50331599, 100663291, 201326557, 0}
298 
299 /*
300  * wrapper structure to ensure that conn+tcpb are aligned
301  * on cache lines.
302  */
303 typedef struct itc_s {
304 	union {
305 		conn_t	itcu_conn;
306 		char	itcu_filler[CACHE_ALIGN(conn_s)];
307 	}	itc_u;
308 	tcp_t	itc_tcp;
309 } itc_t;
310 
311 #define	itc_conn	itc_u.itcu_conn
312 
313 struct kmem_cache  *ipcl_tcpconn_cache;
314 struct kmem_cache  *ipcl_tcp_cache;
315 struct kmem_cache  *ipcl_conn_cache;
316 extern struct kmem_cache  *sctp_conn_cache;
317 extern struct kmem_cache  *tcp_sack_info_cache;
318 extern struct kmem_cache  *tcp_iphc_cache;
319 
320 extern void	tcp_timermp_free(tcp_t *);
321 extern mblk_t	*tcp_timermp_alloc(int);
322 
323 static int	ipcl_tcpconn_constructor(void *, void *, int);
324 static void	ipcl_tcpconn_destructor(void *, void *);
325 
326 static int conn_g_index;
327 connf_t	*ipcl_globalhash_fanout;
328 
329 #ifdef	IPCL_DEBUG
330 #define	INET_NTOA_BUFSIZE	18
331 
332 static char *
333 inet_ntoa_r(uint32_t in, char *b)
334 {
335 	unsigned char	*p;
336 
337 	p = (unsigned char *)&in;
338 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
339 	return (b);
340 }
341 #endif
342 
343 /*
344  * ipclassifier intialization routine, sets up hash tables and
345  * conn caches.
346  */
347 void
348 ipcl_init(void)
349 {
350 	int i;
351 	int sizes[] = P2Ps();
352 
353 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
354 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
355 	    NULL, NULL, NULL, NULL, NULL, 0);
356 
357 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
358 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
359 	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
360 	    NULL, NULL, NULL, 0);
361 
362 	/*
363 	 * Calculate size of conn fanout table.
364 	 */
365 	if (ipcl_conn_hash_size != 0) {
366 		ipcl_conn_fanout_size = ipcl_conn_hash_size;
367 	} else if (tcp_conn_hash_size != 0) {
368 		ipcl_conn_fanout_size = tcp_conn_hash_size;
369 	} else {
370 		extern pgcnt_t freemem;
371 
372 		ipcl_conn_fanout_size =
373 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
374 
375 		if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize)
376 			ipcl_conn_fanout_size = ipcl_conn_hash_maxsize;
377 	}
378 
379 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
380 		if (sizes[i] >= ipcl_conn_fanout_size) {
381 			break;
382 		}
383 	}
384 	if ((ipcl_conn_fanout_size = sizes[i]) == 0) {
385 		/* Out of range, use the 2^16 value */
386 		ipcl_conn_fanout_size = sizes[16];
387 	}
388 	ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size *
389 	    sizeof (*ipcl_conn_fanout), KM_SLEEP);
390 
391 	for (i = 0; i < ipcl_conn_fanout_size; i++) {
392 		mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL,
393 		    MUTEX_DEFAULT, NULL);
394 	}
395 
396 	ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size *
397 	    sizeof (*ipcl_bind_fanout), KM_SLEEP);
398 
399 	for (i = 0; i < ipcl_bind_fanout_size; i++) {
400 		mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL,
401 		    MUTEX_DEFAULT, NULL);
402 	}
403 
404 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) {
405 		mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL,
406 		    MUTEX_DEFAULT, NULL);
407 	}
408 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) {
409 		mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL,
410 		    MUTEX_DEFAULT, NULL);
411 	}
412 
413 	mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL);
414 
415 	ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size *
416 	    sizeof (*ipcl_udp_fanout), KM_SLEEP);
417 
418 	for (i = 0; i < ipcl_udp_fanout_size; i++) {
419 		mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL,
420 		    MUTEX_DEFAULT, NULL);
421 	}
422 
423 	ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size *
424 	    sizeof (*ipcl_raw_fanout), KM_SLEEP);
425 
426 	for (i = 0; i < ipcl_raw_fanout_size; i++) {
427 		mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL,
428 		    MUTEX_DEFAULT, NULL);
429 	}
430 
431 	ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) *
432 	    CONN_G_HASH_SIZE, KM_SLEEP);
433 
434 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
435 		mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL,
436 		    MUTEX_DEFAULT, NULL);
437 	}
438 }
439 
440 void
441 ipcl_destroy(void)
442 {
443 	int i;
444 	kmem_cache_destroy(ipcl_conn_cache);
445 	kmem_cache_destroy(ipcl_tcpconn_cache);
446 	for (i = 0; i < ipcl_conn_fanout_size; i++)
447 		mutex_destroy(&ipcl_conn_fanout[i].connf_lock);
448 	kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size *
449 	    sizeof (*ipcl_conn_fanout));
450 	for (i = 0; i < ipcl_bind_fanout_size; i++)
451 		mutex_destroy(&ipcl_bind_fanout[i].connf_lock);
452 	kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size *
453 	    sizeof (*ipcl_bind_fanout));
454 
455 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++)
456 		mutex_destroy(&ipcl_proto_fanout[i].connf_lock);
457 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++)
458 		mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock);
459 
460 	for (i = 0; i < ipcl_udp_fanout_size; i++)
461 		mutex_destroy(&ipcl_udp_fanout[i].connf_lock);
462 	kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size *
463 	    sizeof (*ipcl_udp_fanout));
464 
465 	for (i = 0; i < ipcl_raw_fanout_size; i++)
466 		mutex_destroy(&ipcl_raw_fanout[i].connf_lock);
467 	kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size *
468 	    sizeof (*ipcl_raw_fanout));
469 
470 	kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE);
471 	mutex_destroy(&rts_clients.connf_lock);
472 }
473 
474 /*
475  * conn creation routine. initialize the conn, sets the reference
476  * and inserts it in the global hash table.
477  */
478 conn_t *
479 ipcl_conn_create(uint32_t type, int sleep)
480 {
481 	itc_t	*itc;
482 	conn_t	*connp;
483 
484 	switch (type) {
485 	case IPCL_TCPCONN:
486 		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
487 		    sleep)) == NULL)
488 			return (NULL);
489 		connp = &itc->itc_conn;
490 		connp->conn_ref = 1;
491 		IPCL_DEBUG_LVL(1,
492 		    ("ipcl_conn_create: connp = %p tcp (%p)",
493 		    (void *)connp, (void *)connp->conn_tcp));
494 		ipcl_globalhash_insert(connp);
495 		break;
496 	case IPCL_SCTPCONN:
497 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
498 			return (NULL);
499 		connp->conn_flags = IPCL_SCTPCONN;
500 		break;
501 	case IPCL_IPCCONN:
502 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
503 		if (connp == NULL)
504 			return (NULL);
505 		bzero(connp, sizeof (conn_t));
506 		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
507 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
508 		connp->conn_flags = IPCL_IPCCONN;
509 		connp->conn_ref = 1;
510 		IPCL_DEBUG_LVL(1,
511 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
512 		ipcl_globalhash_insert(connp);
513 		break;
514 	default:
515 		connp = NULL;
516 		ASSERT(0);
517 	}
518 
519 	return (connp);
520 }
521 
522 void
523 ipcl_conn_destroy(conn_t *connp)
524 {
525 	mblk_t	*mp;
526 
527 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
528 	ASSERT(connp->conn_ref == 0);
529 	ASSERT(connp->conn_ire_cache == NULL);
530 
531 	ipcl_globalhash_remove(connp);
532 
533 	cv_destroy(&connp->conn_cv);
534 	if (connp->conn_flags & IPCL_TCPCONN) {
535 		tcp_t	*tcp = connp->conn_tcp;
536 
537 		mutex_destroy(&connp->conn_lock);
538 		ASSERT(connp->conn_tcp != NULL);
539 		tcp_free(tcp);
540 		mp = tcp->tcp_timercache;
541 
542 		if (tcp->tcp_sack_info != NULL) {
543 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
544 			kmem_cache_free(tcp_sack_info_cache,
545 			    tcp->tcp_sack_info);
546 		}
547 		if (tcp->tcp_iphc != NULL) {
548 			if (tcp->tcp_hdr_grown) {
549 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
550 			} else {
551 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
552 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
553 			}
554 			tcp->tcp_iphc_len = 0;
555 		}
556 		ASSERT(tcp->tcp_iphc_len == 0);
557 
558 		if (connp->conn_latch != NULL)
559 			IPLATCH_REFRELE(connp->conn_latch);
560 		if (connp->conn_policy != NULL)
561 			IPPH_REFRELE(connp->conn_policy);
562 		bzero(connp, sizeof (itc_t));
563 
564 		tcp->tcp_timercache = mp;
565 		connp->conn_tcp = tcp;
566 		connp->conn_flags = IPCL_TCPCONN;
567 		connp->conn_ulp = IPPROTO_TCP;
568 		tcp->tcp_connp = connp;
569 		kmem_cache_free(ipcl_tcpconn_cache, connp);
570 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
571 		sctp_free(connp);
572 	} else {
573 		ASSERT(connp->conn_udp == NULL);
574 		mutex_destroy(&connp->conn_lock);
575 		kmem_cache_free(ipcl_conn_cache, connp);
576 	}
577 }
578 
579 /*
580  * Running in cluster mode - deregister listener information
581  */
582 
583 static void
584 ipcl_conn_unlisten(conn_t *connp)
585 {
586 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
587 	ASSERT(connp->conn_lport != 0);
588 
589 	if (cl_inet_unlisten != NULL) {
590 		sa_family_t	addr_family;
591 		uint8_t		*laddrp;
592 
593 		if (connp->conn_pkt_isv6) {
594 			addr_family = AF_INET6;
595 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
596 		} else {
597 			addr_family = AF_INET;
598 			laddrp = (uint8_t *)&connp->conn_bound_source;
599 		}
600 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
601 		    connp->conn_lport);
602 	}
603 	connp->conn_flags &= ~IPCL_CL_LISTENER;
604 }
605 
606 /*
607  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
608  * which table the conn belonged to). So for debugging we can see which hash
609  * table this connection was in.
610  */
611 #define	IPCL_HASH_REMOVE(connp)	{					\
612 	connf_t	*connfp = (connp)->conn_fanout;				\
613 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
614 	if (connfp != NULL) {						\
615 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
616 		    (void *)(connp)));					\
617 		mutex_enter(&connfp->connf_lock);			\
618 		if ((connp)->conn_next != NULL)				\
619 			(connp)->conn_next->conn_prev =			\
620 			    (connp)->conn_prev;				\
621 		if ((connp)->conn_prev != NULL)				\
622 			(connp)->conn_prev->conn_next =			\
623 			    (connp)->conn_next;				\
624 		else							\
625 			connfp->connf_head = (connp)->conn_next;	\
626 		(connp)->conn_fanout = NULL;				\
627 		(connp)->conn_next = NULL;				\
628 		(connp)->conn_prev = NULL;				\
629 		(connp)->conn_flags |= IPCL_REMOVED;			\
630 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
631 			ipcl_conn_unlisten((connp));			\
632 		CONN_DEC_REF((connp));					\
633 		mutex_exit(&connfp->connf_lock);			\
634 	}								\
635 }
636 
637 void
638 ipcl_hash_remove(conn_t *connp)
639 {
640 	IPCL_HASH_REMOVE(connp);
641 }
642 
643 /*
644  * The whole purpose of this function is allow removal of
645  * a conn_t from the connected hash for timewait reclaim.
646  * This is essentially a TW reclaim fastpath where timewait
647  * collector checks under fanout lock (so no one else can
648  * get access to the conn_t) that refcnt is 2 i.e. one for
649  * TCP and one for the classifier hash list. If ref count
650  * is indeed 2, we can just remove the conn under lock and
651  * avoid cleaning up the conn under squeue. This gives us
652  * improved performance.
653  */
654 void
655 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
656 {
657 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
658 	ASSERT(MUTEX_HELD(&connp->conn_lock));
659 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
660 
661 	if ((connp)->conn_next != NULL) {
662 		(connp)->conn_next->conn_prev =
663 			(connp)->conn_prev;
664 	}
665 	if ((connp)->conn_prev != NULL) {
666 		(connp)->conn_prev->conn_next =
667 			(connp)->conn_next;
668 	} else {
669 		connfp->connf_head = (connp)->conn_next;
670 	}
671 	(connp)->conn_fanout = NULL;
672 	(connp)->conn_next = NULL;
673 	(connp)->conn_prev = NULL;
674 	(connp)->conn_flags |= IPCL_REMOVED;
675 	ASSERT((connp)->conn_ref == 2);
676 	(connp)->conn_ref--;
677 }
678 
679 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
680 	ASSERT((connp)->conn_fanout == NULL);				\
681 	ASSERT((connp)->conn_next == NULL);				\
682 	ASSERT((connp)->conn_prev == NULL);				\
683 	if ((connfp)->connf_head != NULL) {				\
684 		(connfp)->connf_head->conn_prev = (connp);		\
685 		(connp)->conn_next = (connfp)->connf_head;		\
686 	}								\
687 	(connp)->conn_fanout = (connfp);				\
688 	(connfp)->connf_head = (connp);					\
689 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
690 	    IPCL_CONNECTED;						\
691 	CONN_INC_REF(connp);						\
692 }
693 
694 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
695 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
696 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
697 	IPCL_HASH_REMOVE((connp));					\
698 	mutex_enter(&(connfp)->connf_lock);				\
699 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
700 	mutex_exit(&(connfp)->connf_lock);				\
701 }
702 
703 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
704 	conn_t *pconnp = NULL, *nconnp;					\
705 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
706 	    "connp %p", (void *)connfp, (void *)(connp)));		\
707 	IPCL_HASH_REMOVE((connp));					\
708 	mutex_enter(&(connfp)->connf_lock);				\
709 	nconnp = (connfp)->connf_head;					\
710 	while (nconnp != NULL &&					\
711 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
712 		pconnp = nconnp;					\
713 		nconnp = nconnp->conn_next;				\
714 	}								\
715 	if (pconnp != NULL) {						\
716 		pconnp->conn_next = (connp);				\
717 		(connp)->conn_prev = pconnp;				\
718 	} else {							\
719 		(connfp)->connf_head = (connp);				\
720 	}								\
721 	if (nconnp != NULL) {						\
722 		(connp)->conn_next = nconnp;				\
723 		nconnp->conn_prev = (connp);				\
724 	}								\
725 	(connp)->conn_fanout = (connfp);				\
726 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
727 	    IPCL_BOUND;							\
728 	CONN_INC_REF(connp);						\
729 	mutex_exit(&(connfp)->connf_lock);				\
730 }
731 
732 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
733 	conn_t **list, *prev, *next;					\
734 	boolean_t isv4mapped =						\
735 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
736 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
737 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
738 	IPCL_HASH_REMOVE((connp));					\
739 	mutex_enter(&(connfp)->connf_lock);				\
740 	list = &(connfp)->connf_head;					\
741 	prev = NULL;							\
742 	while ((next = *list) != NULL) {				\
743 		if (isv4mapped &&					\
744 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
745 		    connp->conn_zoneid == next->conn_zoneid) {		\
746 			(connp)->conn_next = next;			\
747 			if (prev != NULL)				\
748 				prev = next->conn_prev;			\
749 			next->conn_prev = (connp);			\
750 			break;						\
751 		}							\
752 		list = &next->conn_next;				\
753 		prev = next;						\
754 	}								\
755 	(connp)->conn_prev = prev;					\
756 	*list = (connp);						\
757 	(connp)->conn_fanout = (connfp);				\
758 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
759 	    IPCL_BOUND;							\
760 	CONN_INC_REF((connp));						\
761 	mutex_exit(&(connfp)->connf_lock);				\
762 }
763 
764 void
765 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
766 {
767 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
768 }
769 
770 void
771 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
772 {
773 	connf_t	*connfp;
774 
775 	ASSERT(connp != NULL);
776 
777 	connp->conn_ulp = protocol;
778 
779 	/* Insert it in the protocol hash */
780 	connfp = &ipcl_proto_fanout[protocol];
781 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
782 }
783 
784 void
785 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
786 {
787 	connf_t	*connfp;
788 
789 	ASSERT(connp != NULL);
790 
791 	connp->conn_ulp = protocol;
792 
793 	/* Insert it in the Bind Hash */
794 	connfp = &ipcl_proto_fanout_v6[protocol];
795 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
796 }
797 
798 /*
799  * This function is used only for inserting SCTP raw socket now.
800  * This may change later.
801  *
802  * Note that only one raw socket can be bound to a port.  The param
803  * lport is in network byte order.
804  */
805 static int
806 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
807 {
808 	connf_t	*connfp;
809 	conn_t	*oconnp;
810 
811 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
812 
813 	/* Check for existing raw socket already bound to the port. */
814 	mutex_enter(&connfp->connf_lock);
815 	for (oconnp = connfp->connf_head; oconnp != NULL;
816 	    oconnp = oconnp->conn_next) {
817 		if (oconnp->conn_lport == lport &&
818 		    oconnp->conn_zoneid == connp->conn_zoneid &&
819 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
820 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
821 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
822 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
823 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
824 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
825 		    &connp->conn_srcv6))) {
826 			break;
827 		}
828 	}
829 	mutex_exit(&connfp->connf_lock);
830 	if (oconnp != NULL)
831 		return (EADDRNOTAVAIL);
832 
833 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
834 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
835 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
836 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
837 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
838 		} else {
839 			IPCL_HASH_INSERT_BOUND(connfp, connp);
840 		}
841 	} else {
842 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
843 	}
844 	return (0);
845 }
846 
847 /*
848  * (v4, v6) bind hash insertion routines
849  */
850 int
851 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
852 {
853 	connf_t	*connfp;
854 #ifdef	IPCL_DEBUG
855 	char	buf[INET_NTOA_BUFSIZE];
856 #endif
857 	int	ret = 0;
858 
859 	ASSERT(connp);
860 
861 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
862 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
863 
864 	connp->conn_ulp = protocol;
865 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
866 	connp->conn_lport = lport;
867 
868 	switch (protocol) {
869 	case IPPROTO_UDP:
870 	default:
871 		if (protocol == IPPROTO_UDP) {
872 			IPCL_DEBUG_LVL(64,
873 			    ("ipcl_bind_insert: connp %p - udp\n",
874 			    (void *)connp));
875 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
876 		} else {
877 			IPCL_DEBUG_LVL(64,
878 			    ("ipcl_bind_insert: connp %p - protocol\n",
879 			    (void *)connp));
880 			connfp = &ipcl_proto_fanout[protocol];
881 		}
882 
883 		if (connp->conn_rem != INADDR_ANY) {
884 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
885 		} else if (connp->conn_src != INADDR_ANY) {
886 			IPCL_HASH_INSERT_BOUND(connfp, connp);
887 		} else {
888 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
889 		}
890 		break;
891 
892 	case IPPROTO_TCP:
893 
894 		/* Insert it in the Bind Hash */
895 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
896 		if (connp->conn_src != INADDR_ANY) {
897 			IPCL_HASH_INSERT_BOUND(connfp, connp);
898 		} else {
899 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
900 		}
901 		if (cl_inet_listen != NULL) {
902 			ASSERT(!connp->conn_pkt_isv6);
903 			connp->conn_flags |= IPCL_CL_LISTENER;
904 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
905 			    (uint8_t *)&connp->conn_bound_source, lport);
906 		}
907 		break;
908 
909 	case IPPROTO_SCTP:
910 		ret = ipcl_sctp_hash_insert(connp, lport);
911 		break;
912 	}
913 
914 	return (ret);
915 }
916 
917 int
918 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
919     uint16_t lport)
920 {
921 	connf_t	*connfp;
922 	int	ret = 0;
923 
924 	ASSERT(connp);
925 
926 	connp->conn_ulp = protocol;
927 	connp->conn_srcv6 = *src;
928 	connp->conn_lport = lport;
929 
930 	switch (protocol) {
931 	case IPPROTO_UDP:
932 	default:
933 		if (protocol == IPPROTO_UDP) {
934 			IPCL_DEBUG_LVL(128,
935 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
936 			    (void *)connp));
937 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
938 		} else {
939 			IPCL_DEBUG_LVL(128,
940 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
941 			    (void *)connp));
942 			connfp = &ipcl_proto_fanout_v6[protocol];
943 		}
944 
945 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
946 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
947 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
948 			IPCL_HASH_INSERT_BOUND(connfp, connp);
949 		} else {
950 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
951 		}
952 		break;
953 
954 	case IPPROTO_TCP:
955 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
956 
957 		/* Insert it in the Bind Hash */
958 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
959 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
960 			IPCL_HASH_INSERT_BOUND(connfp, connp);
961 		} else {
962 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
963 		}
964 		if (cl_inet_listen != NULL) {
965 			sa_family_t	addr_family;
966 			uint8_t		*laddrp;
967 
968 			if (connp->conn_pkt_isv6) {
969 				addr_family = AF_INET6;
970 				laddrp =
971 				    (uint8_t *)&connp->conn_bound_source_v6;
972 			} else {
973 				addr_family = AF_INET;
974 				laddrp = (uint8_t *)&connp->conn_bound_source;
975 			}
976 			connp->conn_flags |= IPCL_CL_LISTENER;
977 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
978 			    lport);
979 		}
980 		break;
981 
982 	case IPPROTO_SCTP:
983 		ret = ipcl_sctp_hash_insert(connp, lport);
984 		break;
985 	}
986 
987 	return (ret);
988 }
989 
990 /*
991  * ipcl_conn_hash insertion routines.
992  */
993 int
994 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
995     ipaddr_t rem, uint32_t ports)
996 {
997 	connf_t		*connfp;
998 	uint16_t	*up;
999 	conn_t		*tconnp;
1000 #ifdef	IPCL_DEBUG
1001 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1002 #endif
1003 	in_port_t	lport;
1004 	int		ret = 0;
1005 
1006 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1007 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1008 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1009 	    ports, protocol));
1010 
1011 	switch (protocol) {
1012 	case IPPROTO_TCP:
1013 		if (!(connp->conn_flags & IPCL_EAGER)) {
1014 			/*
1015 			 * for a eager connection, i.e connections which
1016 			 * have just been created, the initialization is
1017 			 * already done in ip at conn_creation time, so
1018 			 * we can skip the checks here.
1019 			 */
1020 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1021 		}
1022 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem,
1023 		    connp->conn_ports)];
1024 		mutex_enter(&connfp->connf_lock);
1025 		for (tconnp = connfp->connf_head; tconnp != NULL;
1026 		    tconnp = tconnp->conn_next) {
1027 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1028 			    connp->conn_rem, connp->conn_src,
1029 			    connp->conn_ports)) {
1030 
1031 				/* Already have a conn. bail out */
1032 				mutex_exit(&connfp->connf_lock);
1033 				return (EADDRINUSE);
1034 			}
1035 		}
1036 		if (connp->conn_fanout != NULL) {
1037 			/*
1038 			 * Probably a XTI/TLI application trying to do a
1039 			 * rebind. Let it happen.
1040 			 */
1041 			mutex_exit(&connfp->connf_lock);
1042 			IPCL_HASH_REMOVE(connp);
1043 			mutex_enter(&connfp->connf_lock);
1044 		}
1045 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1046 		mutex_exit(&connfp->connf_lock);
1047 		break;
1048 
1049 	case IPPROTO_SCTP:
1050 		/*
1051 		 * The raw socket may have already been bound, remove it
1052 		 * from the hash first.
1053 		 */
1054 		IPCL_HASH_REMOVE(connp);
1055 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1056 		ret = ipcl_sctp_hash_insert(connp, lport);
1057 		break;
1058 
1059 	case IPPROTO_UDP:
1060 	default:
1061 		up = (uint16_t *)&ports;
1062 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1063 		if (protocol == IPPROTO_UDP) {
1064 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1065 		} else {
1066 			connfp = &ipcl_proto_fanout[protocol];
1067 		}
1068 
1069 		if (connp->conn_rem != INADDR_ANY) {
1070 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1071 		} else if (connp->conn_src != INADDR_ANY) {
1072 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1073 		} else {
1074 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1075 		}
1076 		break;
1077 	}
1078 
1079 	return (ret);
1080 }
1081 
1082 int
1083 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1084     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1085 {
1086 	connf_t		*connfp;
1087 	uint16_t	*up;
1088 	conn_t		*tconnp;
1089 	in_port_t	lport;
1090 	int		ret = 0;
1091 
1092 	switch (protocol) {
1093 	case IPPROTO_TCP:
1094 		/* Just need to insert a conn struct */
1095 		if (!(connp->conn_flags & IPCL_EAGER)) {
1096 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1097 		}
1098 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6,
1099 		    connp->conn_ports)];
1100 		mutex_enter(&connfp->connf_lock);
1101 		for (tconnp = connfp->connf_head; tconnp != NULL;
1102 		    tconnp = tconnp->conn_next) {
1103 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1104 			    connp->conn_remv6, connp->conn_srcv6,
1105 			    connp->conn_ports) &&
1106 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1107 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1108 				/* Already have a conn. bail out */
1109 				mutex_exit(&connfp->connf_lock);
1110 				return (EADDRINUSE);
1111 			}
1112 		}
1113 		if (connp->conn_fanout != NULL) {
1114 			/*
1115 			 * Probably a XTI/TLI application trying to do a
1116 			 * rebind. Let it happen.
1117 			 */
1118 			mutex_exit(&connfp->connf_lock);
1119 			IPCL_HASH_REMOVE(connp);
1120 			mutex_enter(&connfp->connf_lock);
1121 		}
1122 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1123 		mutex_exit(&connfp->connf_lock);
1124 		break;
1125 
1126 	case IPPROTO_SCTP:
1127 		IPCL_HASH_REMOVE(connp);
1128 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1129 		ret = ipcl_sctp_hash_insert(connp, lport);
1130 		break;
1131 
1132 	case IPPROTO_UDP:
1133 	default:
1134 		up = (uint16_t *)&ports;
1135 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1136 		if (protocol == IPPROTO_UDP) {
1137 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1138 		} else {
1139 			connfp = &ipcl_proto_fanout_v6[protocol];
1140 		}
1141 
1142 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1143 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1144 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1145 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1146 		} else {
1147 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1148 		}
1149 		break;
1150 	}
1151 
1152 	return (ret);
1153 }
1154 
1155 /*
1156  * v4 packet classifying function. looks up the fanout table to
1157  * find the conn, the packet belongs to. returns the conn with
1158  * the reference held, null otherwise.
1159  */
1160 conn_t *
1161 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1162 {
1163 	ipha_t	*ipha;
1164 	connf_t	*connfp, *bind_connfp;
1165 	uint16_t lport;
1166 	uint16_t fport;
1167 	uint32_t ports;
1168 	conn_t	*connp;
1169 	uint16_t  *up;
1170 
1171 	ipha = (ipha_t *)mp->b_rptr;
1172 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1173 
1174 	switch (protocol) {
1175 	case IPPROTO_TCP:
1176 		ports = *(uint32_t *)up;
1177 		connfp =
1178 		    &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)];
1179 		mutex_enter(&connfp->connf_lock);
1180 		for (connp = connfp->connf_head; connp != NULL;
1181 		    connp = connp->conn_next) {
1182 			if (IPCL_CONN_MATCH(connp, protocol,
1183 			    ipha->ipha_src, ipha->ipha_dst, ports))
1184 				break;
1185 		}
1186 
1187 		if (connp != NULL) {
1188 			CONN_INC_REF(connp);
1189 			mutex_exit(&connfp->connf_lock);
1190 			return (connp);
1191 		}
1192 
1193 		mutex_exit(&connfp->connf_lock);
1194 
1195 		lport = up[1];
1196 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1197 		mutex_enter(&bind_connfp->connf_lock);
1198 		for (connp = bind_connfp->connf_head; connp != NULL;
1199 		    connp = connp->conn_next) {
1200 			if (IPCL_BIND_MATCH(connp, protocol,
1201 			    ipha->ipha_dst, lport) &&
1202 			    connp->conn_zoneid == zoneid)
1203 				break;
1204 		}
1205 
1206 		if (connp != NULL) {
1207 			/* Have a listner at least */
1208 			CONN_INC_REF(connp);
1209 			mutex_exit(&bind_connfp->connf_lock);
1210 			return (connp);
1211 		}
1212 
1213 		mutex_exit(&bind_connfp->connf_lock);
1214 
1215 		IPCL_DEBUG_LVL(512,
1216 		    ("ipcl_classify: couldn't classify mp = %p\n",
1217 		    (void *)mp));
1218 		break;
1219 
1220 	case IPPROTO_UDP:
1221 		lport = up[1];
1222 		fport = up[0];
1223 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1224 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1225 		mutex_enter(&connfp->connf_lock);
1226 		for (connp = connfp->connf_head; connp != NULL;
1227 		    connp = connp->conn_next) {
1228 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1229 			    fport, ipha->ipha_src) &&
1230 			    connp->conn_zoneid == zoneid)
1231 				break;
1232 		}
1233 
1234 		if (connp != NULL) {
1235 			CONN_INC_REF(connp);
1236 			mutex_exit(&connfp->connf_lock);
1237 			return (connp);
1238 		}
1239 
1240 		/*
1241 		 * We shouldn't come here for multicast/broadcast packets
1242 		 */
1243 		mutex_exit(&connfp->connf_lock);
1244 		IPCL_DEBUG_LVL(512,
1245 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1246 		    lport, fport));
1247 		break;
1248 	}
1249 
1250 	return (NULL);
1251 }
1252 
1253 conn_t *
1254 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1255 {
1256 	ip6_t		*ip6h;
1257 	connf_t		*connfp, *bind_connfp;
1258 	uint16_t	lport;
1259 	uint16_t	fport;
1260 	tcph_t		*tcph;
1261 	uint32_t	ports;
1262 	conn_t		*connp;
1263 	uint16_t	*up;
1264 
1265 
1266 	ip6h = (ip6_t *)mp->b_rptr;
1267 
1268 	switch (protocol) {
1269 	case IPPROTO_TCP:
1270 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1271 		up = (uint16_t *)tcph->th_lport;
1272 		ports = *(uint32_t *)up;
1273 
1274 		connfp =
1275 		    &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)];
1276 		mutex_enter(&connfp->connf_lock);
1277 		for (connp = connfp->connf_head; connp != NULL;
1278 		    connp = connp->conn_next) {
1279 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1280 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1281 				break;
1282 		}
1283 
1284 		if (connp != NULL) {
1285 			CONN_INC_REF(connp);
1286 			mutex_exit(&connfp->connf_lock);
1287 			return (connp);
1288 		}
1289 
1290 		mutex_exit(&connfp->connf_lock);
1291 
1292 		lport = up[1];
1293 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1294 		mutex_enter(&bind_connfp->connf_lock);
1295 		for (connp = bind_connfp->connf_head; connp != NULL;
1296 		    connp = connp->conn_next) {
1297 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1298 			    ip6h->ip6_dst, lport) &&
1299 			    connp->conn_zoneid == zoneid)
1300 				break;
1301 		}
1302 
1303 		if (connp != NULL) {
1304 			/* Have a listner at least */
1305 			CONN_INC_REF(connp);
1306 			mutex_exit(&bind_connfp->connf_lock);
1307 			IPCL_DEBUG_LVL(512,
1308 			    ("ipcl_classify_v6: found listner "
1309 			    "connp = %p\n", (void *)connp));
1310 
1311 			return (connp);
1312 		}
1313 
1314 		mutex_exit(&bind_connfp->connf_lock);
1315 
1316 		IPCL_DEBUG_LVL(512,
1317 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1318 		    (void *)mp));
1319 		break;
1320 
1321 	case IPPROTO_UDP:
1322 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1323 		lport = up[1];
1324 		fport = up[0];
1325 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1326 		    fport));
1327 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1328 		mutex_enter(&connfp->connf_lock);
1329 		for (connp = connfp->connf_head; connp != NULL;
1330 		    connp = connp->conn_next) {
1331 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1332 			    fport, ip6h->ip6_src) &&
1333 			    connp->conn_zoneid == zoneid)
1334 				break;
1335 		}
1336 
1337 		if (connp != NULL) {
1338 			CONN_INC_REF(connp);
1339 			mutex_exit(&connfp->connf_lock);
1340 			return (connp);
1341 		}
1342 
1343 		/*
1344 		 * We shouldn't come here for multicast/broadcast packets
1345 		 */
1346 		mutex_exit(&connfp->connf_lock);
1347 		IPCL_DEBUG_LVL(512,
1348 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1349 		    lport, fport));
1350 		break;
1351 	}
1352 
1353 
1354 	return (NULL);
1355 }
1356 
1357 /*
1358  * wrapper around ipcl_classify_(v4,v6) routines.
1359  */
1360 conn_t *
1361 ipcl_classify(mblk_t *mp, zoneid_t zoneid)
1362 {
1363 	uint16_t	hdr_len;
1364 	ipha_t		*ipha;
1365 	uint8_t		*nexthdrp;
1366 
1367 	if (MBLKL(mp) < sizeof (ipha_t))
1368 		return (NULL);
1369 
1370 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1371 	case IPV4_VERSION:
1372 		ipha = (ipha_t *)mp->b_rptr;
1373 		hdr_len = IPH_HDR_LENGTH(ipha);
1374 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1375 		    zoneid));
1376 	case IPV6_VERSION:
1377 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1378 		    &hdr_len, &nexthdrp))
1379 			return (NULL);
1380 
1381 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid));
1382 	}
1383 
1384 	return (NULL);
1385 }
1386 
1387 conn_t *
1388 ipcl_classify_raw(uint8_t protocol, zoneid_t zoneid, uint32_t ports,
1389     ipha_t *hdr)
1390 {
1391 	struct connf_s	*connfp;
1392 	conn_t		*connp;
1393 	in_port_t	lport;
1394 	int		af;
1395 
1396 	lport = ((uint16_t *)&ports)[1];
1397 	af = IPH_HDR_VERSION(hdr);
1398 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
1399 
1400 	mutex_enter(&connfp->connf_lock);
1401 	for (connp = connfp->connf_head; connp != NULL;
1402 	    connp = connp->conn_next) {
1403 		/* We don't allow v4 fallback for v6 raw socket. */
1404 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1405 		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1406 			continue;
1407 		}
1408 		if (connp->conn_fully_bound) {
1409 			if (af == IPV4_VERSION) {
1410 				if (IPCL_CONN_MATCH(connp, protocol,
1411 				    hdr->ipha_src, hdr->ipha_dst, ports)) {
1412 					break;
1413 				}
1414 			} else {
1415 				if (IPCL_CONN_MATCH_V6(connp, protocol,
1416 				    ((ip6_t *)hdr)->ip6_src,
1417 				    ((ip6_t *)hdr)->ip6_dst, ports)) {
1418 					break;
1419 				}
1420 			}
1421 		} else {
1422 			if (af == IPV4_VERSION) {
1423 				if (IPCL_BIND_MATCH(connp, protocol,
1424 				    hdr->ipha_dst, lport)) {
1425 					break;
1426 				}
1427 			} else {
1428 				if (IPCL_BIND_MATCH_V6(connp, protocol,
1429 				    ((ip6_t *)hdr)->ip6_dst, lport)) {
1430 					break;
1431 				}
1432 			}
1433 		}
1434 	}
1435 
1436 	if (connp != NULL)
1437 		goto found;
1438 	mutex_exit(&connfp->connf_lock);
1439 
1440 	/* Try to look for a wildcard match. */
1441 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)];
1442 	mutex_enter(&connfp->connf_lock);
1443 	for (connp = connfp->connf_head; connp != NULL;
1444 	    connp = connp->conn_next) {
1445 		/* We don't allow v4 fallback for v6 raw socket. */
1446 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1447 		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1448 			continue;
1449 		}
1450 		if (af == IPV4_VERSION) {
1451 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1452 				break;
1453 		} else {
1454 			if (IPCL_RAW_MATCH_V6(connp, protocol,
1455 			    ((ip6_t *)hdr)->ip6_dst)) {
1456 				break;
1457 			}
1458 		}
1459 	}
1460 
1461 	if (connp != NULL)
1462 		goto found;
1463 
1464 	mutex_exit(&connfp->connf_lock);
1465 	return (NULL);
1466 
1467 found:
1468 	ASSERT(connp != NULL);
1469 	CONN_INC_REF(connp);
1470 	mutex_exit(&connfp->connf_lock);
1471 	return (connp);
1472 }
1473 
1474 /* ARGSUSED */
1475 static int
1476 ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1477 {
1478 	itc_t	*itc = (itc_t *)buf;
1479 	conn_t 	*connp = &itc->itc_conn;
1480 	tcp_t	*tcp = &itc->itc_tcp;
1481 	bzero(itc, sizeof (itc_t));
1482 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1483 	connp->conn_tcp = tcp;
1484 	connp->conn_flags = IPCL_TCPCONN;
1485 	connp->conn_ulp = IPPROTO_TCP;
1486 	tcp->tcp_connp = connp;
1487 	return (0);
1488 }
1489 
1490 /* ARGSUSED */
1491 static void
1492 ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1493 {
1494 	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1495 }
1496 
1497 /*
1498  * All conns are inserted in a global multi-list for the benefit of
1499  * walkers. The walk is guaranteed to walk all open conns at the time
1500  * of the start of the walk exactly once. This property is needed to
1501  * achieve some cleanups during unplumb of interfaces. This is achieved
1502  * as follows.
1503  *
1504  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1505  * call the insert and delete functions below at creation and deletion
1506  * time respectively. The conn never moves or changes its position in this
1507  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1508  * won't increase due to walkers, once the conn deletion has started. Note
1509  * that we can't remove the conn from the global list and then wait for
1510  * the refcnt to drop to zero, since walkers would then see a truncated
1511  * list. CONN_INCIPIENT ensures that walkers don't start looking at
1512  * conns until ip_open is ready to make them globally visible.
1513  * The global round robin multi-list locks are held only to get the
1514  * next member/insertion/deletion and contention should be negligible
1515  * if the multi-list is much greater than the number of cpus.
1516  */
1517 void
1518 ipcl_globalhash_insert(conn_t *connp)
1519 {
1520 	int	index;
1521 
1522 	/*
1523 	 * No need for atomic here. Approximate even distribution
1524 	 * in the global lists is sufficient.
1525 	 */
1526 	conn_g_index++;
1527 	index = conn_g_index & (CONN_G_HASH_SIZE - 1);
1528 
1529 	connp->conn_g_prev = NULL;
1530 	/*
1531 	 * Mark as INCIPIENT, so that walkers will ignore this
1532 	 * for now, till ip_open is ready to make it visible globally.
1533 	 */
1534 	connp->conn_state_flags |= CONN_INCIPIENT;
1535 
1536 	/* Insert at the head of the list */
1537 	mutex_enter(&ipcl_globalhash_fanout[index].connf_lock);
1538 	connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head;
1539 	if (connp->conn_g_next != NULL)
1540 		connp->conn_g_next->conn_g_prev = connp;
1541 	ipcl_globalhash_fanout[index].connf_head = connp;
1542 
1543 	/* The fanout bucket this conn points to */
1544 	connp->conn_g_fanout = &ipcl_globalhash_fanout[index];
1545 
1546 	mutex_exit(&ipcl_globalhash_fanout[index].connf_lock);
1547 }
1548 
1549 void
1550 ipcl_globalhash_remove(conn_t *connp)
1551 {
1552 	/*
1553 	 * We were never inserted in the global multi list.
1554 	 * IPCL_NONE variety is never inserted in the global multilist
1555 	 * since it is presumed to not need any cleanup and is transient.
1556 	 */
1557 	if (connp->conn_g_fanout == NULL)
1558 		return;
1559 
1560 	mutex_enter(&connp->conn_g_fanout->connf_lock);
1561 	if (connp->conn_g_prev != NULL)
1562 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
1563 	else
1564 		connp->conn_g_fanout->connf_head = connp->conn_g_next;
1565 	if (connp->conn_g_next != NULL)
1566 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
1567 	mutex_exit(&connp->conn_g_fanout->connf_lock);
1568 
1569 	/* Better to stumble on a null pointer than to corrupt memory */
1570 	connp->conn_g_next = NULL;
1571 	connp->conn_g_prev = NULL;
1572 }
1573 
1574 /*
1575  * Walk the list of all conn_t's in the system, calling the function provided
1576  * with the specified argument for each.
1577  * Applies to both IPv4 and IPv6.
1578  *
1579  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
1580  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
1581  * unplumbed or removed. New conn_t's that are created while we are walking
1582  * may be missed by this walk, because they are not necessarily inserted
1583  * at the tail of the list. They are new conn_t's and thus don't have any
1584  * stale pointers. The CONN_CLOSING flag ensures that no new reference
1585  * is created to the struct that is going away.
1586  */
1587 void
1588 ipcl_walk(pfv_t func, void *arg)
1589 {
1590 	int	i;
1591 	conn_t	*connp;
1592 	conn_t	*prev_connp;
1593 
1594 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
1595 		mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1596 		prev_connp = NULL;
1597 		connp = ipcl_globalhash_fanout[i].connf_head;
1598 		while (connp != NULL) {
1599 			mutex_enter(&connp->conn_lock);
1600 			if (connp->conn_state_flags &
1601 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
1602 				mutex_exit(&connp->conn_lock);
1603 				connp = connp->conn_g_next;
1604 				continue;
1605 			}
1606 			CONN_INC_REF_LOCKED(connp);
1607 			mutex_exit(&connp->conn_lock);
1608 			mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1609 			(*func)(connp, arg);
1610 			if (prev_connp != NULL)
1611 				CONN_DEC_REF(prev_connp);
1612 			mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1613 			prev_connp = connp;
1614 			connp = connp->conn_g_next;
1615 		}
1616 		mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1617 		if (prev_connp != NULL)
1618 			CONN_DEC_REF(prev_connp);
1619 	}
1620 }
1621 
1622 /*
1623  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
1624  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1625  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1626  * (peer tcp in at least ESTABLISHED state).
1627  */
1628 conn_t *
1629 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph)
1630 {
1631 	uint32_t ports;
1632 	uint16_t *pports = (uint16_t *)&ports;
1633 	connf_t	*connfp;
1634 	conn_t	*tconnp;
1635 	boolean_t zone_chk;
1636 
1637 	/*
1638 	 * If either the source of destination address is loopback, then
1639 	 * both endpoints must be in the same Zone.  Otherwise, both of
1640 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1641 	 * state) and the endpoints may reside in different Zones.
1642 	 */
1643 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
1644 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
1645 
1646 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1647 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1648 
1649 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1650 
1651 	mutex_enter(&connfp->connf_lock);
1652 	for (tconnp = connfp->connf_head; tconnp != NULL;
1653 	    tconnp = tconnp->conn_next) {
1654 
1655 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1656 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1657 		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1658 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1659 
1660 			ASSERT(tconnp != connp);
1661 			CONN_INC_REF(tconnp);
1662 			mutex_exit(&connfp->connf_lock);
1663 			return (tconnp);
1664 		}
1665 	}
1666 	mutex_exit(&connfp->connf_lock);
1667 	return (NULL);
1668 }
1669 
1670 /*
1671  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
1672  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1673  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1674  * (peer tcp in at least ESTABLISHED state).
1675  */
1676 conn_t *
1677 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph)
1678 {
1679 	uint32_t ports;
1680 	uint16_t *pports = (uint16_t *)&ports;
1681 	connf_t	*connfp;
1682 	conn_t	*tconnp;
1683 	boolean_t zone_chk;
1684 
1685 	/*
1686 	 * If either the source of destination address is loopback, then
1687 	 * both endpoints must be in the same Zone.  Otherwise, both of
1688 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1689 	 * state) and the endpoints may reside in different Zones.  We
1690 	 * don't do Zone check for link local address(es) because the
1691 	 * current Zone implementation treats each link local address as
1692 	 * being unique per system node, i.e. they belong to global Zone.
1693 	 */
1694 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
1695 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
1696 
1697 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1698 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1699 
1700 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1701 
1702 	mutex_enter(&connfp->connf_lock);
1703 	for (tconnp = connfp->connf_head; tconnp != NULL;
1704 	    tconnp = tconnp->conn_next) {
1705 
1706 		/* We skip tcp_bound_if check here as this is loopback tcp */
1707 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1708 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1709 		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1710 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1711 
1712 			ASSERT(tconnp != connp);
1713 			CONN_INC_REF(tconnp);
1714 			mutex_exit(&connfp->connf_lock);
1715 			return (tconnp);
1716 		}
1717 	}
1718 	mutex_exit(&connfp->connf_lock);
1719 	return (NULL);
1720 }
1721 
1722 /*
1723  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1724  * Returns with conn reference held. Caller must call CONN_DEC_REF.
1725  * Only checks for connected entries i.e. no INADDR_ANY checks.
1726  */
1727 conn_t *
1728 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state)
1729 {
1730 	uint32_t ports;
1731 	uint16_t *pports;
1732 	connf_t	*connfp;
1733 	conn_t	*tconnp;
1734 
1735 	pports = (uint16_t *)&ports;
1736 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1737 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1738 
1739 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1740 
1741 	mutex_enter(&connfp->connf_lock);
1742 	for (tconnp = connfp->connf_head; tconnp != NULL;
1743 	    tconnp = tconnp->conn_next) {
1744 
1745 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1746 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1747 		    tconnp->conn_tcp->tcp_state >= min_state) {
1748 
1749 			CONN_INC_REF(tconnp);
1750 			mutex_exit(&connfp->connf_lock);
1751 			return (tconnp);
1752 		}
1753 	}
1754 	mutex_exit(&connfp->connf_lock);
1755 	return (NULL);
1756 }
1757 
1758 /*
1759  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1760  * Returns with conn reference held. Caller must call CONN_DEC_REF.
1761  * Only checks for connected entries i.e. no INADDR_ANY checks.
1762  * Match on ifindex in addition to addresses.
1763  */
1764 conn_t *
1765 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
1766     uint_t ifindex)
1767 {
1768 	tcp_t	*tcp;
1769 	uint32_t ports;
1770 	uint16_t *pports;
1771 	connf_t	*connfp;
1772 	conn_t	*tconnp;
1773 
1774 	pports = (uint16_t *)&ports;
1775 	pports[0] = tcpha->tha_fport;
1776 	pports[1] = tcpha->tha_lport;
1777 
1778 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1779 
1780 	mutex_enter(&connfp->connf_lock);
1781 	for (tconnp = connfp->connf_head; tconnp != NULL;
1782 	    tconnp = tconnp->conn_next) {
1783 
1784 		tcp = tconnp->conn_tcp;
1785 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1786 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1787 		    tcp->tcp_state >= min_state &&
1788 		    (tcp->tcp_bound_if == 0 ||
1789 		    tcp->tcp_bound_if == ifindex)) {
1790 
1791 			CONN_INC_REF(tconnp);
1792 			mutex_exit(&connfp->connf_lock);
1793 			return (tconnp);
1794 		}
1795 	}
1796 	mutex_exit(&connfp->connf_lock);
1797 	return (NULL);
1798 }
1799 
1800 /*
1801  * To find a TCP listening connection matching the incoming segment.
1802  */
1803 conn_t *
1804 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid)
1805 {
1806 	connf_t		*bind_connfp;
1807 	conn_t		*connp;
1808 	tcp_t		*tcp;
1809 
1810 	/*
1811 	 * Avoid false matches for packets sent to an IP destination of
1812 	 * all zeros.
1813 	 */
1814 	if (laddr == 0)
1815 		return (NULL);
1816 
1817 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1818 	mutex_enter(&bind_connfp->connf_lock);
1819 	for (connp = bind_connfp->connf_head; connp != NULL;
1820 	    connp = connp->conn_next) {
1821 		tcp = connp->conn_tcp;
1822 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
1823 		    connp->conn_zoneid == zoneid &&
1824 		    (tcp->tcp_listener == NULL)) {
1825 			CONN_INC_REF(connp);
1826 			mutex_exit(&bind_connfp->connf_lock);
1827 			return (connp);
1828 		}
1829 	}
1830 	mutex_exit(&bind_connfp->connf_lock);
1831 	return (NULL);
1832 }
1833 
1834 
1835 conn_t *
1836 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
1837     zoneid_t zoneid)
1838 {
1839 	connf_t		*bind_connfp;
1840 	conn_t		*connp = NULL;
1841 	tcp_t		*tcp;
1842 
1843 	/*
1844 	 * Avoid false matches for packets sent to an IP destination of
1845 	 * all zeros.
1846 	 */
1847 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
1848 		return (NULL);
1849 
1850 
1851 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1852 	mutex_enter(&bind_connfp->connf_lock);
1853 	for (connp = bind_connfp->connf_head; connp != NULL;
1854 	    connp = connp->conn_next) {
1855 		tcp = connp->conn_tcp;
1856 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
1857 		    connp->conn_zoneid == zoneid &&
1858 		    (tcp->tcp_bound_if == 0 ||
1859 		    tcp->tcp_bound_if == ifindex) &&
1860 		    tcp->tcp_listener == NULL) {
1861 			CONN_INC_REF(connp);
1862 			mutex_exit(&bind_connfp->connf_lock);
1863 			return (connp);
1864 		}
1865 	}
1866 	mutex_exit(&bind_connfp->connf_lock);
1867 	return (NULL);
1868 }
1869 
1870 /*
1871  * ipcl_get_next_conn
1872  *	get the next entry in the conn global list
1873  *	and put a reference on the next_conn.
1874  *	decrement the reference on the current conn.
1875  *
1876  * This is an iterator based walker function that also provides for
1877  * some selection by the caller. It walks through the conn_hash bucket
1878  * searching for the next valid connp in the list, and selects connections
1879  * that are neither closed nor condemned. It also REFHOLDS the conn
1880  * thus ensuring that the conn exists when the caller uses the conn.
1881  */
1882 conn_t *
1883 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
1884 {
1885 	conn_t	*next_connp;
1886 
1887 	if (connfp == NULL)
1888 		return (NULL);
1889 
1890 	mutex_enter(&connfp->connf_lock);
1891 
1892 	next_connp = (connp == NULL) ?
1893 	    connfp->connf_head : connp->conn_g_next;
1894 
1895 	while (next_connp != NULL) {
1896 		mutex_enter(&next_connp->conn_lock);
1897 		if (!(next_connp->conn_flags & conn_flags) ||
1898 		    (next_connp->conn_state_flags &
1899 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
1900 			/*
1901 			 * This conn has been condemned or
1902 			 * is closing, or the flags don't match
1903 			 */
1904 			mutex_exit(&next_connp->conn_lock);
1905 			next_connp = next_connp->conn_g_next;
1906 			continue;
1907 		}
1908 		CONN_INC_REF_LOCKED(next_connp);
1909 		mutex_exit(&next_connp->conn_lock);
1910 		break;
1911 	}
1912 
1913 	mutex_exit(&connfp->connf_lock);
1914 
1915 	if (connp != NULL)
1916 		CONN_DEC_REF(connp);
1917 
1918 	return (next_connp);
1919 }
1920 
1921 #ifdef CONN_DEBUG
1922 /*
1923  * Trace of the last NBUF refhold/refrele
1924  */
1925 int
1926 conn_trace_ref(conn_t *connp)
1927 {
1928 	int	last;
1929 	conn_trace_t	*ctb;
1930 
1931 	ASSERT(MUTEX_HELD(&connp->conn_lock));
1932 	last = connp->conn_trace_last;
1933 	last++;
1934 	if (last == CONN_TRACE_MAX)
1935 		last = 0;
1936 
1937 	ctb = &connp->conn_trace_buf[last];
1938 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1939 	connp->conn_trace_last = last;
1940 	return (1);
1941 }
1942 
1943 int
1944 conn_untrace_ref(conn_t *connp)
1945 {
1946 	int	last;
1947 	conn_trace_t	*ctb;
1948 
1949 	ASSERT(MUTEX_HELD(&connp->conn_lock));
1950 	last = connp->conn_trace_last;
1951 	last++;
1952 	if (last == CONN_TRACE_MAX)
1953 		last = 0;
1954 
1955 	ctb = &connp->conn_trace_buf[last];
1956 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1957 	connp->conn_trace_last = last;
1958 	return (1);
1959 }
1960 #endif
1961