xref: /titanic_50/usr/src/uts/common/inet/ip/ipclassifier.c (revision 141ae8360b129ba4ff145d9c7fd3353cc2a300f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 const char ipclassifier_version[] = "@(#)ipclassifier.c	1.6	04/03/31 SMI";
29 
30 /*
31  * IP PACKET CLASSIFIER
32  *
33  * The IP packet classifier provides mapping between IP packets and persistent
34  * connection state for connection-oriented protocols. It also provides
35  * interface for managing connection states.
36  *
37  * The connection state is kept in conn_t data structure and contains, among
38  * other things:
39  *
40  *	o local/remote address and ports
41  *	o Transport protocol
42  *	o squeue for the connection (for TCP only)
43  *	o reference counter
44  *	o Connection state
45  *	o hash table linkage
46  *	o interface/ire information
47  *	o credentials
48  *	o ipsec policy
49  *	o send and receive functions.
50  *	o mutex lock.
51  *
52  * Connections use a reference counting scheme. They are freed when the
53  * reference counter drops to zero. A reference is incremented when connection
54  * is placed in a list or table, when incoming packet for the connection arrives
55  * and when connection is processed via squeue (squeue processing may be
56  * asynchronous and the reference protects the connection from being destroyed
57  * before its processing is finished).
58  *
59  * send and receive functions are currently used for TCP only. The send function
60  * determines the IP entry point for the packet once it leaves TCP to be sent to
61  * the destination address. The receive function is used by IP when the packet
62  * should be passed for TCP processing. When a new connection is created these
63  * are set to ip_output() and tcp_input() respectively. During the lifetime of
64  * the connection the send and receive functions may change depending on the
65  * changes in the connection state. For example, Once the connection is bound to
66  * an addresse, the receive function for this connection is set to
67  * tcp_conn_request().  This allows incoming SYNs to go directly into the
68  * listener SYN processing function without going to tcp_input() first.
69  *
70  * Classifier uses several hash tables:
71  *
72  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73  *	ipcl_bind_fanout:	contains all connections in BOUND state
74  *	ipcl_proto_fanout:	IPv4 protocol fanout
75  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76  *	ipcl_udp_fanout:	contains all UDP connections
77  *	ipcl_globalhash_fanout:	contains all connections
78  *
79  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80  * which need to view all existing connections.
81  *
82  * All tables are protected by per-bucket locks. When both per-bucket lock and
83  * connection lock need to be held, the per-bucket lock should be acquired
84  * first, followed by the connection lock.
85  *
86  * All functions doing search in one of these tables increment a reference
87  * counter on the connection found (if any). This reference should be dropped
88  * when the caller has finished processing the connection.
89  *
90  *
91  * INTERFACES:
92  * ===========
93  *
94  * Connection Lookup:
95  * ------------------
96  *
97  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid)
98  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid)
99  *
100  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101  * it can't find any associated connection. If the connection is found, its
102  * reference counter is incremented.
103  *
104  *	mp:	mblock, containing packet header. The full header should fit
105  *		into a single mblock. It should also contain at least full IP
106  *		and TCP or UDP header.
107  *
108  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109  *
110  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111  *		 the packet.
112  *
113  * 	zoneid: The zone in which the returned connection must be.
114  *
115  *	For TCP connections, the lookup order is as follows:
116  *		5-tuple {src, dst, protocol, local port, remote port}
117  *			lookup in ipcl_conn_fanout table.
118  *		3-tuple {dst, remote port, protocol} lookup in
119  *			ipcl_bind_fanout table.
120  *
121  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
122  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
123  *	these interfaces do not handle cases where a packets belongs
124  *	to multiple UDP clients, which is handled in IP itself.
125  *
126  * conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int);
127  * conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t);
128  *
129  *	Lookup routine to find a exact match for {src, dst, local port,
130  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
131  *	ports are read from the IP and TCP header respectively.
132  *
133  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol);
134  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex);
135  *
136  * 	Lookup routine to find a listener with the tuple {lport, laddr,
137  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
138  * 	parameter interface index is also compared.
139  *
140  * void ipcl_walk(func, arg)
141  *
142  * 	Apply 'func' to every connection available. The 'func' is called as
143  *	(*func)(connp, arg). The walk is non-atomic so connections may be
144  *	created and destroyed during the walk. The CONN_CONDEMNED and
145  *	CONN_INCIPIENT flags ensure that connections which are newly created
146  *	or being destroyed are not selected by the walker.
147  *
148  * Table Updates
149  * -------------
150  *
151  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
152  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
153  *
154  *	Insert 'connp' in the ipcl_conn_fanout.
155  *	Arguements :
156  *		connp		conn_t to be inserted
157  *		protocol	connection protocol
158  *		src		source address
159  *		dst		destination address
160  *		ports		local and remote port
161  *		ifindex		interface index for IPv6 connections
162  *
163  *	Return value :
164  *		0		if connp was inserted
165  *		EADDRINUSE	if the connection with the same tuple
166  *				already exists.
167  *
168  * int ipcl_bind_insert(connp, protocol, src, lport);
169  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
170  *
171  * 	Insert 'connp' in ipcl_bind_fanout.
172  * 	Arguements :
173  * 		connp		conn_t to be inserted
174  * 		protocol	connection protocol
175  * 		src		source address connection wants
176  * 				to bind to
177  * 		lport		local port connection wants to
178  * 				bind to
179  *
180  *
181  * void ipcl_hash_remove(connp);
182  *
183  * 	Removes the 'connp' from the connection fanout table.
184  *
185  * Connection Creation/Destruction
186  * -------------------------------
187  *
188  * conn_t *ipcl_conn_create(type, sleep)
189  *
190  * 	Creates a new conn based on the type flag, inserts it into
191  * 	globalhash table.
192  *
193  *	type:	This flag determines the type of conn_t which needs to be
194  *		created.
195  *		IPCL_TCPCONN	indicates a TCP connection
196  *		IPCL_IPCONN	indicates all non-TCP connections.
197  *
198  * void ipcl_conn_destroy(connp)
199  *
200  * 	Destroys the connection state, removes it from the global
201  * 	connection hash table and frees its memory.
202  */
203 
204 #include <sys/types.h>
205 #include <sys/stream.h>
206 #include <sys/dlpi.h>
207 #include <sys/stropts.h>
208 #include <sys/sysmacros.h>
209 #include <sys/strsubr.h>
210 #include <sys/strlog.h>
211 #include <sys/strsun.h>
212 #define	_SUN_TPI_VERSION 2
213 #include <sys/ddi.h>
214 #include <sys/cmn_err.h>
215 #include <sys/debug.h>
216 
217 #include <sys/systm.h>
218 #include <sys/param.h>
219 #include <sys/kmem.h>
220 #include <sys/isa_defs.h>
221 #include <inet/common.h>
222 #include <netinet/ip6.h>
223 #include <netinet/icmp6.h>
224 
225 #include <inet/ip.h>
226 #include <inet/ip6.h>
227 #include <inet/tcp.h>
228 #include <inet/tcp_trace.h>
229 #include <inet/ip_multi.h>
230 #include <inet/ip_if.h>
231 #include <inet/ip_ire.h>
232 #include <inet/ip_rts.h>
233 #include <inet/optcom.h>
234 #include <inet/ip_ndp.h>
235 #include <inet/udp_impl.h>
236 #include <inet/sctp_ip.h>
237 
238 #include <sys/ethernet.h>
239 #include <net/if_types.h>
240 #include <sys/cpuvar.h>
241 
242 #include <inet/mi.h>
243 #include <inet/ipclassifier.h>
244 #include <inet/ipsec_impl.h>
245 
246 #ifdef DEBUG
247 #define	IPCL_DEBUG
248 #else
249 #undef	IPCL_DEBUG
250 #endif
251 
252 #ifdef	IPCL_DEBUG
253 int	ipcl_debug_level = 0;
254 #define	IPCL_DEBUG_LVL(level, args)	\
255 	if (ipcl_debug_level  & level) { printf args; }
256 #else
257 #define	IPCL_DEBUG_LVL(level, args) {; }
258 #endif
259 connf_t	*ipcl_conn_fanout;
260 connf_t	*ipcl_bind_fanout;
261 connf_t	ipcl_proto_fanout[IPPROTO_MAX + 1];
262 connf_t	ipcl_proto_fanout_v6[IPPROTO_MAX + 1];
263 connf_t	*ipcl_udp_fanout;
264 
265 /* A separate hash list for raw socket. */
266 connf_t *ipcl_raw_fanout;
267 
268 connf_t rts_clients;
269 
270 /* Old value for compatibility */
271 uint_t tcp_conn_hash_size = 0;
272 
273 /* New value. Zero means choose automatically. */
274 uint_t ipcl_conn_hash_size = 0;
275 uint_t ipcl_conn_hash_memfactor = 8192;
276 uint_t ipcl_conn_hash_maxsize = 82500;
277 
278 uint_t ipcl_conn_fanout_size = 0;
279 
280 
281 /* bind/udp fanout table size */
282 uint_t ipcl_bind_fanout_size = 512;
283 uint_t ipcl_udp_fanout_size = 16384;
284 
285 /* Raw socket fanout size.  Must be a power of 2. */
286 uint_t ipcl_raw_fanout_size = 256;
287 
288 /*
289  * Power of 2^N Primes useful for hashing for N of 0-28,
290  * these primes are the nearest prime <= 2^N - 2^(N-2).
291  */
292 
293 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
294 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
295 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
296 		50331599, 100663291, 201326557, 0}
297 
298 /*
299  * wrapper structure to ensure that conn+tcpb are aligned
300  * on cache lines.
301  */
302 typedef struct itc_s {
303 	union {
304 		conn_t	itcu_conn;
305 		char	itcu_filler[CACHE_ALIGN(conn_s)];
306 	}	itc_u;
307 	tcp_t	itc_tcp;
308 } itc_t;
309 
310 #define	itc_conn	itc_u.itcu_conn
311 
312 struct kmem_cache  *ipcl_tcpconn_cache;
313 struct kmem_cache  *ipcl_tcp_cache;
314 struct kmem_cache  *ipcl_conn_cache;
315 extern struct kmem_cache  *sctp_conn_cache;
316 extern struct kmem_cache  *tcp_sack_info_cache;
317 extern struct kmem_cache  *tcp_iphc_cache;
318 
319 extern void	tcp_timermp_free(tcp_t *);
320 extern mblk_t	*tcp_timermp_alloc(int);
321 
322 static int	ipcl_tcpconn_constructor(void *, void *, int);
323 static void	ipcl_tcpconn_destructor(void *, void *);
324 
325 static int conn_g_index;
326 connf_t	*ipcl_globalhash_fanout;
327 
328 #ifdef	IPCL_DEBUG
329 #define	INET_NTOA_BUFSIZE	18
330 
331 static char *
332 inet_ntoa_r(uint32_t in, char *b)
333 {
334 	unsigned char	*p;
335 
336 	p = (unsigned char *)&in;
337 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
338 	return (b);
339 }
340 #endif
341 
342 /*
343  * ipclassifier intialization routine, sets up hash tables and
344  * conn caches.
345  */
346 void
347 ipcl_init(void)
348 {
349 	int i;
350 	int sizes[] = P2Ps();
351 
352 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
353 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
354 	    NULL, NULL, NULL, NULL, NULL, 0);
355 
356 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
357 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
358 	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
359 	    NULL, NULL, NULL, 0);
360 
361 	/*
362 	 * Calculate size of conn fanout table.
363 	 */
364 	if (ipcl_conn_hash_size != 0) {
365 		ipcl_conn_fanout_size = ipcl_conn_hash_size;
366 	} else if (tcp_conn_hash_size != 0) {
367 		ipcl_conn_fanout_size = tcp_conn_hash_size;
368 	} else {
369 		extern pgcnt_t freemem;
370 
371 		ipcl_conn_fanout_size =
372 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
373 
374 		if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize)
375 			ipcl_conn_fanout_size = ipcl_conn_hash_maxsize;
376 	}
377 
378 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
379 		if (sizes[i] >= ipcl_conn_fanout_size) {
380 			break;
381 		}
382 	}
383 	if ((ipcl_conn_fanout_size = sizes[i]) == 0) {
384 		/* Out of range, use the 2^16 value */
385 		ipcl_conn_fanout_size = sizes[16];
386 	}
387 	ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size *
388 	    sizeof (*ipcl_conn_fanout), KM_SLEEP);
389 
390 	for (i = 0; i < ipcl_conn_fanout_size; i++) {
391 		mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL,
392 		    MUTEX_DEFAULT, NULL);
393 	}
394 
395 	ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size *
396 	    sizeof (*ipcl_bind_fanout), KM_SLEEP);
397 
398 	for (i = 0; i < ipcl_bind_fanout_size; i++) {
399 		mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL,
400 		    MUTEX_DEFAULT, NULL);
401 	}
402 
403 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) {
404 		mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL,
405 		    MUTEX_DEFAULT, NULL);
406 	}
407 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) {
408 		mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL,
409 		    MUTEX_DEFAULT, NULL);
410 	}
411 
412 	mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL);
413 
414 	ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size *
415 	    sizeof (*ipcl_udp_fanout), KM_SLEEP);
416 
417 	for (i = 0; i < ipcl_udp_fanout_size; i++) {
418 		mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL,
419 		    MUTEX_DEFAULT, NULL);
420 	}
421 
422 	ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size *
423 	    sizeof (*ipcl_raw_fanout), KM_SLEEP);
424 
425 	for (i = 0; i < ipcl_raw_fanout_size; i++) {
426 		mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL,
427 		    MUTEX_DEFAULT, NULL);
428 	}
429 
430 	ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) *
431 	    CONN_G_HASH_SIZE, KM_SLEEP);
432 
433 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
434 		mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL,
435 		    MUTEX_DEFAULT, NULL);
436 	}
437 }
438 
439 void
440 ipcl_destroy(void)
441 {
442 	int i;
443 	kmem_cache_destroy(ipcl_conn_cache);
444 	kmem_cache_destroy(ipcl_tcpconn_cache);
445 	for (i = 0; i < ipcl_conn_fanout_size; i++)
446 		mutex_destroy(&ipcl_conn_fanout[i].connf_lock);
447 	kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size *
448 	    sizeof (*ipcl_conn_fanout));
449 	for (i = 0; i < ipcl_bind_fanout_size; i++)
450 		mutex_destroy(&ipcl_bind_fanout[i].connf_lock);
451 	kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size *
452 	    sizeof (*ipcl_bind_fanout));
453 
454 	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++)
455 		mutex_destroy(&ipcl_proto_fanout[i].connf_lock);
456 	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++)
457 		mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock);
458 
459 	for (i = 0; i < ipcl_udp_fanout_size; i++)
460 		mutex_destroy(&ipcl_udp_fanout[i].connf_lock);
461 	kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size *
462 	    sizeof (*ipcl_udp_fanout));
463 
464 	for (i = 0; i < ipcl_raw_fanout_size; i++)
465 		mutex_destroy(&ipcl_raw_fanout[i].connf_lock);
466 	kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size *
467 	    sizeof (*ipcl_raw_fanout));
468 
469 	kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE);
470 	mutex_destroy(&rts_clients.connf_lock);
471 }
472 
473 /*
474  * conn creation routine. initialize the conn, sets the reference
475  * and inserts it in the global hash table.
476  */
477 conn_t *
478 ipcl_conn_create(uint32_t type, int sleep)
479 {
480 	itc_t	*itc;
481 	conn_t	*connp;
482 
483 	switch (type) {
484 	case IPCL_TCPCONN:
485 		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
486 		    sleep)) == NULL)
487 			return (NULL);
488 		connp = &itc->itc_conn;
489 		connp->conn_ref = 1;
490 		IPCL_DEBUG_LVL(1,
491 		    ("ipcl_conn_create: connp = %p tcp (%p)",
492 		    (void *)connp, (void *)connp->conn_tcp));
493 		ipcl_globalhash_insert(connp);
494 		break;
495 	case IPCL_SCTPCONN:
496 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
497 			return (NULL);
498 		connp->conn_flags = IPCL_SCTPCONN;
499 		break;
500 	case IPCL_IPCCONN:
501 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
502 		if (connp == NULL)
503 			return (NULL);
504 		bzero(connp, sizeof (conn_t));
505 		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
506 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
507 		connp->conn_flags = IPCL_IPCCONN;
508 		connp->conn_ref = 1;
509 		IPCL_DEBUG_LVL(1,
510 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
511 		ipcl_globalhash_insert(connp);
512 		break;
513 	default:
514 		connp = NULL;
515 		ASSERT(0);
516 	}
517 
518 	return (connp);
519 }
520 
521 void
522 ipcl_conn_destroy(conn_t *connp)
523 {
524 	mblk_t	*mp;
525 
526 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
527 	ASSERT(connp->conn_ref == 0);
528 	ASSERT(connp->conn_ire_cache == NULL);
529 
530 	ipcl_globalhash_remove(connp);
531 
532 	cv_destroy(&connp->conn_cv);
533 	if (connp->conn_flags & IPCL_TCPCONN) {
534 		tcp_t	*tcp = connp->conn_tcp;
535 
536 		mutex_destroy(&connp->conn_lock);
537 		ASSERT(connp->conn_tcp != NULL);
538 		tcp_free(tcp);
539 		mp = tcp->tcp_timercache;
540 
541 		if (tcp->tcp_sack_info != NULL) {
542 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
543 			kmem_cache_free(tcp_sack_info_cache,
544 			    tcp->tcp_sack_info);
545 		}
546 		if (tcp->tcp_iphc != NULL) {
547 			if (tcp->tcp_hdr_grown) {
548 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
549 			} else {
550 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
551 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
552 			}
553 			tcp->tcp_iphc_len = 0;
554 		}
555 		ASSERT(tcp->tcp_iphc_len == 0);
556 
557 		if (connp->conn_latch != NULL)
558 			IPLATCH_REFRELE(connp->conn_latch);
559 		if (connp->conn_policy != NULL)
560 			IPPH_REFRELE(connp->conn_policy);
561 		bzero(connp, sizeof (itc_t));
562 
563 		tcp->tcp_timercache = mp;
564 		connp->conn_tcp = tcp;
565 		connp->conn_flags = IPCL_TCPCONN;
566 		connp->conn_ulp = IPPROTO_TCP;
567 		tcp->tcp_connp = connp;
568 		kmem_cache_free(ipcl_tcpconn_cache, connp);
569 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
570 		sctp_free(connp);
571 	} else {
572 		ASSERT(connp->conn_udp == NULL);
573 		mutex_destroy(&connp->conn_lock);
574 		kmem_cache_free(ipcl_conn_cache, connp);
575 	}
576 }
577 
578 /*
579  * Running in cluster mode - deregister listener information
580  */
581 
582 static void
583 ipcl_conn_unlisten(conn_t *connp)
584 {
585 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
586 	ASSERT(connp->conn_lport != 0);
587 
588 	if (cl_inet_unlisten != NULL) {
589 		sa_family_t	addr_family;
590 		uint8_t		*laddrp;
591 
592 		if (connp->conn_pkt_isv6) {
593 			addr_family = AF_INET6;
594 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
595 		} else {
596 			addr_family = AF_INET;
597 			laddrp = (uint8_t *)&connp->conn_bound_source;
598 		}
599 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
600 		    connp->conn_lport);
601 	}
602 	connp->conn_flags &= ~IPCL_CL_LISTENER;
603 }
604 
605 /*
606  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
607  * which table the conn belonged to). So for debugging we can see which hash
608  * table this connection was in.
609  */
610 #define	IPCL_HASH_REMOVE(connp)	{					\
611 	connf_t	*connfp = (connp)->conn_fanout;				\
612 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
613 	if (connfp != NULL) {						\
614 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
615 		    (void *)(connp)));					\
616 		mutex_enter(&connfp->connf_lock);			\
617 		if ((connp)->conn_next != NULL)				\
618 			(connp)->conn_next->conn_prev =			\
619 			    (connp)->conn_prev;				\
620 		if ((connp)->conn_prev != NULL)				\
621 			(connp)->conn_prev->conn_next =			\
622 			    (connp)->conn_next;				\
623 		else							\
624 			connfp->connf_head = (connp)->conn_next;	\
625 		(connp)->conn_fanout = NULL;				\
626 		(connp)->conn_next = NULL;				\
627 		(connp)->conn_prev = NULL;				\
628 		(connp)->conn_flags |= IPCL_REMOVED;			\
629 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
630 			ipcl_conn_unlisten((connp));			\
631 		CONN_DEC_REF((connp));					\
632 		mutex_exit(&connfp->connf_lock);			\
633 	}								\
634 }
635 
636 void
637 ipcl_hash_remove(conn_t *connp)
638 {
639 	IPCL_HASH_REMOVE(connp);
640 }
641 
642 /*
643  * The whole purpose of this function is allow removal of
644  * a conn_t from the connected hash for timewait reclaim.
645  * This is essentially a TW reclaim fastpath where timewait
646  * collector checks under fanout lock (so no one else can
647  * get access to the conn_t) that refcnt is 2 i.e. one for
648  * TCP and one for the classifier hash list. If ref count
649  * is indeed 2, we can just remove the conn under lock and
650  * avoid cleaning up the conn under squeue. This gives us
651  * improved performance.
652  */
653 void
654 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
655 {
656 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
657 	ASSERT(MUTEX_HELD(&connp->conn_lock));
658 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
659 
660 	if ((connp)->conn_next != NULL) {
661 		(connp)->conn_next->conn_prev =
662 			(connp)->conn_prev;
663 	}
664 	if ((connp)->conn_prev != NULL) {
665 		(connp)->conn_prev->conn_next =
666 			(connp)->conn_next;
667 	} else {
668 		connfp->connf_head = (connp)->conn_next;
669 	}
670 	(connp)->conn_fanout = NULL;
671 	(connp)->conn_next = NULL;
672 	(connp)->conn_prev = NULL;
673 	(connp)->conn_flags |= IPCL_REMOVED;
674 	ASSERT((connp)->conn_ref == 2);
675 	(connp)->conn_ref--;
676 }
677 
678 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
679 	ASSERT((connp)->conn_fanout == NULL);				\
680 	ASSERT((connp)->conn_next == NULL);				\
681 	ASSERT((connp)->conn_prev == NULL);				\
682 	if ((connfp)->connf_head != NULL) {				\
683 		(connfp)->connf_head->conn_prev = (connp);		\
684 		(connp)->conn_next = (connfp)->connf_head;		\
685 	}								\
686 	(connp)->conn_fanout = (connfp);				\
687 	(connfp)->connf_head = (connp);					\
688 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
689 	    IPCL_CONNECTED;						\
690 	CONN_INC_REF(connp);						\
691 }
692 
693 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
694 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
695 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
696 	IPCL_HASH_REMOVE((connp));					\
697 	mutex_enter(&(connfp)->connf_lock);				\
698 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
699 	mutex_exit(&(connfp)->connf_lock);				\
700 }
701 
702 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
703 	conn_t *pconnp = NULL, *nconnp;					\
704 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
705 	    "connp %p", (void *)connfp, (void *)(connp)));		\
706 	IPCL_HASH_REMOVE((connp));					\
707 	mutex_enter(&(connfp)->connf_lock);				\
708 	nconnp = (connfp)->connf_head;					\
709 	while (nconnp != NULL &&					\
710 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
711 		pconnp = nconnp;					\
712 		nconnp = nconnp->conn_next;				\
713 	}								\
714 	if (pconnp != NULL) {						\
715 		pconnp->conn_next = (connp);				\
716 		(connp)->conn_prev = pconnp;				\
717 	} else {							\
718 		(connfp)->connf_head = (connp);				\
719 	}								\
720 	if (nconnp != NULL) {						\
721 		(connp)->conn_next = nconnp;				\
722 		nconnp->conn_prev = (connp);				\
723 	}								\
724 	(connp)->conn_fanout = (connfp);				\
725 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
726 	    IPCL_BOUND;							\
727 	CONN_INC_REF(connp);						\
728 	mutex_exit(&(connfp)->connf_lock);				\
729 }
730 
731 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
732 	conn_t **list, *prev, *next;					\
733 	boolean_t isv4mapped =						\
734 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
735 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
736 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
737 	IPCL_HASH_REMOVE((connp));					\
738 	mutex_enter(&(connfp)->connf_lock);				\
739 	list = &(connfp)->connf_head;					\
740 	prev = NULL;							\
741 	while ((next = *list) != NULL) {				\
742 		if (isv4mapped &&					\
743 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
744 		    connp->conn_zoneid == next->conn_zoneid) {		\
745 			(connp)->conn_next = next;			\
746 			if (prev != NULL)				\
747 				prev = next->conn_prev;			\
748 			next->conn_prev = (connp);			\
749 			break;						\
750 		}							\
751 		list = &next->conn_next;				\
752 		prev = next;						\
753 	}								\
754 	(connp)->conn_prev = prev;					\
755 	*list = (connp);						\
756 	(connp)->conn_fanout = (connfp);				\
757 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
758 	    IPCL_BOUND;							\
759 	CONN_INC_REF((connp));						\
760 	mutex_exit(&(connfp)->connf_lock);				\
761 }
762 
763 void
764 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
765 {
766 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
767 }
768 
769 void
770 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
771 {
772 	connf_t	*connfp;
773 
774 	ASSERT(connp != NULL);
775 
776 	connp->conn_ulp = protocol;
777 
778 	/* Insert it in the protocol hash */
779 	connfp = &ipcl_proto_fanout[protocol];
780 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
781 }
782 
783 void
784 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
785 {
786 	connf_t	*connfp;
787 
788 	ASSERT(connp != NULL);
789 
790 	connp->conn_ulp = protocol;
791 
792 	/* Insert it in the Bind Hash */
793 	connfp = &ipcl_proto_fanout_v6[protocol];
794 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
795 }
796 
797 /*
798  * This function is used only for inserting SCTP raw socket now.
799  * This may change later.
800  *
801  * Note that only one raw socket can be bound to a port.  The param
802  * lport is in network byte order.
803  */
804 static int
805 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
806 {
807 	connf_t	*connfp;
808 	conn_t	*oconnp;
809 
810 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
811 
812 	/* Check for existing raw socket already bound to the port. */
813 	mutex_enter(&connfp->connf_lock);
814 	for (oconnp = connfp->connf_head; oconnp != NULL;
815 	    oconnp = oconnp->conn_next) {
816 		if (oconnp->conn_lport == lport &&
817 		    oconnp->conn_zoneid == connp->conn_zoneid &&
818 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
819 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
820 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
821 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
822 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
823 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
824 		    &connp->conn_srcv6))) {
825 			break;
826 		}
827 	}
828 	mutex_exit(&connfp->connf_lock);
829 	if (oconnp != NULL)
830 		return (EADDRNOTAVAIL);
831 
832 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
833 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
834 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
835 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
836 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
837 		} else {
838 			IPCL_HASH_INSERT_BOUND(connfp, connp);
839 		}
840 	} else {
841 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
842 	}
843 	return (0);
844 }
845 
846 /*
847  * (v4, v6) bind hash insertion routines
848  */
849 int
850 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
851 {
852 	connf_t	*connfp;
853 #ifdef	IPCL_DEBUG
854 	char	buf[INET_NTOA_BUFSIZE];
855 #endif
856 	int	ret = 0;
857 
858 	ASSERT(connp);
859 
860 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
861 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
862 
863 	connp->conn_ulp = protocol;
864 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
865 	connp->conn_lport = lport;
866 
867 	switch (protocol) {
868 	case IPPROTO_UDP:
869 	default:
870 		if (protocol == IPPROTO_UDP) {
871 			IPCL_DEBUG_LVL(64,
872 			    ("ipcl_bind_insert: connp %p - udp\n",
873 			    (void *)connp));
874 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
875 		} else {
876 			IPCL_DEBUG_LVL(64,
877 			    ("ipcl_bind_insert: connp %p - protocol\n",
878 			    (void *)connp));
879 			connfp = &ipcl_proto_fanout[protocol];
880 		}
881 
882 		if (connp->conn_rem != INADDR_ANY) {
883 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
884 		} else if (connp->conn_src != INADDR_ANY) {
885 			IPCL_HASH_INSERT_BOUND(connfp, connp);
886 		} else {
887 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
888 		}
889 		break;
890 
891 	case IPPROTO_TCP:
892 
893 		/* Insert it in the Bind Hash */
894 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
895 		if (connp->conn_src != INADDR_ANY) {
896 			IPCL_HASH_INSERT_BOUND(connfp, connp);
897 		} else {
898 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
899 		}
900 		if (cl_inet_listen != NULL) {
901 			ASSERT(!connp->conn_pkt_isv6);
902 			connp->conn_flags |= IPCL_CL_LISTENER;
903 			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
904 			    (uint8_t *)&connp->conn_bound_source, lport);
905 		}
906 		break;
907 
908 	case IPPROTO_SCTP:
909 		ret = ipcl_sctp_hash_insert(connp, lport);
910 		break;
911 	}
912 
913 	return (ret);
914 }
915 
916 int
917 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
918     uint16_t lport)
919 {
920 	connf_t	*connfp;
921 	int	ret = 0;
922 
923 	ASSERT(connp);
924 
925 	connp->conn_ulp = protocol;
926 	connp->conn_srcv6 = *src;
927 	connp->conn_lport = lport;
928 
929 	switch (protocol) {
930 	case IPPROTO_UDP:
931 	default:
932 		if (protocol == IPPROTO_UDP) {
933 			IPCL_DEBUG_LVL(128,
934 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
935 			    (void *)connp));
936 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
937 		} else {
938 			IPCL_DEBUG_LVL(128,
939 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
940 			    (void *)connp));
941 			connfp = &ipcl_proto_fanout_v6[protocol];
942 		}
943 
944 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
945 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
946 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
947 			IPCL_HASH_INSERT_BOUND(connfp, connp);
948 		} else {
949 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
950 		}
951 		break;
952 
953 	case IPPROTO_TCP:
954 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
955 
956 		/* Insert it in the Bind Hash */
957 		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
958 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
959 			IPCL_HASH_INSERT_BOUND(connfp, connp);
960 		} else {
961 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
962 		}
963 		if (cl_inet_listen != NULL) {
964 			sa_family_t	addr_family;
965 			uint8_t		*laddrp;
966 
967 			if (connp->conn_pkt_isv6) {
968 				addr_family = AF_INET6;
969 				laddrp =
970 				    (uint8_t *)&connp->conn_bound_source_v6;
971 			} else {
972 				addr_family = AF_INET;
973 				laddrp = (uint8_t *)&connp->conn_bound_source;
974 			}
975 			connp->conn_flags |= IPCL_CL_LISTENER;
976 			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
977 			    lport);
978 		}
979 		break;
980 
981 	case IPPROTO_SCTP:
982 		ret = ipcl_sctp_hash_insert(connp, lport);
983 		break;
984 	}
985 
986 	return (ret);
987 }
988 
989 /*
990  * ipcl_conn_hash insertion routines.
991  */
992 int
993 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
994     ipaddr_t rem, uint32_t ports)
995 {
996 	connf_t		*connfp;
997 	uint16_t	*up;
998 	conn_t		*tconnp;
999 #ifdef	IPCL_DEBUG
1000 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1001 #endif
1002 	in_port_t	lport;
1003 	int		ret = 0;
1004 
1005 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1006 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1007 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1008 	    ports, protocol));
1009 
1010 	switch (protocol) {
1011 	case IPPROTO_TCP:
1012 		if (!(connp->conn_flags & IPCL_EAGER)) {
1013 			/*
1014 			 * for a eager connection, i.e connections which
1015 			 * have just been created, the initialization is
1016 			 * already done in ip at conn_creation time, so
1017 			 * we can skip the checks here.
1018 			 */
1019 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1020 		}
1021 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem,
1022 		    connp->conn_ports)];
1023 		mutex_enter(&connfp->connf_lock);
1024 		for (tconnp = connfp->connf_head; tconnp != NULL;
1025 		    tconnp = tconnp->conn_next) {
1026 			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1027 			    connp->conn_rem, connp->conn_src,
1028 			    connp->conn_ports)) {
1029 
1030 				/* Already have a conn. bail out */
1031 				mutex_exit(&connfp->connf_lock);
1032 				return (EADDRINUSE);
1033 			}
1034 		}
1035 		if (connp->conn_fanout != NULL) {
1036 			/*
1037 			 * Probably a XTI/TLI application trying to do a
1038 			 * rebind. Let it happen.
1039 			 */
1040 			mutex_exit(&connfp->connf_lock);
1041 			IPCL_HASH_REMOVE(connp);
1042 			mutex_enter(&connfp->connf_lock);
1043 		}
1044 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1045 		mutex_exit(&connfp->connf_lock);
1046 		break;
1047 
1048 	case IPPROTO_SCTP:
1049 		/*
1050 		 * The raw socket may have already been bound, remove it
1051 		 * from the hash first.
1052 		 */
1053 		IPCL_HASH_REMOVE(connp);
1054 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1055 		ret = ipcl_sctp_hash_insert(connp, lport);
1056 		break;
1057 
1058 	case IPPROTO_UDP:
1059 	default:
1060 		up = (uint16_t *)&ports;
1061 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1062 		if (protocol == IPPROTO_UDP) {
1063 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1064 		} else {
1065 			connfp = &ipcl_proto_fanout[protocol];
1066 		}
1067 
1068 		if (connp->conn_rem != INADDR_ANY) {
1069 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1070 		} else if (connp->conn_src != INADDR_ANY) {
1071 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1072 		} else {
1073 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1074 		}
1075 		break;
1076 	}
1077 
1078 	return (ret);
1079 }
1080 
1081 int
1082 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1083     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1084 {
1085 	connf_t		*connfp;
1086 	uint16_t	*up;
1087 	conn_t		*tconnp;
1088 	in_port_t	lport;
1089 	int		ret = 0;
1090 
1091 	switch (protocol) {
1092 	case IPPROTO_TCP:
1093 		/* Just need to insert a conn struct */
1094 		if (!(connp->conn_flags & IPCL_EAGER)) {
1095 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1096 		}
1097 		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6,
1098 		    connp->conn_ports)];
1099 		mutex_enter(&connfp->connf_lock);
1100 		for (tconnp = connfp->connf_head; tconnp != NULL;
1101 		    tconnp = tconnp->conn_next) {
1102 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1103 			    connp->conn_remv6, connp->conn_srcv6,
1104 			    connp->conn_ports) &&
1105 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1106 			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1107 				/* Already have a conn. bail out */
1108 				mutex_exit(&connfp->connf_lock);
1109 				return (EADDRINUSE);
1110 			}
1111 		}
1112 		if (connp->conn_fanout != NULL) {
1113 			/*
1114 			 * Probably a XTI/TLI application trying to do a
1115 			 * rebind. Let it happen.
1116 			 */
1117 			mutex_exit(&connfp->connf_lock);
1118 			IPCL_HASH_REMOVE(connp);
1119 			mutex_enter(&connfp->connf_lock);
1120 		}
1121 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1122 		mutex_exit(&connfp->connf_lock);
1123 		break;
1124 
1125 	case IPPROTO_SCTP:
1126 		IPCL_HASH_REMOVE(connp);
1127 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1128 		ret = ipcl_sctp_hash_insert(connp, lport);
1129 		break;
1130 
1131 	case IPPROTO_UDP:
1132 	default:
1133 		up = (uint16_t *)&ports;
1134 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1135 		if (protocol == IPPROTO_UDP) {
1136 			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1137 		} else {
1138 			connfp = &ipcl_proto_fanout_v6[protocol];
1139 		}
1140 
1141 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1142 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1143 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1144 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1145 		} else {
1146 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1147 		}
1148 		break;
1149 	}
1150 
1151 	return (ret);
1152 }
1153 
1154 /*
1155  * v4 packet classifying function. looks up the fanout table to
1156  * find the conn, the packet belongs to. returns the conn with
1157  * the reference held, null otherwise.
1158  */
1159 conn_t *
1160 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1161 {
1162 	ipha_t	*ipha;
1163 	connf_t	*connfp, *bind_connfp;
1164 	uint16_t lport;
1165 	uint16_t fport;
1166 	uint32_t ports;
1167 	conn_t	*connp;
1168 	uint16_t  *up;
1169 
1170 	ipha = (ipha_t *)mp->b_rptr;
1171 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1172 
1173 	switch (protocol) {
1174 	case IPPROTO_TCP:
1175 		ports = *(uint32_t *)up;
1176 		connfp =
1177 		    &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)];
1178 		mutex_enter(&connfp->connf_lock);
1179 		for (connp = connfp->connf_head; connp != NULL;
1180 		    connp = connp->conn_next) {
1181 			if (IPCL_CONN_MATCH(connp, protocol,
1182 			    ipha->ipha_src, ipha->ipha_dst, ports))
1183 				break;
1184 		}
1185 
1186 		if (connp != NULL) {
1187 			CONN_INC_REF(connp);
1188 			mutex_exit(&connfp->connf_lock);
1189 			return (connp);
1190 		}
1191 
1192 		mutex_exit(&connfp->connf_lock);
1193 
1194 		lport = up[1];
1195 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1196 		mutex_enter(&bind_connfp->connf_lock);
1197 		for (connp = bind_connfp->connf_head; connp != NULL;
1198 		    connp = connp->conn_next) {
1199 			if (IPCL_BIND_MATCH(connp, protocol,
1200 			    ipha->ipha_dst, lport) &&
1201 			    connp->conn_zoneid == zoneid)
1202 				break;
1203 		}
1204 
1205 		if (connp != NULL) {
1206 			/* Have a listner at least */
1207 			CONN_INC_REF(connp);
1208 			mutex_exit(&bind_connfp->connf_lock);
1209 			return (connp);
1210 		}
1211 
1212 		mutex_exit(&bind_connfp->connf_lock);
1213 
1214 		IPCL_DEBUG_LVL(512,
1215 		    ("ipcl_classify: couldn't classify mp = %p\n",
1216 		    (void *)mp));
1217 		break;
1218 
1219 	case IPPROTO_UDP:
1220 		lport = up[1];
1221 		fport = up[0];
1222 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1223 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1224 		mutex_enter(&connfp->connf_lock);
1225 		for (connp = connfp->connf_head; connp != NULL;
1226 		    connp = connp->conn_next) {
1227 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1228 			    fport, ipha->ipha_src) &&
1229 			    connp->conn_zoneid == zoneid)
1230 				break;
1231 		}
1232 
1233 		if (connp != NULL) {
1234 			CONN_INC_REF(connp);
1235 			mutex_exit(&connfp->connf_lock);
1236 			return (connp);
1237 		}
1238 
1239 		/*
1240 		 * We shouldn't come here for multicast/broadcast packets
1241 		 */
1242 		mutex_exit(&connfp->connf_lock);
1243 		IPCL_DEBUG_LVL(512,
1244 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1245 		    lport, fport));
1246 		break;
1247 	}
1248 
1249 	return (NULL);
1250 }
1251 
1252 conn_t *
1253 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1254 {
1255 	ip6_t		*ip6h;
1256 	connf_t		*connfp, *bind_connfp;
1257 	uint16_t	lport;
1258 	uint16_t	fport;
1259 	tcph_t		*tcph;
1260 	uint32_t	ports;
1261 	conn_t		*connp;
1262 	uint16_t	*up;
1263 
1264 
1265 	ip6h = (ip6_t *)mp->b_rptr;
1266 
1267 	switch (protocol) {
1268 	case IPPROTO_TCP:
1269 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1270 		up = (uint16_t *)tcph->th_lport;
1271 		ports = *(uint32_t *)up;
1272 
1273 		connfp =
1274 		    &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)];
1275 		mutex_enter(&connfp->connf_lock);
1276 		for (connp = connfp->connf_head; connp != NULL;
1277 		    connp = connp->conn_next) {
1278 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1279 			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1280 				break;
1281 		}
1282 
1283 		if (connp != NULL) {
1284 			CONN_INC_REF(connp);
1285 			mutex_exit(&connfp->connf_lock);
1286 			return (connp);
1287 		}
1288 
1289 		mutex_exit(&connfp->connf_lock);
1290 
1291 		lport = up[1];
1292 		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1293 		mutex_enter(&bind_connfp->connf_lock);
1294 		for (connp = bind_connfp->connf_head; connp != NULL;
1295 		    connp = connp->conn_next) {
1296 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1297 			    ip6h->ip6_dst, lport) &&
1298 			    connp->conn_zoneid == zoneid)
1299 				break;
1300 		}
1301 
1302 		if (connp != NULL) {
1303 			/* Have a listner at least */
1304 			CONN_INC_REF(connp);
1305 			mutex_exit(&bind_connfp->connf_lock);
1306 			IPCL_DEBUG_LVL(512,
1307 			    ("ipcl_classify_v6: found listner "
1308 			    "connp = %p\n", (void *)connp));
1309 
1310 			return (connp);
1311 		}
1312 
1313 		mutex_exit(&bind_connfp->connf_lock);
1314 
1315 		IPCL_DEBUG_LVL(512,
1316 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1317 		    (void *)mp));
1318 		break;
1319 
1320 	case IPPROTO_UDP:
1321 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1322 		lport = up[1];
1323 		fport = up[0];
1324 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1325 		    fport));
1326 		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1327 		mutex_enter(&connfp->connf_lock);
1328 		for (connp = connfp->connf_head; connp != NULL;
1329 		    connp = connp->conn_next) {
1330 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1331 			    fport, ip6h->ip6_src) &&
1332 			    connp->conn_zoneid == zoneid)
1333 				break;
1334 		}
1335 
1336 		if (connp != NULL) {
1337 			CONN_INC_REF(connp);
1338 			mutex_exit(&connfp->connf_lock);
1339 			return (connp);
1340 		}
1341 
1342 		/*
1343 		 * We shouldn't come here for multicast/broadcast packets
1344 		 */
1345 		mutex_exit(&connfp->connf_lock);
1346 		IPCL_DEBUG_LVL(512,
1347 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1348 		    lport, fport));
1349 		break;
1350 	}
1351 
1352 
1353 	return (NULL);
1354 }
1355 
1356 /*
1357  * wrapper around ipcl_classify_(v4,v6) routines.
1358  */
1359 conn_t *
1360 ipcl_classify(mblk_t *mp, zoneid_t zoneid)
1361 {
1362 	uint16_t	hdr_len;
1363 	ipha_t		*ipha;
1364 	uint8_t		*nexthdrp;
1365 
1366 	if (MBLKL(mp) < sizeof (ipha_t))
1367 		return (NULL);
1368 
1369 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1370 	case IPV4_VERSION:
1371 		ipha = (ipha_t *)mp->b_rptr;
1372 		hdr_len = IPH_HDR_LENGTH(ipha);
1373 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1374 		    zoneid));
1375 	case IPV6_VERSION:
1376 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1377 		    &hdr_len, &nexthdrp))
1378 			return (NULL);
1379 
1380 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid));
1381 	}
1382 
1383 	return (NULL);
1384 }
1385 
1386 conn_t *
1387 ipcl_classify_raw(uint8_t protocol, zoneid_t zoneid, uint32_t ports,
1388     ipha_t *hdr)
1389 {
1390 	struct connf_s	*connfp;
1391 	conn_t		*connp;
1392 	in_port_t	lport;
1393 	int		af;
1394 
1395 	lport = ((uint16_t *)&ports)[1];
1396 	af = IPH_HDR_VERSION(hdr);
1397 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
1398 
1399 	mutex_enter(&connfp->connf_lock);
1400 	for (connp = connfp->connf_head; connp != NULL;
1401 	    connp = connp->conn_next) {
1402 		/* We don't allow v4 fallback for v6 raw socket. */
1403 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1404 		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1405 			continue;
1406 		}
1407 		if (connp->conn_fully_bound) {
1408 			if (af == IPV4_VERSION) {
1409 				if (IPCL_CONN_MATCH(connp, protocol,
1410 				    hdr->ipha_src, hdr->ipha_dst, ports)) {
1411 					break;
1412 				}
1413 			} else {
1414 				if (IPCL_CONN_MATCH_V6(connp, protocol,
1415 				    ((ip6_t *)hdr)->ip6_src,
1416 				    ((ip6_t *)hdr)->ip6_dst, ports)) {
1417 					break;
1418 				}
1419 			}
1420 		} else {
1421 			if (af == IPV4_VERSION) {
1422 				if (IPCL_BIND_MATCH(connp, protocol,
1423 				    hdr->ipha_dst, lport)) {
1424 					break;
1425 				}
1426 			} else {
1427 				if (IPCL_BIND_MATCH_V6(connp, protocol,
1428 				    ((ip6_t *)hdr)->ip6_dst, lport)) {
1429 					break;
1430 				}
1431 			}
1432 		}
1433 	}
1434 
1435 	if (connp != NULL)
1436 		goto found;
1437 	mutex_exit(&connfp->connf_lock);
1438 
1439 	/* Try to look for a wildcard match. */
1440 	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)];
1441 	mutex_enter(&connfp->connf_lock);
1442 	for (connp = connfp->connf_head; connp != NULL;
1443 	    connp = connp->conn_next) {
1444 		/* We don't allow v4 fallback for v6 raw socket. */
1445 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1446 		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1447 			continue;
1448 		}
1449 		if (af == IPV4_VERSION) {
1450 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1451 				break;
1452 		} else {
1453 			if (IPCL_RAW_MATCH_V6(connp, protocol,
1454 			    ((ip6_t *)hdr)->ip6_dst)) {
1455 				break;
1456 			}
1457 		}
1458 	}
1459 
1460 	if (connp != NULL)
1461 		goto found;
1462 
1463 	mutex_exit(&connfp->connf_lock);
1464 	return (NULL);
1465 
1466 found:
1467 	ASSERT(connp != NULL);
1468 	CONN_INC_REF(connp);
1469 	mutex_exit(&connfp->connf_lock);
1470 	return (connp);
1471 }
1472 
1473 /* ARGSUSED */
1474 static int
1475 ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1476 {
1477 	itc_t	*itc = (itc_t *)buf;
1478 	conn_t 	*connp = &itc->itc_conn;
1479 	tcp_t	*tcp = &itc->itc_tcp;
1480 	bzero(itc, sizeof (itc_t));
1481 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1482 	connp->conn_tcp = tcp;
1483 	connp->conn_flags = IPCL_TCPCONN;
1484 	connp->conn_ulp = IPPROTO_TCP;
1485 	tcp->tcp_connp = connp;
1486 	return (0);
1487 }
1488 
1489 /* ARGSUSED */
1490 static void
1491 ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1492 {
1493 	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1494 }
1495 
1496 /*
1497  * All conns are inserted in a global multi-list for the benefit of
1498  * walkers. The walk is guaranteed to walk all open conns at the time
1499  * of the start of the walk exactly once. This property is needed to
1500  * achieve some cleanups during unplumb of interfaces. This is achieved
1501  * as follows.
1502  *
1503  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1504  * call the insert and delete functions below at creation and deletion
1505  * time respectively. The conn never moves or changes its position in this
1506  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1507  * won't increase due to walkers, once the conn deletion has started. Note
1508  * that we can't remove the conn from the global list and then wait for
1509  * the refcnt to drop to zero, since walkers would then see a truncated
1510  * list. CONN_INCIPIENT ensures that walkers don't start looking at
1511  * conns until ip_open is ready to make them globally visible.
1512  * The global round robin multi-list locks are held only to get the
1513  * next member/insertion/deletion and contention should be negligible
1514  * if the multi-list is much greater than the number of cpus.
1515  */
1516 void
1517 ipcl_globalhash_insert(conn_t *connp)
1518 {
1519 	int	index;
1520 
1521 	/*
1522 	 * No need for atomic here. Approximate even distribution
1523 	 * in the global lists is sufficient.
1524 	 */
1525 	conn_g_index++;
1526 	index = conn_g_index & (CONN_G_HASH_SIZE - 1);
1527 
1528 	connp->conn_g_prev = NULL;
1529 	/*
1530 	 * Mark as INCIPIENT, so that walkers will ignore this
1531 	 * for now, till ip_open is ready to make it visible globally.
1532 	 */
1533 	connp->conn_state_flags |= CONN_INCIPIENT;
1534 
1535 	/* Insert at the head of the list */
1536 	mutex_enter(&ipcl_globalhash_fanout[index].connf_lock);
1537 	connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head;
1538 	if (connp->conn_g_next != NULL)
1539 		connp->conn_g_next->conn_g_prev = connp;
1540 	ipcl_globalhash_fanout[index].connf_head = connp;
1541 
1542 	/* The fanout bucket this conn points to */
1543 	connp->conn_g_fanout = &ipcl_globalhash_fanout[index];
1544 
1545 	mutex_exit(&ipcl_globalhash_fanout[index].connf_lock);
1546 }
1547 
1548 void
1549 ipcl_globalhash_remove(conn_t *connp)
1550 {
1551 	/*
1552 	 * We were never inserted in the global multi list.
1553 	 * IPCL_NONE variety is never inserted in the global multilist
1554 	 * since it is presumed to not need any cleanup and is transient.
1555 	 */
1556 	if (connp->conn_g_fanout == NULL)
1557 		return;
1558 
1559 	mutex_enter(&connp->conn_g_fanout->connf_lock);
1560 	if (connp->conn_g_prev != NULL)
1561 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
1562 	else
1563 		connp->conn_g_fanout->connf_head = connp->conn_g_next;
1564 	if (connp->conn_g_next != NULL)
1565 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
1566 	mutex_exit(&connp->conn_g_fanout->connf_lock);
1567 
1568 	/* Better to stumble on a null pointer than to corrupt memory */
1569 	connp->conn_g_next = NULL;
1570 	connp->conn_g_prev = NULL;
1571 }
1572 
1573 /*
1574  * Walk the list of all conn_t's in the system, calling the function provided
1575  * with the specified argument for each.
1576  * Applies to both IPv4 and IPv6.
1577  *
1578  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
1579  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
1580  * unplumbed or removed. New conn_t's that are created while we are walking
1581  * may be missed by this walk, because they are not necessarily inserted
1582  * at the tail of the list. They are new conn_t's and thus don't have any
1583  * stale pointers. The CONN_CLOSING flag ensures that no new reference
1584  * is created to the struct that is going away.
1585  */
1586 void
1587 ipcl_walk(pfv_t func, void *arg)
1588 {
1589 	int	i;
1590 	conn_t	*connp;
1591 	conn_t	*prev_connp;
1592 
1593 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
1594 		mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1595 		prev_connp = NULL;
1596 		connp = ipcl_globalhash_fanout[i].connf_head;
1597 		while (connp != NULL) {
1598 			mutex_enter(&connp->conn_lock);
1599 			if (connp->conn_state_flags &
1600 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
1601 				mutex_exit(&connp->conn_lock);
1602 				connp = connp->conn_g_next;
1603 				continue;
1604 			}
1605 			CONN_INC_REF_LOCKED(connp);
1606 			mutex_exit(&connp->conn_lock);
1607 			mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1608 			(*func)(connp, arg);
1609 			if (prev_connp != NULL)
1610 				CONN_DEC_REF(prev_connp);
1611 			mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1612 			prev_connp = connp;
1613 			connp = connp->conn_g_next;
1614 		}
1615 		mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1616 		if (prev_connp != NULL)
1617 			CONN_DEC_REF(prev_connp);
1618 	}
1619 }
1620 
1621 /*
1622  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
1623  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1624  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1625  * (peer tcp in at least ESTABLISHED state).
1626  */
1627 conn_t *
1628 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph)
1629 {
1630 	uint32_t ports;
1631 	uint16_t *pports = (uint16_t *)&ports;
1632 	connf_t	*connfp;
1633 	conn_t	*tconnp;
1634 	boolean_t zone_chk;
1635 
1636 	/*
1637 	 * If either the source of destination address is loopback, then
1638 	 * both endpoints must be in the same Zone.  Otherwise, both of
1639 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1640 	 * state) and the endpoints may reside in different Zones.
1641 	 */
1642 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
1643 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
1644 
1645 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1646 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1647 
1648 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1649 
1650 	mutex_enter(&connfp->connf_lock);
1651 	for (tconnp = connfp->connf_head; tconnp != NULL;
1652 	    tconnp = tconnp->conn_next) {
1653 
1654 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1655 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1656 		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1657 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1658 
1659 			ASSERT(tconnp != connp);
1660 			CONN_INC_REF(tconnp);
1661 			mutex_exit(&connfp->connf_lock);
1662 			return (tconnp);
1663 		}
1664 	}
1665 	mutex_exit(&connfp->connf_lock);
1666 	return (NULL);
1667 }
1668 
1669 /*
1670  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
1671  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1672  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1673  * (peer tcp in at least ESTABLISHED state).
1674  */
1675 conn_t *
1676 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph)
1677 {
1678 	uint32_t ports;
1679 	uint16_t *pports = (uint16_t *)&ports;
1680 	connf_t	*connfp;
1681 	conn_t	*tconnp;
1682 	boolean_t zone_chk;
1683 
1684 	/*
1685 	 * If either the source of destination address is loopback, then
1686 	 * both endpoints must be in the same Zone.  Otherwise, both of
1687 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1688 	 * state) and the endpoints may reside in different Zones.  We
1689 	 * don't do Zone check for link local address(es) because the
1690 	 * current Zone implementation treats each link local address as
1691 	 * being unique per system node, i.e. they belong to global Zone.
1692 	 */
1693 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
1694 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
1695 
1696 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1697 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1698 
1699 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1700 
1701 	mutex_enter(&connfp->connf_lock);
1702 	for (tconnp = connfp->connf_head; tconnp != NULL;
1703 	    tconnp = tconnp->conn_next) {
1704 
1705 		/* We skip tcp_bound_if check here as this is loopback tcp */
1706 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1707 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1708 		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1709 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1710 
1711 			ASSERT(tconnp != connp);
1712 			CONN_INC_REF(tconnp);
1713 			mutex_exit(&connfp->connf_lock);
1714 			return (tconnp);
1715 		}
1716 	}
1717 	mutex_exit(&connfp->connf_lock);
1718 	return (NULL);
1719 }
1720 
1721 /*
1722  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1723  * Returns with conn reference held. Caller must call CONN_DEC_REF.
1724  * Only checks for connected entries i.e. no INADDR_ANY checks.
1725  */
1726 conn_t *
1727 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state)
1728 {
1729 	uint32_t ports;
1730 	uint16_t *pports;
1731 	connf_t	*connfp;
1732 	conn_t	*tconnp;
1733 
1734 	pports = (uint16_t *)&ports;
1735 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1736 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1737 
1738 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1739 
1740 	mutex_enter(&connfp->connf_lock);
1741 	for (tconnp = connfp->connf_head; tconnp != NULL;
1742 	    tconnp = tconnp->conn_next) {
1743 
1744 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1745 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1746 		    tconnp->conn_tcp->tcp_state >= min_state) {
1747 
1748 			CONN_INC_REF(tconnp);
1749 			mutex_exit(&connfp->connf_lock);
1750 			return (tconnp);
1751 		}
1752 	}
1753 	mutex_exit(&connfp->connf_lock);
1754 	return (NULL);
1755 }
1756 
1757 /*
1758  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1759  * Returns with conn reference held. Caller must call CONN_DEC_REF.
1760  * Only checks for connected entries i.e. no INADDR_ANY checks.
1761  * Match on ifindex in addition to addresses.
1762  */
1763 conn_t *
1764 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
1765     uint_t ifindex)
1766 {
1767 	tcp_t	*tcp;
1768 	uint32_t ports;
1769 	uint16_t *pports;
1770 	connf_t	*connfp;
1771 	conn_t	*tconnp;
1772 
1773 	pports = (uint16_t *)&ports;
1774 	pports[0] = tcpha->tha_fport;
1775 	pports[1] = tcpha->tha_lport;
1776 
1777 	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1778 
1779 	mutex_enter(&connfp->connf_lock);
1780 	for (tconnp = connfp->connf_head; tconnp != NULL;
1781 	    tconnp = tconnp->conn_next) {
1782 
1783 		tcp = tconnp->conn_tcp;
1784 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1785 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1786 		    tcp->tcp_state >= min_state &&
1787 		    (tcp->tcp_bound_if == 0 ||
1788 		    tcp->tcp_bound_if == ifindex)) {
1789 
1790 			CONN_INC_REF(tconnp);
1791 			mutex_exit(&connfp->connf_lock);
1792 			return (tconnp);
1793 		}
1794 	}
1795 	mutex_exit(&connfp->connf_lock);
1796 	return (NULL);
1797 }
1798 
1799 /*
1800  * To find a TCP listening connection matching the incoming segment.
1801  */
1802 conn_t *
1803 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid)
1804 {
1805 	connf_t		*bind_connfp;
1806 	conn_t		*connp;
1807 	tcp_t		*tcp;
1808 
1809 	/*
1810 	 * Avoid false matches for packets sent to an IP destination of
1811 	 * all zeros.
1812 	 */
1813 	if (laddr == 0)
1814 		return (NULL);
1815 
1816 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1817 	mutex_enter(&bind_connfp->connf_lock);
1818 	for (connp = bind_connfp->connf_head; connp != NULL;
1819 	    connp = connp->conn_next) {
1820 		tcp = connp->conn_tcp;
1821 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
1822 		    connp->conn_zoneid == zoneid &&
1823 		    (tcp->tcp_listener == NULL)) {
1824 			CONN_INC_REF(connp);
1825 			mutex_exit(&bind_connfp->connf_lock);
1826 			return (connp);
1827 		}
1828 	}
1829 	mutex_exit(&bind_connfp->connf_lock);
1830 	return (NULL);
1831 }
1832 
1833 
1834 conn_t *
1835 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
1836     zoneid_t zoneid)
1837 {
1838 	connf_t		*bind_connfp;
1839 	conn_t		*connp = NULL;
1840 	tcp_t		*tcp;
1841 
1842 	/*
1843 	 * Avoid false matches for packets sent to an IP destination of
1844 	 * all zeros.
1845 	 */
1846 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
1847 		return (NULL);
1848 
1849 
1850 	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1851 	mutex_enter(&bind_connfp->connf_lock);
1852 	for (connp = bind_connfp->connf_head; connp != NULL;
1853 	    connp = connp->conn_next) {
1854 		tcp = connp->conn_tcp;
1855 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
1856 		    connp->conn_zoneid == zoneid &&
1857 		    (tcp->tcp_bound_if == 0 ||
1858 		    tcp->tcp_bound_if == ifindex) &&
1859 		    tcp->tcp_listener == NULL) {
1860 			CONN_INC_REF(connp);
1861 			mutex_exit(&bind_connfp->connf_lock);
1862 			return (connp);
1863 		}
1864 	}
1865 	mutex_exit(&bind_connfp->connf_lock);
1866 	return (NULL);
1867 }
1868 
1869 /*
1870  * ipcl_get_next_conn
1871  *	get the next entry in the conn global list
1872  *	and put a reference on the next_conn.
1873  *	decrement the reference on the current conn.
1874  *
1875  * This is an iterator based walker function that also provides for
1876  * some selection by the caller. It walks through the conn_hash bucket
1877  * searching for the next valid connp in the list, and selects connections
1878  * that are neither closed nor condemned. It also REFHOLDS the conn
1879  * thus ensuring that the conn exists when the caller uses the conn.
1880  */
1881 conn_t *
1882 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
1883 {
1884 	conn_t	*next_connp;
1885 
1886 	if (connfp == NULL)
1887 		return (NULL);
1888 
1889 	mutex_enter(&connfp->connf_lock);
1890 
1891 	next_connp = (connp == NULL) ?
1892 	    connfp->connf_head : connp->conn_g_next;
1893 
1894 	while (next_connp != NULL) {
1895 		mutex_enter(&next_connp->conn_lock);
1896 		if (!(next_connp->conn_flags & conn_flags) ||
1897 		    (next_connp->conn_state_flags &
1898 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
1899 			/*
1900 			 * This conn has been condemned or
1901 			 * is closing, or the flags don't match
1902 			 */
1903 			mutex_exit(&next_connp->conn_lock);
1904 			next_connp = next_connp->conn_g_next;
1905 			continue;
1906 		}
1907 		CONN_INC_REF_LOCKED(next_connp);
1908 		mutex_exit(&next_connp->conn_lock);
1909 		break;
1910 	}
1911 
1912 	mutex_exit(&connfp->connf_lock);
1913 
1914 	if (connp != NULL)
1915 		CONN_DEC_REF(connp);
1916 
1917 	return (next_connp);
1918 }
1919 
1920 #ifdef CONN_DEBUG
1921 /*
1922  * Trace of the last NBUF refhold/refrele
1923  */
1924 int
1925 conn_trace_ref(conn_t *connp)
1926 {
1927 	int	last;
1928 	conn_trace_t	*ctb;
1929 
1930 	ASSERT(MUTEX_HELD(&connp->conn_lock));
1931 	last = connp->conn_trace_last;
1932 	last++;
1933 	if (last == CONN_TRACE_MAX)
1934 		last = 0;
1935 
1936 	ctb = &connp->conn_trace_buf[last];
1937 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1938 	connp->conn_trace_last = last;
1939 	return (1);
1940 }
1941 
1942 int
1943 conn_untrace_ref(conn_t *connp)
1944 {
1945 	int	last;
1946 	conn_trace_t	*ctb;
1947 
1948 	ASSERT(MUTEX_HELD(&connp->conn_lock));
1949 	last = connp->conn_trace_last;
1950 	last++;
1951 	if (last == CONN_TRACE_MAX)
1952 		last = 0;
1953 
1954 	ctb = &connp->conn_trace_buf[last];
1955 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1956 	connp->conn_trace_last = last;
1957 	return (1);
1958 }
1959 #endif
1960