xref: /illumos-gate/usr/src/uts/common/inet/ip/ipclassifier.c (revision 3fe455549728ac525df3be56130ad8e075d645d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24  * Copyright 2022 Joyent, Inc.
25  * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org>
26  */
27 
28 /*
29  * IP PACKET CLASSIFIER
30  *
31  * The IP packet classifier provides mapping between IP packets and persistent
32  * connection state for connection-oriented protocols. It also provides
33  * interface for managing connection states.
34  *
35  * The connection state is kept in conn_t data structure and contains, among
36  * other things:
37  *
38  *	o local/remote address and ports
39  *	o Transport protocol
40  *	o squeue for the connection (for TCP only)
41  *	o reference counter
42  *	o Connection state
43  *	o hash table linkage
44  *	o interface/ire information
45  *	o credentials
46  *	o ipsec policy
47  *	o send and receive functions.
48  *	o mutex lock.
49  *
50  * Connections use a reference counting scheme. They are freed when the
51  * reference counter drops to zero. A reference is incremented when connection
52  * is placed in a list or table, when incoming packet for the connection arrives
53  * and when connection is processed via squeue (squeue processing may be
54  * asynchronous and the reference protects the connection from being destroyed
55  * before its processing is finished).
56  *
57  * conn_recv is used to pass up packets to the ULP.
58  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
59  * a listener, and changes to tcp_input_listener as the listener has picked a
60  * good squeue. For other cases it is set to tcp_input_data.
61  *
62  * conn_recvicmp is used to pass up ICMP errors to the ULP.
63  *
64  * Classifier uses several hash tables:
65  *
66  *	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
67  *	ipcl_bind_fanout:	contains all connections in BOUND state
68  *	ipcl_proto_fanout:	IPv4 protocol fanout
69  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
70  *	ipcl_udp_fanout:	contains all UDP connections
71  *	ipcl_iptun_fanout:	contains all IP tunnel connections
72  *	ipcl_globalhash_fanout:	contains all connections
73  *
74  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
75  * which need to view all existing connections.
76  *
77  * All tables are protected by per-bucket locks. When both per-bucket lock and
78  * connection lock need to be held, the per-bucket lock should be acquired
79  * first, followed by the connection lock.
80  *
81  * All functions doing search in one of these tables increment a reference
82  * counter on the connection found (if any). This reference should be dropped
83  * when the caller has finished processing the connection.
84  *
85  *
86  * INTERFACES:
87  * ===========
88  *
89  * Connection Lookup:
90  * ------------------
91  *
92  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
93  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
94  *
95  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
96  * it can't find any associated connection. If the connection is found, its
97  * reference counter is incremented.
98  *
99  *	mp:	mblock, containing packet header. The full header should fit
100  *		into a single mblock. It should also contain at least full IP
101  *		and TCP or UDP header.
102  *
103  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
104  *
105  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
106  *		 the packet.
107  *
108  *	ira->ira_zoneid: The zone in which the returned connection must be; the
109  *		zoneid corresponding to the ire_zoneid on the IRE located for
110  *		the packet's destination address.
111  *
112  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
113  *		IRAF_TX_SHARED_ADDR flags
114  *
115  *	For TCP connections, the lookup order is as follows:
116  *		5-tuple {src, dst, protocol, local port, remote port}
117  *			lookup in ipcl_conn_fanout table.
118  *		3-tuple {dst, remote port, protocol} lookup in
119  *			ipcl_bind_fanout table.
120  *
121  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
122  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
123  *	these interfaces do not handle cases where a packets belongs
124  *	to multiple UDP clients, which is handled in IP itself.
125  *
126  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
127  * determine which actual zone gets the segment.  This is used only in a
128  * labeled environment.  The matching rules are:
129  *
130  *	- If it's not a multilevel port, then the label on the packet selects
131  *	  the zone.  Unlabeled packets are delivered to the global zone.
132  *
133  *	- If it's a multilevel port, then only the zone registered to receive
134  *	  packets on that port matches.
135  *
136  * Also, in a labeled environment, packet labels need to be checked.  For fully
137  * bound TCP connections, we can assume that the packet label was checked
138  * during connection establishment, and doesn't need to be checked on each
139  * packet.  For others, though, we need to check for strict equality or, for
140  * multilevel ports, membership in the range or set.  This part currently does
141  * a tnrh lookup on each packet, but could be optimized to use cached results
142  * if that were necessary.  (SCTP doesn't come through here, but if it did,
143  * we would apply the same rules as TCP.)
144  *
145  * An implication of the above is that fully-bound TCP sockets must always use
146  * distinct 4-tuples; they can't be discriminated by label alone.
147  *
148  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
149  * as there's no connection set-up handshake and no shared state.
150  *
151  * Labels on looped-back packets within a single zone do not need to be
152  * checked, as all processes in the same zone have the same label.
153  *
154  * Finally, for unlabeled packets received by a labeled system, special rules
155  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
156  * socket in the zone whose label matches the default label of the sender, if
157  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
158  * receiver's label must dominate the sender's default label.
159  *
160  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
161  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
162  *					 ip_stack);
163  *
164  *	Lookup routine to find a exact match for {src, dst, local port,
165  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
166  *	ports are read from the IP and TCP header respectively.
167  *
168  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
169  *					 zoneid, ip_stack);
170  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
171  *					 zoneid, ip_stack);
172  *
173  *	Lookup routine to find a listener with the tuple {lport, laddr,
174  *	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
175  *	parameter interface index is also compared.
176  *
177  * void ipcl_walk(func, arg, ip_stack)
178  *
179  *	Apply 'func' to every connection available. The 'func' is called as
180  *	(*func)(connp, arg). The walk is non-atomic so connections may be
181  *	created and destroyed during the walk. The CONN_CONDEMNED and
182  *	CONN_INCIPIENT flags ensure that connections which are newly created
183  *	or being destroyed are not selected by the walker.
184  *
185  * Table Updates
186  * -------------
187  *
188  * int ipcl_conn_insert(connp);
189  * int ipcl_conn_insert_v4(connp);
190  * int ipcl_conn_insert_v6(connp);
191  *
192  *	Insert 'connp' in the ipcl_conn_fanout.
193  *	Arguments :
194  *		connp		conn_t to be inserted
195  *
196  *	Return value :
197  *		0		if connp was inserted
198  *		EADDRINUSE	if the connection with the same tuple
199  *				already exists.
200  *
201  * int ipcl_bind_insert(connp);
202  * int ipcl_bind_insert_v4(connp);
203  * int ipcl_bind_insert_v6(connp);
204  *
205  *	Insert 'connp' in ipcl_bind_fanout.
206  *	Arguments :
207  *		connp		conn_t to be inserted
208  *
209  *
210  * void ipcl_hash_remove(connp);
211  *
212  *	Removes the 'connp' from the connection fanout table.
213  *
214  * Connection Creation/Destruction
215  * -------------------------------
216  *
217  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
218  *
219  *	Creates a new conn based on the type flag, inserts it into
220  *	globalhash table.
221  *
222  *	type:	This flag determines the type of conn_t which needs to be
223  *		created i.e., which kmem_cache it comes from.
224  *		IPCL_TCPCONN	indicates a TCP connection
225  *		IPCL_SCTPCONN	indicates a SCTP connection
226  *		IPCL_UDPCONN	indicates a UDP conn_t.
227  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
228  *		IPCL_RTSCONN	indicates a RTS conn_t.
229  *		IPCL_IPCCONN	indicates all other connections.
230  *
231  * void ipcl_conn_destroy(connp)
232  *
233  *	Destroys the connection state, removes it from the global
234  *	connection hash table and frees its memory.
235  */
236 
237 #include <sys/types.h>
238 #include <sys/stream.h>
239 #include <sys/stropts.h>
240 #include <sys/sysmacros.h>
241 #include <sys/strsubr.h>
242 #include <sys/strsun.h>
243 #define	_SUN_TPI_VERSION 2
244 #include <sys/ddi.h>
245 #include <sys/cmn_err.h>
246 #include <sys/debug.h>
247 
248 #include <sys/systm.h>
249 #include <sys/param.h>
250 #include <sys/kmem.h>
251 #include <sys/isa_defs.h>
252 #include <inet/common.h>
253 #include <netinet/ip6.h>
254 #include <netinet/icmp6.h>
255 
256 #include <inet/ip.h>
257 #include <inet/ip_if.h>
258 #include <inet/ip_ire.h>
259 #include <inet/ip6.h>
260 #include <inet/ip_ndp.h>
261 #include <inet/ip_impl.h>
262 #include <inet/udp_impl.h>
263 #include <inet/sctp_ip.h>
264 #include <inet/sctp/sctp_impl.h>
265 #include <inet/rawip_impl.h>
266 #include <inet/rts_impl.h>
267 #include <inet/iptun/iptun_impl.h>
268 
269 #include <sys/cpuvar.h>
270 
271 #include <inet/ipclassifier.h>
272 #include <inet/tcp.h>
273 #include <inet/ipsec_impl.h>
274 
275 #include <sys/tsol/tnet.h>
276 #include <sys/sockio.h>
277 
278 /* Old value for compatibility. Setable in /etc/system */
279 uint_t tcp_conn_hash_size = 0;
280 
281 /* New value. Zero means choose automatically.  Setable in /etc/system */
282 uint_t ipcl_conn_hash_size = 0;
283 uint_t ipcl_conn_hash_memfactor = 8192;
284 uint_t ipcl_conn_hash_maxsize = 82500;
285 
286 /* bind/udp fanout table size */
287 uint_t ipcl_bind_fanout_size = 512;
288 uint_t ipcl_udp_fanout_size = 16384;
289 
290 /* Raw socket fanout size.  Must be a power of 2. */
291 uint_t ipcl_raw_fanout_size = 256;
292 
293 /*
294  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
295  * expect that most large deployments would have hundreds of tunnels, and
296  * thousands in the extreme case.
297  */
298 uint_t ipcl_iptun_fanout_size = 6143;
299 
300 /*
301  * Power of 2^N Primes useful for hashing for N of 0-28,
302  * these primes are the nearest prime <= 2^N - 2^(N-2).
303  */
304 
305 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
306 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
307 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
308 		50331599, 100663291, 201326557, 0}
309 
310 /*
311  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
312  * are aligned on cache lines.
313  */
314 typedef union itc_s {
315 	conn_t	itc_conn;
316 	char	itcu_filler[CACHE_ALIGN(conn_s)];
317 } itc_t;
318 
319 struct kmem_cache  *tcp_conn_cache;
320 struct kmem_cache  *ip_conn_cache;
321 extern struct kmem_cache  *sctp_conn_cache;
322 struct kmem_cache  *udp_conn_cache;
323 struct kmem_cache  *rawip_conn_cache;
324 struct kmem_cache  *rts_conn_cache;
325 
326 extern void	tcp_timermp_free(tcp_t *);
327 extern mblk_t	*tcp_timermp_alloc(int);
328 
329 static int	ip_conn_constructor(void *, void *, int);
330 static void	ip_conn_destructor(void *, void *);
331 
332 static int	tcp_conn_constructor(void *, void *, int);
333 static void	tcp_conn_destructor(void *, void *);
334 
335 static int	udp_conn_constructor(void *, void *, int);
336 static void	udp_conn_destructor(void *, void *);
337 
338 static int	rawip_conn_constructor(void *, void *, int);
339 static void	rawip_conn_destructor(void *, void *);
340 
341 static int	rts_conn_constructor(void *, void *, int);
342 static void	rts_conn_destructor(void *, void *);
343 
344 /*
345  * Global (for all stack instances) init routine
346  */
347 void
348 ipcl_g_init(void)
349 {
350 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
351 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
352 	    ip_conn_constructor, ip_conn_destructor,
353 	    NULL, NULL, NULL, 0);
354 
355 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
356 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
357 	    tcp_conn_constructor, tcp_conn_destructor,
358 	    tcp_conn_reclaim, NULL, NULL, 0);
359 
360 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
361 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
362 	    udp_conn_constructor, udp_conn_destructor,
363 	    NULL, NULL, NULL, 0);
364 
365 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
366 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
367 	    rawip_conn_constructor, rawip_conn_destructor,
368 	    NULL, NULL, NULL, 0);
369 
370 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
371 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
372 	    rts_conn_constructor, rts_conn_destructor,
373 	    NULL, NULL, NULL, 0);
374 }
375 
376 /*
377  * ipclassifier intialization routine, sets up hash tables.
378  */
379 void
380 ipcl_init(ip_stack_t *ipst)
381 {
382 	int i;
383 	int sizes[] = P2Ps();
384 
385 	/*
386 	 * Calculate size of conn fanout table from /etc/system settings
387 	 */
388 	if (ipcl_conn_hash_size != 0) {
389 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
390 	} else if (tcp_conn_hash_size != 0) {
391 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
392 	} else {
393 		extern pgcnt_t freemem;
394 
395 		ipst->ips_ipcl_conn_fanout_size =
396 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
397 
398 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
399 			ipst->ips_ipcl_conn_fanout_size =
400 			    ipcl_conn_hash_maxsize;
401 		}
402 	}
403 
404 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
405 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
406 			break;
407 		}
408 	}
409 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
410 		/* Out of range, use the 2^16 value */
411 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
412 	}
413 
414 	/* Take values from /etc/system */
415 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
416 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
417 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
418 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
419 
420 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
421 
422 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
423 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
424 
425 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
426 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
427 		    MUTEX_DEFAULT, NULL);
428 	}
429 
430 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
431 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
432 
433 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
434 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
435 		    MUTEX_DEFAULT, NULL);
436 	}
437 
438 	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
439 	    sizeof (connf_t), KM_SLEEP);
440 	for (i = 0; i < IPPROTO_MAX; i++) {
441 		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
442 		    MUTEX_DEFAULT, NULL);
443 	}
444 
445 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
446 	    sizeof (connf_t), KM_SLEEP);
447 	for (i = 0; i < IPPROTO_MAX; i++) {
448 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
449 		    MUTEX_DEFAULT, NULL);
450 	}
451 
452 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
453 	mutex_init(&ipst->ips_rts_clients->connf_lock,
454 	    NULL, MUTEX_DEFAULT, NULL);
455 
456 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
457 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
458 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
459 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
460 		    MUTEX_DEFAULT, NULL);
461 	}
462 
463 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
464 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
465 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
466 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
467 		    MUTEX_DEFAULT, NULL);
468 	}
469 
470 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
471 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
472 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
473 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
474 		    MUTEX_DEFAULT, NULL);
475 	}
476 
477 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
478 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
479 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
480 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
481 		    NULL, MUTEX_DEFAULT, NULL);
482 	}
483 }
484 
485 void
486 ipcl_g_destroy(void)
487 {
488 	kmem_cache_destroy(ip_conn_cache);
489 	kmem_cache_destroy(tcp_conn_cache);
490 	kmem_cache_destroy(udp_conn_cache);
491 	kmem_cache_destroy(rawip_conn_cache);
492 	kmem_cache_destroy(rts_conn_cache);
493 }
494 
495 /*
496  * All user-level and kernel use of the stack must be gone
497  * by now.
498  */
499 void
500 ipcl_destroy(ip_stack_t *ipst)
501 {
502 	int i;
503 
504 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
505 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
506 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
507 	}
508 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
509 	    sizeof (connf_t));
510 	ipst->ips_ipcl_conn_fanout = NULL;
511 
512 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
513 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
514 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
515 	}
516 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
517 	    sizeof (connf_t));
518 	ipst->ips_ipcl_bind_fanout = NULL;
519 
520 	for (i = 0; i < IPPROTO_MAX; i++) {
521 		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
522 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
523 	}
524 	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
525 	    IPPROTO_MAX * sizeof (connf_t));
526 	ipst->ips_ipcl_proto_fanout_v4 = NULL;
527 
528 	for (i = 0; i < IPPROTO_MAX; i++) {
529 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
530 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
531 	}
532 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
533 	    IPPROTO_MAX * sizeof (connf_t));
534 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
535 
536 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
537 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
538 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
539 	}
540 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
541 	    sizeof (connf_t));
542 	ipst->ips_ipcl_udp_fanout = NULL;
543 
544 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
545 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
546 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
547 	}
548 	kmem_free(ipst->ips_ipcl_iptun_fanout,
549 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
550 	ipst->ips_ipcl_iptun_fanout = NULL;
551 
552 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
553 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
554 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
555 	}
556 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
557 	    sizeof (connf_t));
558 	ipst->ips_ipcl_raw_fanout = NULL;
559 
560 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
561 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
562 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
563 	}
564 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
565 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
566 	ipst->ips_ipcl_globalhash_fanout = NULL;
567 
568 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
569 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
570 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
571 	ipst->ips_rts_clients = NULL;
572 }
573 
574 /*
575  * conn creation routine. initialize the conn, sets the reference
576  * and inserts it in the global hash table.
577  */
578 conn_t *
579 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
580 {
581 	conn_t	*connp;
582 	struct kmem_cache *conn_cache;
583 
584 	switch (type) {
585 	case IPCL_SCTPCONN:
586 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
587 			return (NULL);
588 		sctp_conn_init(connp);
589 		netstack_hold(ns);
590 		connp->conn_netstack = ns;
591 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
592 		connp->conn_ixa->ixa_conn_id = (long)connp;
593 		ipcl_globalhash_insert(connp);
594 		return (connp);
595 
596 	case IPCL_TCPCONN:
597 		conn_cache = tcp_conn_cache;
598 		break;
599 
600 	case IPCL_UDPCONN:
601 		conn_cache = udp_conn_cache;
602 		break;
603 
604 	case IPCL_RAWIPCONN:
605 		conn_cache = rawip_conn_cache;
606 		break;
607 
608 	case IPCL_RTSCONN:
609 		conn_cache = rts_conn_cache;
610 		break;
611 
612 	case IPCL_IPCCONN:
613 		conn_cache = ip_conn_cache;
614 		break;
615 
616 	default:
617 		conn_cache = NULL;
618 		connp = NULL;
619 		ASSERT(0);
620 	}
621 
622 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
623 		return (NULL);
624 
625 	connp->conn_ref = 1;
626 	netstack_hold(ns);
627 	connp->conn_netstack = ns;
628 	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
629 	connp->conn_ixa->ixa_conn_id = (long)connp;
630 	ipcl_globalhash_insert(connp);
631 	return (connp);
632 }
633 
634 void
635 ipcl_conn_destroy(conn_t *connp)
636 {
637 	mblk_t	*mp;
638 	netstack_t	*ns = connp->conn_netstack;
639 
640 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
641 	ASSERT(connp->conn_ref == 0);
642 	ASSERT(connp->conn_ioctlref == 0);
643 
644 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
645 
646 	if (connp->conn_cred != NULL) {
647 		crfree(connp->conn_cred);
648 		connp->conn_cred = NULL;
649 		/* ixa_cred done in ipcl_conn_cleanup below */
650 	}
651 
652 	if (connp->conn_ht_iphc != NULL) {
653 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
654 		connp->conn_ht_iphc = NULL;
655 		connp->conn_ht_iphc_allocated = 0;
656 		connp->conn_ht_iphc_len = 0;
657 		connp->conn_ht_ulp = NULL;
658 		connp->conn_ht_ulp_len = 0;
659 	}
660 	ip_pkt_free(&connp->conn_xmit_ipp);
661 
662 	ipcl_globalhash_remove(connp);
663 
664 	if (connp->conn_latch != NULL) {
665 		IPLATCH_REFRELE(connp->conn_latch);
666 		connp->conn_latch = NULL;
667 	}
668 	if (connp->conn_latch_in_policy != NULL) {
669 		IPPOL_REFRELE(connp->conn_latch_in_policy);
670 		connp->conn_latch_in_policy = NULL;
671 	}
672 	if (connp->conn_latch_in_action != NULL) {
673 		IPACT_REFRELE(connp->conn_latch_in_action);
674 		connp->conn_latch_in_action = NULL;
675 	}
676 	if (connp->conn_policy != NULL) {
677 		IPPH_REFRELE(connp->conn_policy, ns);
678 		connp->conn_policy = NULL;
679 	}
680 
681 	if (connp->conn_ipsec_opt_mp != NULL) {
682 		freemsg(connp->conn_ipsec_opt_mp);
683 		connp->conn_ipsec_opt_mp = NULL;
684 	}
685 
686 	if (connp->conn_flags & IPCL_TCPCONN) {
687 		tcp_t *tcp = connp->conn_tcp;
688 
689 		tcp_free(tcp);
690 		mp = tcp->tcp_timercache;
691 
692 		tcp->tcp_tcps = NULL;
693 
694 		/*
695 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
696 		 * the mblk.
697 		 */
698 		if (tcp->tcp_rsrv_mp != NULL) {
699 			freeb(tcp->tcp_rsrv_mp);
700 			tcp->tcp_rsrv_mp = NULL;
701 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
702 		}
703 
704 		ipcl_conn_cleanup(connp);
705 		connp->conn_flags = IPCL_TCPCONN;
706 		if (ns != NULL) {
707 			ASSERT(tcp->tcp_tcps == NULL);
708 			connp->conn_netstack = NULL;
709 			connp->conn_ixa->ixa_ipst = NULL;
710 			netstack_rele(ns);
711 		}
712 
713 		bzero(tcp, sizeof (tcp_t));
714 
715 		tcp->tcp_timercache = mp;
716 		tcp->tcp_connp = connp;
717 		kmem_cache_free(tcp_conn_cache, connp);
718 		return;
719 	}
720 
721 	if (connp->conn_flags & IPCL_SCTPCONN) {
722 		ASSERT(ns != NULL);
723 		sctp_free(connp);
724 		return;
725 	}
726 
727 	ipcl_conn_cleanup(connp);
728 	if (ns != NULL) {
729 		connp->conn_netstack = NULL;
730 		connp->conn_ixa->ixa_ipst = NULL;
731 		netstack_rele(ns);
732 	}
733 
734 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
735 	if (connp->conn_flags & IPCL_UDPCONN) {
736 		connp->conn_flags = IPCL_UDPCONN;
737 		kmem_cache_free(udp_conn_cache, connp);
738 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
739 		connp->conn_flags = IPCL_RAWIPCONN;
740 		connp->conn_proto = IPPROTO_ICMP;
741 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
742 		kmem_cache_free(rawip_conn_cache, connp);
743 	} else if (connp->conn_flags & IPCL_RTSCONN) {
744 		connp->conn_flags = IPCL_RTSCONN;
745 		kmem_cache_free(rts_conn_cache, connp);
746 	} else {
747 		connp->conn_flags = IPCL_IPCCONN;
748 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
749 		ASSERT(connp->conn_priv == NULL);
750 		kmem_cache_free(ip_conn_cache, connp);
751 	}
752 }
753 
754 /*
755  * Running in cluster mode - deregister listener information
756  */
757 static void
758 ipcl_conn_unlisten(conn_t *connp)
759 {
760 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
761 	ASSERT(connp->conn_lport != 0);
762 
763 	if (cl_inet_unlisten != NULL) {
764 		sa_family_t	addr_family;
765 		uint8_t		*laddrp;
766 
767 		if (connp->conn_ipversion == IPV6_VERSION) {
768 			addr_family = AF_INET6;
769 			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
770 		} else {
771 			addr_family = AF_INET;
772 			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
773 		}
774 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
775 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
776 	}
777 	connp->conn_flags &= ~IPCL_CL_LISTENER;
778 }
779 
780 /*
781  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
782  * which table the conn belonged to). So for debugging we can see which hash
783  * table this connection was in.
784  */
785 #define	IPCL_HASH_REMOVE(connp)	{					\
786 	connf_t	*connfp = (connp)->conn_fanout;				\
787 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
788 	if (connfp != NULL) {						\
789 		mutex_enter(&connfp->connf_lock);			\
790 		if ((connp)->conn_next != NULL)				\
791 			(connp)->conn_next->conn_prev =			\
792 			    (connp)->conn_prev;				\
793 		if ((connp)->conn_prev != NULL)				\
794 			(connp)->conn_prev->conn_next =			\
795 			    (connp)->conn_next;				\
796 		else							\
797 			connfp->connf_head = (connp)->conn_next;	\
798 		(connp)->conn_fanout = NULL;				\
799 		(connp)->conn_next = NULL;				\
800 		(connp)->conn_prev = NULL;				\
801 		(connp)->conn_flags |= IPCL_REMOVED;			\
802 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
803 			ipcl_conn_unlisten((connp));			\
804 		CONN_DEC_REF((connp));					\
805 		mutex_exit(&connfp->connf_lock);			\
806 	}								\
807 }
808 
809 void
810 ipcl_hash_remove(conn_t *connp)
811 {
812 	uint8_t		protocol = connp->conn_proto;
813 
814 	IPCL_HASH_REMOVE(connp);
815 	if (protocol == IPPROTO_RSVP)
816 		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
817 }
818 
819 /*
820  * The whole purpose of this function is allow removal of
821  * a conn_t from the connected hash for timewait reclaim.
822  * This is essentially a TW reclaim fastpath where timewait
823  * collector checks under fanout lock (so no one else can
824  * get access to the conn_t) that refcnt is 2 i.e. one for
825  * TCP and one for the classifier hash list. If ref count
826  * is indeed 2, we can just remove the conn under lock and
827  * avoid cleaning up the conn under squeue. This gives us
828  * improved performance.
829  */
830 void
831 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
832 {
833 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
834 	ASSERT(MUTEX_HELD(&connp->conn_lock));
835 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
836 
837 	if ((connp)->conn_next != NULL) {
838 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
839 	}
840 	if ((connp)->conn_prev != NULL) {
841 		(connp)->conn_prev->conn_next = (connp)->conn_next;
842 	} else {
843 		connfp->connf_head = (connp)->conn_next;
844 	}
845 	(connp)->conn_fanout = NULL;
846 	(connp)->conn_next = NULL;
847 	(connp)->conn_prev = NULL;
848 	(connp)->conn_flags |= IPCL_REMOVED;
849 	ASSERT((connp)->conn_ref == 2);
850 	(connp)->conn_ref--;
851 }
852 
853 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
854 	ASSERT((connp)->conn_fanout == NULL);				\
855 	ASSERT((connp)->conn_next == NULL);				\
856 	ASSERT((connp)->conn_prev == NULL);				\
857 	if ((connfp)->connf_head != NULL) {				\
858 		(connfp)->connf_head->conn_prev = (connp);		\
859 		(connp)->conn_next = (connfp)->connf_head;		\
860 	}								\
861 	(connp)->conn_fanout = (connfp);				\
862 	(connfp)->connf_head = (connp);					\
863 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
864 	    IPCL_CONNECTED;						\
865 	CONN_INC_REF(connp);						\
866 }
867 
868 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
869 	IPCL_HASH_REMOVE((connp));					\
870 	mutex_enter(&(connfp)->connf_lock);				\
871 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
872 	mutex_exit(&(connfp)->connf_lock);				\
873 }
874 
875 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
876 	conn_t *pconnp = NULL, *nconnp;					\
877 	IPCL_HASH_REMOVE((connp));					\
878 	mutex_enter(&(connfp)->connf_lock);				\
879 	nconnp = (connfp)->connf_head;					\
880 	while (nconnp != NULL &&					\
881 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
882 		pconnp = nconnp;					\
883 		nconnp = nconnp->conn_next;				\
884 	}								\
885 	if (pconnp != NULL) {						\
886 		pconnp->conn_next = (connp);				\
887 		(connp)->conn_prev = pconnp;				\
888 	} else {							\
889 		(connfp)->connf_head = (connp);				\
890 	}								\
891 	if (nconnp != NULL) {						\
892 		(connp)->conn_next = nconnp;				\
893 		nconnp->conn_prev = (connp);				\
894 	}								\
895 	(connp)->conn_fanout = (connfp);				\
896 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
897 	    IPCL_BOUND;							\
898 	CONN_INC_REF(connp);						\
899 	mutex_exit(&(connfp)->connf_lock);				\
900 }
901 
902 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
903 	conn_t **list, *prev, *next;					\
904 	boolean_t isv4mapped =						\
905 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
906 	IPCL_HASH_REMOVE((connp));					\
907 	mutex_enter(&(connfp)->connf_lock);				\
908 	list = &(connfp)->connf_head;					\
909 	prev = NULL;							\
910 	while ((next = *list) != NULL) {				\
911 		if (isv4mapped &&					\
912 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
913 		    connp->conn_zoneid == next->conn_zoneid) {		\
914 			(connp)->conn_next = next;			\
915 			if (prev != NULL)				\
916 				prev = next->conn_prev;			\
917 			next->conn_prev = (connp);			\
918 			break;						\
919 		}							\
920 		list = &next->conn_next;				\
921 		prev = next;						\
922 	}								\
923 	(connp)->conn_prev = prev;					\
924 	*list = (connp);						\
925 	(connp)->conn_fanout = (connfp);				\
926 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
927 	    IPCL_BOUND;							\
928 	CONN_INC_REF((connp));						\
929 	mutex_exit(&(connfp)->connf_lock);				\
930 }
931 
932 void
933 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
934 {
935 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
936 }
937 
938 /*
939  * Because the classifier is used to classify inbound packets, the destination
940  * address is meant to be our local tunnel address (tunnel source), and the
941  * source the remote tunnel address (tunnel destination).
942  *
943  * Note that conn_proto can't be used for fanout since the upper protocol
944  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
945  */
946 conn_t *
947 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
948 {
949 	connf_t	*connfp;
950 	conn_t	*connp;
951 
952 	/* first look for IPv4 tunnel links */
953 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
954 	mutex_enter(&connfp->connf_lock);
955 	for (connp = connfp->connf_head; connp != NULL;
956 	    connp = connp->conn_next) {
957 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
958 			break;
959 	}
960 	if (connp != NULL)
961 		goto done;
962 
963 	mutex_exit(&connfp->connf_lock);
964 
965 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
966 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
967 	    INADDR_ANY)];
968 	mutex_enter(&connfp->connf_lock);
969 	for (connp = connfp->connf_head; connp != NULL;
970 	    connp = connp->conn_next) {
971 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
972 			break;
973 	}
974 done:
975 	if (connp != NULL)
976 		CONN_INC_REF(connp);
977 	mutex_exit(&connfp->connf_lock);
978 	return (connp);
979 }
980 
981 conn_t *
982 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
983 {
984 	connf_t	*connfp;
985 	conn_t	*connp;
986 
987 	/* Look for an IPv6 tunnel link */
988 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
989 	mutex_enter(&connfp->connf_lock);
990 	for (connp = connfp->connf_head; connp != NULL;
991 	    connp = connp->conn_next) {
992 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
993 			CONN_INC_REF(connp);
994 			break;
995 		}
996 	}
997 	mutex_exit(&connfp->connf_lock);
998 	return (connp);
999 }
1000 
1001 /*
1002  * This function is used only for inserting SCTP raw socket now.
1003  * This may change later.
1004  *
1005  * Note that only one raw socket can be bound to a port.  The param
1006  * lport is in network byte order.
1007  */
1008 static int
1009 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1010 {
1011 	connf_t	*connfp;
1012 	conn_t	*oconnp;
1013 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1014 
1015 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1016 
1017 	/* Check for existing raw socket already bound to the port. */
1018 	mutex_enter(&connfp->connf_lock);
1019 	for (oconnp = connfp->connf_head; oconnp != NULL;
1020 	    oconnp = oconnp->conn_next) {
1021 		if (oconnp->conn_lport == lport &&
1022 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1023 		    oconnp->conn_family == connp->conn_family &&
1024 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1025 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1026 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1027 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1028 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1029 		    &connp->conn_laddr_v6))) {
1030 			break;
1031 		}
1032 	}
1033 	mutex_exit(&connfp->connf_lock);
1034 	if (oconnp != NULL)
1035 		return (EADDRNOTAVAIL);
1036 
1037 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1038 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1039 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1040 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1041 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1042 		} else {
1043 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1044 		}
1045 	} else {
1046 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1047 	}
1048 	return (0);
1049 }
1050 
1051 static int
1052 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1053 {
1054 	connf_t	*connfp;
1055 	conn_t	*tconnp;
1056 	ipaddr_t laddr = connp->conn_laddr_v4;
1057 	ipaddr_t faddr = connp->conn_faddr_v4;
1058 
1059 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1060 	mutex_enter(&connfp->connf_lock);
1061 	for (tconnp = connfp->connf_head; tconnp != NULL;
1062 	    tconnp = tconnp->conn_next) {
1063 		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1064 			/* A tunnel is already bound to these addresses. */
1065 			mutex_exit(&connfp->connf_lock);
1066 			return (EADDRINUSE);
1067 		}
1068 	}
1069 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1070 	mutex_exit(&connfp->connf_lock);
1071 	return (0);
1072 }
1073 
1074 static int
1075 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1076 {
1077 	connf_t	*connfp;
1078 	conn_t	*tconnp;
1079 	in6_addr_t *laddr = &connp->conn_laddr_v6;
1080 	in6_addr_t *faddr = &connp->conn_faddr_v6;
1081 
1082 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1083 	mutex_enter(&connfp->connf_lock);
1084 	for (tconnp = connfp->connf_head; tconnp != NULL;
1085 	    tconnp = tconnp->conn_next) {
1086 		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1087 			/* A tunnel is already bound to these addresses. */
1088 			mutex_exit(&connfp->connf_lock);
1089 			return (EADDRINUSE);
1090 		}
1091 	}
1092 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1093 	mutex_exit(&connfp->connf_lock);
1094 	return (0);
1095 }
1096 
1097 /*
1098  * Check for a MAC exemption conflict on a labeled system.  Note that for
1099  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1100  * transport layer.  This check is for binding all other protocols.
1101  *
1102  * Returns true if there's a conflict.
1103  */
1104 static boolean_t
1105 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1106 {
1107 	connf_t	*connfp;
1108 	conn_t *tconn;
1109 
1110 	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1111 	mutex_enter(&connfp->connf_lock);
1112 	for (tconn = connfp->connf_head; tconn != NULL;
1113 	    tconn = tconn->conn_next) {
1114 		/* We don't allow v4 fallback for v6 raw socket */
1115 		if (connp->conn_family != tconn->conn_family)
1116 			continue;
1117 		/* If neither is exempt, then there's no conflict */
1118 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1119 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1120 			continue;
1121 		/* We are only concerned about sockets for a different zone */
1122 		if (connp->conn_zoneid == tconn->conn_zoneid)
1123 			continue;
1124 		/* If both are bound to different specific addrs, ok */
1125 		if (connp->conn_laddr_v4 != INADDR_ANY &&
1126 		    tconn->conn_laddr_v4 != INADDR_ANY &&
1127 		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1128 			continue;
1129 		/* These two conflict; fail */
1130 		break;
1131 	}
1132 	mutex_exit(&connfp->connf_lock);
1133 	return (tconn != NULL);
1134 }
1135 
1136 static boolean_t
1137 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1138 {
1139 	connf_t	*connfp;
1140 	conn_t *tconn;
1141 
1142 	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1143 	mutex_enter(&connfp->connf_lock);
1144 	for (tconn = connfp->connf_head; tconn != NULL;
1145 	    tconn = tconn->conn_next) {
1146 		/* We don't allow v4 fallback for v6 raw socket */
1147 		if (connp->conn_family != tconn->conn_family)
1148 			continue;
1149 		/* If neither is exempt, then there's no conflict */
1150 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1151 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1152 			continue;
1153 		/* We are only concerned about sockets for a different zone */
1154 		if (connp->conn_zoneid == tconn->conn_zoneid)
1155 			continue;
1156 		/* If both are bound to different addrs, ok */
1157 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1158 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1159 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1160 		    &tconn->conn_laddr_v6))
1161 			continue;
1162 		/* These two conflict; fail */
1163 		break;
1164 	}
1165 	mutex_exit(&connfp->connf_lock);
1166 	return (tconn != NULL);
1167 }
1168 
1169 /*
1170  * (v4, v6) bind hash insertion routines
1171  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1172  */
1173 
1174 int
1175 ipcl_bind_insert(conn_t *connp)
1176 {
1177 	if (connp->conn_ipversion == IPV6_VERSION)
1178 		return (ipcl_bind_insert_v6(connp));
1179 	else
1180 		return (ipcl_bind_insert_v4(connp));
1181 }
1182 
1183 int
1184 ipcl_bind_insert_v4(conn_t *connp)
1185 {
1186 	connf_t	*connfp;
1187 	int	ret = 0;
1188 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1189 	uint16_t	lport = connp->conn_lport;
1190 	uint8_t		protocol = connp->conn_proto;
1191 
1192 	if (IPCL_IS_IPTUN(connp))
1193 		return (ipcl_iptun_hash_insert(connp, ipst));
1194 
1195 	switch (protocol) {
1196 	default:
1197 		if (is_system_labeled() &&
1198 		    check_exempt_conflict_v4(connp, ipst))
1199 			return (EADDRINUSE);
1200 		/* FALLTHROUGH */
1201 	case IPPROTO_UDP:
1202 		if (protocol == IPPROTO_UDP) {
1203 			connfp = &ipst->ips_ipcl_udp_fanout[
1204 			    IPCL_UDP_HASH(lport, ipst)];
1205 		} else {
1206 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1207 		}
1208 
1209 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1210 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1211 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1212 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1213 		} else {
1214 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1215 		}
1216 		if (protocol == IPPROTO_RSVP)
1217 			ill_set_inputfn_all(ipst);
1218 		break;
1219 
1220 	case IPPROTO_TCP:
1221 		/* Insert it in the Bind Hash */
1222 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1223 		connfp = &ipst->ips_ipcl_bind_fanout[
1224 		    IPCL_BIND_HASH(lport, ipst)];
1225 		if (connp->conn_laddr_v4 != INADDR_ANY) {
1226 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1227 		} else {
1228 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1229 		}
1230 		if (cl_inet_listen != NULL) {
1231 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1232 			connp->conn_flags |= IPCL_CL_LISTENER;
1233 			(*cl_inet_listen)(
1234 			    connp->conn_netstack->netstack_stackid,
1235 			    IPPROTO_TCP, AF_INET,
1236 			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1237 		}
1238 		break;
1239 
1240 	case IPPROTO_SCTP:
1241 		ret = ipcl_sctp_hash_insert(connp, lport);
1242 		break;
1243 	}
1244 
1245 	return (ret);
1246 }
1247 
1248 int
1249 ipcl_bind_insert_v6(conn_t *connp)
1250 {
1251 	connf_t		*connfp;
1252 	int		ret = 0;
1253 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1254 	uint16_t	lport = connp->conn_lport;
1255 	uint8_t		protocol = connp->conn_proto;
1256 
1257 	if (IPCL_IS_IPTUN(connp)) {
1258 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1259 	}
1260 
1261 	switch (protocol) {
1262 	default:
1263 		if (is_system_labeled() &&
1264 		    check_exempt_conflict_v6(connp, ipst))
1265 			return (EADDRINUSE);
1266 		/* FALLTHROUGH */
1267 	case IPPROTO_UDP:
1268 		if (protocol == IPPROTO_UDP) {
1269 			connfp = &ipst->ips_ipcl_udp_fanout[
1270 			    IPCL_UDP_HASH(lport, ipst)];
1271 		} else {
1272 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1273 		}
1274 
1275 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1276 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1277 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1278 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1279 		} else {
1280 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1281 		}
1282 		break;
1283 
1284 	case IPPROTO_TCP:
1285 		/* Insert it in the Bind Hash */
1286 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1287 		connfp = &ipst->ips_ipcl_bind_fanout[
1288 		    IPCL_BIND_HASH(lport, ipst)];
1289 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1290 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1291 		} else {
1292 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1293 		}
1294 		if (cl_inet_listen != NULL) {
1295 			sa_family_t	addr_family;
1296 			uint8_t		*laddrp;
1297 
1298 			if (connp->conn_ipversion == IPV6_VERSION) {
1299 				addr_family = AF_INET6;
1300 				laddrp =
1301 				    (uint8_t *)&connp->conn_bound_addr_v6;
1302 			} else {
1303 				addr_family = AF_INET;
1304 				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1305 			}
1306 			connp->conn_flags |= IPCL_CL_LISTENER;
1307 			(*cl_inet_listen)(
1308 			    connp->conn_netstack->netstack_stackid,
1309 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1310 		}
1311 		break;
1312 
1313 	case IPPROTO_SCTP:
1314 		ret = ipcl_sctp_hash_insert(connp, lport);
1315 		break;
1316 	}
1317 
1318 	return (ret);
1319 }
1320 
1321 /*
1322  * ipcl_conn_hash insertion routines.
1323  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1324  */
1325 
1326 int
1327 ipcl_conn_insert(conn_t *connp)
1328 {
1329 	if (connp->conn_ipversion == IPV6_VERSION)
1330 		return (ipcl_conn_insert_v6(connp));
1331 	else
1332 		return (ipcl_conn_insert_v4(connp));
1333 }
1334 
1335 int
1336 ipcl_conn_insert_v4(conn_t *connp)
1337 {
1338 	connf_t		*connfp;
1339 	conn_t		*tconnp;
1340 	int		ret = 0;
1341 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1342 	uint16_t	lport = connp->conn_lport;
1343 	uint8_t		protocol = connp->conn_proto;
1344 
1345 	if (IPCL_IS_IPTUN(connp))
1346 		return (ipcl_iptun_hash_insert(connp, ipst));
1347 
1348 	switch (protocol) {
1349 	case IPPROTO_TCP:
1350 		/*
1351 		 * For TCP, we check whether the connection tuple already
1352 		 * exists before allowing the connection to proceed.  We
1353 		 * also allow indexing on the zoneid. This is to allow
1354 		 * multiple shared stack zones to have the same tcp
1355 		 * connection tuple. In practice this only happens for
1356 		 * INADDR_LOOPBACK as it's the only local address which
1357 		 * doesn't have to be unique.
1358 		 */
1359 		connfp = &ipst->ips_ipcl_conn_fanout[
1360 		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1361 		    connp->conn_ports, ipst)];
1362 		mutex_enter(&connfp->connf_lock);
1363 		for (tconnp = connfp->connf_head; tconnp != NULL;
1364 		    tconnp = tconnp->conn_next) {
1365 			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1366 			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1367 			    connp->conn_ports) &&
1368 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1369 				/* Already have a conn. bail out */
1370 				mutex_exit(&connfp->connf_lock);
1371 				return (EADDRINUSE);
1372 			}
1373 		}
1374 		if (connp->conn_fanout != NULL) {
1375 			/*
1376 			 * Probably a XTI/TLI application trying to do a
1377 			 * rebind. Let it happen.
1378 			 */
1379 			mutex_exit(&connfp->connf_lock);
1380 			IPCL_HASH_REMOVE(connp);
1381 			mutex_enter(&connfp->connf_lock);
1382 		}
1383 
1384 		ASSERT(connp->conn_recv != NULL);
1385 		ASSERT(connp->conn_recvicmp != NULL);
1386 
1387 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1388 		mutex_exit(&connfp->connf_lock);
1389 		break;
1390 
1391 	case IPPROTO_SCTP:
1392 		/*
1393 		 * The raw socket may have already been bound, remove it
1394 		 * from the hash first.
1395 		 */
1396 		IPCL_HASH_REMOVE(connp);
1397 		ret = ipcl_sctp_hash_insert(connp, lport);
1398 		break;
1399 
1400 	default:
1401 		/*
1402 		 * Check for conflicts among MAC exempt bindings.  For
1403 		 * transports with port numbers, this is done by the upper
1404 		 * level per-transport binding logic.  For all others, it's
1405 		 * done here.
1406 		 */
1407 		if (is_system_labeled() &&
1408 		    check_exempt_conflict_v4(connp, ipst))
1409 			return (EADDRINUSE);
1410 		/* FALLTHROUGH */
1411 
1412 	case IPPROTO_UDP:
1413 		if (protocol == IPPROTO_UDP) {
1414 			connfp = &ipst->ips_ipcl_udp_fanout[
1415 			    IPCL_UDP_HASH(lport, ipst)];
1416 		} else {
1417 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1418 		}
1419 
1420 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1421 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1422 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1423 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1424 		} else {
1425 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1426 		}
1427 		break;
1428 	}
1429 
1430 	return (ret);
1431 }
1432 
1433 int
1434 ipcl_conn_insert_v6(conn_t *connp)
1435 {
1436 	connf_t		*connfp;
1437 	conn_t		*tconnp;
1438 	int		ret = 0;
1439 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1440 	uint16_t	lport = connp->conn_lport;
1441 	uint8_t		protocol = connp->conn_proto;
1442 	uint_t		ifindex = connp->conn_bound_if;
1443 
1444 	if (IPCL_IS_IPTUN(connp))
1445 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1446 
1447 	switch (protocol) {
1448 	case IPPROTO_TCP:
1449 
1450 		/*
1451 		 * For tcp, we check whether the connection tuple already
1452 		 * exists before allowing the connection to proceed.  We
1453 		 * also allow indexing on the zoneid. This is to allow
1454 		 * multiple shared stack zones to have the same tcp
1455 		 * connection tuple. In practice this only happens for
1456 		 * ipv6_loopback as it's the only local address which
1457 		 * doesn't have to be unique.
1458 		 */
1459 		connfp = &ipst->ips_ipcl_conn_fanout[
1460 		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1461 		    ipst)];
1462 		mutex_enter(&connfp->connf_lock);
1463 		for (tconnp = connfp->connf_head; tconnp != NULL;
1464 		    tconnp = tconnp->conn_next) {
1465 			/* NOTE: need to match zoneid. Bug in onnv-gate */
1466 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1467 			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1468 			    connp->conn_ports) &&
1469 			    (tconnp->conn_bound_if == 0 ||
1470 			    tconnp->conn_bound_if == ifindex) &&
1471 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1472 				/* Already have a conn. bail out */
1473 				mutex_exit(&connfp->connf_lock);
1474 				return (EADDRINUSE);
1475 			}
1476 		}
1477 		if (connp->conn_fanout != NULL) {
1478 			/*
1479 			 * Probably a XTI/TLI application trying to do a
1480 			 * rebind. Let it happen.
1481 			 */
1482 			mutex_exit(&connfp->connf_lock);
1483 			IPCL_HASH_REMOVE(connp);
1484 			mutex_enter(&connfp->connf_lock);
1485 		}
1486 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1487 		mutex_exit(&connfp->connf_lock);
1488 		break;
1489 
1490 	case IPPROTO_SCTP:
1491 		IPCL_HASH_REMOVE(connp);
1492 		ret = ipcl_sctp_hash_insert(connp, lport);
1493 		break;
1494 
1495 	default:
1496 		if (is_system_labeled() &&
1497 		    check_exempt_conflict_v6(connp, ipst))
1498 			return (EADDRINUSE);
1499 		/* FALLTHROUGH */
1500 	case IPPROTO_UDP:
1501 		if (protocol == IPPROTO_UDP) {
1502 			connfp = &ipst->ips_ipcl_udp_fanout[
1503 			    IPCL_UDP_HASH(lport, ipst)];
1504 		} else {
1505 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1506 		}
1507 
1508 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1509 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1510 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1511 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1512 		} else {
1513 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1514 		}
1515 		break;
1516 	}
1517 
1518 	return (ret);
1519 }
1520 
1521 /*
1522  * v4 packet classifying function. looks up the fanout table to
1523  * find the conn, the packet belongs to. returns the conn with
1524  * the reference held, null otherwise.
1525  *
1526  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1527  * Lookup" comment block are applied.  Labels are also checked as described
1528  * above.  If the packet is from the inside (looped back), and is from the same
1529  * zone, then label checks are omitted.
1530  */
1531 conn_t *
1532 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1533     ip_recv_attr_t *ira, ip_stack_t *ipst)
1534 {
1535 	ipha_t	*ipha;
1536 	connf_t	*connfp, *bind_connfp;
1537 	uint16_t lport;
1538 	uint16_t fport;
1539 	uint32_t ports;
1540 	conn_t	*connp;
1541 	uint16_t  *up;
1542 	zoneid_t	zoneid = ira->ira_zoneid;
1543 	int		ifindex = ira->ira_ruifindex;
1544 
1545 	ipha = (ipha_t *)mp->b_rptr;
1546 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1547 
1548 	switch (protocol) {
1549 	case IPPROTO_TCP:
1550 		ports = *(uint32_t *)up;
1551 		connfp =
1552 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1553 		    ports, ipst)];
1554 		mutex_enter(&connfp->connf_lock);
1555 		for (connp = connfp->connf_head; connp != NULL;
1556 		    connp = connp->conn_next) {
1557 			if (IPCL_CONN_MATCH(connp, protocol,
1558 			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1559 			    (connp->conn_incoming_ifindex == 0 ||
1560 			    connp->conn_incoming_ifindex == ifindex) &&
1561 			    (connp->conn_zoneid == zoneid ||
1562 			    connp->conn_allzones ||
1563 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1564 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1565 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1566 				break;
1567 		}
1568 
1569 		if (connp != NULL) {
1570 			/*
1571 			 * We have a fully-bound TCP connection.
1572 			 *
1573 			 * For labeled systems, there's no need to check the
1574 			 * label here.  It's known to be good as we checked
1575 			 * before allowing the connection to become bound.
1576 			 */
1577 			CONN_INC_REF(connp);
1578 			mutex_exit(&connfp->connf_lock);
1579 			return (connp);
1580 		}
1581 
1582 		mutex_exit(&connfp->connf_lock);
1583 		lport = up[1];
1584 		bind_connfp =
1585 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1586 		mutex_enter(&bind_connfp->connf_lock);
1587 		for (connp = bind_connfp->connf_head; connp != NULL;
1588 		    connp = connp->conn_next) {
1589 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1590 			    lport) &&
1591 			    (connp->conn_incoming_ifindex == 0 ||
1592 			    connp->conn_incoming_ifindex == ifindex) &&
1593 			    (connp->conn_zoneid == zoneid ||
1594 			    connp->conn_allzones ||
1595 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1596 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1597 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1598 				break;
1599 		}
1600 
1601 		/*
1602 		 * If the matching connection is SLP on a private address, then
1603 		 * the label on the packet must match the local zone's label.
1604 		 * Otherwise, it must be in the label range defined by tnrh.
1605 		 * This is ensured by tsol_receive_local.
1606 		 *
1607 		 * Note that we don't check tsol_receive_local for
1608 		 * the connected case.
1609 		 */
1610 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1611 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1612 		    ira, connp)) {
1613 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1614 			    char *, "connp(1) could not receive mp(2)",
1615 			    conn_t *, connp, mblk_t *, mp);
1616 			connp = NULL;
1617 		}
1618 
1619 		if (connp != NULL) {
1620 			/* Have a listener at least */
1621 			CONN_INC_REF(connp);
1622 			mutex_exit(&bind_connfp->connf_lock);
1623 			return (connp);
1624 		}
1625 
1626 		mutex_exit(&bind_connfp->connf_lock);
1627 		break;
1628 
1629 	case IPPROTO_UDP:
1630 		lport = up[1];
1631 		fport = up[0];
1632 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1633 		mutex_enter(&connfp->connf_lock);
1634 		for (connp = connfp->connf_head; connp != NULL;
1635 		    connp = connp->conn_next) {
1636 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1637 			    fport, ipha->ipha_src) &&
1638 			    (connp->conn_incoming_ifindex == 0 ||
1639 			    connp->conn_incoming_ifindex == ifindex) &&
1640 			    (connp->conn_zoneid == zoneid ||
1641 			    connp->conn_allzones ||
1642 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1643 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1644 				break;
1645 		}
1646 
1647 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1648 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1649 		    ira, connp)) {
1650 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1651 			    char *, "connp(1) could not receive mp(2)",
1652 			    conn_t *, connp, mblk_t *, mp);
1653 			connp = NULL;
1654 		}
1655 
1656 		if (connp != NULL) {
1657 			CONN_INC_REF(connp);
1658 			mutex_exit(&connfp->connf_lock);
1659 			return (connp);
1660 		}
1661 
1662 		/*
1663 		 * We shouldn't come here for multicast/broadcast packets
1664 		 */
1665 		mutex_exit(&connfp->connf_lock);
1666 
1667 		break;
1668 
1669 	case IPPROTO_ENCAP:
1670 	case IPPROTO_IPV6:
1671 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1672 		    &ipha->ipha_dst, ipst));
1673 	}
1674 
1675 	return (NULL);
1676 }
1677 
1678 conn_t *
1679 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1680     ip_recv_attr_t *ira, ip_stack_t *ipst)
1681 {
1682 	ip6_t		*ip6h;
1683 	connf_t		*connfp, *bind_connfp;
1684 	uint16_t	lport;
1685 	uint16_t	fport;
1686 	tcpha_t		*tcpha;
1687 	uint32_t	ports;
1688 	conn_t		*connp;
1689 	uint16_t	*up;
1690 	zoneid_t	zoneid = ira->ira_zoneid;
1691 	int		ifindex = ira->ira_ruifindex;
1692 
1693 	ip6h = (ip6_t *)mp->b_rptr;
1694 
1695 	switch (protocol) {
1696 	case IPPROTO_TCP:
1697 		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1698 		up = &tcpha->tha_lport;
1699 		ports = *(uint32_t *)up;
1700 
1701 		connfp =
1702 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1703 		    ports, ipst)];
1704 		mutex_enter(&connfp->connf_lock);
1705 		for (connp = connfp->connf_head; connp != NULL;
1706 		    connp = connp->conn_next) {
1707 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1708 			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1709 			    (connp->conn_incoming_ifindex == 0 ||
1710 			    connp->conn_incoming_ifindex == ifindex) &&
1711 			    (connp->conn_zoneid == zoneid ||
1712 			    connp->conn_allzones ||
1713 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1714 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1715 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1716 				break;
1717 		}
1718 
1719 		if (connp != NULL) {
1720 			/*
1721 			 * We have a fully-bound TCP connection.
1722 			 *
1723 			 * For labeled systems, there's no need to check the
1724 			 * label here.  It's known to be good as we checked
1725 			 * before allowing the connection to become bound.
1726 			 */
1727 			CONN_INC_REF(connp);
1728 			mutex_exit(&connfp->connf_lock);
1729 			return (connp);
1730 		}
1731 
1732 		mutex_exit(&connfp->connf_lock);
1733 
1734 		lport = up[1];
1735 		bind_connfp =
1736 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1737 		mutex_enter(&bind_connfp->connf_lock);
1738 		for (connp = bind_connfp->connf_head; connp != NULL;
1739 		    connp = connp->conn_next) {
1740 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1741 			    ip6h->ip6_dst, lport) &&
1742 			    (connp->conn_incoming_ifindex == 0 ||
1743 			    connp->conn_incoming_ifindex == ifindex) &&
1744 			    (connp->conn_zoneid == zoneid ||
1745 			    connp->conn_allzones ||
1746 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1747 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1748 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1749 				break;
1750 		}
1751 
1752 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1753 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1754 		    ira, connp)) {
1755 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1756 			    char *, "connp(1) could not receive mp(2)",
1757 			    conn_t *, connp, mblk_t *, mp);
1758 			connp = NULL;
1759 		}
1760 
1761 		if (connp != NULL) {
1762 			/* Have a listner at least */
1763 			CONN_INC_REF(connp);
1764 			mutex_exit(&bind_connfp->connf_lock);
1765 			return (connp);
1766 		}
1767 
1768 		mutex_exit(&bind_connfp->connf_lock);
1769 		break;
1770 
1771 	case IPPROTO_UDP:
1772 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1773 		lport = up[1];
1774 		fport = up[0];
1775 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1776 		mutex_enter(&connfp->connf_lock);
1777 		for (connp = connfp->connf_head; connp != NULL;
1778 		    connp = connp->conn_next) {
1779 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1780 			    fport, ip6h->ip6_src) &&
1781 			    (connp->conn_incoming_ifindex == 0 ||
1782 			    connp->conn_incoming_ifindex == ifindex) &&
1783 			    (connp->conn_zoneid == zoneid ||
1784 			    connp->conn_allzones ||
1785 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1786 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1787 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1788 				break;
1789 		}
1790 
1791 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1792 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1793 		    ira, connp)) {
1794 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1795 			    char *, "connp(1) could not receive mp(2)",
1796 			    conn_t *, connp, mblk_t *, mp);
1797 			connp = NULL;
1798 		}
1799 
1800 		if (connp != NULL) {
1801 			CONN_INC_REF(connp);
1802 			mutex_exit(&connfp->connf_lock);
1803 			return (connp);
1804 		}
1805 
1806 		/*
1807 		 * We shouldn't come here for multicast/broadcast packets
1808 		 */
1809 		mutex_exit(&connfp->connf_lock);
1810 		break;
1811 	case IPPROTO_ENCAP:
1812 	case IPPROTO_IPV6:
1813 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1814 		    &ip6h->ip6_dst, ipst));
1815 	}
1816 
1817 	return (NULL);
1818 }
1819 
1820 /*
1821  * wrapper around ipcl_classify_(v4,v6) routines.
1822  */
1823 conn_t *
1824 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1825 {
1826 	if (ira->ira_flags & IRAF_IS_IPV4) {
1827 		return (ipcl_classify_v4(mp, ira->ira_protocol,
1828 		    ira->ira_ip_hdr_length, ira, ipst));
1829 	} else {
1830 		return (ipcl_classify_v6(mp, ira->ira_protocol,
1831 		    ira->ira_ip_hdr_length, ira, ipst));
1832 	}
1833 }
1834 
1835 /*
1836  * Only used to classify SCTP RAW sockets
1837  */
1838 conn_t *
1839 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1840     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1841 {
1842 	connf_t		*connfp;
1843 	conn_t		*connp;
1844 	in_port_t	lport;
1845 	int		ipversion;
1846 	const void	*dst;
1847 	zoneid_t	zoneid = ira->ira_zoneid;
1848 
1849 	lport = ((uint16_t *)&ports)[1];
1850 	if (ira->ira_flags & IRAF_IS_IPV4) {
1851 		dst = (const void *)&ipha->ipha_dst;
1852 		ipversion = IPV4_VERSION;
1853 	} else {
1854 		dst = (const void *)&ip6h->ip6_dst;
1855 		ipversion = IPV6_VERSION;
1856 	}
1857 
1858 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1859 	mutex_enter(&connfp->connf_lock);
1860 	for (connp = connfp->connf_head; connp != NULL;
1861 	    connp = connp->conn_next) {
1862 		/* We don't allow v4 fallback for v6 raw socket. */
1863 		if (ipversion != connp->conn_ipversion)
1864 			continue;
1865 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1866 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1867 			if (ipversion == IPV4_VERSION) {
1868 				if (!IPCL_CONN_MATCH(connp, protocol,
1869 				    ipha->ipha_src, ipha->ipha_dst, ports))
1870 					continue;
1871 			} else {
1872 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1873 				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1874 					continue;
1875 			}
1876 		} else {
1877 			if (ipversion == IPV4_VERSION) {
1878 				if (!IPCL_BIND_MATCH(connp, protocol,
1879 				    ipha->ipha_dst, lport))
1880 					continue;
1881 			} else {
1882 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1883 				    ip6h->ip6_dst, lport))
1884 					continue;
1885 			}
1886 		}
1887 
1888 		if (connp->conn_zoneid == zoneid ||
1889 		    connp->conn_allzones ||
1890 		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1891 		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1892 		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1893 			break;
1894 	}
1895 
1896 	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1897 	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1898 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1899 		    char *, "connp(1) could not receive mp(2)",
1900 		    conn_t *, connp, mblk_t *, mp);
1901 		connp = NULL;
1902 	}
1903 
1904 	if (connp != NULL)
1905 		goto found;
1906 	mutex_exit(&connfp->connf_lock);
1907 
1908 	/* Try to look for a wildcard SCTP RAW socket match. */
1909 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1910 	mutex_enter(&connfp->connf_lock);
1911 	for (connp = connfp->connf_head; connp != NULL;
1912 	    connp = connp->conn_next) {
1913 		/* We don't allow v4 fallback for v6 raw socket. */
1914 		if (ipversion != connp->conn_ipversion)
1915 			continue;
1916 		if (!IPCL_ZONE_MATCH(connp, zoneid))
1917 			continue;
1918 
1919 		if (ipversion == IPV4_VERSION) {
1920 			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1921 				break;
1922 		} else {
1923 			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1924 				break;
1925 			}
1926 		}
1927 	}
1928 
1929 	if (connp != NULL)
1930 		goto found;
1931 
1932 	mutex_exit(&connfp->connf_lock);
1933 	return (NULL);
1934 
1935 found:
1936 	ASSERT(connp != NULL);
1937 	CONN_INC_REF(connp);
1938 	mutex_exit(&connfp->connf_lock);
1939 	return (connp);
1940 }
1941 
1942 /* ARGSUSED */
1943 static int
1944 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1945 {
1946 	itc_t	*itc = (itc_t *)buf;
1947 	conn_t	*connp = &itc->itc_conn;
1948 	tcp_t	*tcp = (tcp_t *)&itc[1];
1949 
1950 	bzero(connp, sizeof (conn_t));
1951 	bzero(tcp, sizeof (tcp_t));
1952 
1953 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1954 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1955 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1956 	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1957 	if (tcp->tcp_timercache == NULL)
1958 		return (ENOMEM);
1959 	connp->conn_tcp = tcp;
1960 	connp->conn_flags = IPCL_TCPCONN;
1961 	connp->conn_proto = IPPROTO_TCP;
1962 	tcp->tcp_connp = connp;
1963 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1964 
1965 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1966 	if (connp->conn_ixa == NULL) {
1967 		tcp_timermp_free(tcp);
1968 		return (ENOMEM);
1969 	}
1970 	connp->conn_ixa->ixa_refcnt = 1;
1971 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1972 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1973 	return (0);
1974 }
1975 
1976 /* ARGSUSED */
1977 static void
1978 tcp_conn_destructor(void *buf, void *cdrarg)
1979 {
1980 	itc_t	*itc = (itc_t *)buf;
1981 	conn_t	*connp = &itc->itc_conn;
1982 	tcp_t	*tcp = (tcp_t *)&itc[1];
1983 
1984 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1985 	ASSERT(tcp->tcp_connp == connp);
1986 	ASSERT(connp->conn_tcp == tcp);
1987 	tcp_timermp_free(tcp);
1988 	mutex_destroy(&connp->conn_lock);
1989 	cv_destroy(&connp->conn_cv);
1990 	cv_destroy(&connp->conn_sq_cv);
1991 	rw_destroy(&connp->conn_ilg_lock);
1992 
1993 	/* Can be NULL if constructor failed */
1994 	if (connp->conn_ixa != NULL) {
1995 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1996 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1997 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1998 		ixa_refrele(connp->conn_ixa);
1999 	}
2000 }
2001 
2002 /* ARGSUSED */
2003 static int
2004 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2005 {
2006 	itc_t	*itc = (itc_t *)buf;
2007 	conn_t	*connp = &itc->itc_conn;
2008 
2009 	bzero(connp, sizeof (conn_t));
2010 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2011 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2012 	connp->conn_flags = IPCL_IPCCONN;
2013 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2014 
2015 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2016 	if (connp->conn_ixa == NULL)
2017 		return (ENOMEM);
2018 	connp->conn_ixa->ixa_refcnt = 1;
2019 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2020 	return (0);
2021 }
2022 
2023 /* ARGSUSED */
2024 static void
2025 ip_conn_destructor(void *buf, void *cdrarg)
2026 {
2027 	itc_t	*itc = (itc_t *)buf;
2028 	conn_t	*connp = &itc->itc_conn;
2029 
2030 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2031 	ASSERT(connp->conn_priv == NULL);
2032 	mutex_destroy(&connp->conn_lock);
2033 	cv_destroy(&connp->conn_cv);
2034 	rw_destroy(&connp->conn_ilg_lock);
2035 
2036 	/* Can be NULL if constructor failed */
2037 	if (connp->conn_ixa != NULL) {
2038 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2039 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2040 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2041 		ixa_refrele(connp->conn_ixa);
2042 	}
2043 }
2044 
2045 /* ARGSUSED */
2046 static int
2047 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2048 {
2049 	itc_t	*itc = (itc_t *)buf;
2050 	conn_t	*connp = &itc->itc_conn;
2051 	udp_t	*udp = (udp_t *)&itc[1];
2052 
2053 	bzero(connp, sizeof (conn_t));
2054 	bzero(udp, sizeof (udp_t));
2055 
2056 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2057 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2058 	connp->conn_udp = udp;
2059 	connp->conn_flags = IPCL_UDPCONN;
2060 	connp->conn_proto = IPPROTO_UDP;
2061 	udp->udp_connp = connp;
2062 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2063 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2064 	if (connp->conn_ixa == NULL)
2065 		return (ENOMEM);
2066 	connp->conn_ixa->ixa_refcnt = 1;
2067 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2068 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2069 	return (0);
2070 }
2071 
2072 /* ARGSUSED */
2073 static void
2074 udp_conn_destructor(void *buf, void *cdrarg)
2075 {
2076 	itc_t	*itc = (itc_t *)buf;
2077 	conn_t	*connp = &itc->itc_conn;
2078 	udp_t	*udp = (udp_t *)&itc[1];
2079 
2080 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2081 	ASSERT(udp->udp_connp == connp);
2082 	ASSERT(connp->conn_udp == udp);
2083 	mutex_destroy(&connp->conn_lock);
2084 	cv_destroy(&connp->conn_cv);
2085 	rw_destroy(&connp->conn_ilg_lock);
2086 
2087 	/* Can be NULL if constructor failed */
2088 	if (connp->conn_ixa != NULL) {
2089 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2090 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2091 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2092 		ixa_refrele(connp->conn_ixa);
2093 	}
2094 }
2095 
2096 /* ARGSUSED */
2097 static int
2098 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2099 {
2100 	itc_t	*itc = (itc_t *)buf;
2101 	conn_t	*connp = &itc->itc_conn;
2102 	icmp_t	*icmp = (icmp_t *)&itc[1];
2103 
2104 	bzero(connp, sizeof (conn_t));
2105 	bzero(icmp, sizeof (icmp_t));
2106 
2107 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2108 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2109 	connp->conn_icmp = icmp;
2110 	connp->conn_flags = IPCL_RAWIPCONN;
2111 	connp->conn_proto = IPPROTO_ICMP;
2112 	icmp->icmp_connp = connp;
2113 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2114 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2115 	if (connp->conn_ixa == NULL)
2116 		return (ENOMEM);
2117 	connp->conn_ixa->ixa_refcnt = 1;
2118 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2119 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2120 	return (0);
2121 }
2122 
2123 /* ARGSUSED */
2124 static void
2125 rawip_conn_destructor(void *buf, void *cdrarg)
2126 {
2127 	itc_t	*itc = (itc_t *)buf;
2128 	conn_t	*connp = &itc->itc_conn;
2129 	icmp_t	*icmp = (icmp_t *)&itc[1];
2130 
2131 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2132 	ASSERT(icmp->icmp_connp == connp);
2133 	ASSERT(connp->conn_icmp == icmp);
2134 	mutex_destroy(&connp->conn_lock);
2135 	cv_destroy(&connp->conn_cv);
2136 	rw_destroy(&connp->conn_ilg_lock);
2137 
2138 	/* Can be NULL if constructor failed */
2139 	if (connp->conn_ixa != NULL) {
2140 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2141 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2142 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2143 		ixa_refrele(connp->conn_ixa);
2144 	}
2145 }
2146 
2147 /* ARGSUSED */
2148 static int
2149 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2150 {
2151 	itc_t	*itc = (itc_t *)buf;
2152 	conn_t	*connp = &itc->itc_conn;
2153 	rts_t	*rts = (rts_t *)&itc[1];
2154 
2155 	bzero(connp, sizeof (conn_t));
2156 	bzero(rts, sizeof (rts_t));
2157 
2158 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2159 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2160 	connp->conn_rts = rts;
2161 	connp->conn_flags = IPCL_RTSCONN;
2162 	rts->rts_connp = connp;
2163 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2164 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2165 	if (connp->conn_ixa == NULL)
2166 		return (ENOMEM);
2167 	connp->conn_ixa->ixa_refcnt = 1;
2168 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2169 	return (0);
2170 }
2171 
2172 /* ARGSUSED */
2173 static void
2174 rts_conn_destructor(void *buf, void *cdrarg)
2175 {
2176 	itc_t	*itc = (itc_t *)buf;
2177 	conn_t	*connp = &itc->itc_conn;
2178 	rts_t	*rts = (rts_t *)&itc[1];
2179 
2180 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2181 	ASSERT(rts->rts_connp == connp);
2182 	ASSERT(connp->conn_rts == rts);
2183 	mutex_destroy(&connp->conn_lock);
2184 	cv_destroy(&connp->conn_cv);
2185 	rw_destroy(&connp->conn_ilg_lock);
2186 
2187 	/* Can be NULL if constructor failed */
2188 	if (connp->conn_ixa != NULL) {
2189 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2190 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2191 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2192 		ixa_refrele(connp->conn_ixa);
2193 	}
2194 }
2195 
2196 /*
2197  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2198  * in the conn_t.
2199  *
2200  * Below we list all the pointers in the conn_t as a documentation aid.
2201  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2202  * If you add any pointers to the conn_t please add an ASSERT here
2203  * and #ifdef it out if it can't be actually asserted to be NULL.
2204  * In any case, we bzero most of the conn_t at the end of the function.
2205  */
2206 void
2207 ipcl_conn_cleanup(conn_t *connp)
2208 {
2209 	ip_xmit_attr_t	*ixa;
2210 
2211 	ASSERT(connp->conn_latch == NULL);
2212 	ASSERT(connp->conn_latch_in_policy == NULL);
2213 	ASSERT(connp->conn_latch_in_action == NULL);
2214 #ifdef notdef
2215 	ASSERT(connp->conn_rq == NULL);
2216 	ASSERT(connp->conn_wq == NULL);
2217 #endif
2218 	ASSERT(connp->conn_cred == NULL);
2219 	ASSERT(connp->conn_g_fanout == NULL);
2220 	ASSERT(connp->conn_g_next == NULL);
2221 	ASSERT(connp->conn_g_prev == NULL);
2222 	ASSERT(connp->conn_policy == NULL);
2223 	ASSERT(connp->conn_fanout == NULL);
2224 	ASSERT(connp->conn_next == NULL);
2225 	ASSERT(connp->conn_prev == NULL);
2226 	ASSERT(connp->conn_oper_pending_ill == NULL);
2227 	ASSERT(connp->conn_ilg == NULL);
2228 	ASSERT(connp->conn_drain_next == NULL);
2229 	ASSERT(connp->conn_drain_prev == NULL);
2230 #ifdef notdef
2231 	/* conn_idl is not cleared when removed from idl list */
2232 	ASSERT(connp->conn_idl == NULL);
2233 #endif
2234 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2235 #ifdef notdef
2236 	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2237 	ASSERT(connp->conn_netstack == NULL);
2238 #endif
2239 
2240 	ASSERT(connp->conn_helper_info == NULL);
2241 	ASSERT(connp->conn_ixa != NULL);
2242 	ixa = connp->conn_ixa;
2243 	ASSERT(ixa->ixa_refcnt == 1);
2244 	/* Need to preserve ixa_protocol */
2245 	ixa_cleanup(ixa);
2246 	ixa->ixa_flags = 0;
2247 
2248 	/* Clear out the conn_t fields that are not preserved */
2249 	bzero(&connp->conn_start_clr,
2250 	    sizeof (conn_t) -
2251 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2252 }
2253 
2254 /*
2255  * All conns are inserted in a global multi-list for the benefit of
2256  * walkers. The walk is guaranteed to walk all open conns at the time
2257  * of the start of the walk exactly once. This property is needed to
2258  * achieve some cleanups during unplumb of interfaces. This is achieved
2259  * as follows.
2260  *
2261  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2262  * call the insert and delete functions below at creation and deletion
2263  * time respectively. The conn never moves or changes its position in this
2264  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2265  * won't increase due to walkers, once the conn deletion has started. Note
2266  * that we can't remove the conn from the global list and then wait for
2267  * the refcnt to drop to zero, since walkers would then see a truncated
2268  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2269  * conns until ip_open is ready to make them globally visible.
2270  * The global round robin multi-list locks are held only to get the
2271  * next member/insertion/deletion and contention should be negligible
2272  * if the multi-list is much greater than the number of cpus.
2273  */
2274 void
2275 ipcl_globalhash_insert(conn_t *connp)
2276 {
2277 	int	index;
2278 	struct connf_s	*connfp;
2279 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2280 
2281 	/*
2282 	 * No need for atomic here. Approximate even distribution
2283 	 * in the global lists is sufficient.
2284 	 */
2285 	ipst->ips_conn_g_index++;
2286 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2287 
2288 	connp->conn_g_prev = NULL;
2289 	/*
2290 	 * Mark as INCIPIENT, so that walkers will ignore this
2291 	 * for now, till ip_open is ready to make it visible globally.
2292 	 */
2293 	connp->conn_state_flags |= CONN_INCIPIENT;
2294 
2295 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2296 	/* Insert at the head of the list */
2297 	mutex_enter(&connfp->connf_lock);
2298 	connp->conn_g_next = connfp->connf_head;
2299 	if (connp->conn_g_next != NULL)
2300 		connp->conn_g_next->conn_g_prev = connp;
2301 	connfp->connf_head = connp;
2302 
2303 	/* The fanout bucket this conn points to */
2304 	connp->conn_g_fanout = connfp;
2305 
2306 	mutex_exit(&connfp->connf_lock);
2307 }
2308 
2309 void
2310 ipcl_globalhash_remove(conn_t *connp)
2311 {
2312 	struct connf_s	*connfp;
2313 
2314 	/*
2315 	 * We were never inserted in the global multi list.
2316 	 * IPCL_NONE variety is never inserted in the global multilist
2317 	 * since it is presumed to not need any cleanup and is transient.
2318 	 */
2319 	if (connp->conn_g_fanout == NULL)
2320 		return;
2321 
2322 	connfp = connp->conn_g_fanout;
2323 	mutex_enter(&connfp->connf_lock);
2324 	if (connp->conn_g_prev != NULL)
2325 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2326 	else
2327 		connfp->connf_head = connp->conn_g_next;
2328 	if (connp->conn_g_next != NULL)
2329 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2330 	mutex_exit(&connfp->connf_lock);
2331 
2332 	/* Better to stumble on a null pointer than to corrupt memory */
2333 	connp->conn_g_next = NULL;
2334 	connp->conn_g_prev = NULL;
2335 	connp->conn_g_fanout = NULL;
2336 }
2337 
2338 /*
2339  * Walk the list of all conn_t's in the system, calling the function provided
2340  * With the specified argument for each.
2341  * Applies to both IPv4 and IPv6.
2342  *
2343  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2344  * conn_oper_pending_ill). To guard against stale pointers
2345  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2346  * unplumbed or removed. New conn_t's that are created while we are walking
2347  * may be missed by this walk, because they are not necessarily inserted
2348  * at the tail of the list. They are new conn_t's and thus don't have any
2349  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2350  * is created to the struct that is going away.
2351  */
2352 void
2353 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2354 {
2355 	int	i;
2356 	conn_t	*connp;
2357 	conn_t	*prev_connp;
2358 
2359 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2360 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2361 		prev_connp = NULL;
2362 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2363 		while (connp != NULL) {
2364 			mutex_enter(&connp->conn_lock);
2365 			if (connp->conn_state_flags &
2366 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2367 				mutex_exit(&connp->conn_lock);
2368 				connp = connp->conn_g_next;
2369 				continue;
2370 			}
2371 			CONN_INC_REF_LOCKED(connp);
2372 			mutex_exit(&connp->conn_lock);
2373 			mutex_exit(
2374 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2375 			(*func)(connp, arg);
2376 			if (prev_connp != NULL)
2377 				CONN_DEC_REF(prev_connp);
2378 			mutex_enter(
2379 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2380 			prev_connp = connp;
2381 			connp = connp->conn_g_next;
2382 		}
2383 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2384 		if (prev_connp != NULL)
2385 			CONN_DEC_REF(prev_connp);
2386 	}
2387 }
2388 
2389 /*
2390  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2391  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2392  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2393  * (peer tcp in ESTABLISHED state).
2394  */
2395 conn_t *
2396 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2397     ip_stack_t *ipst)
2398 {
2399 	uint32_t ports;
2400 	uint16_t *pports = (uint16_t *)&ports;
2401 	connf_t	*connfp;
2402 	conn_t	*tconnp;
2403 	boolean_t zone_chk;
2404 
2405 	/*
2406 	 * If either the source of destination address is loopback, then
2407 	 * both endpoints must be in the same Zone.  Otherwise, both of
2408 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2409 	 * state) and the endpoints may reside in different Zones.
2410 	 */
2411 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2412 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2413 
2414 	pports[0] = tcpha->tha_fport;
2415 	pports[1] = tcpha->tha_lport;
2416 
2417 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2418 	    ports, ipst)];
2419 
2420 	mutex_enter(&connfp->connf_lock);
2421 	for (tconnp = connfp->connf_head; tconnp != NULL;
2422 	    tconnp = tconnp->conn_next) {
2423 
2424 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2425 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2426 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2427 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2428 
2429 			ASSERT(tconnp != connp);
2430 			CONN_INC_REF(tconnp);
2431 			mutex_exit(&connfp->connf_lock);
2432 			return (tconnp);
2433 		}
2434 	}
2435 	mutex_exit(&connfp->connf_lock);
2436 	return (NULL);
2437 }
2438 
2439 /*
2440  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2441  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2442  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2443  * (peer tcp in ESTABLISHED state).
2444  */
2445 conn_t *
2446 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2447     ip_stack_t *ipst)
2448 {
2449 	uint32_t ports;
2450 	uint16_t *pports = (uint16_t *)&ports;
2451 	connf_t	*connfp;
2452 	conn_t	*tconnp;
2453 	boolean_t zone_chk;
2454 
2455 	/*
2456 	 * If either the source of destination address is loopback, then
2457 	 * both endpoints must be in the same Zone.  Otherwise, both of
2458 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2459 	 * state) and the endpoints may reside in different Zones.  We
2460 	 * don't do Zone check for link local address(es) because the
2461 	 * current Zone implementation treats each link local address as
2462 	 * being unique per system node, i.e. they belong to global Zone.
2463 	 */
2464 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2465 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2466 
2467 	pports[0] = tcpha->tha_fport;
2468 	pports[1] = tcpha->tha_lport;
2469 
2470 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2471 	    ports, ipst)];
2472 
2473 	mutex_enter(&connfp->connf_lock);
2474 	for (tconnp = connfp->connf_head; tconnp != NULL;
2475 	    tconnp = tconnp->conn_next) {
2476 
2477 		/* We skip conn_bound_if check here as this is loopback tcp */
2478 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2479 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2480 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2481 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2482 
2483 			ASSERT(tconnp != connp);
2484 			CONN_INC_REF(tconnp);
2485 			mutex_exit(&connfp->connf_lock);
2486 			return (tconnp);
2487 		}
2488 	}
2489 	mutex_exit(&connfp->connf_lock);
2490 	return (NULL);
2491 }
2492 
2493 /*
2494  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2495  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2496  * Only checks for connected entries i.e. no INADDR_ANY checks.
2497  */
2498 conn_t *
2499 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2500     ip_stack_t *ipst)
2501 {
2502 	uint32_t ports;
2503 	uint16_t *pports;
2504 	connf_t	*connfp;
2505 	conn_t	*tconnp;
2506 
2507 	pports = (uint16_t *)&ports;
2508 	pports[0] = tcpha->tha_fport;
2509 	pports[1] = tcpha->tha_lport;
2510 
2511 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2512 	    ports, ipst)];
2513 
2514 	mutex_enter(&connfp->connf_lock);
2515 	for (tconnp = connfp->connf_head; tconnp != NULL;
2516 	    tconnp = tconnp->conn_next) {
2517 
2518 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2519 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2520 		    tconnp->conn_tcp->tcp_state >= min_state) {
2521 
2522 			CONN_INC_REF(tconnp);
2523 			mutex_exit(&connfp->connf_lock);
2524 			return (tconnp);
2525 		}
2526 	}
2527 	mutex_exit(&connfp->connf_lock);
2528 	return (NULL);
2529 }
2530 
2531 /*
2532  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2533  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2534  * Only checks for connected entries i.e. no INADDR_ANY checks.
2535  * Match on ifindex in addition to addresses.
2536  */
2537 conn_t *
2538 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2539     uint_t ifindex, ip_stack_t *ipst)
2540 {
2541 	tcp_t	*tcp;
2542 	uint32_t ports;
2543 	uint16_t *pports;
2544 	connf_t	*connfp;
2545 	conn_t	*tconnp;
2546 
2547 	pports = (uint16_t *)&ports;
2548 	pports[0] = tcpha->tha_fport;
2549 	pports[1] = tcpha->tha_lport;
2550 
2551 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2552 	    ports, ipst)];
2553 
2554 	mutex_enter(&connfp->connf_lock);
2555 	for (tconnp = connfp->connf_head; tconnp != NULL;
2556 	    tconnp = tconnp->conn_next) {
2557 
2558 		tcp = tconnp->conn_tcp;
2559 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2560 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2561 		    tcp->tcp_state >= min_state &&
2562 		    (tconnp->conn_bound_if == 0 ||
2563 		    tconnp->conn_bound_if == ifindex)) {
2564 
2565 			CONN_INC_REF(tconnp);
2566 			mutex_exit(&connfp->connf_lock);
2567 			return (tconnp);
2568 		}
2569 	}
2570 	mutex_exit(&connfp->connf_lock);
2571 	return (NULL);
2572 }
2573 
2574 /*
2575  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2576  * a listener when changing state.
2577  */
2578 conn_t *
2579 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2580     ip_stack_t *ipst)
2581 {
2582 	connf_t		*bind_connfp;
2583 	conn_t		*connp;
2584 	tcp_t		*tcp;
2585 
2586 	/*
2587 	 * Avoid false matches for packets sent to an IP destination of
2588 	 * all zeros.
2589 	 */
2590 	if (laddr == 0)
2591 		return (NULL);
2592 
2593 	ASSERT(zoneid != ALL_ZONES);
2594 
2595 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2596 	mutex_enter(&bind_connfp->connf_lock);
2597 	for (connp = bind_connfp->connf_head; connp != NULL;
2598 	    connp = connp->conn_next) {
2599 		tcp = connp->conn_tcp;
2600 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2601 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2602 		    (tcp->tcp_listener == NULL)) {
2603 			CONN_INC_REF(connp);
2604 			mutex_exit(&bind_connfp->connf_lock);
2605 			return (connp);
2606 		}
2607 	}
2608 	mutex_exit(&bind_connfp->connf_lock);
2609 	return (NULL);
2610 }
2611 
2612 /*
2613  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2614  * a listener when changing state.
2615  */
2616 conn_t *
2617 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2618     zoneid_t zoneid, ip_stack_t *ipst)
2619 {
2620 	connf_t		*bind_connfp;
2621 	conn_t		*connp = NULL;
2622 	tcp_t		*tcp;
2623 
2624 	/*
2625 	 * Avoid false matches for packets sent to an IP destination of
2626 	 * all zeros.
2627 	 */
2628 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2629 		return (NULL);
2630 
2631 	ASSERT(zoneid != ALL_ZONES);
2632 
2633 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2634 	mutex_enter(&bind_connfp->connf_lock);
2635 	for (connp = bind_connfp->connf_head; connp != NULL;
2636 	    connp = connp->conn_next) {
2637 		tcp = connp->conn_tcp;
2638 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2639 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2640 		    (connp->conn_bound_if == 0 ||
2641 		    connp->conn_bound_if == ifindex) &&
2642 		    tcp->tcp_listener == NULL) {
2643 			CONN_INC_REF(connp);
2644 			mutex_exit(&bind_connfp->connf_lock);
2645 			return (connp);
2646 		}
2647 	}
2648 	mutex_exit(&bind_connfp->connf_lock);
2649 	return (NULL);
2650 }
2651 
2652 /*
2653  * ipcl_get_next_conn
2654  *	get the next entry in the conn global list
2655  *	and put a reference on the next_conn.
2656  *	decrement the reference on the current conn.
2657  *
2658  * This is an iterator based walker function that also provides for
2659  * some selection by the caller. It walks through the conn_hash bucket
2660  * searching for the next valid connp in the list, and selects connections
2661  * that are neither closed nor condemned. It also REFHOLDS the conn
2662  * thus ensuring that the conn exists when the caller uses the conn.
2663  */
2664 conn_t *
2665 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2666 {
2667 	conn_t	*next_connp;
2668 
2669 	if (connfp == NULL)
2670 		return (NULL);
2671 
2672 	mutex_enter(&connfp->connf_lock);
2673 
2674 	next_connp = (connp == NULL) ?
2675 	    connfp->connf_head : connp->conn_g_next;
2676 
2677 	while (next_connp != NULL) {
2678 		mutex_enter(&next_connp->conn_lock);
2679 		if (!(next_connp->conn_flags & conn_flags) ||
2680 		    (next_connp->conn_state_flags &
2681 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2682 			/*
2683 			 * This conn has been condemned or
2684 			 * is closing, or the flags don't match
2685 			 */
2686 			mutex_exit(&next_connp->conn_lock);
2687 			next_connp = next_connp->conn_g_next;
2688 			continue;
2689 		}
2690 		CONN_INC_REF_LOCKED(next_connp);
2691 		mutex_exit(&next_connp->conn_lock);
2692 		break;
2693 	}
2694 
2695 	mutex_exit(&connfp->connf_lock);
2696 
2697 	if (connp != NULL)
2698 		CONN_DEC_REF(connp);
2699 
2700 	return (next_connp);
2701 }
2702 
2703 #ifdef CONN_DEBUG
2704 /*
2705  * Trace of the last NBUF refhold/refrele
2706  */
2707 int
2708 conn_trace_ref(conn_t *connp)
2709 {
2710 	int	last;
2711 	conn_trace_t	*ctb;
2712 
2713 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 	last = connp->conn_trace_last;
2715 	last++;
2716 	if (last == CONN_TRACE_MAX)
2717 		last = 0;
2718 
2719 	ctb = &connp->conn_trace_buf[last];
2720 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 	connp->conn_trace_last = last;
2722 	return (1);
2723 }
2724 
2725 int
2726 conn_untrace_ref(conn_t *connp)
2727 {
2728 	int	last;
2729 	conn_trace_t	*ctb;
2730 
2731 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2732 	last = connp->conn_trace_last;
2733 	last++;
2734 	if (last == CONN_TRACE_MAX)
2735 		last = 0;
2736 
2737 	ctb = &connp->conn_trace_buf[last];
2738 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2739 	connp->conn_trace_last = last;
2740 	return (1);
2741 }
2742 #endif
2743 
2744 mib2_socketInfoEntry_t *
2745 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2746 {
2747 	vnode_t *vn = NULL;
2748 	vattr_t attr;
2749 	uint64_t flags = 0;
2750 	sock_upcalls_t *upcalls;
2751 	sock_upper_handle_t upper_handle;
2752 
2753 	/*
2754 	 * If the connection is closing, it is not safe to make an upcall or
2755 	 * access the stream associated with the connection.
2756 	 * The callers of this function have a reference on connp itself
2757 	 * so, as long as it is not closing, it's safe to continue.
2758 	 */
2759 	mutex_enter(&connp->conn_lock);
2760 
2761 	if ((connp->conn_state_flags & CONN_CLOSING)) {
2762 		mutex_exit(&connp->conn_lock);
2763 		return (NULL);
2764 	}
2765 
2766 	/*
2767 	 * Continue to hold conn_lock because we don't want to race with an
2768 	 * in-progress close, which will have set-to-NULL (and destroyed
2769 	 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2770 	 *
2771 	 * There is still a race with an in-progress OPEN, however, where
2772 	 * conn_upper_handle and conn_upcalls are being assigned (in multiple
2773 	 * codepaths) WITHOUT conn_lock being held.  We address that race
2774 	 * HERE, however, given that both are going from NULL to non-NULL,
2775 	 * if we lose the race, we don't get any data for the in-progress-OPEN
2776 	 * socket.
2777 	 */
2778 
2779 	upcalls = connp->conn_upcalls;
2780 	upper_handle = connp->conn_upper_handle;
2781 	/* Check BOTH for non-NULL before attempting an upcall. */
2782 	if (upper_handle != NULL && upcalls != NULL) {
2783 		/* su_get_vnode() returns one with VN_HOLD() already done. */
2784 		vn = upcalls->su_get_vnode(upper_handle);
2785 	} else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2786 		vn = STREAM(connp->conn_rq)->sd_pvnode;
2787 		if (vn != NULL)
2788 			VN_HOLD(vn);
2789 		flags |= MIB2_SOCKINFO_STREAM;
2790 	}
2791 
2792 	mutex_exit(&connp->conn_lock);
2793 
2794 	if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2795 		if (vn != NULL)
2796 			VN_RELE(vn);
2797 		return (NULL);
2798 	}
2799 
2800 	VN_RELE(vn);
2801 
2802 	bzero(sie, sizeof (*sie));
2803 
2804 	sie->sie_flags = flags;
2805 	sie->sie_inode = attr.va_nodeid;
2806 	sie->sie_dev = attr.va_rdev;
2807 
2808 	return (sie);
2809 }
2810