xref: /titanic_41/usr/src/uts/common/inet/ip/ipclassifier.c (revision a5628610b3cb18335f49944f353e3be7b9e669f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * IP PACKET CLASSIFIER
27  *
28  * The IP packet classifier provides mapping between IP packets and persistent
29  * connection state for connection-oriented protocols. It also provides
30  * interface for managing connection states.
31  *
32  * The connection state is kept in conn_t data structure and contains, among
33  * other things:
34  *
35  *	o local/remote address and ports
36  *	o Transport protocol
37  *	o squeue for the connection (for TCP only)
38  *	o reference counter
39  *	o Connection state
40  *	o hash table linkage
41  *	o interface/ire information
42  *	o credentials
43  *	o ipsec policy
44  *	o send and receive functions.
45  *	o mutex lock.
46  *
47  * Connections use a reference counting scheme. They are freed when the
48  * reference counter drops to zero. A reference is incremented when connection
49  * is placed in a list or table, when incoming packet for the connection arrives
50  * and when connection is processed via squeue (squeue processing may be
51  * asynchronous and the reference protects the connection from being destroyed
52  * before its processing is finished).
53  *
54  * conn_recv is used to pass up packets to the ULP.
55  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56  * a listener, and changes to tcp_input_listener as the listener has picked a
57  * good squeue. For other cases it is set to tcp_input_data.
58  *
59  * conn_recvicmp is used to pass up ICMP errors to the ULP.
60  *
61  * Classifier uses several hash tables:
62  *
63  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
64  *	ipcl_bind_fanout:	contains all connections in BOUND state
65  *	ipcl_proto_fanout:	IPv4 protocol fanout
66  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
67  *	ipcl_udp_fanout:	contains all UDP connections
68  *	ipcl_iptun_fanout:	contains all IP tunnel connections
69  *	ipcl_globalhash_fanout:	contains all connections
70  *
71  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72  * which need to view all existing connections.
73  *
74  * All tables are protected by per-bucket locks. When both per-bucket lock and
75  * connection lock need to be held, the per-bucket lock should be acquired
76  * first, followed by the connection lock.
77  *
78  * All functions doing search in one of these tables increment a reference
79  * counter on the connection found (if any). This reference should be dropped
80  * when the caller has finished processing the connection.
81  *
82  *
83  * INTERFACES:
84  * ===========
85  *
86  * Connection Lookup:
87  * ------------------
88  *
89  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
91  *
92  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93  * it can't find any associated connection. If the connection is found, its
94  * reference counter is incremented.
95  *
96  *	mp:	mblock, containing packet header. The full header should fit
97  *		into a single mblock. It should also contain at least full IP
98  *		and TCP or UDP header.
99  *
100  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
101  *
102  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
103  *		 the packet.
104  *
105  * 	ira->ira_zoneid: The zone in which the returned connection must be; the
106  *		zoneid corresponding to the ire_zoneid on the IRE located for
107  *		the packet's destination address.
108  *
109  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110  *		IRAF_TX_SHARED_ADDR flags
111  *
112  *	For TCP connections, the lookup order is as follows:
113  *		5-tuple {src, dst, protocol, local port, remote port}
114  *			lookup in ipcl_conn_fanout table.
115  *		3-tuple {dst, remote port, protocol} lookup in
116  *			ipcl_bind_fanout table.
117  *
118  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
119  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
120  *	these interfaces do not handle cases where a packets belongs
121  *	to multiple UDP clients, which is handled in IP itself.
122  *
123  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
124  * determine which actual zone gets the segment.  This is used only in a
125  * labeled environment.  The matching rules are:
126  *
127  *	- If it's not a multilevel port, then the label on the packet selects
128  *	  the zone.  Unlabeled packets are delivered to the global zone.
129  *
130  *	- If it's a multilevel port, then only the zone registered to receive
131  *	  packets on that port matches.
132  *
133  * Also, in a labeled environment, packet labels need to be checked.  For fully
134  * bound TCP connections, we can assume that the packet label was checked
135  * during connection establishment, and doesn't need to be checked on each
136  * packet.  For others, though, we need to check for strict equality or, for
137  * multilevel ports, membership in the range or set.  This part currently does
138  * a tnrh lookup on each packet, but could be optimized to use cached results
139  * if that were necessary.  (SCTP doesn't come through here, but if it did,
140  * we would apply the same rules as TCP.)
141  *
142  * An implication of the above is that fully-bound TCP sockets must always use
143  * distinct 4-tuples; they can't be discriminated by label alone.
144  *
145  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
146  * as there's no connection set-up handshake and no shared state.
147  *
148  * Labels on looped-back packets within a single zone do not need to be
149  * checked, as all processes in the same zone have the same label.
150  *
151  * Finally, for unlabeled packets received by a labeled system, special rules
152  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
153  * socket in the zone whose label matches the default label of the sender, if
154  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
155  * receiver's label must dominate the sender's default label.
156  *
157  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
158  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
159  *					 ip_stack);
160  *
161  *	Lookup routine to find a exact match for {src, dst, local port,
162  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
163  *	ports are read from the IP and TCP header respectively.
164  *
165  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
166  *					 zoneid, ip_stack);
167  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
168  *					 zoneid, ip_stack);
169  *
170  * 	Lookup routine to find a listener with the tuple {lport, laddr,
171  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
172  * 	parameter interface index is also compared.
173  *
174  * void ipcl_walk(func, arg, ip_stack)
175  *
176  * 	Apply 'func' to every connection available. The 'func' is called as
177  *	(*func)(connp, arg). The walk is non-atomic so connections may be
178  *	created and destroyed during the walk. The CONN_CONDEMNED and
179  *	CONN_INCIPIENT flags ensure that connections which are newly created
180  *	or being destroyed are not selected by the walker.
181  *
182  * Table Updates
183  * -------------
184  *
185  * int ipcl_conn_insert(connp);
186  * int ipcl_conn_insert_v4(connp);
187  * int ipcl_conn_insert_v6(connp);
188  *
189  *	Insert 'connp' in the ipcl_conn_fanout.
190  *	Arguements :
191  *		connp		conn_t to be inserted
192  *
193  *	Return value :
194  *		0		if connp was inserted
195  *		EADDRINUSE	if the connection with the same tuple
196  *				already exists.
197  *
198  * int ipcl_bind_insert(connp);
199  * int ipcl_bind_insert_v4(connp);
200  * int ipcl_bind_insert_v6(connp);
201  *
202  * 	Insert 'connp' in ipcl_bind_fanout.
203  * 	Arguements :
204  * 		connp		conn_t to be inserted
205  *
206  *
207  * void ipcl_hash_remove(connp);
208  *
209  * 	Removes the 'connp' from the connection fanout table.
210  *
211  * Connection Creation/Destruction
212  * -------------------------------
213  *
214  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
215  *
216  * 	Creates a new conn based on the type flag, inserts it into
217  * 	globalhash table.
218  *
219  *	type:	This flag determines the type of conn_t which needs to be
220  *		created i.e., which kmem_cache it comes from.
221  *		IPCL_TCPCONN	indicates a TCP connection
222  *		IPCL_SCTPCONN	indicates a SCTP connection
223  *		IPCL_UDPCONN	indicates a UDP conn_t.
224  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
225  *		IPCL_RTSCONN	indicates a RTS conn_t.
226  *		IPCL_IPCCONN	indicates all other connections.
227  *
228  * void ipcl_conn_destroy(connp)
229  *
230  * 	Destroys the connection state, removes it from the global
231  * 	connection hash table and frees its memory.
232  */
233 
234 #include <sys/types.h>
235 #include <sys/stream.h>
236 #include <sys/stropts.h>
237 #include <sys/sysmacros.h>
238 #include <sys/strsubr.h>
239 #include <sys/strsun.h>
240 #define	_SUN_TPI_VERSION 2
241 #include <sys/ddi.h>
242 #include <sys/cmn_err.h>
243 #include <sys/debug.h>
244 
245 #include <sys/systm.h>
246 #include <sys/param.h>
247 #include <sys/kmem.h>
248 #include <sys/isa_defs.h>
249 #include <inet/common.h>
250 #include <netinet/ip6.h>
251 #include <netinet/icmp6.h>
252 
253 #include <inet/ip.h>
254 #include <inet/ip_if.h>
255 #include <inet/ip_ire.h>
256 #include <inet/ip6.h>
257 #include <inet/ip_ndp.h>
258 #include <inet/ip_impl.h>
259 #include <inet/udp_impl.h>
260 #include <inet/sctp_ip.h>
261 #include <inet/sctp/sctp_impl.h>
262 #include <inet/rawip_impl.h>
263 #include <inet/rts_impl.h>
264 #include <inet/iptun/iptun_impl.h>
265 
266 #include <sys/cpuvar.h>
267 
268 #include <inet/ipclassifier.h>
269 #include <inet/tcp.h>
270 #include <inet/ipsec_impl.h>
271 
272 #include <sys/tsol/tnet.h>
273 #include <sys/sockio.h>
274 
275 /* Old value for compatibility. Setable in /etc/system */
276 uint_t tcp_conn_hash_size = 0;
277 
278 /* New value. Zero means choose automatically.  Setable in /etc/system */
279 uint_t ipcl_conn_hash_size = 0;
280 uint_t ipcl_conn_hash_memfactor = 8192;
281 uint_t ipcl_conn_hash_maxsize = 82500;
282 
283 /* bind/udp fanout table size */
284 uint_t ipcl_bind_fanout_size = 512;
285 uint_t ipcl_udp_fanout_size = 16384;
286 
287 /* Raw socket fanout size.  Must be a power of 2. */
288 uint_t ipcl_raw_fanout_size = 256;
289 
290 /*
291  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
292  * expect that most large deployments would have hundreds of tunnels, and
293  * thousands in the extreme case.
294  */
295 uint_t ipcl_iptun_fanout_size = 6143;
296 
297 /*
298  * Power of 2^N Primes useful for hashing for N of 0-28,
299  * these primes are the nearest prime <= 2^N - 2^(N-2).
300  */
301 
302 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
303 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
304 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
305 		50331599, 100663291, 201326557, 0}
306 
307 /*
308  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
309  * are aligned on cache lines.
310  */
311 typedef union itc_s {
312 	conn_t	itc_conn;
313 	char	itcu_filler[CACHE_ALIGN(conn_s)];
314 } itc_t;
315 
316 struct kmem_cache  *tcp_conn_cache;
317 struct kmem_cache  *ip_conn_cache;
318 extern struct kmem_cache  *sctp_conn_cache;
319 struct kmem_cache  *udp_conn_cache;
320 struct kmem_cache  *rawip_conn_cache;
321 struct kmem_cache  *rts_conn_cache;
322 
323 extern void	tcp_timermp_free(tcp_t *);
324 extern mblk_t	*tcp_timermp_alloc(int);
325 
326 static int	ip_conn_constructor(void *, void *, int);
327 static void	ip_conn_destructor(void *, void *);
328 
329 static int	tcp_conn_constructor(void *, void *, int);
330 static void	tcp_conn_destructor(void *, void *);
331 
332 static int	udp_conn_constructor(void *, void *, int);
333 static void	udp_conn_destructor(void *, void *);
334 
335 static int	rawip_conn_constructor(void *, void *, int);
336 static void	rawip_conn_destructor(void *, void *);
337 
338 static int	rts_conn_constructor(void *, void *, int);
339 static void	rts_conn_destructor(void *, void *);
340 
341 /*
342  * Global (for all stack instances) init routine
343  */
344 void
ipcl_g_init(void)345 ipcl_g_init(void)
346 {
347 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
348 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
349 	    ip_conn_constructor, ip_conn_destructor,
350 	    NULL, NULL, NULL, 0);
351 
352 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
353 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
354 	    tcp_conn_constructor, tcp_conn_destructor,
355 	    tcp_conn_reclaim, NULL, NULL, 0);
356 
357 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
358 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
359 	    udp_conn_constructor, udp_conn_destructor,
360 	    NULL, NULL, NULL, 0);
361 
362 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
363 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
364 	    rawip_conn_constructor, rawip_conn_destructor,
365 	    NULL, NULL, NULL, 0);
366 
367 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
368 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
369 	    rts_conn_constructor, rts_conn_destructor,
370 	    NULL, NULL, NULL, 0);
371 }
372 
373 /*
374  * ipclassifier intialization routine, sets up hash tables.
375  */
376 void
ipcl_init(ip_stack_t * ipst)377 ipcl_init(ip_stack_t *ipst)
378 {
379 	int i;
380 	int sizes[] = P2Ps();
381 
382 	/*
383 	 * Calculate size of conn fanout table from /etc/system settings
384 	 */
385 	if (ipcl_conn_hash_size != 0) {
386 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
387 	} else if (tcp_conn_hash_size != 0) {
388 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
389 	} else {
390 		extern pgcnt_t freemem;
391 
392 		ipst->ips_ipcl_conn_fanout_size =
393 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
394 
395 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
396 			ipst->ips_ipcl_conn_fanout_size =
397 			    ipcl_conn_hash_maxsize;
398 		}
399 	}
400 
401 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
402 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
403 			break;
404 		}
405 	}
406 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
407 		/* Out of range, use the 2^16 value */
408 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
409 	}
410 
411 	/* Take values from /etc/system */
412 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
413 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
414 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
415 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
416 
417 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
418 
419 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
420 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
421 
422 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
423 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
424 		    MUTEX_DEFAULT, NULL);
425 	}
426 
427 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
428 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
429 
430 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
431 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
432 		    MUTEX_DEFAULT, NULL);
433 	}
434 
435 	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
436 	    sizeof (connf_t), KM_SLEEP);
437 	for (i = 0; i < IPPROTO_MAX; i++) {
438 		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
439 		    MUTEX_DEFAULT, NULL);
440 	}
441 
442 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
443 	    sizeof (connf_t), KM_SLEEP);
444 	for (i = 0; i < IPPROTO_MAX; i++) {
445 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
446 		    MUTEX_DEFAULT, NULL);
447 	}
448 
449 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
450 	mutex_init(&ipst->ips_rts_clients->connf_lock,
451 	    NULL, MUTEX_DEFAULT, NULL);
452 
453 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
454 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
455 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
456 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
457 		    MUTEX_DEFAULT, NULL);
458 	}
459 
460 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
461 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
462 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
463 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
464 		    MUTEX_DEFAULT, NULL);
465 	}
466 
467 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
468 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
469 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
470 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
471 		    MUTEX_DEFAULT, NULL);
472 	}
473 
474 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
475 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
476 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
477 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
478 		    NULL, MUTEX_DEFAULT, NULL);
479 	}
480 }
481 
482 void
ipcl_g_destroy(void)483 ipcl_g_destroy(void)
484 {
485 	kmem_cache_destroy(ip_conn_cache);
486 	kmem_cache_destroy(tcp_conn_cache);
487 	kmem_cache_destroy(udp_conn_cache);
488 	kmem_cache_destroy(rawip_conn_cache);
489 	kmem_cache_destroy(rts_conn_cache);
490 }
491 
492 /*
493  * All user-level and kernel use of the stack must be gone
494  * by now.
495  */
496 void
ipcl_destroy(ip_stack_t * ipst)497 ipcl_destroy(ip_stack_t *ipst)
498 {
499 	int i;
500 
501 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
502 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
503 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
504 	}
505 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
506 	    sizeof (connf_t));
507 	ipst->ips_ipcl_conn_fanout = NULL;
508 
509 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
510 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
511 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
512 	}
513 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
514 	    sizeof (connf_t));
515 	ipst->ips_ipcl_bind_fanout = NULL;
516 
517 	for (i = 0; i < IPPROTO_MAX; i++) {
518 		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
519 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
520 	}
521 	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
522 	    IPPROTO_MAX * sizeof (connf_t));
523 	ipst->ips_ipcl_proto_fanout_v4 = NULL;
524 
525 	for (i = 0; i < IPPROTO_MAX; i++) {
526 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
527 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
528 	}
529 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
530 	    IPPROTO_MAX * sizeof (connf_t));
531 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
532 
533 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
534 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
535 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
536 	}
537 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
538 	    sizeof (connf_t));
539 	ipst->ips_ipcl_udp_fanout = NULL;
540 
541 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
542 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
543 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
544 	}
545 	kmem_free(ipst->ips_ipcl_iptun_fanout,
546 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
547 	ipst->ips_ipcl_iptun_fanout = NULL;
548 
549 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
550 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
551 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
552 	}
553 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
554 	    sizeof (connf_t));
555 	ipst->ips_ipcl_raw_fanout = NULL;
556 
557 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
558 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
559 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
560 	}
561 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
562 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
563 	ipst->ips_ipcl_globalhash_fanout = NULL;
564 
565 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
566 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
567 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
568 	ipst->ips_rts_clients = NULL;
569 }
570 
571 /*
572  * conn creation routine. initialize the conn, sets the reference
573  * and inserts it in the global hash table.
574  */
575 conn_t *
ipcl_conn_create(uint32_t type,int sleep,netstack_t * ns)576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
577 {
578 	conn_t	*connp;
579 	struct kmem_cache *conn_cache;
580 
581 	switch (type) {
582 	case IPCL_SCTPCONN:
583 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
584 			return (NULL);
585 		sctp_conn_init(connp);
586 		netstack_hold(ns);
587 		connp->conn_netstack = ns;
588 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
589 		connp->conn_ixa->ixa_conn_id = (long)connp;
590 		ipcl_globalhash_insert(connp);
591 		return (connp);
592 
593 	case IPCL_TCPCONN:
594 		conn_cache = tcp_conn_cache;
595 		break;
596 
597 	case IPCL_UDPCONN:
598 		conn_cache = udp_conn_cache;
599 		break;
600 
601 	case IPCL_RAWIPCONN:
602 		conn_cache = rawip_conn_cache;
603 		break;
604 
605 	case IPCL_RTSCONN:
606 		conn_cache = rts_conn_cache;
607 		break;
608 
609 	case IPCL_IPCCONN:
610 		conn_cache = ip_conn_cache;
611 		break;
612 
613 	default:
614 		connp = NULL;
615 		ASSERT(0);
616 	}
617 
618 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
619 		return (NULL);
620 
621 	connp->conn_ref = 1;
622 	netstack_hold(ns);
623 	connp->conn_netstack = ns;
624 	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
625 	connp->conn_ixa->ixa_conn_id = (long)connp;
626 	ipcl_globalhash_insert(connp);
627 	return (connp);
628 }
629 
630 void
ipcl_conn_destroy(conn_t * connp)631 ipcl_conn_destroy(conn_t *connp)
632 {
633 	mblk_t	*mp;
634 	netstack_t	*ns = connp->conn_netstack;
635 
636 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
637 	ASSERT(connp->conn_ref == 0);
638 	ASSERT(connp->conn_ioctlref == 0);
639 
640 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
641 
642 	if (connp->conn_cred != NULL) {
643 		crfree(connp->conn_cred);
644 		connp->conn_cred = NULL;
645 		/* ixa_cred done in ipcl_conn_cleanup below */
646 	}
647 
648 	if (connp->conn_ht_iphc != NULL) {
649 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
650 		connp->conn_ht_iphc = NULL;
651 		connp->conn_ht_iphc_allocated = 0;
652 		connp->conn_ht_iphc_len = 0;
653 		connp->conn_ht_ulp = NULL;
654 		connp->conn_ht_ulp_len = 0;
655 	}
656 	ip_pkt_free(&connp->conn_xmit_ipp);
657 
658 	ipcl_globalhash_remove(connp);
659 
660 	if (connp->conn_latch != NULL) {
661 		IPLATCH_REFRELE(connp->conn_latch);
662 		connp->conn_latch = NULL;
663 	}
664 	if (connp->conn_latch_in_policy != NULL) {
665 		IPPOL_REFRELE(connp->conn_latch_in_policy);
666 		connp->conn_latch_in_policy = NULL;
667 	}
668 	if (connp->conn_latch_in_action != NULL) {
669 		IPACT_REFRELE(connp->conn_latch_in_action);
670 		connp->conn_latch_in_action = NULL;
671 	}
672 	if (connp->conn_policy != NULL) {
673 		IPPH_REFRELE(connp->conn_policy, ns);
674 		connp->conn_policy = NULL;
675 	}
676 
677 	if (connp->conn_ipsec_opt_mp != NULL) {
678 		freemsg(connp->conn_ipsec_opt_mp);
679 		connp->conn_ipsec_opt_mp = NULL;
680 	}
681 
682 	if (connp->conn_flags & IPCL_TCPCONN) {
683 		tcp_t *tcp = connp->conn_tcp;
684 
685 		tcp_free(tcp);
686 		mp = tcp->tcp_timercache;
687 
688 		tcp->tcp_tcps = NULL;
689 
690 		/*
691 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
692 		 * the mblk.
693 		 */
694 		if (tcp->tcp_rsrv_mp != NULL) {
695 			freeb(tcp->tcp_rsrv_mp);
696 			tcp->tcp_rsrv_mp = NULL;
697 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
698 		}
699 
700 		ipcl_conn_cleanup(connp);
701 		connp->conn_flags = IPCL_TCPCONN;
702 		if (ns != NULL) {
703 			ASSERT(tcp->tcp_tcps == NULL);
704 			connp->conn_netstack = NULL;
705 			connp->conn_ixa->ixa_ipst = NULL;
706 			netstack_rele(ns);
707 		}
708 
709 		bzero(tcp, sizeof (tcp_t));
710 
711 		tcp->tcp_timercache = mp;
712 		tcp->tcp_connp = connp;
713 		kmem_cache_free(tcp_conn_cache, connp);
714 		return;
715 	}
716 
717 	if (connp->conn_flags & IPCL_SCTPCONN) {
718 		ASSERT(ns != NULL);
719 		sctp_free(connp);
720 		return;
721 	}
722 
723 	ipcl_conn_cleanup(connp);
724 	if (ns != NULL) {
725 		connp->conn_netstack = NULL;
726 		connp->conn_ixa->ixa_ipst = NULL;
727 		netstack_rele(ns);
728 	}
729 
730 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
731 	if (connp->conn_flags & IPCL_UDPCONN) {
732 		connp->conn_flags = IPCL_UDPCONN;
733 		kmem_cache_free(udp_conn_cache, connp);
734 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
735 		connp->conn_flags = IPCL_RAWIPCONN;
736 		connp->conn_proto = IPPROTO_ICMP;
737 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
738 		kmem_cache_free(rawip_conn_cache, connp);
739 	} else if (connp->conn_flags & IPCL_RTSCONN) {
740 		connp->conn_flags = IPCL_RTSCONN;
741 		kmem_cache_free(rts_conn_cache, connp);
742 	} else {
743 		connp->conn_flags = IPCL_IPCCONN;
744 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
745 		ASSERT(connp->conn_priv == NULL);
746 		kmem_cache_free(ip_conn_cache, connp);
747 	}
748 }
749 
750 /*
751  * Running in cluster mode - deregister listener information
752  */
753 static void
ipcl_conn_unlisten(conn_t * connp)754 ipcl_conn_unlisten(conn_t *connp)
755 {
756 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
757 	ASSERT(connp->conn_lport != 0);
758 
759 	if (cl_inet_unlisten != NULL) {
760 		sa_family_t	addr_family;
761 		uint8_t		*laddrp;
762 
763 		if (connp->conn_ipversion == IPV6_VERSION) {
764 			addr_family = AF_INET6;
765 			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
766 		} else {
767 			addr_family = AF_INET;
768 			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
769 		}
770 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
771 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
772 	}
773 	connp->conn_flags &= ~IPCL_CL_LISTENER;
774 }
775 
776 /*
777  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
778  * which table the conn belonged to). So for debugging we can see which hash
779  * table this connection was in.
780  */
781 #define	IPCL_HASH_REMOVE(connp)	{					\
782 	connf_t	*connfp = (connp)->conn_fanout;				\
783 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
784 	if (connfp != NULL) {						\
785 		mutex_enter(&connfp->connf_lock);			\
786 		if ((connp)->conn_next != NULL)				\
787 			(connp)->conn_next->conn_prev =			\
788 			    (connp)->conn_prev;				\
789 		if ((connp)->conn_prev != NULL)				\
790 			(connp)->conn_prev->conn_next =			\
791 			    (connp)->conn_next;				\
792 		else							\
793 			connfp->connf_head = (connp)->conn_next;	\
794 		(connp)->conn_fanout = NULL;				\
795 		(connp)->conn_next = NULL;				\
796 		(connp)->conn_prev = NULL;				\
797 		(connp)->conn_flags |= IPCL_REMOVED;			\
798 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
799 			ipcl_conn_unlisten((connp));			\
800 		CONN_DEC_REF((connp));					\
801 		mutex_exit(&connfp->connf_lock);			\
802 	}								\
803 }
804 
805 void
ipcl_hash_remove(conn_t * connp)806 ipcl_hash_remove(conn_t *connp)
807 {
808 	uint8_t		protocol = connp->conn_proto;
809 
810 	IPCL_HASH_REMOVE(connp);
811 	if (protocol == IPPROTO_RSVP)
812 		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
813 }
814 
815 /*
816  * The whole purpose of this function is allow removal of
817  * a conn_t from the connected hash for timewait reclaim.
818  * This is essentially a TW reclaim fastpath where timewait
819  * collector checks under fanout lock (so no one else can
820  * get access to the conn_t) that refcnt is 2 i.e. one for
821  * TCP and one for the classifier hash list. If ref count
822  * is indeed 2, we can just remove the conn under lock and
823  * avoid cleaning up the conn under squeue. This gives us
824  * improved performance.
825  */
826 void
ipcl_hash_remove_locked(conn_t * connp,connf_t * connfp)827 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
828 {
829 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
830 	ASSERT(MUTEX_HELD(&connp->conn_lock));
831 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
832 
833 	if ((connp)->conn_next != NULL) {
834 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
835 	}
836 	if ((connp)->conn_prev != NULL) {
837 		(connp)->conn_prev->conn_next = (connp)->conn_next;
838 	} else {
839 		connfp->connf_head = (connp)->conn_next;
840 	}
841 	(connp)->conn_fanout = NULL;
842 	(connp)->conn_next = NULL;
843 	(connp)->conn_prev = NULL;
844 	(connp)->conn_flags |= IPCL_REMOVED;
845 	ASSERT((connp)->conn_ref == 2);
846 	(connp)->conn_ref--;
847 }
848 
849 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
850 	ASSERT((connp)->conn_fanout == NULL);				\
851 	ASSERT((connp)->conn_next == NULL);				\
852 	ASSERT((connp)->conn_prev == NULL);				\
853 	if ((connfp)->connf_head != NULL) {				\
854 		(connfp)->connf_head->conn_prev = (connp);		\
855 		(connp)->conn_next = (connfp)->connf_head;		\
856 	}								\
857 	(connp)->conn_fanout = (connfp);				\
858 	(connfp)->connf_head = (connp);					\
859 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
860 	    IPCL_CONNECTED;						\
861 	CONN_INC_REF(connp);						\
862 }
863 
864 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
865 	IPCL_HASH_REMOVE((connp));					\
866 	mutex_enter(&(connfp)->connf_lock);				\
867 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
868 	mutex_exit(&(connfp)->connf_lock);				\
869 }
870 
871 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
872 	conn_t *pconnp = NULL, *nconnp;					\
873 	IPCL_HASH_REMOVE((connp));					\
874 	mutex_enter(&(connfp)->connf_lock);				\
875 	nconnp = (connfp)->connf_head;					\
876 	while (nconnp != NULL &&					\
877 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
878 		pconnp = nconnp;					\
879 		nconnp = nconnp->conn_next;				\
880 	}								\
881 	if (pconnp != NULL) {						\
882 		pconnp->conn_next = (connp);				\
883 		(connp)->conn_prev = pconnp;				\
884 	} else {							\
885 		(connfp)->connf_head = (connp);				\
886 	}								\
887 	if (nconnp != NULL) {						\
888 		(connp)->conn_next = nconnp;				\
889 		nconnp->conn_prev = (connp);				\
890 	}								\
891 	(connp)->conn_fanout = (connfp);				\
892 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
893 	    IPCL_BOUND;							\
894 	CONN_INC_REF(connp);						\
895 	mutex_exit(&(connfp)->connf_lock);				\
896 }
897 
898 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
899 	conn_t **list, *prev, *next;					\
900 	boolean_t isv4mapped =						\
901 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
902 	IPCL_HASH_REMOVE((connp));					\
903 	mutex_enter(&(connfp)->connf_lock);				\
904 	list = &(connfp)->connf_head;					\
905 	prev = NULL;							\
906 	while ((next = *list) != NULL) {				\
907 		if (isv4mapped &&					\
908 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
909 		    connp->conn_zoneid == next->conn_zoneid) {		\
910 			(connp)->conn_next = next;			\
911 			if (prev != NULL)				\
912 				prev = next->conn_prev;			\
913 			next->conn_prev = (connp);			\
914 			break;						\
915 		}							\
916 		list = &next->conn_next;				\
917 		prev = next;						\
918 	}								\
919 	(connp)->conn_prev = prev;					\
920 	*list = (connp);						\
921 	(connp)->conn_fanout = (connfp);				\
922 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
923 	    IPCL_BOUND;							\
924 	CONN_INC_REF((connp));						\
925 	mutex_exit(&(connfp)->connf_lock);				\
926 }
927 
928 void
ipcl_hash_insert_wildcard(connf_t * connfp,conn_t * connp)929 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
930 {
931 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
932 }
933 
934 /*
935  * Because the classifier is used to classify inbound packets, the destination
936  * address is meant to be our local tunnel address (tunnel source), and the
937  * source the remote tunnel address (tunnel destination).
938  *
939  * Note that conn_proto can't be used for fanout since the upper protocol
940  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
941  */
942 conn_t *
ipcl_iptun_classify_v4(ipaddr_t * src,ipaddr_t * dst,ip_stack_t * ipst)943 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
944 {
945 	connf_t	*connfp;
946 	conn_t	*connp;
947 
948 	/* first look for IPv4 tunnel links */
949 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
950 	mutex_enter(&connfp->connf_lock);
951 	for (connp = connfp->connf_head; connp != NULL;
952 	    connp = connp->conn_next) {
953 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
954 			break;
955 	}
956 	if (connp != NULL)
957 		goto done;
958 
959 	mutex_exit(&connfp->connf_lock);
960 
961 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
962 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
963 	    INADDR_ANY)];
964 	mutex_enter(&connfp->connf_lock);
965 	for (connp = connfp->connf_head; connp != NULL;
966 	    connp = connp->conn_next) {
967 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
968 			break;
969 	}
970 done:
971 	if (connp != NULL)
972 		CONN_INC_REF(connp);
973 	mutex_exit(&connfp->connf_lock);
974 	return (connp);
975 }
976 
977 conn_t *
ipcl_iptun_classify_v6(in6_addr_t * src,in6_addr_t * dst,ip_stack_t * ipst)978 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
979 {
980 	connf_t	*connfp;
981 	conn_t	*connp;
982 
983 	/* Look for an IPv6 tunnel link */
984 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
985 	mutex_enter(&connfp->connf_lock);
986 	for (connp = connfp->connf_head; connp != NULL;
987 	    connp = connp->conn_next) {
988 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
989 			CONN_INC_REF(connp);
990 			break;
991 		}
992 	}
993 	mutex_exit(&connfp->connf_lock);
994 	return (connp);
995 }
996 
997 /*
998  * This function is used only for inserting SCTP raw socket now.
999  * This may change later.
1000  *
1001  * Note that only one raw socket can be bound to a port.  The param
1002  * lport is in network byte order.
1003  */
1004 static int
ipcl_sctp_hash_insert(conn_t * connp,in_port_t lport)1005 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 {
1007 	connf_t	*connfp;
1008 	conn_t	*oconnp;
1009 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1010 
1011 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 
1013 	/* Check for existing raw socket already bound to the port. */
1014 	mutex_enter(&connfp->connf_lock);
1015 	for (oconnp = connfp->connf_head; oconnp != NULL;
1016 	    oconnp = oconnp->conn_next) {
1017 		if (oconnp->conn_lport == lport &&
1018 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1019 		    oconnp->conn_family == connp->conn_family &&
1020 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 		    &connp->conn_laddr_v6))) {
1026 			break;
1027 		}
1028 	}
1029 	mutex_exit(&connfp->connf_lock);
1030 	if (oconnp != NULL)
1031 		return (EADDRNOTAVAIL);
1032 
1033 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038 		} else {
1039 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1040 		}
1041 	} else {
1042 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 	}
1044 	return (0);
1045 }
1046 
1047 static int
ipcl_iptun_hash_insert(conn_t * connp,ip_stack_t * ipst)1048 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 {
1050 	connf_t	*connfp;
1051 	conn_t	*tconnp;
1052 	ipaddr_t laddr = connp->conn_laddr_v4;
1053 	ipaddr_t faddr = connp->conn_faddr_v4;
1054 
1055 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 	mutex_enter(&connfp->connf_lock);
1057 	for (tconnp = connfp->connf_head; tconnp != NULL;
1058 	    tconnp = tconnp->conn_next) {
1059 		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 			/* A tunnel is already bound to these addresses. */
1061 			mutex_exit(&connfp->connf_lock);
1062 			return (EADDRINUSE);
1063 		}
1064 	}
1065 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 	mutex_exit(&connfp->connf_lock);
1067 	return (0);
1068 }
1069 
1070 static int
ipcl_iptun_hash_insert_v6(conn_t * connp,ip_stack_t * ipst)1071 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 {
1073 	connf_t	*connfp;
1074 	conn_t	*tconnp;
1075 	in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 	in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 
1078 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 	mutex_enter(&connfp->connf_lock);
1080 	for (tconnp = connfp->connf_head; tconnp != NULL;
1081 	    tconnp = tconnp->conn_next) {
1082 		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 			/* A tunnel is already bound to these addresses. */
1084 			mutex_exit(&connfp->connf_lock);
1085 			return (EADDRINUSE);
1086 		}
1087 	}
1088 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 	mutex_exit(&connfp->connf_lock);
1090 	return (0);
1091 }
1092 
1093 /*
1094  * Check for a MAC exemption conflict on a labeled system.  Note that for
1095  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096  * transport layer.  This check is for binding all other protocols.
1097  *
1098  * Returns true if there's a conflict.
1099  */
1100 static boolean_t
check_exempt_conflict_v4(conn_t * connp,ip_stack_t * ipst)1101 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 {
1103 	connf_t	*connfp;
1104 	conn_t *tconn;
1105 
1106 	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 	mutex_enter(&connfp->connf_lock);
1108 	for (tconn = connfp->connf_head; tconn != NULL;
1109 	    tconn = tconn->conn_next) {
1110 		/* We don't allow v4 fallback for v6 raw socket */
1111 		if (connp->conn_family != tconn->conn_family)
1112 			continue;
1113 		/* If neither is exempt, then there's no conflict */
1114 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 			continue;
1117 		/* We are only concerned about sockets for a different zone */
1118 		if (connp->conn_zoneid == tconn->conn_zoneid)
1119 			continue;
1120 		/* If both are bound to different specific addrs, ok */
1121 		if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 		    tconn->conn_laddr_v4 != INADDR_ANY &&
1123 		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 			continue;
1125 		/* These two conflict; fail */
1126 		break;
1127 	}
1128 	mutex_exit(&connfp->connf_lock);
1129 	return (tconn != NULL);
1130 }
1131 
1132 static boolean_t
check_exempt_conflict_v6(conn_t * connp,ip_stack_t * ipst)1133 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 {
1135 	connf_t	*connfp;
1136 	conn_t *tconn;
1137 
1138 	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 	mutex_enter(&connfp->connf_lock);
1140 	for (tconn = connfp->connf_head; tconn != NULL;
1141 	    tconn = tconn->conn_next) {
1142 		/* We don't allow v4 fallback for v6 raw socket */
1143 		if (connp->conn_family != tconn->conn_family)
1144 			continue;
1145 		/* If neither is exempt, then there's no conflict */
1146 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 			continue;
1149 		/* We are only concerned about sockets for a different zone */
1150 		if (connp->conn_zoneid == tconn->conn_zoneid)
1151 			continue;
1152 		/* If both are bound to different addrs, ok */
1153 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 		    &tconn->conn_laddr_v6))
1157 			continue;
1158 		/* These two conflict; fail */
1159 		break;
1160 	}
1161 	mutex_exit(&connfp->connf_lock);
1162 	return (tconn != NULL);
1163 }
1164 
1165 /*
1166  * (v4, v6) bind hash insertion routines
1167  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168  */
1169 
1170 int
ipcl_bind_insert(conn_t * connp)1171 ipcl_bind_insert(conn_t *connp)
1172 {
1173 	if (connp->conn_ipversion == IPV6_VERSION)
1174 		return (ipcl_bind_insert_v6(connp));
1175 	else
1176 		return (ipcl_bind_insert_v4(connp));
1177 }
1178 
1179 int
ipcl_bind_insert_v4(conn_t * connp)1180 ipcl_bind_insert_v4(conn_t *connp)
1181 {
1182 	connf_t	*connfp;
1183 	int	ret = 0;
1184 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1185 	uint16_t	lport = connp->conn_lport;
1186 	uint8_t		protocol = connp->conn_proto;
1187 
1188 	if (IPCL_IS_IPTUN(connp))
1189 		return (ipcl_iptun_hash_insert(connp, ipst));
1190 
1191 	switch (protocol) {
1192 	default:
1193 		if (is_system_labeled() &&
1194 		    check_exempt_conflict_v4(connp, ipst))
1195 			return (EADDRINUSE);
1196 		/* FALLTHROUGH */
1197 	case IPPROTO_UDP:
1198 		if (protocol == IPPROTO_UDP) {
1199 			connfp = &ipst->ips_ipcl_udp_fanout[
1200 			    IPCL_UDP_HASH(lport, ipst)];
1201 		} else {
1202 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 		}
1204 
1205 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1209 		} else {
1210 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211 		}
1212 		if (protocol == IPPROTO_RSVP)
1213 			ill_set_inputfn_all(ipst);
1214 		break;
1215 
1216 	case IPPROTO_TCP:
1217 		/* Insert it in the Bind Hash */
1218 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 		connfp = &ipst->ips_ipcl_bind_fanout[
1220 		    IPCL_BIND_HASH(lport, ipst)];
1221 		if (connp->conn_laddr_v4 != INADDR_ANY) {
1222 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1223 		} else {
1224 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225 		}
1226 		if (cl_inet_listen != NULL) {
1227 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 			connp->conn_flags |= IPCL_CL_LISTENER;
1229 			(*cl_inet_listen)(
1230 			    connp->conn_netstack->netstack_stackid,
1231 			    IPPROTO_TCP, AF_INET,
1232 			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 		}
1234 		break;
1235 
1236 	case IPPROTO_SCTP:
1237 		ret = ipcl_sctp_hash_insert(connp, lport);
1238 		break;
1239 	}
1240 
1241 	return (ret);
1242 }
1243 
1244 int
ipcl_bind_insert_v6(conn_t * connp)1245 ipcl_bind_insert_v6(conn_t *connp)
1246 {
1247 	connf_t		*connfp;
1248 	int		ret = 0;
1249 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1250 	uint16_t	lport = connp->conn_lport;
1251 	uint8_t		protocol = connp->conn_proto;
1252 
1253 	if (IPCL_IS_IPTUN(connp)) {
1254 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 	}
1256 
1257 	switch (protocol) {
1258 	default:
1259 		if (is_system_labeled() &&
1260 		    check_exempt_conflict_v6(connp, ipst))
1261 			return (EADDRINUSE);
1262 		/* FALLTHROUGH */
1263 	case IPPROTO_UDP:
1264 		if (protocol == IPPROTO_UDP) {
1265 			connfp = &ipst->ips_ipcl_udp_fanout[
1266 			    IPCL_UDP_HASH(lport, ipst)];
1267 		} else {
1268 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 		}
1270 
1271 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 		} else {
1276 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 		}
1278 		break;
1279 
1280 	case IPPROTO_TCP:
1281 		/* Insert it in the Bind Hash */
1282 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 		connfp = &ipst->ips_ipcl_bind_fanout[
1284 		    IPCL_BIND_HASH(lport, ipst)];
1285 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1287 		} else {
1288 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289 		}
1290 		if (cl_inet_listen != NULL) {
1291 			sa_family_t	addr_family;
1292 			uint8_t		*laddrp;
1293 
1294 			if (connp->conn_ipversion == IPV6_VERSION) {
1295 				addr_family = AF_INET6;
1296 				laddrp =
1297 				    (uint8_t *)&connp->conn_bound_addr_v6;
1298 			} else {
1299 				addr_family = AF_INET;
1300 				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 			}
1302 			connp->conn_flags |= IPCL_CL_LISTENER;
1303 			(*cl_inet_listen)(
1304 			    connp->conn_netstack->netstack_stackid,
1305 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 		}
1307 		break;
1308 
1309 	case IPPROTO_SCTP:
1310 		ret = ipcl_sctp_hash_insert(connp, lport);
1311 		break;
1312 	}
1313 
1314 	return (ret);
1315 }
1316 
1317 /*
1318  * ipcl_conn_hash insertion routines.
1319  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320  */
1321 
1322 int
ipcl_conn_insert(conn_t * connp)1323 ipcl_conn_insert(conn_t *connp)
1324 {
1325 	if (connp->conn_ipversion == IPV6_VERSION)
1326 		return (ipcl_conn_insert_v6(connp));
1327 	else
1328 		return (ipcl_conn_insert_v4(connp));
1329 }
1330 
1331 int
ipcl_conn_insert_v4(conn_t * connp)1332 ipcl_conn_insert_v4(conn_t *connp)
1333 {
1334 	connf_t		*connfp;
1335 	conn_t		*tconnp;
1336 	int		ret = 0;
1337 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1338 	uint16_t	lport = connp->conn_lport;
1339 	uint8_t		protocol = connp->conn_proto;
1340 
1341 	if (IPCL_IS_IPTUN(connp))
1342 		return (ipcl_iptun_hash_insert(connp, ipst));
1343 
1344 	switch (protocol) {
1345 	case IPPROTO_TCP:
1346 		/*
1347 		 * For TCP, we check whether the connection tuple already
1348 		 * exists before allowing the connection to proceed.  We
1349 		 * also allow indexing on the zoneid. This is to allow
1350 		 * multiple shared stack zones to have the same tcp
1351 		 * connection tuple. In practice this only happens for
1352 		 * INADDR_LOOPBACK as it's the only local address which
1353 		 * doesn't have to be unique.
1354 		 */
1355 		connfp = &ipst->ips_ipcl_conn_fanout[
1356 		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 		    connp->conn_ports, ipst)];
1358 		mutex_enter(&connfp->connf_lock);
1359 		for (tconnp = connfp->connf_head; tconnp != NULL;
1360 		    tconnp = tconnp->conn_next) {
1361 			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 			    connp->conn_ports) &&
1364 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 				/* Already have a conn. bail out */
1366 				mutex_exit(&connfp->connf_lock);
1367 				return (EADDRINUSE);
1368 			}
1369 		}
1370 		if (connp->conn_fanout != NULL) {
1371 			/*
1372 			 * Probably a XTI/TLI application trying to do a
1373 			 * rebind. Let it happen.
1374 			 */
1375 			mutex_exit(&connfp->connf_lock);
1376 			IPCL_HASH_REMOVE(connp);
1377 			mutex_enter(&connfp->connf_lock);
1378 		}
1379 
1380 		ASSERT(connp->conn_recv != NULL);
1381 		ASSERT(connp->conn_recvicmp != NULL);
1382 
1383 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 		mutex_exit(&connfp->connf_lock);
1385 		break;
1386 
1387 	case IPPROTO_SCTP:
1388 		/*
1389 		 * The raw socket may have already been bound, remove it
1390 		 * from the hash first.
1391 		 */
1392 		IPCL_HASH_REMOVE(connp);
1393 		ret = ipcl_sctp_hash_insert(connp, lport);
1394 		break;
1395 
1396 	default:
1397 		/*
1398 		 * Check for conflicts among MAC exempt bindings.  For
1399 		 * transports with port numbers, this is done by the upper
1400 		 * level per-transport binding logic.  For all others, it's
1401 		 * done here.
1402 		 */
1403 		if (is_system_labeled() &&
1404 		    check_exempt_conflict_v4(connp, ipst))
1405 			return (EADDRINUSE);
1406 		/* FALLTHROUGH */
1407 
1408 	case IPPROTO_UDP:
1409 		if (protocol == IPPROTO_UDP) {
1410 			connfp = &ipst->ips_ipcl_udp_fanout[
1411 			    IPCL_UDP_HASH(lport, ipst)];
1412 		} else {
1413 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 		}
1415 
1416 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1420 		} else {
1421 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422 		}
1423 		break;
1424 	}
1425 
1426 	return (ret);
1427 }
1428 
1429 int
ipcl_conn_insert_v6(conn_t * connp)1430 ipcl_conn_insert_v6(conn_t *connp)
1431 {
1432 	connf_t		*connfp;
1433 	conn_t		*tconnp;
1434 	int		ret = 0;
1435 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1436 	uint16_t	lport = connp->conn_lport;
1437 	uint8_t		protocol = connp->conn_proto;
1438 	uint_t		ifindex = connp->conn_bound_if;
1439 
1440 	if (IPCL_IS_IPTUN(connp))
1441 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 
1443 	switch (protocol) {
1444 	case IPPROTO_TCP:
1445 
1446 		/*
1447 		 * For tcp, we check whether the connection tuple already
1448 		 * exists before allowing the connection to proceed.  We
1449 		 * also allow indexing on the zoneid. This is to allow
1450 		 * multiple shared stack zones to have the same tcp
1451 		 * connection tuple. In practice this only happens for
1452 		 * ipv6_loopback as it's the only local address which
1453 		 * doesn't have to be unique.
1454 		 */
1455 		connfp = &ipst->ips_ipcl_conn_fanout[
1456 		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 		    ipst)];
1458 		mutex_enter(&connfp->connf_lock);
1459 		for (tconnp = connfp->connf_head; tconnp != NULL;
1460 		    tconnp = tconnp->conn_next) {
1461 			/* NOTE: need to match zoneid. Bug in onnv-gate */
1462 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 			    connp->conn_ports) &&
1465 			    (tconnp->conn_bound_if == 0 ||
1466 			    tconnp->conn_bound_if == ifindex) &&
1467 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 				/* Already have a conn. bail out */
1469 				mutex_exit(&connfp->connf_lock);
1470 				return (EADDRINUSE);
1471 			}
1472 		}
1473 		if (connp->conn_fanout != NULL) {
1474 			/*
1475 			 * Probably a XTI/TLI application trying to do a
1476 			 * rebind. Let it happen.
1477 			 */
1478 			mutex_exit(&connfp->connf_lock);
1479 			IPCL_HASH_REMOVE(connp);
1480 			mutex_enter(&connfp->connf_lock);
1481 		}
1482 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 		mutex_exit(&connfp->connf_lock);
1484 		break;
1485 
1486 	case IPPROTO_SCTP:
1487 		IPCL_HASH_REMOVE(connp);
1488 		ret = ipcl_sctp_hash_insert(connp, lport);
1489 		break;
1490 
1491 	default:
1492 		if (is_system_labeled() &&
1493 		    check_exempt_conflict_v6(connp, ipst))
1494 			return (EADDRINUSE);
1495 		/* FALLTHROUGH */
1496 	case IPPROTO_UDP:
1497 		if (protocol == IPPROTO_UDP) {
1498 			connfp = &ipst->ips_ipcl_udp_fanout[
1499 			    IPCL_UDP_HASH(lport, ipst)];
1500 		} else {
1501 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 		}
1503 
1504 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1508 		} else {
1509 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510 		}
1511 		break;
1512 	}
1513 
1514 	return (ret);
1515 }
1516 
1517 /*
1518  * v4 packet classifying function. looks up the fanout table to
1519  * find the conn, the packet belongs to. returns the conn with
1520  * the reference held, null otherwise.
1521  *
1522  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523  * Lookup" comment block are applied.  Labels are also checked as described
1524  * above.  If the packet is from the inside (looped back), and is from the same
1525  * zone, then label checks are omitted.
1526  */
1527 conn_t *
ipcl_classify_v4(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1528 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529     ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 {
1531 	ipha_t	*ipha;
1532 	connf_t	*connfp, *bind_connfp;
1533 	uint16_t lport;
1534 	uint16_t fport;
1535 	uint32_t ports;
1536 	conn_t	*connp;
1537 	uint16_t  *up;
1538 	zoneid_t	zoneid = ira->ira_zoneid;
1539 
1540 	ipha = (ipha_t *)mp->b_rptr;
1541 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 
1543 	switch (protocol) {
1544 	case IPPROTO_TCP:
1545 		ports = *(uint32_t *)up;
1546 		connfp =
1547 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 		    ports, ipst)];
1549 		mutex_enter(&connfp->connf_lock);
1550 		for (connp = connfp->connf_head; connp != NULL;
1551 		    connp = connp->conn_next) {
1552 			if (IPCL_CONN_MATCH(connp, protocol,
1553 			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 			    (connp->conn_zoneid == zoneid ||
1555 			    connp->conn_allzones ||
1556 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 				break;
1560 		}
1561 
1562 		if (connp != NULL) {
1563 			/*
1564 			 * We have a fully-bound TCP connection.
1565 			 *
1566 			 * For labeled systems, there's no need to check the
1567 			 * label here.  It's known to be good as we checked
1568 			 * before allowing the connection to become bound.
1569 			 */
1570 			CONN_INC_REF(connp);
1571 			mutex_exit(&connfp->connf_lock);
1572 			return (connp);
1573 		}
1574 
1575 		mutex_exit(&connfp->connf_lock);
1576 		lport = up[1];
1577 		bind_connfp =
1578 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 		mutex_enter(&bind_connfp->connf_lock);
1580 		for (connp = bind_connfp->connf_head; connp != NULL;
1581 		    connp = connp->conn_next) {
1582 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 			    lport) &&
1584 			    (connp->conn_zoneid == zoneid ||
1585 			    connp->conn_allzones ||
1586 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 				break;
1590 		}
1591 
1592 		/*
1593 		 * If the matching connection is SLP on a private address, then
1594 		 * the label on the packet must match the local zone's label.
1595 		 * Otherwise, it must be in the label range defined by tnrh.
1596 		 * This is ensured by tsol_receive_local.
1597 		 *
1598 		 * Note that we don't check tsol_receive_local for
1599 		 * the connected case.
1600 		 */
1601 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 		    ira, connp)) {
1604 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 			    char *, "connp(1) could not receive mp(2)",
1606 			    conn_t *, connp, mblk_t *, mp);
1607 			connp = NULL;
1608 		}
1609 
1610 		if (connp != NULL) {
1611 			/* Have a listener at least */
1612 			CONN_INC_REF(connp);
1613 			mutex_exit(&bind_connfp->connf_lock);
1614 			return (connp);
1615 		}
1616 
1617 		mutex_exit(&bind_connfp->connf_lock);
1618 		break;
1619 
1620 	case IPPROTO_UDP:
1621 		lport = up[1];
1622 		fport = up[0];
1623 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 		mutex_enter(&connfp->connf_lock);
1625 		for (connp = connfp->connf_head; connp != NULL;
1626 		    connp = connp->conn_next) {
1627 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 			    fport, ipha->ipha_src) &&
1629 			    (connp->conn_zoneid == zoneid ||
1630 			    connp->conn_allzones ||
1631 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 				break;
1634 		}
1635 
1636 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 		    ira, connp)) {
1639 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 			    char *, "connp(1) could not receive mp(2)",
1641 			    conn_t *, connp, mblk_t *, mp);
1642 			connp = NULL;
1643 		}
1644 
1645 		if (connp != NULL) {
1646 			CONN_INC_REF(connp);
1647 			mutex_exit(&connfp->connf_lock);
1648 			return (connp);
1649 		}
1650 
1651 		/*
1652 		 * We shouldn't come here for multicast/broadcast packets
1653 		 */
1654 		mutex_exit(&connfp->connf_lock);
1655 
1656 		break;
1657 
1658 	case IPPROTO_ENCAP:
1659 	case IPPROTO_IPV6:
1660 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 		    &ipha->ipha_dst, ipst));
1662 	}
1663 
1664 	return (NULL);
1665 }
1666 
1667 conn_t *
ipcl_classify_v6(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1668 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669     ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 {
1671 	ip6_t		*ip6h;
1672 	connf_t		*connfp, *bind_connfp;
1673 	uint16_t	lport;
1674 	uint16_t	fport;
1675 	tcpha_t		*tcpha;
1676 	uint32_t	ports;
1677 	conn_t		*connp;
1678 	uint16_t	*up;
1679 	zoneid_t	zoneid = ira->ira_zoneid;
1680 
1681 	ip6h = (ip6_t *)mp->b_rptr;
1682 
1683 	switch (protocol) {
1684 	case IPPROTO_TCP:
1685 		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 		up = &tcpha->tha_lport;
1687 		ports = *(uint32_t *)up;
1688 
1689 		connfp =
1690 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 		    ports, ipst)];
1692 		mutex_enter(&connfp->connf_lock);
1693 		for (connp = connfp->connf_head; connp != NULL;
1694 		    connp = connp->conn_next) {
1695 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 			    (connp->conn_zoneid == zoneid ||
1698 			    connp->conn_allzones ||
1699 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 				break;
1703 		}
1704 
1705 		if (connp != NULL) {
1706 			/*
1707 			 * We have a fully-bound TCP connection.
1708 			 *
1709 			 * For labeled systems, there's no need to check the
1710 			 * label here.  It's known to be good as we checked
1711 			 * before allowing the connection to become bound.
1712 			 */
1713 			CONN_INC_REF(connp);
1714 			mutex_exit(&connfp->connf_lock);
1715 			return (connp);
1716 		}
1717 
1718 		mutex_exit(&connfp->connf_lock);
1719 
1720 		lport = up[1];
1721 		bind_connfp =
1722 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 		mutex_enter(&bind_connfp->connf_lock);
1724 		for (connp = bind_connfp->connf_head; connp != NULL;
1725 		    connp = connp->conn_next) {
1726 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 			    ip6h->ip6_dst, lport) &&
1728 			    (connp->conn_zoneid == zoneid ||
1729 			    connp->conn_allzones ||
1730 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 				break;
1734 		}
1735 
1736 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 		    ira, connp)) {
1739 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 			    char *, "connp(1) could not receive mp(2)",
1741 			    conn_t *, connp, mblk_t *, mp);
1742 			connp = NULL;
1743 		}
1744 
1745 		if (connp != NULL) {
1746 			/* Have a listner at least */
1747 			CONN_INC_REF(connp);
1748 			mutex_exit(&bind_connfp->connf_lock);
1749 			return (connp);
1750 		}
1751 
1752 		mutex_exit(&bind_connfp->connf_lock);
1753 		break;
1754 
1755 	case IPPROTO_UDP:
1756 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 		lport = up[1];
1758 		fport = up[0];
1759 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 		mutex_enter(&connfp->connf_lock);
1761 		for (connp = connfp->connf_head; connp != NULL;
1762 		    connp = connp->conn_next) {
1763 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 			    fport, ip6h->ip6_src) &&
1765 			    (connp->conn_zoneid == zoneid ||
1766 			    connp->conn_allzones ||
1767 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 				break;
1771 		}
1772 
1773 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 		    ira, connp)) {
1776 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 			    char *, "connp(1) could not receive mp(2)",
1778 			    conn_t *, connp, mblk_t *, mp);
1779 			connp = NULL;
1780 		}
1781 
1782 		if (connp != NULL) {
1783 			CONN_INC_REF(connp);
1784 			mutex_exit(&connfp->connf_lock);
1785 			return (connp);
1786 		}
1787 
1788 		/*
1789 		 * We shouldn't come here for multicast/broadcast packets
1790 		 */
1791 		mutex_exit(&connfp->connf_lock);
1792 		break;
1793 	case IPPROTO_ENCAP:
1794 	case IPPROTO_IPV6:
1795 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 		    &ip6h->ip6_dst, ipst));
1797 	}
1798 
1799 	return (NULL);
1800 }
1801 
1802 /*
1803  * wrapper around ipcl_classify_(v4,v6) routines.
1804  */
1805 conn_t *
ipcl_classify(mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst)1806 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 {
1808 	if (ira->ira_flags & IRAF_IS_IPV4) {
1809 		return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 		    ira->ira_ip_hdr_length, ira, ipst));
1811 	} else {
1812 		return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 		    ira->ira_ip_hdr_length, ira, ipst));
1814 	}
1815 }
1816 
1817 /*
1818  * Only used to classify SCTP RAW sockets
1819  */
1820 conn_t *
ipcl_classify_raw(mblk_t * mp,uint8_t protocol,uint32_t ports,ipha_t * ipha,ip6_t * ip6h,ip_recv_attr_t * ira,ip_stack_t * ipst)1821 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 {
1824 	connf_t		*connfp;
1825 	conn_t		*connp;
1826 	in_port_t	lport;
1827 	int		ipversion;
1828 	const void	*dst;
1829 	zoneid_t	zoneid = ira->ira_zoneid;
1830 
1831 	lport = ((uint16_t *)&ports)[1];
1832 	if (ira->ira_flags & IRAF_IS_IPV4) {
1833 		dst = (const void *)&ipha->ipha_dst;
1834 		ipversion = IPV4_VERSION;
1835 	} else {
1836 		dst = (const void *)&ip6h->ip6_dst;
1837 		ipversion = IPV6_VERSION;
1838 	}
1839 
1840 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 	mutex_enter(&connfp->connf_lock);
1842 	for (connp = connfp->connf_head; connp != NULL;
1843 	    connp = connp->conn_next) {
1844 		/* We don't allow v4 fallback for v6 raw socket. */
1845 		if (ipversion != connp->conn_ipversion)
1846 			continue;
1847 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 			if (ipversion == IPV4_VERSION) {
1850 				if (!IPCL_CONN_MATCH(connp, protocol,
1851 				    ipha->ipha_src, ipha->ipha_dst, ports))
1852 					continue;
1853 			} else {
1854 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 					continue;
1857 			}
1858 		} else {
1859 			if (ipversion == IPV4_VERSION) {
1860 				if (!IPCL_BIND_MATCH(connp, protocol,
1861 				    ipha->ipha_dst, lport))
1862 					continue;
1863 			} else {
1864 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 				    ip6h->ip6_dst, lport))
1866 					continue;
1867 			}
1868 		}
1869 
1870 		if (connp->conn_zoneid == zoneid ||
1871 		    connp->conn_allzones ||
1872 		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 			break;
1876 	}
1877 
1878 	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 		    char *, "connp(1) could not receive mp(2)",
1882 		    conn_t *, connp, mblk_t *, mp);
1883 		connp = NULL;
1884 	}
1885 
1886 	if (connp != NULL)
1887 		goto found;
1888 	mutex_exit(&connfp->connf_lock);
1889 
1890 	/* Try to look for a wildcard SCTP RAW socket match. */
1891 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 	mutex_enter(&connfp->connf_lock);
1893 	for (connp = connfp->connf_head; connp != NULL;
1894 	    connp = connp->conn_next) {
1895 		/* We don't allow v4 fallback for v6 raw socket. */
1896 		if (ipversion != connp->conn_ipversion)
1897 			continue;
1898 		if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 			continue;
1900 
1901 		if (ipversion == IPV4_VERSION) {
1902 			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 				break;
1904 		} else {
1905 			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 				break;
1907 			}
1908 		}
1909 	}
1910 
1911 	if (connp != NULL)
1912 		goto found;
1913 
1914 	mutex_exit(&connfp->connf_lock);
1915 	return (NULL);
1916 
1917 found:
1918 	ASSERT(connp != NULL);
1919 	CONN_INC_REF(connp);
1920 	mutex_exit(&connfp->connf_lock);
1921 	return (connp);
1922 }
1923 
1924 /* ARGSUSED */
1925 static int
tcp_conn_constructor(void * buf,void * cdrarg,int kmflags)1926 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 {
1928 	itc_t	*itc = (itc_t *)buf;
1929 	conn_t 	*connp = &itc->itc_conn;
1930 	tcp_t	*tcp = (tcp_t *)&itc[1];
1931 
1932 	bzero(connp, sizeof (conn_t));
1933 	bzero(tcp, sizeof (tcp_t));
1934 
1935 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 	if (tcp->tcp_timercache == NULL)
1940 		return (ENOMEM);
1941 	connp->conn_tcp = tcp;
1942 	connp->conn_flags = IPCL_TCPCONN;
1943 	connp->conn_proto = IPPROTO_TCP;
1944 	tcp->tcp_connp = connp;
1945 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 
1947 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 	if (connp->conn_ixa == NULL) {
1949 		tcp_timermp_free(tcp);
1950 		return (ENOMEM);
1951 	}
1952 	connp->conn_ixa->ixa_refcnt = 1;
1953 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 	return (0);
1956 }
1957 
1958 /* ARGSUSED */
1959 static void
tcp_conn_destructor(void * buf,void * cdrarg)1960 tcp_conn_destructor(void *buf, void *cdrarg)
1961 {
1962 	itc_t	*itc = (itc_t *)buf;
1963 	conn_t 	*connp = &itc->itc_conn;
1964 	tcp_t	*tcp = (tcp_t *)&itc[1];
1965 
1966 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 	ASSERT(tcp->tcp_connp == connp);
1968 	ASSERT(connp->conn_tcp == tcp);
1969 	tcp_timermp_free(tcp);
1970 	mutex_destroy(&connp->conn_lock);
1971 	cv_destroy(&connp->conn_cv);
1972 	cv_destroy(&connp->conn_sq_cv);
1973 	rw_destroy(&connp->conn_ilg_lock);
1974 
1975 	/* Can be NULL if constructor failed */
1976 	if (connp->conn_ixa != NULL) {
1977 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 		ixa_refrele(connp->conn_ixa);
1981 	}
1982 }
1983 
1984 /* ARGSUSED */
1985 static int
ip_conn_constructor(void * buf,void * cdrarg,int kmflags)1986 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 {
1988 	itc_t	*itc = (itc_t *)buf;
1989 	conn_t 	*connp = &itc->itc_conn;
1990 
1991 	bzero(connp, sizeof (conn_t));
1992 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 	connp->conn_flags = IPCL_IPCCONN;
1995 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 
1997 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 	if (connp->conn_ixa == NULL)
1999 		return (ENOMEM);
2000 	connp->conn_ixa->ixa_refcnt = 1;
2001 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 	return (0);
2003 }
2004 
2005 /* ARGSUSED */
2006 static void
ip_conn_destructor(void * buf,void * cdrarg)2007 ip_conn_destructor(void *buf, void *cdrarg)
2008 {
2009 	itc_t	*itc = (itc_t *)buf;
2010 	conn_t 	*connp = &itc->itc_conn;
2011 
2012 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 	ASSERT(connp->conn_priv == NULL);
2014 	mutex_destroy(&connp->conn_lock);
2015 	cv_destroy(&connp->conn_cv);
2016 	rw_destroy(&connp->conn_ilg_lock);
2017 
2018 	/* Can be NULL if constructor failed */
2019 	if (connp->conn_ixa != NULL) {
2020 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 		ixa_refrele(connp->conn_ixa);
2024 	}
2025 }
2026 
2027 /* ARGSUSED */
2028 static int
udp_conn_constructor(void * buf,void * cdrarg,int kmflags)2029 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 {
2031 	itc_t	*itc = (itc_t *)buf;
2032 	conn_t 	*connp = &itc->itc_conn;
2033 	udp_t	*udp = (udp_t *)&itc[1];
2034 
2035 	bzero(connp, sizeof (conn_t));
2036 	bzero(udp, sizeof (udp_t));
2037 
2038 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 	connp->conn_udp = udp;
2041 	connp->conn_flags = IPCL_UDPCONN;
2042 	connp->conn_proto = IPPROTO_UDP;
2043 	udp->udp_connp = connp;
2044 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 	if (connp->conn_ixa == NULL)
2047 		return (ENOMEM);
2048 	connp->conn_ixa->ixa_refcnt = 1;
2049 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 	return (0);
2052 }
2053 
2054 /* ARGSUSED */
2055 static void
udp_conn_destructor(void * buf,void * cdrarg)2056 udp_conn_destructor(void *buf, void *cdrarg)
2057 {
2058 	itc_t	*itc = (itc_t *)buf;
2059 	conn_t 	*connp = &itc->itc_conn;
2060 	udp_t	*udp = (udp_t *)&itc[1];
2061 
2062 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 	ASSERT(udp->udp_connp == connp);
2064 	ASSERT(connp->conn_udp == udp);
2065 	mutex_destroy(&connp->conn_lock);
2066 	cv_destroy(&connp->conn_cv);
2067 	rw_destroy(&connp->conn_ilg_lock);
2068 
2069 	/* Can be NULL if constructor failed */
2070 	if (connp->conn_ixa != NULL) {
2071 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 		ixa_refrele(connp->conn_ixa);
2075 	}
2076 }
2077 
2078 /* ARGSUSED */
2079 static int
rawip_conn_constructor(void * buf,void * cdrarg,int kmflags)2080 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 {
2082 	itc_t	*itc = (itc_t *)buf;
2083 	conn_t 	*connp = &itc->itc_conn;
2084 	icmp_t	*icmp = (icmp_t *)&itc[1];
2085 
2086 	bzero(connp, sizeof (conn_t));
2087 	bzero(icmp, sizeof (icmp_t));
2088 
2089 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 	connp->conn_icmp = icmp;
2092 	connp->conn_flags = IPCL_RAWIPCONN;
2093 	connp->conn_proto = IPPROTO_ICMP;
2094 	icmp->icmp_connp = connp;
2095 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 	if (connp->conn_ixa == NULL)
2098 		return (ENOMEM);
2099 	connp->conn_ixa->ixa_refcnt = 1;
2100 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 	return (0);
2103 }
2104 
2105 /* ARGSUSED */
2106 static void
rawip_conn_destructor(void * buf,void * cdrarg)2107 rawip_conn_destructor(void *buf, void *cdrarg)
2108 {
2109 	itc_t	*itc = (itc_t *)buf;
2110 	conn_t 	*connp = &itc->itc_conn;
2111 	icmp_t	*icmp = (icmp_t *)&itc[1];
2112 
2113 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 	ASSERT(icmp->icmp_connp == connp);
2115 	ASSERT(connp->conn_icmp == icmp);
2116 	mutex_destroy(&connp->conn_lock);
2117 	cv_destroy(&connp->conn_cv);
2118 	rw_destroy(&connp->conn_ilg_lock);
2119 
2120 	/* Can be NULL if constructor failed */
2121 	if (connp->conn_ixa != NULL) {
2122 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 		ixa_refrele(connp->conn_ixa);
2126 	}
2127 }
2128 
2129 /* ARGSUSED */
2130 static int
rts_conn_constructor(void * buf,void * cdrarg,int kmflags)2131 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 {
2133 	itc_t	*itc = (itc_t *)buf;
2134 	conn_t 	*connp = &itc->itc_conn;
2135 	rts_t	*rts = (rts_t *)&itc[1];
2136 
2137 	bzero(connp, sizeof (conn_t));
2138 	bzero(rts, sizeof (rts_t));
2139 
2140 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 	connp->conn_rts = rts;
2143 	connp->conn_flags = IPCL_RTSCONN;
2144 	rts->rts_connp = connp;
2145 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 	if (connp->conn_ixa == NULL)
2148 		return (ENOMEM);
2149 	connp->conn_ixa->ixa_refcnt = 1;
2150 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 	return (0);
2152 }
2153 
2154 /* ARGSUSED */
2155 static void
rts_conn_destructor(void * buf,void * cdrarg)2156 rts_conn_destructor(void *buf, void *cdrarg)
2157 {
2158 	itc_t	*itc = (itc_t *)buf;
2159 	conn_t 	*connp = &itc->itc_conn;
2160 	rts_t	*rts = (rts_t *)&itc[1];
2161 
2162 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 	ASSERT(rts->rts_connp == connp);
2164 	ASSERT(connp->conn_rts == rts);
2165 	mutex_destroy(&connp->conn_lock);
2166 	cv_destroy(&connp->conn_cv);
2167 	rw_destroy(&connp->conn_ilg_lock);
2168 
2169 	/* Can be NULL if constructor failed */
2170 	if (connp->conn_ixa != NULL) {
2171 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 		ixa_refrele(connp->conn_ixa);
2175 	}
2176 }
2177 
2178 /*
2179  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180  * in the conn_t.
2181  *
2182  * Below we list all the pointers in the conn_t as a documentation aid.
2183  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184  * If you add any pointers to the conn_t please add an ASSERT here
2185  * and #ifdef it out if it can't be actually asserted to be NULL.
2186  * In any case, we bzero most of the conn_t at the end of the function.
2187  */
2188 void
ipcl_conn_cleanup(conn_t * connp)2189 ipcl_conn_cleanup(conn_t *connp)
2190 {
2191 	ip_xmit_attr_t	*ixa;
2192 
2193 	ASSERT(connp->conn_latch == NULL);
2194 	ASSERT(connp->conn_latch_in_policy == NULL);
2195 	ASSERT(connp->conn_latch_in_action == NULL);
2196 #ifdef notdef
2197 	ASSERT(connp->conn_rq == NULL);
2198 	ASSERT(connp->conn_wq == NULL);
2199 #endif
2200 	ASSERT(connp->conn_cred == NULL);
2201 	ASSERT(connp->conn_g_fanout == NULL);
2202 	ASSERT(connp->conn_g_next == NULL);
2203 	ASSERT(connp->conn_g_prev == NULL);
2204 	ASSERT(connp->conn_policy == NULL);
2205 	ASSERT(connp->conn_fanout == NULL);
2206 	ASSERT(connp->conn_next == NULL);
2207 	ASSERT(connp->conn_prev == NULL);
2208 	ASSERT(connp->conn_oper_pending_ill == NULL);
2209 	ASSERT(connp->conn_ilg == NULL);
2210 	ASSERT(connp->conn_drain_next == NULL);
2211 	ASSERT(connp->conn_drain_prev == NULL);
2212 #ifdef notdef
2213 	/* conn_idl is not cleared when removed from idl list */
2214 	ASSERT(connp->conn_idl == NULL);
2215 #endif
2216 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 #ifdef notdef
2218 	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 	ASSERT(connp->conn_netstack == NULL);
2220 #endif
2221 
2222 	ASSERT(connp->conn_helper_info == NULL);
2223 	ASSERT(connp->conn_ixa != NULL);
2224 	ixa = connp->conn_ixa;
2225 	ASSERT(ixa->ixa_refcnt == 1);
2226 	/* Need to preserve ixa_protocol */
2227 	ixa_cleanup(ixa);
2228 	ixa->ixa_flags = 0;
2229 
2230 	/* Clear out the conn_t fields that are not preserved */
2231 	bzero(&connp->conn_start_clr,
2232 	    sizeof (conn_t) -
2233 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 }
2235 
2236 /*
2237  * All conns are inserted in a global multi-list for the benefit of
2238  * walkers. The walk is guaranteed to walk all open conns at the time
2239  * of the start of the walk exactly once. This property is needed to
2240  * achieve some cleanups during unplumb of interfaces. This is achieved
2241  * as follows.
2242  *
2243  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244  * call the insert and delete functions below at creation and deletion
2245  * time respectively. The conn never moves or changes its position in this
2246  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247  * won't increase due to walkers, once the conn deletion has started. Note
2248  * that we can't remove the conn from the global list and then wait for
2249  * the refcnt to drop to zero, since walkers would then see a truncated
2250  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251  * conns until ip_open is ready to make them globally visible.
2252  * The global round robin multi-list locks are held only to get the
2253  * next member/insertion/deletion and contention should be negligible
2254  * if the multi-list is much greater than the number of cpus.
2255  */
2256 void
ipcl_globalhash_insert(conn_t * connp)2257 ipcl_globalhash_insert(conn_t *connp)
2258 {
2259 	int	index;
2260 	struct connf_s	*connfp;
2261 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2262 
2263 	/*
2264 	 * No need for atomic here. Approximate even distribution
2265 	 * in the global lists is sufficient.
2266 	 */
2267 	ipst->ips_conn_g_index++;
2268 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 
2270 	connp->conn_g_prev = NULL;
2271 	/*
2272 	 * Mark as INCIPIENT, so that walkers will ignore this
2273 	 * for now, till ip_open is ready to make it visible globally.
2274 	 */
2275 	connp->conn_state_flags |= CONN_INCIPIENT;
2276 
2277 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 	/* Insert at the head of the list */
2279 	mutex_enter(&connfp->connf_lock);
2280 	connp->conn_g_next = connfp->connf_head;
2281 	if (connp->conn_g_next != NULL)
2282 		connp->conn_g_next->conn_g_prev = connp;
2283 	connfp->connf_head = connp;
2284 
2285 	/* The fanout bucket this conn points to */
2286 	connp->conn_g_fanout = connfp;
2287 
2288 	mutex_exit(&connfp->connf_lock);
2289 }
2290 
2291 void
ipcl_globalhash_remove(conn_t * connp)2292 ipcl_globalhash_remove(conn_t *connp)
2293 {
2294 	struct connf_s	*connfp;
2295 
2296 	/*
2297 	 * We were never inserted in the global multi list.
2298 	 * IPCL_NONE variety is never inserted in the global multilist
2299 	 * since it is presumed to not need any cleanup and is transient.
2300 	 */
2301 	if (connp->conn_g_fanout == NULL)
2302 		return;
2303 
2304 	connfp = connp->conn_g_fanout;
2305 	mutex_enter(&connfp->connf_lock);
2306 	if (connp->conn_g_prev != NULL)
2307 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 	else
2309 		connfp->connf_head = connp->conn_g_next;
2310 	if (connp->conn_g_next != NULL)
2311 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 	mutex_exit(&connfp->connf_lock);
2313 
2314 	/* Better to stumble on a null pointer than to corrupt memory */
2315 	connp->conn_g_next = NULL;
2316 	connp->conn_g_prev = NULL;
2317 	connp->conn_g_fanout = NULL;
2318 }
2319 
2320 /*
2321  * Walk the list of all conn_t's in the system, calling the function provided
2322  * With the specified argument for each.
2323  * Applies to both IPv4 and IPv6.
2324  *
2325  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326  * conn_oper_pending_ill). To guard against stale pointers
2327  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328  * unplumbed or removed. New conn_t's that are created while we are walking
2329  * may be missed by this walk, because they are not necessarily inserted
2330  * at the tail of the list. They are new conn_t's and thus don't have any
2331  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332  * is created to the struct that is going away.
2333  */
2334 void
ipcl_walk(pfv_t func,void * arg,ip_stack_t * ipst)2335 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 {
2337 	int	i;
2338 	conn_t	*connp;
2339 	conn_t	*prev_connp;
2340 
2341 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 		prev_connp = NULL;
2344 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 		while (connp != NULL) {
2346 			mutex_enter(&connp->conn_lock);
2347 			if (connp->conn_state_flags &
2348 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 				mutex_exit(&connp->conn_lock);
2350 				connp = connp->conn_g_next;
2351 				continue;
2352 			}
2353 			CONN_INC_REF_LOCKED(connp);
2354 			mutex_exit(&connp->conn_lock);
2355 			mutex_exit(
2356 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 			(*func)(connp, arg);
2358 			if (prev_connp != NULL)
2359 				CONN_DEC_REF(prev_connp);
2360 			mutex_enter(
2361 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 			prev_connp = connp;
2363 			connp = connp->conn_g_next;
2364 		}
2365 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 		if (prev_connp != NULL)
2367 			CONN_DEC_REF(prev_connp);
2368 	}
2369 }
2370 
2371 /*
2372  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2374  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2375  * (peer tcp in ESTABLISHED state).
2376  */
2377 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv4(conn_t * connp,ipha_t * ipha,tcpha_t * tcpha,ip_stack_t * ipst)2378 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379     ip_stack_t *ipst)
2380 {
2381 	uint32_t ports;
2382 	uint16_t *pports = (uint16_t *)&ports;
2383 	connf_t	*connfp;
2384 	conn_t	*tconnp;
2385 	boolean_t zone_chk;
2386 
2387 	/*
2388 	 * If either the source of destination address is loopback, then
2389 	 * both endpoints must be in the same Zone.  Otherwise, both of
2390 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 	 * state) and the endpoints may reside in different Zones.
2392 	 */
2393 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 
2396 	pports[0] = tcpha->tha_fport;
2397 	pports[1] = tcpha->tha_lport;
2398 
2399 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 	    ports, ipst)];
2401 
2402 	mutex_enter(&connfp->connf_lock);
2403 	for (tconnp = connfp->connf_head; tconnp != NULL;
2404 	    tconnp = tconnp->conn_next) {
2405 
2406 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 
2411 			ASSERT(tconnp != connp);
2412 			CONN_INC_REF(tconnp);
2413 			mutex_exit(&connfp->connf_lock);
2414 			return (tconnp);
2415 		}
2416 	}
2417 	mutex_exit(&connfp->connf_lock);
2418 	return (NULL);
2419 }
2420 
2421 /*
2422  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2424  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2425  * (peer tcp in ESTABLISHED state).
2426  */
2427 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv6(conn_t * connp,ip6_t * ip6h,tcpha_t * tcpha,ip_stack_t * ipst)2428 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429     ip_stack_t *ipst)
2430 {
2431 	uint32_t ports;
2432 	uint16_t *pports = (uint16_t *)&ports;
2433 	connf_t	*connfp;
2434 	conn_t	*tconnp;
2435 	boolean_t zone_chk;
2436 
2437 	/*
2438 	 * If either the source of destination address is loopback, then
2439 	 * both endpoints must be in the same Zone.  Otherwise, both of
2440 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 	 * state) and the endpoints may reside in different Zones.  We
2442 	 * don't do Zone check for link local address(es) because the
2443 	 * current Zone implementation treats each link local address as
2444 	 * being unique per system node, i.e. they belong to global Zone.
2445 	 */
2446 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 
2449 	pports[0] = tcpha->tha_fport;
2450 	pports[1] = tcpha->tha_lport;
2451 
2452 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 	    ports, ipst)];
2454 
2455 	mutex_enter(&connfp->connf_lock);
2456 	for (tconnp = connfp->connf_head; tconnp != NULL;
2457 	    tconnp = tconnp->conn_next) {
2458 
2459 		/* We skip conn_bound_if check here as this is loopback tcp */
2460 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 
2465 			ASSERT(tconnp != connp);
2466 			CONN_INC_REF(tconnp);
2467 			mutex_exit(&connfp->connf_lock);
2468 			return (tconnp);
2469 		}
2470 	}
2471 	mutex_exit(&connfp->connf_lock);
2472 	return (NULL);
2473 }
2474 
2475 /*
2476  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478  * Only checks for connected entries i.e. no INADDR_ANY checks.
2479  */
2480 conn_t *
ipcl_tcp_lookup_reversed_ipv4(ipha_t * ipha,tcpha_t * tcpha,int min_state,ip_stack_t * ipst)2481 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482     ip_stack_t *ipst)
2483 {
2484 	uint32_t ports;
2485 	uint16_t *pports;
2486 	connf_t	*connfp;
2487 	conn_t	*tconnp;
2488 
2489 	pports = (uint16_t *)&ports;
2490 	pports[0] = tcpha->tha_fport;
2491 	pports[1] = tcpha->tha_lport;
2492 
2493 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 	    ports, ipst)];
2495 
2496 	mutex_enter(&connfp->connf_lock);
2497 	for (tconnp = connfp->connf_head; tconnp != NULL;
2498 	    tconnp = tconnp->conn_next) {
2499 
2500 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 		    tconnp->conn_tcp->tcp_state >= min_state) {
2503 
2504 			CONN_INC_REF(tconnp);
2505 			mutex_exit(&connfp->connf_lock);
2506 			return (tconnp);
2507 		}
2508 	}
2509 	mutex_exit(&connfp->connf_lock);
2510 	return (NULL);
2511 }
2512 
2513 /*
2514  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516  * Only checks for connected entries i.e. no INADDR_ANY checks.
2517  * Match on ifindex in addition to addresses.
2518  */
2519 conn_t *
ipcl_tcp_lookup_reversed_ipv6(ip6_t * ip6h,tcpha_t * tcpha,int min_state,uint_t ifindex,ip_stack_t * ipst)2520 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521     uint_t ifindex, ip_stack_t *ipst)
2522 {
2523 	tcp_t	*tcp;
2524 	uint32_t ports;
2525 	uint16_t *pports;
2526 	connf_t	*connfp;
2527 	conn_t	*tconnp;
2528 
2529 	pports = (uint16_t *)&ports;
2530 	pports[0] = tcpha->tha_fport;
2531 	pports[1] = tcpha->tha_lport;
2532 
2533 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 	    ports, ipst)];
2535 
2536 	mutex_enter(&connfp->connf_lock);
2537 	for (tconnp = connfp->connf_head; tconnp != NULL;
2538 	    tconnp = tconnp->conn_next) {
2539 
2540 		tcp = tconnp->conn_tcp;
2541 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 		    tcp->tcp_state >= min_state &&
2544 		    (tconnp->conn_bound_if == 0 ||
2545 		    tconnp->conn_bound_if == ifindex)) {
2546 
2547 			CONN_INC_REF(tconnp);
2548 			mutex_exit(&connfp->connf_lock);
2549 			return (tconnp);
2550 		}
2551 	}
2552 	mutex_exit(&connfp->connf_lock);
2553 	return (NULL);
2554 }
2555 
2556 /*
2557  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558  * a listener when changing state.
2559  */
2560 conn_t *
ipcl_lookup_listener_v4(uint16_t lport,ipaddr_t laddr,zoneid_t zoneid,ip_stack_t * ipst)2561 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562     ip_stack_t *ipst)
2563 {
2564 	connf_t		*bind_connfp;
2565 	conn_t		*connp;
2566 	tcp_t		*tcp;
2567 
2568 	/*
2569 	 * Avoid false matches for packets sent to an IP destination of
2570 	 * all zeros.
2571 	 */
2572 	if (laddr == 0)
2573 		return (NULL);
2574 
2575 	ASSERT(zoneid != ALL_ZONES);
2576 
2577 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 	mutex_enter(&bind_connfp->connf_lock);
2579 	for (connp = bind_connfp->connf_head; connp != NULL;
2580 	    connp = connp->conn_next) {
2581 		tcp = connp->conn_tcp;
2582 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2584 		    (tcp->tcp_listener == NULL)) {
2585 			CONN_INC_REF(connp);
2586 			mutex_exit(&bind_connfp->connf_lock);
2587 			return (connp);
2588 		}
2589 	}
2590 	mutex_exit(&bind_connfp->connf_lock);
2591 	return (NULL);
2592 }
2593 
2594 /*
2595  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596  * a listener when changing state.
2597  */
2598 conn_t *
ipcl_lookup_listener_v6(uint16_t lport,in6_addr_t * laddr,uint_t ifindex,zoneid_t zoneid,ip_stack_t * ipst)2599 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600     zoneid_t zoneid, ip_stack_t *ipst)
2601 {
2602 	connf_t		*bind_connfp;
2603 	conn_t		*connp = NULL;
2604 	tcp_t		*tcp;
2605 
2606 	/*
2607 	 * Avoid false matches for packets sent to an IP destination of
2608 	 * all zeros.
2609 	 */
2610 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 		return (NULL);
2612 
2613 	ASSERT(zoneid != ALL_ZONES);
2614 
2615 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 	mutex_enter(&bind_connfp->connf_lock);
2617 	for (connp = bind_connfp->connf_head; connp != NULL;
2618 	    connp = connp->conn_next) {
2619 		tcp = connp->conn_tcp;
2620 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2622 		    (connp->conn_bound_if == 0 ||
2623 		    connp->conn_bound_if == ifindex) &&
2624 		    tcp->tcp_listener == NULL) {
2625 			CONN_INC_REF(connp);
2626 			mutex_exit(&bind_connfp->connf_lock);
2627 			return (connp);
2628 		}
2629 	}
2630 	mutex_exit(&bind_connfp->connf_lock);
2631 	return (NULL);
2632 }
2633 
2634 /*
2635  * ipcl_get_next_conn
2636  *	get the next entry in the conn global list
2637  *	and put a reference on the next_conn.
2638  *	decrement the reference on the current conn.
2639  *
2640  * This is an iterator based walker function that also provides for
2641  * some selection by the caller. It walks through the conn_hash bucket
2642  * searching for the next valid connp in the list, and selects connections
2643  * that are neither closed nor condemned. It also REFHOLDS the conn
2644  * thus ensuring that the conn exists when the caller uses the conn.
2645  */
2646 conn_t *
ipcl_get_next_conn(connf_t * connfp,conn_t * connp,uint32_t conn_flags)2647 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 {
2649 	conn_t	*next_connp;
2650 
2651 	if (connfp == NULL)
2652 		return (NULL);
2653 
2654 	mutex_enter(&connfp->connf_lock);
2655 
2656 	next_connp = (connp == NULL) ?
2657 	    connfp->connf_head : connp->conn_g_next;
2658 
2659 	while (next_connp != NULL) {
2660 		mutex_enter(&next_connp->conn_lock);
2661 		if (!(next_connp->conn_flags & conn_flags) ||
2662 		    (next_connp->conn_state_flags &
2663 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 			/*
2665 			 * This conn has been condemned or
2666 			 * is closing, or the flags don't match
2667 			 */
2668 			mutex_exit(&next_connp->conn_lock);
2669 			next_connp = next_connp->conn_g_next;
2670 			continue;
2671 		}
2672 		CONN_INC_REF_LOCKED(next_connp);
2673 		mutex_exit(&next_connp->conn_lock);
2674 		break;
2675 	}
2676 
2677 	mutex_exit(&connfp->connf_lock);
2678 
2679 	if (connp != NULL)
2680 		CONN_DEC_REF(connp);
2681 
2682 	return (next_connp);
2683 }
2684 
2685 #ifdef CONN_DEBUG
2686 /*
2687  * Trace of the last NBUF refhold/refrele
2688  */
2689 int
conn_trace_ref(conn_t * connp)2690 conn_trace_ref(conn_t *connp)
2691 {
2692 	int	last;
2693 	conn_trace_t	*ctb;
2694 
2695 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 	last = connp->conn_trace_last;
2697 	last++;
2698 	if (last == CONN_TRACE_MAX)
2699 		last = 0;
2700 
2701 	ctb = &connp->conn_trace_buf[last];
2702 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 	connp->conn_trace_last = last;
2704 	return (1);
2705 }
2706 
2707 int
conn_untrace_ref(conn_t * connp)2708 conn_untrace_ref(conn_t *connp)
2709 {
2710 	int	last;
2711 	conn_trace_t	*ctb;
2712 
2713 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 	last = connp->conn_trace_last;
2715 	last++;
2716 	if (last == CONN_TRACE_MAX)
2717 		last = 0;
2718 
2719 	ctb = &connp->conn_trace_buf[last];
2720 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 	connp->conn_trace_last = last;
2722 	return (1);
2723 }
2724 #endif
2725