1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * IP PACKET CLASSIFIER
27 *
28 * The IP packet classifier provides mapping between IP packets and persistent
29 * connection state for connection-oriented protocols. It also provides
30 * interface for managing connection states.
31 *
32 * The connection state is kept in conn_t data structure and contains, among
33 * other things:
34 *
35 * o local/remote address and ports
36 * o Transport protocol
37 * o squeue for the connection (for TCP only)
38 * o reference counter
39 * o Connection state
40 * o hash table linkage
41 * o interface/ire information
42 * o credentials
43 * o ipsec policy
44 * o send and receive functions.
45 * o mutex lock.
46 *
47 * Connections use a reference counting scheme. They are freed when the
48 * reference counter drops to zero. A reference is incremented when connection
49 * is placed in a list or table, when incoming packet for the connection arrives
50 * and when connection is processed via squeue (squeue processing may be
51 * asynchronous and the reference protects the connection from being destroyed
52 * before its processing is finished).
53 *
54 * conn_recv is used to pass up packets to the ULP.
55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56 * a listener, and changes to tcp_input_listener as the listener has picked a
57 * good squeue. For other cases it is set to tcp_input_data.
58 *
59 * conn_recvicmp is used to pass up ICMP errors to the ULP.
60 *
61 * Classifier uses several hash tables:
62 *
63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
64 * ipcl_bind_fanout: contains all connections in BOUND state
65 * ipcl_proto_fanout: IPv4 protocol fanout
66 * ipcl_proto_fanout_v6: IPv6 protocol fanout
67 * ipcl_udp_fanout: contains all UDP connections
68 * ipcl_iptun_fanout: contains all IP tunnel connections
69 * ipcl_globalhash_fanout: contains all connections
70 *
71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72 * which need to view all existing connections.
73 *
74 * All tables are protected by per-bucket locks. When both per-bucket lock and
75 * connection lock need to be held, the per-bucket lock should be acquired
76 * first, followed by the connection lock.
77 *
78 * All functions doing search in one of these tables increment a reference
79 * counter on the connection found (if any). This reference should be dropped
80 * when the caller has finished processing the connection.
81 *
82 *
83 * INTERFACES:
84 * ===========
85 *
86 * Connection Lookup:
87 * ------------------
88 *
89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
91 *
92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93 * it can't find any associated connection. If the connection is found, its
94 * reference counter is incremented.
95 *
96 * mp: mblock, containing packet header. The full header should fit
97 * into a single mblock. It should also contain at least full IP
98 * and TCP or UDP header.
99 *
100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
101 *
102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
103 * the packet.
104 *
105 * ira->ira_zoneid: The zone in which the returned connection must be; the
106 * zoneid corresponding to the ire_zoneid on the IRE located for
107 * the packet's destination address.
108 *
109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110 * IRAF_TX_SHARED_ADDR flags
111 *
112 * For TCP connections, the lookup order is as follows:
113 * 5-tuple {src, dst, protocol, local port, remote port}
114 * lookup in ipcl_conn_fanout table.
115 * 3-tuple {dst, remote port, protocol} lookup in
116 * ipcl_bind_fanout table.
117 *
118 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
119 * remote port} lookup is done on ipcl_udp_fanout. Note that,
120 * these interfaces do not handle cases where a packets belongs
121 * to multiple UDP clients, which is handled in IP itself.
122 *
123 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
124 * determine which actual zone gets the segment. This is used only in a
125 * labeled environment. The matching rules are:
126 *
127 * - If it's not a multilevel port, then the label on the packet selects
128 * the zone. Unlabeled packets are delivered to the global zone.
129 *
130 * - If it's a multilevel port, then only the zone registered to receive
131 * packets on that port matches.
132 *
133 * Also, in a labeled environment, packet labels need to be checked. For fully
134 * bound TCP connections, we can assume that the packet label was checked
135 * during connection establishment, and doesn't need to be checked on each
136 * packet. For others, though, we need to check for strict equality or, for
137 * multilevel ports, membership in the range or set. This part currently does
138 * a tnrh lookup on each packet, but could be optimized to use cached results
139 * if that were necessary. (SCTP doesn't come through here, but if it did,
140 * we would apply the same rules as TCP.)
141 *
142 * An implication of the above is that fully-bound TCP sockets must always use
143 * distinct 4-tuples; they can't be discriminated by label alone.
144 *
145 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
146 * as there's no connection set-up handshake and no shared state.
147 *
148 * Labels on looped-back packets within a single zone do not need to be
149 * checked, as all processes in the same zone have the same label.
150 *
151 * Finally, for unlabeled packets received by a labeled system, special rules
152 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
153 * socket in the zone whose label matches the default label of the sender, if
154 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
155 * receiver's label must dominate the sender's default label.
156 *
157 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
158 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
159 * ip_stack);
160 *
161 * Lookup routine to find a exact match for {src, dst, local port,
162 * remote port) for TCP connections in ipcl_conn_fanout. The address and
163 * ports are read from the IP and TCP header respectively.
164 *
165 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
166 * zoneid, ip_stack);
167 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
168 * zoneid, ip_stack);
169 *
170 * Lookup routine to find a listener with the tuple {lport, laddr,
171 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
172 * parameter interface index is also compared.
173 *
174 * void ipcl_walk(func, arg, ip_stack)
175 *
176 * Apply 'func' to every connection available. The 'func' is called as
177 * (*func)(connp, arg). The walk is non-atomic so connections may be
178 * created and destroyed during the walk. The CONN_CONDEMNED and
179 * CONN_INCIPIENT flags ensure that connections which are newly created
180 * or being destroyed are not selected by the walker.
181 *
182 * Table Updates
183 * -------------
184 *
185 * int ipcl_conn_insert(connp);
186 * int ipcl_conn_insert_v4(connp);
187 * int ipcl_conn_insert_v6(connp);
188 *
189 * Insert 'connp' in the ipcl_conn_fanout.
190 * Arguements :
191 * connp conn_t to be inserted
192 *
193 * Return value :
194 * 0 if connp was inserted
195 * EADDRINUSE if the connection with the same tuple
196 * already exists.
197 *
198 * int ipcl_bind_insert(connp);
199 * int ipcl_bind_insert_v4(connp);
200 * int ipcl_bind_insert_v6(connp);
201 *
202 * Insert 'connp' in ipcl_bind_fanout.
203 * Arguements :
204 * connp conn_t to be inserted
205 *
206 *
207 * void ipcl_hash_remove(connp);
208 *
209 * Removes the 'connp' from the connection fanout table.
210 *
211 * Connection Creation/Destruction
212 * -------------------------------
213 *
214 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
215 *
216 * Creates a new conn based on the type flag, inserts it into
217 * globalhash table.
218 *
219 * type: This flag determines the type of conn_t which needs to be
220 * created i.e., which kmem_cache it comes from.
221 * IPCL_TCPCONN indicates a TCP connection
222 * IPCL_SCTPCONN indicates a SCTP connection
223 * IPCL_UDPCONN indicates a UDP conn_t.
224 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
225 * IPCL_RTSCONN indicates a RTS conn_t.
226 * IPCL_IPCCONN indicates all other connections.
227 *
228 * void ipcl_conn_destroy(connp)
229 *
230 * Destroys the connection state, removes it from the global
231 * connection hash table and frees its memory.
232 */
233
234 #include <sys/types.h>
235 #include <sys/stream.h>
236 #include <sys/stropts.h>
237 #include <sys/sysmacros.h>
238 #include <sys/strsubr.h>
239 #include <sys/strsun.h>
240 #define _SUN_TPI_VERSION 2
241 #include <sys/ddi.h>
242 #include <sys/cmn_err.h>
243 #include <sys/debug.h>
244
245 #include <sys/systm.h>
246 #include <sys/param.h>
247 #include <sys/kmem.h>
248 #include <sys/isa_defs.h>
249 #include <inet/common.h>
250 #include <netinet/ip6.h>
251 #include <netinet/icmp6.h>
252
253 #include <inet/ip.h>
254 #include <inet/ip_if.h>
255 #include <inet/ip_ire.h>
256 #include <inet/ip6.h>
257 #include <inet/ip_ndp.h>
258 #include <inet/ip_impl.h>
259 #include <inet/udp_impl.h>
260 #include <inet/sctp_ip.h>
261 #include <inet/sctp/sctp_impl.h>
262 #include <inet/rawip_impl.h>
263 #include <inet/rts_impl.h>
264 #include <inet/iptun/iptun_impl.h>
265
266 #include <sys/cpuvar.h>
267
268 #include <inet/ipclassifier.h>
269 #include <inet/tcp.h>
270 #include <inet/ipsec_impl.h>
271
272 #include <sys/tsol/tnet.h>
273 #include <sys/sockio.h>
274
275 /* Old value for compatibility. Setable in /etc/system */
276 uint_t tcp_conn_hash_size = 0;
277
278 /* New value. Zero means choose automatically. Setable in /etc/system */
279 uint_t ipcl_conn_hash_size = 0;
280 uint_t ipcl_conn_hash_memfactor = 8192;
281 uint_t ipcl_conn_hash_maxsize = 82500;
282
283 /* bind/udp fanout table size */
284 uint_t ipcl_bind_fanout_size = 512;
285 uint_t ipcl_udp_fanout_size = 16384;
286
287 /* Raw socket fanout size. Must be a power of 2. */
288 uint_t ipcl_raw_fanout_size = 256;
289
290 /*
291 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
292 * expect that most large deployments would have hundreds of tunnels, and
293 * thousands in the extreme case.
294 */
295 uint_t ipcl_iptun_fanout_size = 6143;
296
297 /*
298 * Power of 2^N Primes useful for hashing for N of 0-28,
299 * these primes are the nearest prime <= 2^N - 2^(N-2).
300 */
301
302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
303 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
304 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
305 50331599, 100663291, 201326557, 0}
306
307 /*
308 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
309 * are aligned on cache lines.
310 */
311 typedef union itc_s {
312 conn_t itc_conn;
313 char itcu_filler[CACHE_ALIGN(conn_s)];
314 } itc_t;
315
316 struct kmem_cache *tcp_conn_cache;
317 struct kmem_cache *ip_conn_cache;
318 extern struct kmem_cache *sctp_conn_cache;
319 struct kmem_cache *udp_conn_cache;
320 struct kmem_cache *rawip_conn_cache;
321 struct kmem_cache *rts_conn_cache;
322
323 extern void tcp_timermp_free(tcp_t *);
324 extern mblk_t *tcp_timermp_alloc(int);
325
326 static int ip_conn_constructor(void *, void *, int);
327 static void ip_conn_destructor(void *, void *);
328
329 static int tcp_conn_constructor(void *, void *, int);
330 static void tcp_conn_destructor(void *, void *);
331
332 static int udp_conn_constructor(void *, void *, int);
333 static void udp_conn_destructor(void *, void *);
334
335 static int rawip_conn_constructor(void *, void *, int);
336 static void rawip_conn_destructor(void *, void *);
337
338 static int rts_conn_constructor(void *, void *, int);
339 static void rts_conn_destructor(void *, void *);
340
341 /*
342 * Global (for all stack instances) init routine
343 */
344 void
ipcl_g_init(void)345 ipcl_g_init(void)
346 {
347 ip_conn_cache = kmem_cache_create("ip_conn_cache",
348 sizeof (conn_t), CACHE_ALIGN_SIZE,
349 ip_conn_constructor, ip_conn_destructor,
350 NULL, NULL, NULL, 0);
351
352 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
353 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
354 tcp_conn_constructor, tcp_conn_destructor,
355 tcp_conn_reclaim, NULL, NULL, 0);
356
357 udp_conn_cache = kmem_cache_create("udp_conn_cache",
358 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
359 udp_conn_constructor, udp_conn_destructor,
360 NULL, NULL, NULL, 0);
361
362 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
363 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
364 rawip_conn_constructor, rawip_conn_destructor,
365 NULL, NULL, NULL, 0);
366
367 rts_conn_cache = kmem_cache_create("rts_conn_cache",
368 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
369 rts_conn_constructor, rts_conn_destructor,
370 NULL, NULL, NULL, 0);
371 }
372
373 /*
374 * ipclassifier intialization routine, sets up hash tables.
375 */
376 void
ipcl_init(ip_stack_t * ipst)377 ipcl_init(ip_stack_t *ipst)
378 {
379 int i;
380 int sizes[] = P2Ps();
381
382 /*
383 * Calculate size of conn fanout table from /etc/system settings
384 */
385 if (ipcl_conn_hash_size != 0) {
386 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
387 } else if (tcp_conn_hash_size != 0) {
388 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
389 } else {
390 extern pgcnt_t freemem;
391
392 ipst->ips_ipcl_conn_fanout_size =
393 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
394
395 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
396 ipst->ips_ipcl_conn_fanout_size =
397 ipcl_conn_hash_maxsize;
398 }
399 }
400
401 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
402 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
403 break;
404 }
405 }
406 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
407 /* Out of range, use the 2^16 value */
408 ipst->ips_ipcl_conn_fanout_size = sizes[16];
409 }
410
411 /* Take values from /etc/system */
412 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
413 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
414 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
415 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
416
417 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
418
419 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
420 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
421
422 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
423 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
424 MUTEX_DEFAULT, NULL);
425 }
426
427 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
428 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
429
430 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
431 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
432 MUTEX_DEFAULT, NULL);
433 }
434
435 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
436 sizeof (connf_t), KM_SLEEP);
437 for (i = 0; i < IPPROTO_MAX; i++) {
438 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
439 MUTEX_DEFAULT, NULL);
440 }
441
442 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
443 sizeof (connf_t), KM_SLEEP);
444 for (i = 0; i < IPPROTO_MAX; i++) {
445 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
446 MUTEX_DEFAULT, NULL);
447 }
448
449 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
450 mutex_init(&ipst->ips_rts_clients->connf_lock,
451 NULL, MUTEX_DEFAULT, NULL);
452
453 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
454 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
455 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
456 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
457 MUTEX_DEFAULT, NULL);
458 }
459
460 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
461 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
462 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
463 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
464 MUTEX_DEFAULT, NULL);
465 }
466
467 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
468 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
469 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
470 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
471 MUTEX_DEFAULT, NULL);
472 }
473
474 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
475 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
476 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
477 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
478 NULL, MUTEX_DEFAULT, NULL);
479 }
480 }
481
482 void
ipcl_g_destroy(void)483 ipcl_g_destroy(void)
484 {
485 kmem_cache_destroy(ip_conn_cache);
486 kmem_cache_destroy(tcp_conn_cache);
487 kmem_cache_destroy(udp_conn_cache);
488 kmem_cache_destroy(rawip_conn_cache);
489 kmem_cache_destroy(rts_conn_cache);
490 }
491
492 /*
493 * All user-level and kernel use of the stack must be gone
494 * by now.
495 */
496 void
ipcl_destroy(ip_stack_t * ipst)497 ipcl_destroy(ip_stack_t *ipst)
498 {
499 int i;
500
501 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
502 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
503 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
504 }
505 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
506 sizeof (connf_t));
507 ipst->ips_ipcl_conn_fanout = NULL;
508
509 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
510 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
511 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
512 }
513 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
514 sizeof (connf_t));
515 ipst->ips_ipcl_bind_fanout = NULL;
516
517 for (i = 0; i < IPPROTO_MAX; i++) {
518 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
519 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
520 }
521 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
522 IPPROTO_MAX * sizeof (connf_t));
523 ipst->ips_ipcl_proto_fanout_v4 = NULL;
524
525 for (i = 0; i < IPPROTO_MAX; i++) {
526 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
527 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
528 }
529 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
530 IPPROTO_MAX * sizeof (connf_t));
531 ipst->ips_ipcl_proto_fanout_v6 = NULL;
532
533 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
534 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
535 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
536 }
537 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
538 sizeof (connf_t));
539 ipst->ips_ipcl_udp_fanout = NULL;
540
541 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
542 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
543 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
544 }
545 kmem_free(ipst->ips_ipcl_iptun_fanout,
546 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
547 ipst->ips_ipcl_iptun_fanout = NULL;
548
549 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
550 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
551 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
552 }
553 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
554 sizeof (connf_t));
555 ipst->ips_ipcl_raw_fanout = NULL;
556
557 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
558 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
559 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
560 }
561 kmem_free(ipst->ips_ipcl_globalhash_fanout,
562 sizeof (connf_t) * CONN_G_HASH_SIZE);
563 ipst->ips_ipcl_globalhash_fanout = NULL;
564
565 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
566 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
567 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
568 ipst->ips_rts_clients = NULL;
569 }
570
571 /*
572 * conn creation routine. initialize the conn, sets the reference
573 * and inserts it in the global hash table.
574 */
575 conn_t *
ipcl_conn_create(uint32_t type,int sleep,netstack_t * ns)576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
577 {
578 conn_t *connp;
579 struct kmem_cache *conn_cache;
580
581 switch (type) {
582 case IPCL_SCTPCONN:
583 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
584 return (NULL);
585 sctp_conn_init(connp);
586 netstack_hold(ns);
587 connp->conn_netstack = ns;
588 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
589 connp->conn_ixa->ixa_conn_id = (long)connp;
590 ipcl_globalhash_insert(connp);
591 return (connp);
592
593 case IPCL_TCPCONN:
594 conn_cache = tcp_conn_cache;
595 break;
596
597 case IPCL_UDPCONN:
598 conn_cache = udp_conn_cache;
599 break;
600
601 case IPCL_RAWIPCONN:
602 conn_cache = rawip_conn_cache;
603 break;
604
605 case IPCL_RTSCONN:
606 conn_cache = rts_conn_cache;
607 break;
608
609 case IPCL_IPCCONN:
610 conn_cache = ip_conn_cache;
611 break;
612
613 default:
614 connp = NULL;
615 ASSERT(0);
616 }
617
618 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
619 return (NULL);
620
621 connp->conn_ref = 1;
622 netstack_hold(ns);
623 connp->conn_netstack = ns;
624 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
625 connp->conn_ixa->ixa_conn_id = (long)connp;
626 ipcl_globalhash_insert(connp);
627 return (connp);
628 }
629
630 void
ipcl_conn_destroy(conn_t * connp)631 ipcl_conn_destroy(conn_t *connp)
632 {
633 mblk_t *mp;
634 netstack_t *ns = connp->conn_netstack;
635
636 ASSERT(!MUTEX_HELD(&connp->conn_lock));
637 ASSERT(connp->conn_ref == 0);
638 ASSERT(connp->conn_ioctlref == 0);
639
640 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
641
642 if (connp->conn_cred != NULL) {
643 crfree(connp->conn_cred);
644 connp->conn_cred = NULL;
645 /* ixa_cred done in ipcl_conn_cleanup below */
646 }
647
648 if (connp->conn_ht_iphc != NULL) {
649 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
650 connp->conn_ht_iphc = NULL;
651 connp->conn_ht_iphc_allocated = 0;
652 connp->conn_ht_iphc_len = 0;
653 connp->conn_ht_ulp = NULL;
654 connp->conn_ht_ulp_len = 0;
655 }
656 ip_pkt_free(&connp->conn_xmit_ipp);
657
658 ipcl_globalhash_remove(connp);
659
660 if (connp->conn_latch != NULL) {
661 IPLATCH_REFRELE(connp->conn_latch);
662 connp->conn_latch = NULL;
663 }
664 if (connp->conn_latch_in_policy != NULL) {
665 IPPOL_REFRELE(connp->conn_latch_in_policy);
666 connp->conn_latch_in_policy = NULL;
667 }
668 if (connp->conn_latch_in_action != NULL) {
669 IPACT_REFRELE(connp->conn_latch_in_action);
670 connp->conn_latch_in_action = NULL;
671 }
672 if (connp->conn_policy != NULL) {
673 IPPH_REFRELE(connp->conn_policy, ns);
674 connp->conn_policy = NULL;
675 }
676
677 if (connp->conn_ipsec_opt_mp != NULL) {
678 freemsg(connp->conn_ipsec_opt_mp);
679 connp->conn_ipsec_opt_mp = NULL;
680 }
681
682 if (connp->conn_flags & IPCL_TCPCONN) {
683 tcp_t *tcp = connp->conn_tcp;
684
685 tcp_free(tcp);
686 mp = tcp->tcp_timercache;
687
688 tcp->tcp_tcps = NULL;
689
690 /*
691 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
692 * the mblk.
693 */
694 if (tcp->tcp_rsrv_mp != NULL) {
695 freeb(tcp->tcp_rsrv_mp);
696 tcp->tcp_rsrv_mp = NULL;
697 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
698 }
699
700 ipcl_conn_cleanup(connp);
701 connp->conn_flags = IPCL_TCPCONN;
702 if (ns != NULL) {
703 ASSERT(tcp->tcp_tcps == NULL);
704 connp->conn_netstack = NULL;
705 connp->conn_ixa->ixa_ipst = NULL;
706 netstack_rele(ns);
707 }
708
709 bzero(tcp, sizeof (tcp_t));
710
711 tcp->tcp_timercache = mp;
712 tcp->tcp_connp = connp;
713 kmem_cache_free(tcp_conn_cache, connp);
714 return;
715 }
716
717 if (connp->conn_flags & IPCL_SCTPCONN) {
718 ASSERT(ns != NULL);
719 sctp_free(connp);
720 return;
721 }
722
723 ipcl_conn_cleanup(connp);
724 if (ns != NULL) {
725 connp->conn_netstack = NULL;
726 connp->conn_ixa->ixa_ipst = NULL;
727 netstack_rele(ns);
728 }
729
730 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
731 if (connp->conn_flags & IPCL_UDPCONN) {
732 connp->conn_flags = IPCL_UDPCONN;
733 kmem_cache_free(udp_conn_cache, connp);
734 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
735 connp->conn_flags = IPCL_RAWIPCONN;
736 connp->conn_proto = IPPROTO_ICMP;
737 connp->conn_ixa->ixa_protocol = connp->conn_proto;
738 kmem_cache_free(rawip_conn_cache, connp);
739 } else if (connp->conn_flags & IPCL_RTSCONN) {
740 connp->conn_flags = IPCL_RTSCONN;
741 kmem_cache_free(rts_conn_cache, connp);
742 } else {
743 connp->conn_flags = IPCL_IPCCONN;
744 ASSERT(connp->conn_flags & IPCL_IPCCONN);
745 ASSERT(connp->conn_priv == NULL);
746 kmem_cache_free(ip_conn_cache, connp);
747 }
748 }
749
750 /*
751 * Running in cluster mode - deregister listener information
752 */
753 static void
ipcl_conn_unlisten(conn_t * connp)754 ipcl_conn_unlisten(conn_t *connp)
755 {
756 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
757 ASSERT(connp->conn_lport != 0);
758
759 if (cl_inet_unlisten != NULL) {
760 sa_family_t addr_family;
761 uint8_t *laddrp;
762
763 if (connp->conn_ipversion == IPV6_VERSION) {
764 addr_family = AF_INET6;
765 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
766 } else {
767 addr_family = AF_INET;
768 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
769 }
770 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
771 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
772 }
773 connp->conn_flags &= ~IPCL_CL_LISTENER;
774 }
775
776 /*
777 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
778 * which table the conn belonged to). So for debugging we can see which hash
779 * table this connection was in.
780 */
781 #define IPCL_HASH_REMOVE(connp) { \
782 connf_t *connfp = (connp)->conn_fanout; \
783 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
784 if (connfp != NULL) { \
785 mutex_enter(&connfp->connf_lock); \
786 if ((connp)->conn_next != NULL) \
787 (connp)->conn_next->conn_prev = \
788 (connp)->conn_prev; \
789 if ((connp)->conn_prev != NULL) \
790 (connp)->conn_prev->conn_next = \
791 (connp)->conn_next; \
792 else \
793 connfp->connf_head = (connp)->conn_next; \
794 (connp)->conn_fanout = NULL; \
795 (connp)->conn_next = NULL; \
796 (connp)->conn_prev = NULL; \
797 (connp)->conn_flags |= IPCL_REMOVED; \
798 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
799 ipcl_conn_unlisten((connp)); \
800 CONN_DEC_REF((connp)); \
801 mutex_exit(&connfp->connf_lock); \
802 } \
803 }
804
805 void
ipcl_hash_remove(conn_t * connp)806 ipcl_hash_remove(conn_t *connp)
807 {
808 uint8_t protocol = connp->conn_proto;
809
810 IPCL_HASH_REMOVE(connp);
811 if (protocol == IPPROTO_RSVP)
812 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
813 }
814
815 /*
816 * The whole purpose of this function is allow removal of
817 * a conn_t from the connected hash for timewait reclaim.
818 * This is essentially a TW reclaim fastpath where timewait
819 * collector checks under fanout lock (so no one else can
820 * get access to the conn_t) that refcnt is 2 i.e. one for
821 * TCP and one for the classifier hash list. If ref count
822 * is indeed 2, we can just remove the conn under lock and
823 * avoid cleaning up the conn under squeue. This gives us
824 * improved performance.
825 */
826 void
ipcl_hash_remove_locked(conn_t * connp,connf_t * connfp)827 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
828 {
829 ASSERT(MUTEX_HELD(&connfp->connf_lock));
830 ASSERT(MUTEX_HELD(&connp->conn_lock));
831 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
832
833 if ((connp)->conn_next != NULL) {
834 (connp)->conn_next->conn_prev = (connp)->conn_prev;
835 }
836 if ((connp)->conn_prev != NULL) {
837 (connp)->conn_prev->conn_next = (connp)->conn_next;
838 } else {
839 connfp->connf_head = (connp)->conn_next;
840 }
841 (connp)->conn_fanout = NULL;
842 (connp)->conn_next = NULL;
843 (connp)->conn_prev = NULL;
844 (connp)->conn_flags |= IPCL_REMOVED;
845 ASSERT((connp)->conn_ref == 2);
846 (connp)->conn_ref--;
847 }
848
849 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
850 ASSERT((connp)->conn_fanout == NULL); \
851 ASSERT((connp)->conn_next == NULL); \
852 ASSERT((connp)->conn_prev == NULL); \
853 if ((connfp)->connf_head != NULL) { \
854 (connfp)->connf_head->conn_prev = (connp); \
855 (connp)->conn_next = (connfp)->connf_head; \
856 } \
857 (connp)->conn_fanout = (connfp); \
858 (connfp)->connf_head = (connp); \
859 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
860 IPCL_CONNECTED; \
861 CONN_INC_REF(connp); \
862 }
863
864 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
865 IPCL_HASH_REMOVE((connp)); \
866 mutex_enter(&(connfp)->connf_lock); \
867 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
868 mutex_exit(&(connfp)->connf_lock); \
869 }
870
871 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
872 conn_t *pconnp = NULL, *nconnp; \
873 IPCL_HASH_REMOVE((connp)); \
874 mutex_enter(&(connfp)->connf_lock); \
875 nconnp = (connfp)->connf_head; \
876 while (nconnp != NULL && \
877 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
878 pconnp = nconnp; \
879 nconnp = nconnp->conn_next; \
880 } \
881 if (pconnp != NULL) { \
882 pconnp->conn_next = (connp); \
883 (connp)->conn_prev = pconnp; \
884 } else { \
885 (connfp)->connf_head = (connp); \
886 } \
887 if (nconnp != NULL) { \
888 (connp)->conn_next = nconnp; \
889 nconnp->conn_prev = (connp); \
890 } \
891 (connp)->conn_fanout = (connfp); \
892 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
893 IPCL_BOUND; \
894 CONN_INC_REF(connp); \
895 mutex_exit(&(connfp)->connf_lock); \
896 }
897
898 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
899 conn_t **list, *prev, *next; \
900 boolean_t isv4mapped = \
901 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
902 IPCL_HASH_REMOVE((connp)); \
903 mutex_enter(&(connfp)->connf_lock); \
904 list = &(connfp)->connf_head; \
905 prev = NULL; \
906 while ((next = *list) != NULL) { \
907 if (isv4mapped && \
908 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
909 connp->conn_zoneid == next->conn_zoneid) { \
910 (connp)->conn_next = next; \
911 if (prev != NULL) \
912 prev = next->conn_prev; \
913 next->conn_prev = (connp); \
914 break; \
915 } \
916 list = &next->conn_next; \
917 prev = next; \
918 } \
919 (connp)->conn_prev = prev; \
920 *list = (connp); \
921 (connp)->conn_fanout = (connfp); \
922 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
923 IPCL_BOUND; \
924 CONN_INC_REF((connp)); \
925 mutex_exit(&(connfp)->connf_lock); \
926 }
927
928 void
ipcl_hash_insert_wildcard(connf_t * connfp,conn_t * connp)929 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
930 {
931 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
932 }
933
934 /*
935 * Because the classifier is used to classify inbound packets, the destination
936 * address is meant to be our local tunnel address (tunnel source), and the
937 * source the remote tunnel address (tunnel destination).
938 *
939 * Note that conn_proto can't be used for fanout since the upper protocol
940 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
941 */
942 conn_t *
ipcl_iptun_classify_v4(ipaddr_t * src,ipaddr_t * dst,ip_stack_t * ipst)943 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
944 {
945 connf_t *connfp;
946 conn_t *connp;
947
948 /* first look for IPv4 tunnel links */
949 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
950 mutex_enter(&connfp->connf_lock);
951 for (connp = connfp->connf_head; connp != NULL;
952 connp = connp->conn_next) {
953 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
954 break;
955 }
956 if (connp != NULL)
957 goto done;
958
959 mutex_exit(&connfp->connf_lock);
960
961 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
962 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
963 INADDR_ANY)];
964 mutex_enter(&connfp->connf_lock);
965 for (connp = connfp->connf_head; connp != NULL;
966 connp = connp->conn_next) {
967 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
968 break;
969 }
970 done:
971 if (connp != NULL)
972 CONN_INC_REF(connp);
973 mutex_exit(&connfp->connf_lock);
974 return (connp);
975 }
976
977 conn_t *
ipcl_iptun_classify_v6(in6_addr_t * src,in6_addr_t * dst,ip_stack_t * ipst)978 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
979 {
980 connf_t *connfp;
981 conn_t *connp;
982
983 /* Look for an IPv6 tunnel link */
984 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
985 mutex_enter(&connfp->connf_lock);
986 for (connp = connfp->connf_head; connp != NULL;
987 connp = connp->conn_next) {
988 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
989 CONN_INC_REF(connp);
990 break;
991 }
992 }
993 mutex_exit(&connfp->connf_lock);
994 return (connp);
995 }
996
997 /*
998 * This function is used only for inserting SCTP raw socket now.
999 * This may change later.
1000 *
1001 * Note that only one raw socket can be bound to a port. The param
1002 * lport is in network byte order.
1003 */
1004 static int
ipcl_sctp_hash_insert(conn_t * connp,in_port_t lport)1005 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 {
1007 connf_t *connfp;
1008 conn_t *oconnp;
1009 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1010
1011 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012
1013 /* Check for existing raw socket already bound to the port. */
1014 mutex_enter(&connfp->connf_lock);
1015 for (oconnp = connfp->connf_head; oconnp != NULL;
1016 oconnp = oconnp->conn_next) {
1017 if (oconnp->conn_lport == lport &&
1018 oconnp->conn_zoneid == connp->conn_zoneid &&
1019 oconnp->conn_family == connp->conn_family &&
1020 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 &connp->conn_laddr_v6))) {
1026 break;
1027 }
1028 }
1029 mutex_exit(&connfp->connf_lock);
1030 if (oconnp != NULL)
1031 return (EADDRNOTAVAIL);
1032
1033 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038 } else {
1039 IPCL_HASH_INSERT_BOUND(connfp, connp);
1040 }
1041 } else {
1042 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 }
1044 return (0);
1045 }
1046
1047 static int
ipcl_iptun_hash_insert(conn_t * connp,ip_stack_t * ipst)1048 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 {
1050 connf_t *connfp;
1051 conn_t *tconnp;
1052 ipaddr_t laddr = connp->conn_laddr_v4;
1053 ipaddr_t faddr = connp->conn_faddr_v4;
1054
1055 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 mutex_enter(&connfp->connf_lock);
1057 for (tconnp = connfp->connf_head; tconnp != NULL;
1058 tconnp = tconnp->conn_next) {
1059 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 /* A tunnel is already bound to these addresses. */
1061 mutex_exit(&connfp->connf_lock);
1062 return (EADDRINUSE);
1063 }
1064 }
1065 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 mutex_exit(&connfp->connf_lock);
1067 return (0);
1068 }
1069
1070 static int
ipcl_iptun_hash_insert_v6(conn_t * connp,ip_stack_t * ipst)1071 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 {
1073 connf_t *connfp;
1074 conn_t *tconnp;
1075 in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 in6_addr_t *faddr = &connp->conn_faddr_v6;
1077
1078 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 mutex_enter(&connfp->connf_lock);
1080 for (tconnp = connfp->connf_head; tconnp != NULL;
1081 tconnp = tconnp->conn_next) {
1082 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 /* A tunnel is already bound to these addresses. */
1084 mutex_exit(&connfp->connf_lock);
1085 return (EADDRINUSE);
1086 }
1087 }
1088 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 mutex_exit(&connfp->connf_lock);
1090 return (0);
1091 }
1092
1093 /*
1094 * Check for a MAC exemption conflict on a labeled system. Note that for
1095 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096 * transport layer. This check is for binding all other protocols.
1097 *
1098 * Returns true if there's a conflict.
1099 */
1100 static boolean_t
check_exempt_conflict_v4(conn_t * connp,ip_stack_t * ipst)1101 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 {
1103 connf_t *connfp;
1104 conn_t *tconn;
1105
1106 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 mutex_enter(&connfp->connf_lock);
1108 for (tconn = connfp->connf_head; tconn != NULL;
1109 tconn = tconn->conn_next) {
1110 /* We don't allow v4 fallback for v6 raw socket */
1111 if (connp->conn_family != tconn->conn_family)
1112 continue;
1113 /* If neither is exempt, then there's no conflict */
1114 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 continue;
1117 /* We are only concerned about sockets for a different zone */
1118 if (connp->conn_zoneid == tconn->conn_zoneid)
1119 continue;
1120 /* If both are bound to different specific addrs, ok */
1121 if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 tconn->conn_laddr_v4 != INADDR_ANY &&
1123 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 continue;
1125 /* These two conflict; fail */
1126 break;
1127 }
1128 mutex_exit(&connfp->connf_lock);
1129 return (tconn != NULL);
1130 }
1131
1132 static boolean_t
check_exempt_conflict_v6(conn_t * connp,ip_stack_t * ipst)1133 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 {
1135 connf_t *connfp;
1136 conn_t *tconn;
1137
1138 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 mutex_enter(&connfp->connf_lock);
1140 for (tconn = connfp->connf_head; tconn != NULL;
1141 tconn = tconn->conn_next) {
1142 /* We don't allow v4 fallback for v6 raw socket */
1143 if (connp->conn_family != tconn->conn_family)
1144 continue;
1145 /* If neither is exempt, then there's no conflict */
1146 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 continue;
1149 /* We are only concerned about sockets for a different zone */
1150 if (connp->conn_zoneid == tconn->conn_zoneid)
1151 continue;
1152 /* If both are bound to different addrs, ok */
1153 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 &tconn->conn_laddr_v6))
1157 continue;
1158 /* These two conflict; fail */
1159 break;
1160 }
1161 mutex_exit(&connfp->connf_lock);
1162 return (tconn != NULL);
1163 }
1164
1165 /*
1166 * (v4, v6) bind hash insertion routines
1167 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168 */
1169
1170 int
ipcl_bind_insert(conn_t * connp)1171 ipcl_bind_insert(conn_t *connp)
1172 {
1173 if (connp->conn_ipversion == IPV6_VERSION)
1174 return (ipcl_bind_insert_v6(connp));
1175 else
1176 return (ipcl_bind_insert_v4(connp));
1177 }
1178
1179 int
ipcl_bind_insert_v4(conn_t * connp)1180 ipcl_bind_insert_v4(conn_t *connp)
1181 {
1182 connf_t *connfp;
1183 int ret = 0;
1184 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1185 uint16_t lport = connp->conn_lport;
1186 uint8_t protocol = connp->conn_proto;
1187
1188 if (IPCL_IS_IPTUN(connp))
1189 return (ipcl_iptun_hash_insert(connp, ipst));
1190
1191 switch (protocol) {
1192 default:
1193 if (is_system_labeled() &&
1194 check_exempt_conflict_v4(connp, ipst))
1195 return (EADDRINUSE);
1196 /* FALLTHROUGH */
1197 case IPPROTO_UDP:
1198 if (protocol == IPPROTO_UDP) {
1199 connfp = &ipst->ips_ipcl_udp_fanout[
1200 IPCL_UDP_HASH(lport, ipst)];
1201 } else {
1202 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 }
1204
1205 if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208 IPCL_HASH_INSERT_BOUND(connfp, connp);
1209 } else {
1210 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211 }
1212 if (protocol == IPPROTO_RSVP)
1213 ill_set_inputfn_all(ipst);
1214 break;
1215
1216 case IPPROTO_TCP:
1217 /* Insert it in the Bind Hash */
1218 ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 connfp = &ipst->ips_ipcl_bind_fanout[
1220 IPCL_BIND_HASH(lport, ipst)];
1221 if (connp->conn_laddr_v4 != INADDR_ANY) {
1222 IPCL_HASH_INSERT_BOUND(connfp, connp);
1223 } else {
1224 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225 }
1226 if (cl_inet_listen != NULL) {
1227 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 connp->conn_flags |= IPCL_CL_LISTENER;
1229 (*cl_inet_listen)(
1230 connp->conn_netstack->netstack_stackid,
1231 IPPROTO_TCP, AF_INET,
1232 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 }
1234 break;
1235
1236 case IPPROTO_SCTP:
1237 ret = ipcl_sctp_hash_insert(connp, lport);
1238 break;
1239 }
1240
1241 return (ret);
1242 }
1243
1244 int
ipcl_bind_insert_v6(conn_t * connp)1245 ipcl_bind_insert_v6(conn_t *connp)
1246 {
1247 connf_t *connfp;
1248 int ret = 0;
1249 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1250 uint16_t lport = connp->conn_lport;
1251 uint8_t protocol = connp->conn_proto;
1252
1253 if (IPCL_IS_IPTUN(connp)) {
1254 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 }
1256
1257 switch (protocol) {
1258 default:
1259 if (is_system_labeled() &&
1260 check_exempt_conflict_v6(connp, ipst))
1261 return (EADDRINUSE);
1262 /* FALLTHROUGH */
1263 case IPPROTO_UDP:
1264 if (protocol == IPPROTO_UDP) {
1265 connfp = &ipst->ips_ipcl_udp_fanout[
1266 IPCL_UDP_HASH(lport, ipst)];
1267 } else {
1268 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 }
1270
1271 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274 IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 } else {
1276 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 }
1278 break;
1279
1280 case IPPROTO_TCP:
1281 /* Insert it in the Bind Hash */
1282 ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 connfp = &ipst->ips_ipcl_bind_fanout[
1284 IPCL_BIND_HASH(lport, ipst)];
1285 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286 IPCL_HASH_INSERT_BOUND(connfp, connp);
1287 } else {
1288 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289 }
1290 if (cl_inet_listen != NULL) {
1291 sa_family_t addr_family;
1292 uint8_t *laddrp;
1293
1294 if (connp->conn_ipversion == IPV6_VERSION) {
1295 addr_family = AF_INET6;
1296 laddrp =
1297 (uint8_t *)&connp->conn_bound_addr_v6;
1298 } else {
1299 addr_family = AF_INET;
1300 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 }
1302 connp->conn_flags |= IPCL_CL_LISTENER;
1303 (*cl_inet_listen)(
1304 connp->conn_netstack->netstack_stackid,
1305 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 }
1307 break;
1308
1309 case IPPROTO_SCTP:
1310 ret = ipcl_sctp_hash_insert(connp, lport);
1311 break;
1312 }
1313
1314 return (ret);
1315 }
1316
1317 /*
1318 * ipcl_conn_hash insertion routines.
1319 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320 */
1321
1322 int
ipcl_conn_insert(conn_t * connp)1323 ipcl_conn_insert(conn_t *connp)
1324 {
1325 if (connp->conn_ipversion == IPV6_VERSION)
1326 return (ipcl_conn_insert_v6(connp));
1327 else
1328 return (ipcl_conn_insert_v4(connp));
1329 }
1330
1331 int
ipcl_conn_insert_v4(conn_t * connp)1332 ipcl_conn_insert_v4(conn_t *connp)
1333 {
1334 connf_t *connfp;
1335 conn_t *tconnp;
1336 int ret = 0;
1337 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1338 uint16_t lport = connp->conn_lport;
1339 uint8_t protocol = connp->conn_proto;
1340
1341 if (IPCL_IS_IPTUN(connp))
1342 return (ipcl_iptun_hash_insert(connp, ipst));
1343
1344 switch (protocol) {
1345 case IPPROTO_TCP:
1346 /*
1347 * For TCP, we check whether the connection tuple already
1348 * exists before allowing the connection to proceed. We
1349 * also allow indexing on the zoneid. This is to allow
1350 * multiple shared stack zones to have the same tcp
1351 * connection tuple. In practice this only happens for
1352 * INADDR_LOOPBACK as it's the only local address which
1353 * doesn't have to be unique.
1354 */
1355 connfp = &ipst->ips_ipcl_conn_fanout[
1356 IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 connp->conn_ports, ipst)];
1358 mutex_enter(&connfp->connf_lock);
1359 for (tconnp = connfp->connf_head; tconnp != NULL;
1360 tconnp = tconnp->conn_next) {
1361 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 connp->conn_ports) &&
1364 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 /* Already have a conn. bail out */
1366 mutex_exit(&connfp->connf_lock);
1367 return (EADDRINUSE);
1368 }
1369 }
1370 if (connp->conn_fanout != NULL) {
1371 /*
1372 * Probably a XTI/TLI application trying to do a
1373 * rebind. Let it happen.
1374 */
1375 mutex_exit(&connfp->connf_lock);
1376 IPCL_HASH_REMOVE(connp);
1377 mutex_enter(&connfp->connf_lock);
1378 }
1379
1380 ASSERT(connp->conn_recv != NULL);
1381 ASSERT(connp->conn_recvicmp != NULL);
1382
1383 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 mutex_exit(&connfp->connf_lock);
1385 break;
1386
1387 case IPPROTO_SCTP:
1388 /*
1389 * The raw socket may have already been bound, remove it
1390 * from the hash first.
1391 */
1392 IPCL_HASH_REMOVE(connp);
1393 ret = ipcl_sctp_hash_insert(connp, lport);
1394 break;
1395
1396 default:
1397 /*
1398 * Check for conflicts among MAC exempt bindings. For
1399 * transports with port numbers, this is done by the upper
1400 * level per-transport binding logic. For all others, it's
1401 * done here.
1402 */
1403 if (is_system_labeled() &&
1404 check_exempt_conflict_v4(connp, ipst))
1405 return (EADDRINUSE);
1406 /* FALLTHROUGH */
1407
1408 case IPPROTO_UDP:
1409 if (protocol == IPPROTO_UDP) {
1410 connfp = &ipst->ips_ipcl_udp_fanout[
1411 IPCL_UDP_HASH(lport, ipst)];
1412 } else {
1413 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 }
1415
1416 if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419 IPCL_HASH_INSERT_BOUND(connfp, connp);
1420 } else {
1421 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422 }
1423 break;
1424 }
1425
1426 return (ret);
1427 }
1428
1429 int
ipcl_conn_insert_v6(conn_t * connp)1430 ipcl_conn_insert_v6(conn_t *connp)
1431 {
1432 connf_t *connfp;
1433 conn_t *tconnp;
1434 int ret = 0;
1435 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1436 uint16_t lport = connp->conn_lport;
1437 uint8_t protocol = connp->conn_proto;
1438 uint_t ifindex = connp->conn_bound_if;
1439
1440 if (IPCL_IS_IPTUN(connp))
1441 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442
1443 switch (protocol) {
1444 case IPPROTO_TCP:
1445
1446 /*
1447 * For tcp, we check whether the connection tuple already
1448 * exists before allowing the connection to proceed. We
1449 * also allow indexing on the zoneid. This is to allow
1450 * multiple shared stack zones to have the same tcp
1451 * connection tuple. In practice this only happens for
1452 * ipv6_loopback as it's the only local address which
1453 * doesn't have to be unique.
1454 */
1455 connfp = &ipst->ips_ipcl_conn_fanout[
1456 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 ipst)];
1458 mutex_enter(&connfp->connf_lock);
1459 for (tconnp = connfp->connf_head; tconnp != NULL;
1460 tconnp = tconnp->conn_next) {
1461 /* NOTE: need to match zoneid. Bug in onnv-gate */
1462 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 connp->conn_ports) &&
1465 (tconnp->conn_bound_if == 0 ||
1466 tconnp->conn_bound_if == ifindex) &&
1467 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 /* Already have a conn. bail out */
1469 mutex_exit(&connfp->connf_lock);
1470 return (EADDRINUSE);
1471 }
1472 }
1473 if (connp->conn_fanout != NULL) {
1474 /*
1475 * Probably a XTI/TLI application trying to do a
1476 * rebind. Let it happen.
1477 */
1478 mutex_exit(&connfp->connf_lock);
1479 IPCL_HASH_REMOVE(connp);
1480 mutex_enter(&connfp->connf_lock);
1481 }
1482 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 mutex_exit(&connfp->connf_lock);
1484 break;
1485
1486 case IPPROTO_SCTP:
1487 IPCL_HASH_REMOVE(connp);
1488 ret = ipcl_sctp_hash_insert(connp, lport);
1489 break;
1490
1491 default:
1492 if (is_system_labeled() &&
1493 check_exempt_conflict_v6(connp, ipst))
1494 return (EADDRINUSE);
1495 /* FALLTHROUGH */
1496 case IPPROTO_UDP:
1497 if (protocol == IPPROTO_UDP) {
1498 connfp = &ipst->ips_ipcl_udp_fanout[
1499 IPCL_UDP_HASH(lport, ipst)];
1500 } else {
1501 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 }
1503
1504 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507 IPCL_HASH_INSERT_BOUND(connfp, connp);
1508 } else {
1509 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510 }
1511 break;
1512 }
1513
1514 return (ret);
1515 }
1516
1517 /*
1518 * v4 packet classifying function. looks up the fanout table to
1519 * find the conn, the packet belongs to. returns the conn with
1520 * the reference held, null otherwise.
1521 *
1522 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523 * Lookup" comment block are applied. Labels are also checked as described
1524 * above. If the packet is from the inside (looped back), and is from the same
1525 * zone, then label checks are omitted.
1526 */
1527 conn_t *
ipcl_classify_v4(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1528 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529 ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 {
1531 ipha_t *ipha;
1532 connf_t *connfp, *bind_connfp;
1533 uint16_t lport;
1534 uint16_t fport;
1535 uint32_t ports;
1536 conn_t *connp;
1537 uint16_t *up;
1538 zoneid_t zoneid = ira->ira_zoneid;
1539
1540 ipha = (ipha_t *)mp->b_rptr;
1541 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542
1543 switch (protocol) {
1544 case IPPROTO_TCP:
1545 ports = *(uint32_t *)up;
1546 connfp =
1547 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 ports, ipst)];
1549 mutex_enter(&connfp->connf_lock);
1550 for (connp = connfp->connf_head; connp != NULL;
1551 connp = connp->conn_next) {
1552 if (IPCL_CONN_MATCH(connp, protocol,
1553 ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 (connp->conn_zoneid == zoneid ||
1555 connp->conn_allzones ||
1556 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 break;
1560 }
1561
1562 if (connp != NULL) {
1563 /*
1564 * We have a fully-bound TCP connection.
1565 *
1566 * For labeled systems, there's no need to check the
1567 * label here. It's known to be good as we checked
1568 * before allowing the connection to become bound.
1569 */
1570 CONN_INC_REF(connp);
1571 mutex_exit(&connfp->connf_lock);
1572 return (connp);
1573 }
1574
1575 mutex_exit(&connfp->connf_lock);
1576 lport = up[1];
1577 bind_connfp =
1578 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 mutex_enter(&bind_connfp->connf_lock);
1580 for (connp = bind_connfp->connf_head; connp != NULL;
1581 connp = connp->conn_next) {
1582 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 lport) &&
1584 (connp->conn_zoneid == zoneid ||
1585 connp->conn_allzones ||
1586 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 break;
1590 }
1591
1592 /*
1593 * If the matching connection is SLP on a private address, then
1594 * the label on the packet must match the local zone's label.
1595 * Otherwise, it must be in the label range defined by tnrh.
1596 * This is ensured by tsol_receive_local.
1597 *
1598 * Note that we don't check tsol_receive_local for
1599 * the connected case.
1600 */
1601 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 ira, connp)) {
1604 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 char *, "connp(1) could not receive mp(2)",
1606 conn_t *, connp, mblk_t *, mp);
1607 connp = NULL;
1608 }
1609
1610 if (connp != NULL) {
1611 /* Have a listener at least */
1612 CONN_INC_REF(connp);
1613 mutex_exit(&bind_connfp->connf_lock);
1614 return (connp);
1615 }
1616
1617 mutex_exit(&bind_connfp->connf_lock);
1618 break;
1619
1620 case IPPROTO_UDP:
1621 lport = up[1];
1622 fport = up[0];
1623 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 mutex_enter(&connfp->connf_lock);
1625 for (connp = connfp->connf_head; connp != NULL;
1626 connp = connp->conn_next) {
1627 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 fport, ipha->ipha_src) &&
1629 (connp->conn_zoneid == zoneid ||
1630 connp->conn_allzones ||
1631 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 break;
1634 }
1635
1636 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 ira, connp)) {
1639 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 char *, "connp(1) could not receive mp(2)",
1641 conn_t *, connp, mblk_t *, mp);
1642 connp = NULL;
1643 }
1644
1645 if (connp != NULL) {
1646 CONN_INC_REF(connp);
1647 mutex_exit(&connfp->connf_lock);
1648 return (connp);
1649 }
1650
1651 /*
1652 * We shouldn't come here for multicast/broadcast packets
1653 */
1654 mutex_exit(&connfp->connf_lock);
1655
1656 break;
1657
1658 case IPPROTO_ENCAP:
1659 case IPPROTO_IPV6:
1660 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 &ipha->ipha_dst, ipst));
1662 }
1663
1664 return (NULL);
1665 }
1666
1667 conn_t *
ipcl_classify_v6(mblk_t * mp,uint8_t protocol,uint_t hdr_len,ip_recv_attr_t * ira,ip_stack_t * ipst)1668 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669 ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 {
1671 ip6_t *ip6h;
1672 connf_t *connfp, *bind_connfp;
1673 uint16_t lport;
1674 uint16_t fport;
1675 tcpha_t *tcpha;
1676 uint32_t ports;
1677 conn_t *connp;
1678 uint16_t *up;
1679 zoneid_t zoneid = ira->ira_zoneid;
1680
1681 ip6h = (ip6_t *)mp->b_rptr;
1682
1683 switch (protocol) {
1684 case IPPROTO_TCP:
1685 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 up = &tcpha->tha_lport;
1687 ports = *(uint32_t *)up;
1688
1689 connfp =
1690 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 ports, ipst)];
1692 mutex_enter(&connfp->connf_lock);
1693 for (connp = connfp->connf_head; connp != NULL;
1694 connp = connp->conn_next) {
1695 if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 (connp->conn_zoneid == zoneid ||
1698 connp->conn_allzones ||
1699 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 break;
1703 }
1704
1705 if (connp != NULL) {
1706 /*
1707 * We have a fully-bound TCP connection.
1708 *
1709 * For labeled systems, there's no need to check the
1710 * label here. It's known to be good as we checked
1711 * before allowing the connection to become bound.
1712 */
1713 CONN_INC_REF(connp);
1714 mutex_exit(&connfp->connf_lock);
1715 return (connp);
1716 }
1717
1718 mutex_exit(&connfp->connf_lock);
1719
1720 lport = up[1];
1721 bind_connfp =
1722 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 mutex_enter(&bind_connfp->connf_lock);
1724 for (connp = bind_connfp->connf_head; connp != NULL;
1725 connp = connp->conn_next) {
1726 if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 ip6h->ip6_dst, lport) &&
1728 (connp->conn_zoneid == zoneid ||
1729 connp->conn_allzones ||
1730 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 break;
1734 }
1735
1736 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 ira, connp)) {
1739 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 char *, "connp(1) could not receive mp(2)",
1741 conn_t *, connp, mblk_t *, mp);
1742 connp = NULL;
1743 }
1744
1745 if (connp != NULL) {
1746 /* Have a listner at least */
1747 CONN_INC_REF(connp);
1748 mutex_exit(&bind_connfp->connf_lock);
1749 return (connp);
1750 }
1751
1752 mutex_exit(&bind_connfp->connf_lock);
1753 break;
1754
1755 case IPPROTO_UDP:
1756 up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 lport = up[1];
1758 fport = up[0];
1759 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 mutex_enter(&connfp->connf_lock);
1761 for (connp = connfp->connf_head; connp != NULL;
1762 connp = connp->conn_next) {
1763 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 fport, ip6h->ip6_src) &&
1765 (connp->conn_zoneid == zoneid ||
1766 connp->conn_allzones ||
1767 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 break;
1771 }
1772
1773 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 ira, connp)) {
1776 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 char *, "connp(1) could not receive mp(2)",
1778 conn_t *, connp, mblk_t *, mp);
1779 connp = NULL;
1780 }
1781
1782 if (connp != NULL) {
1783 CONN_INC_REF(connp);
1784 mutex_exit(&connfp->connf_lock);
1785 return (connp);
1786 }
1787
1788 /*
1789 * We shouldn't come here for multicast/broadcast packets
1790 */
1791 mutex_exit(&connfp->connf_lock);
1792 break;
1793 case IPPROTO_ENCAP:
1794 case IPPROTO_IPV6:
1795 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 &ip6h->ip6_dst, ipst));
1797 }
1798
1799 return (NULL);
1800 }
1801
1802 /*
1803 * wrapper around ipcl_classify_(v4,v6) routines.
1804 */
1805 conn_t *
ipcl_classify(mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst)1806 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 {
1808 if (ira->ira_flags & IRAF_IS_IPV4) {
1809 return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 ira->ira_ip_hdr_length, ira, ipst));
1811 } else {
1812 return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 ira->ira_ip_hdr_length, ira, ipst));
1814 }
1815 }
1816
1817 /*
1818 * Only used to classify SCTP RAW sockets
1819 */
1820 conn_t *
ipcl_classify_raw(mblk_t * mp,uint8_t protocol,uint32_t ports,ipha_t * ipha,ip6_t * ip6h,ip_recv_attr_t * ira,ip_stack_t * ipst)1821 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 {
1824 connf_t *connfp;
1825 conn_t *connp;
1826 in_port_t lport;
1827 int ipversion;
1828 const void *dst;
1829 zoneid_t zoneid = ira->ira_zoneid;
1830
1831 lport = ((uint16_t *)&ports)[1];
1832 if (ira->ira_flags & IRAF_IS_IPV4) {
1833 dst = (const void *)&ipha->ipha_dst;
1834 ipversion = IPV4_VERSION;
1835 } else {
1836 dst = (const void *)&ip6h->ip6_dst;
1837 ipversion = IPV6_VERSION;
1838 }
1839
1840 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 mutex_enter(&connfp->connf_lock);
1842 for (connp = connfp->connf_head; connp != NULL;
1843 connp = connp->conn_next) {
1844 /* We don't allow v4 fallback for v6 raw socket. */
1845 if (ipversion != connp->conn_ipversion)
1846 continue;
1847 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 if (ipversion == IPV4_VERSION) {
1850 if (!IPCL_CONN_MATCH(connp, protocol,
1851 ipha->ipha_src, ipha->ipha_dst, ports))
1852 continue;
1853 } else {
1854 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 continue;
1857 }
1858 } else {
1859 if (ipversion == IPV4_VERSION) {
1860 if (!IPCL_BIND_MATCH(connp, protocol,
1861 ipha->ipha_dst, lport))
1862 continue;
1863 } else {
1864 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 ip6h->ip6_dst, lport))
1866 continue;
1867 }
1868 }
1869
1870 if (connp->conn_zoneid == zoneid ||
1871 connp->conn_allzones ||
1872 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 break;
1876 }
1877
1878 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 char *, "connp(1) could not receive mp(2)",
1882 conn_t *, connp, mblk_t *, mp);
1883 connp = NULL;
1884 }
1885
1886 if (connp != NULL)
1887 goto found;
1888 mutex_exit(&connfp->connf_lock);
1889
1890 /* Try to look for a wildcard SCTP RAW socket match. */
1891 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 mutex_enter(&connfp->connf_lock);
1893 for (connp = connfp->connf_head; connp != NULL;
1894 connp = connp->conn_next) {
1895 /* We don't allow v4 fallback for v6 raw socket. */
1896 if (ipversion != connp->conn_ipversion)
1897 continue;
1898 if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 continue;
1900
1901 if (ipversion == IPV4_VERSION) {
1902 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 break;
1904 } else {
1905 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 break;
1907 }
1908 }
1909 }
1910
1911 if (connp != NULL)
1912 goto found;
1913
1914 mutex_exit(&connfp->connf_lock);
1915 return (NULL);
1916
1917 found:
1918 ASSERT(connp != NULL);
1919 CONN_INC_REF(connp);
1920 mutex_exit(&connfp->connf_lock);
1921 return (connp);
1922 }
1923
1924 /* ARGSUSED */
1925 static int
tcp_conn_constructor(void * buf,void * cdrarg,int kmflags)1926 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 {
1928 itc_t *itc = (itc_t *)buf;
1929 conn_t *connp = &itc->itc_conn;
1930 tcp_t *tcp = (tcp_t *)&itc[1];
1931
1932 bzero(connp, sizeof (conn_t));
1933 bzero(tcp, sizeof (tcp_t));
1934
1935 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 if (tcp->tcp_timercache == NULL)
1940 return (ENOMEM);
1941 connp->conn_tcp = tcp;
1942 connp->conn_flags = IPCL_TCPCONN;
1943 connp->conn_proto = IPPROTO_TCP;
1944 tcp->tcp_connp = connp;
1945 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946
1947 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 if (connp->conn_ixa == NULL) {
1949 tcp_timermp_free(tcp);
1950 return (ENOMEM);
1951 }
1952 connp->conn_ixa->ixa_refcnt = 1;
1953 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 return (0);
1956 }
1957
1958 /* ARGSUSED */
1959 static void
tcp_conn_destructor(void * buf,void * cdrarg)1960 tcp_conn_destructor(void *buf, void *cdrarg)
1961 {
1962 itc_t *itc = (itc_t *)buf;
1963 conn_t *connp = &itc->itc_conn;
1964 tcp_t *tcp = (tcp_t *)&itc[1];
1965
1966 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 ASSERT(tcp->tcp_connp == connp);
1968 ASSERT(connp->conn_tcp == tcp);
1969 tcp_timermp_free(tcp);
1970 mutex_destroy(&connp->conn_lock);
1971 cv_destroy(&connp->conn_cv);
1972 cv_destroy(&connp->conn_sq_cv);
1973 rw_destroy(&connp->conn_ilg_lock);
1974
1975 /* Can be NULL if constructor failed */
1976 if (connp->conn_ixa != NULL) {
1977 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 ixa_refrele(connp->conn_ixa);
1981 }
1982 }
1983
1984 /* ARGSUSED */
1985 static int
ip_conn_constructor(void * buf,void * cdrarg,int kmflags)1986 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 {
1988 itc_t *itc = (itc_t *)buf;
1989 conn_t *connp = &itc->itc_conn;
1990
1991 bzero(connp, sizeof (conn_t));
1992 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 connp->conn_flags = IPCL_IPCCONN;
1995 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996
1997 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 if (connp->conn_ixa == NULL)
1999 return (ENOMEM);
2000 connp->conn_ixa->ixa_refcnt = 1;
2001 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 return (0);
2003 }
2004
2005 /* ARGSUSED */
2006 static void
ip_conn_destructor(void * buf,void * cdrarg)2007 ip_conn_destructor(void *buf, void *cdrarg)
2008 {
2009 itc_t *itc = (itc_t *)buf;
2010 conn_t *connp = &itc->itc_conn;
2011
2012 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 ASSERT(connp->conn_priv == NULL);
2014 mutex_destroy(&connp->conn_lock);
2015 cv_destroy(&connp->conn_cv);
2016 rw_destroy(&connp->conn_ilg_lock);
2017
2018 /* Can be NULL if constructor failed */
2019 if (connp->conn_ixa != NULL) {
2020 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 ixa_refrele(connp->conn_ixa);
2024 }
2025 }
2026
2027 /* ARGSUSED */
2028 static int
udp_conn_constructor(void * buf,void * cdrarg,int kmflags)2029 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 {
2031 itc_t *itc = (itc_t *)buf;
2032 conn_t *connp = &itc->itc_conn;
2033 udp_t *udp = (udp_t *)&itc[1];
2034
2035 bzero(connp, sizeof (conn_t));
2036 bzero(udp, sizeof (udp_t));
2037
2038 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 connp->conn_udp = udp;
2041 connp->conn_flags = IPCL_UDPCONN;
2042 connp->conn_proto = IPPROTO_UDP;
2043 udp->udp_connp = connp;
2044 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 if (connp->conn_ixa == NULL)
2047 return (ENOMEM);
2048 connp->conn_ixa->ixa_refcnt = 1;
2049 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 return (0);
2052 }
2053
2054 /* ARGSUSED */
2055 static void
udp_conn_destructor(void * buf,void * cdrarg)2056 udp_conn_destructor(void *buf, void *cdrarg)
2057 {
2058 itc_t *itc = (itc_t *)buf;
2059 conn_t *connp = &itc->itc_conn;
2060 udp_t *udp = (udp_t *)&itc[1];
2061
2062 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 ASSERT(udp->udp_connp == connp);
2064 ASSERT(connp->conn_udp == udp);
2065 mutex_destroy(&connp->conn_lock);
2066 cv_destroy(&connp->conn_cv);
2067 rw_destroy(&connp->conn_ilg_lock);
2068
2069 /* Can be NULL if constructor failed */
2070 if (connp->conn_ixa != NULL) {
2071 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 ixa_refrele(connp->conn_ixa);
2075 }
2076 }
2077
2078 /* ARGSUSED */
2079 static int
rawip_conn_constructor(void * buf,void * cdrarg,int kmflags)2080 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 {
2082 itc_t *itc = (itc_t *)buf;
2083 conn_t *connp = &itc->itc_conn;
2084 icmp_t *icmp = (icmp_t *)&itc[1];
2085
2086 bzero(connp, sizeof (conn_t));
2087 bzero(icmp, sizeof (icmp_t));
2088
2089 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 connp->conn_icmp = icmp;
2092 connp->conn_flags = IPCL_RAWIPCONN;
2093 connp->conn_proto = IPPROTO_ICMP;
2094 icmp->icmp_connp = connp;
2095 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 if (connp->conn_ixa == NULL)
2098 return (ENOMEM);
2099 connp->conn_ixa->ixa_refcnt = 1;
2100 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 return (0);
2103 }
2104
2105 /* ARGSUSED */
2106 static void
rawip_conn_destructor(void * buf,void * cdrarg)2107 rawip_conn_destructor(void *buf, void *cdrarg)
2108 {
2109 itc_t *itc = (itc_t *)buf;
2110 conn_t *connp = &itc->itc_conn;
2111 icmp_t *icmp = (icmp_t *)&itc[1];
2112
2113 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 ASSERT(icmp->icmp_connp == connp);
2115 ASSERT(connp->conn_icmp == icmp);
2116 mutex_destroy(&connp->conn_lock);
2117 cv_destroy(&connp->conn_cv);
2118 rw_destroy(&connp->conn_ilg_lock);
2119
2120 /* Can be NULL if constructor failed */
2121 if (connp->conn_ixa != NULL) {
2122 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 ixa_refrele(connp->conn_ixa);
2126 }
2127 }
2128
2129 /* ARGSUSED */
2130 static int
rts_conn_constructor(void * buf,void * cdrarg,int kmflags)2131 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 {
2133 itc_t *itc = (itc_t *)buf;
2134 conn_t *connp = &itc->itc_conn;
2135 rts_t *rts = (rts_t *)&itc[1];
2136
2137 bzero(connp, sizeof (conn_t));
2138 bzero(rts, sizeof (rts_t));
2139
2140 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 connp->conn_rts = rts;
2143 connp->conn_flags = IPCL_RTSCONN;
2144 rts->rts_connp = connp;
2145 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 if (connp->conn_ixa == NULL)
2148 return (ENOMEM);
2149 connp->conn_ixa->ixa_refcnt = 1;
2150 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 return (0);
2152 }
2153
2154 /* ARGSUSED */
2155 static void
rts_conn_destructor(void * buf,void * cdrarg)2156 rts_conn_destructor(void *buf, void *cdrarg)
2157 {
2158 itc_t *itc = (itc_t *)buf;
2159 conn_t *connp = &itc->itc_conn;
2160 rts_t *rts = (rts_t *)&itc[1];
2161
2162 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 ASSERT(rts->rts_connp == connp);
2164 ASSERT(connp->conn_rts == rts);
2165 mutex_destroy(&connp->conn_lock);
2166 cv_destroy(&connp->conn_cv);
2167 rw_destroy(&connp->conn_ilg_lock);
2168
2169 /* Can be NULL if constructor failed */
2170 if (connp->conn_ixa != NULL) {
2171 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 ixa_refrele(connp->conn_ixa);
2175 }
2176 }
2177
2178 /*
2179 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180 * in the conn_t.
2181 *
2182 * Below we list all the pointers in the conn_t as a documentation aid.
2183 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184 * If you add any pointers to the conn_t please add an ASSERT here
2185 * and #ifdef it out if it can't be actually asserted to be NULL.
2186 * In any case, we bzero most of the conn_t at the end of the function.
2187 */
2188 void
ipcl_conn_cleanup(conn_t * connp)2189 ipcl_conn_cleanup(conn_t *connp)
2190 {
2191 ip_xmit_attr_t *ixa;
2192
2193 ASSERT(connp->conn_latch == NULL);
2194 ASSERT(connp->conn_latch_in_policy == NULL);
2195 ASSERT(connp->conn_latch_in_action == NULL);
2196 #ifdef notdef
2197 ASSERT(connp->conn_rq == NULL);
2198 ASSERT(connp->conn_wq == NULL);
2199 #endif
2200 ASSERT(connp->conn_cred == NULL);
2201 ASSERT(connp->conn_g_fanout == NULL);
2202 ASSERT(connp->conn_g_next == NULL);
2203 ASSERT(connp->conn_g_prev == NULL);
2204 ASSERT(connp->conn_policy == NULL);
2205 ASSERT(connp->conn_fanout == NULL);
2206 ASSERT(connp->conn_next == NULL);
2207 ASSERT(connp->conn_prev == NULL);
2208 ASSERT(connp->conn_oper_pending_ill == NULL);
2209 ASSERT(connp->conn_ilg == NULL);
2210 ASSERT(connp->conn_drain_next == NULL);
2211 ASSERT(connp->conn_drain_prev == NULL);
2212 #ifdef notdef
2213 /* conn_idl is not cleared when removed from idl list */
2214 ASSERT(connp->conn_idl == NULL);
2215 #endif
2216 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 #ifdef notdef
2218 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 ASSERT(connp->conn_netstack == NULL);
2220 #endif
2221
2222 ASSERT(connp->conn_helper_info == NULL);
2223 ASSERT(connp->conn_ixa != NULL);
2224 ixa = connp->conn_ixa;
2225 ASSERT(ixa->ixa_refcnt == 1);
2226 /* Need to preserve ixa_protocol */
2227 ixa_cleanup(ixa);
2228 ixa->ixa_flags = 0;
2229
2230 /* Clear out the conn_t fields that are not preserved */
2231 bzero(&connp->conn_start_clr,
2232 sizeof (conn_t) -
2233 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 }
2235
2236 /*
2237 * All conns are inserted in a global multi-list for the benefit of
2238 * walkers. The walk is guaranteed to walk all open conns at the time
2239 * of the start of the walk exactly once. This property is needed to
2240 * achieve some cleanups during unplumb of interfaces. This is achieved
2241 * as follows.
2242 *
2243 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244 * call the insert and delete functions below at creation and deletion
2245 * time respectively. The conn never moves or changes its position in this
2246 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247 * won't increase due to walkers, once the conn deletion has started. Note
2248 * that we can't remove the conn from the global list and then wait for
2249 * the refcnt to drop to zero, since walkers would then see a truncated
2250 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251 * conns until ip_open is ready to make them globally visible.
2252 * The global round robin multi-list locks are held only to get the
2253 * next member/insertion/deletion and contention should be negligible
2254 * if the multi-list is much greater than the number of cpus.
2255 */
2256 void
ipcl_globalhash_insert(conn_t * connp)2257 ipcl_globalhash_insert(conn_t *connp)
2258 {
2259 int index;
2260 struct connf_s *connfp;
2261 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2262
2263 /*
2264 * No need for atomic here. Approximate even distribution
2265 * in the global lists is sufficient.
2266 */
2267 ipst->ips_conn_g_index++;
2268 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269
2270 connp->conn_g_prev = NULL;
2271 /*
2272 * Mark as INCIPIENT, so that walkers will ignore this
2273 * for now, till ip_open is ready to make it visible globally.
2274 */
2275 connp->conn_state_flags |= CONN_INCIPIENT;
2276
2277 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 /* Insert at the head of the list */
2279 mutex_enter(&connfp->connf_lock);
2280 connp->conn_g_next = connfp->connf_head;
2281 if (connp->conn_g_next != NULL)
2282 connp->conn_g_next->conn_g_prev = connp;
2283 connfp->connf_head = connp;
2284
2285 /* The fanout bucket this conn points to */
2286 connp->conn_g_fanout = connfp;
2287
2288 mutex_exit(&connfp->connf_lock);
2289 }
2290
2291 void
ipcl_globalhash_remove(conn_t * connp)2292 ipcl_globalhash_remove(conn_t *connp)
2293 {
2294 struct connf_s *connfp;
2295
2296 /*
2297 * We were never inserted in the global multi list.
2298 * IPCL_NONE variety is never inserted in the global multilist
2299 * since it is presumed to not need any cleanup and is transient.
2300 */
2301 if (connp->conn_g_fanout == NULL)
2302 return;
2303
2304 connfp = connp->conn_g_fanout;
2305 mutex_enter(&connfp->connf_lock);
2306 if (connp->conn_g_prev != NULL)
2307 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 else
2309 connfp->connf_head = connp->conn_g_next;
2310 if (connp->conn_g_next != NULL)
2311 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 mutex_exit(&connfp->connf_lock);
2313
2314 /* Better to stumble on a null pointer than to corrupt memory */
2315 connp->conn_g_next = NULL;
2316 connp->conn_g_prev = NULL;
2317 connp->conn_g_fanout = NULL;
2318 }
2319
2320 /*
2321 * Walk the list of all conn_t's in the system, calling the function provided
2322 * With the specified argument for each.
2323 * Applies to both IPv4 and IPv6.
2324 *
2325 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326 * conn_oper_pending_ill). To guard against stale pointers
2327 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328 * unplumbed or removed. New conn_t's that are created while we are walking
2329 * may be missed by this walk, because they are not necessarily inserted
2330 * at the tail of the list. They are new conn_t's and thus don't have any
2331 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332 * is created to the struct that is going away.
2333 */
2334 void
ipcl_walk(pfv_t func,void * arg,ip_stack_t * ipst)2335 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 {
2337 int i;
2338 conn_t *connp;
2339 conn_t *prev_connp;
2340
2341 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 prev_connp = NULL;
2344 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 while (connp != NULL) {
2346 mutex_enter(&connp->conn_lock);
2347 if (connp->conn_state_flags &
2348 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 mutex_exit(&connp->conn_lock);
2350 connp = connp->conn_g_next;
2351 continue;
2352 }
2353 CONN_INC_REF_LOCKED(connp);
2354 mutex_exit(&connp->conn_lock);
2355 mutex_exit(
2356 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 (*func)(connp, arg);
2358 if (prev_connp != NULL)
2359 CONN_DEC_REF(prev_connp);
2360 mutex_enter(
2361 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 prev_connp = connp;
2363 connp = connp->conn_g_next;
2364 }
2365 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 if (prev_connp != NULL)
2367 CONN_DEC_REF(prev_connp);
2368 }
2369 }
2370
2371 /*
2372 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2374 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2375 * (peer tcp in ESTABLISHED state).
2376 */
2377 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv4(conn_t * connp,ipha_t * ipha,tcpha_t * tcpha,ip_stack_t * ipst)2378 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379 ip_stack_t *ipst)
2380 {
2381 uint32_t ports;
2382 uint16_t *pports = (uint16_t *)&ports;
2383 connf_t *connfp;
2384 conn_t *tconnp;
2385 boolean_t zone_chk;
2386
2387 /*
2388 * If either the source of destination address is loopback, then
2389 * both endpoints must be in the same Zone. Otherwise, both of
2390 * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 * state) and the endpoints may reside in different Zones.
2392 */
2393 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395
2396 pports[0] = tcpha->tha_fport;
2397 pports[1] = tcpha->tha_lport;
2398
2399 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 ports, ipst)];
2401
2402 mutex_enter(&connfp->connf_lock);
2403 for (tconnp = connfp->connf_head; tconnp != NULL;
2404 tconnp = tconnp->conn_next) {
2405
2406 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410
2411 ASSERT(tconnp != connp);
2412 CONN_INC_REF(tconnp);
2413 mutex_exit(&connfp->connf_lock);
2414 return (tconnp);
2415 }
2416 }
2417 mutex_exit(&connfp->connf_lock);
2418 return (NULL);
2419 }
2420
2421 /*
2422 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2424 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2425 * (peer tcp in ESTABLISHED state).
2426 */
2427 conn_t *
ipcl_conn_tcp_lookup_reversed_ipv6(conn_t * connp,ip6_t * ip6h,tcpha_t * tcpha,ip_stack_t * ipst)2428 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429 ip_stack_t *ipst)
2430 {
2431 uint32_t ports;
2432 uint16_t *pports = (uint16_t *)&ports;
2433 connf_t *connfp;
2434 conn_t *tconnp;
2435 boolean_t zone_chk;
2436
2437 /*
2438 * If either the source of destination address is loopback, then
2439 * both endpoints must be in the same Zone. Otherwise, both of
2440 * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 * state) and the endpoints may reside in different Zones. We
2442 * don't do Zone check for link local address(es) because the
2443 * current Zone implementation treats each link local address as
2444 * being unique per system node, i.e. they belong to global Zone.
2445 */
2446 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448
2449 pports[0] = tcpha->tha_fport;
2450 pports[1] = tcpha->tha_lport;
2451
2452 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 ports, ipst)];
2454
2455 mutex_enter(&connfp->connf_lock);
2456 for (tconnp = connfp->connf_head; tconnp != NULL;
2457 tconnp = tconnp->conn_next) {
2458
2459 /* We skip conn_bound_if check here as this is loopback tcp */
2460 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464
2465 ASSERT(tconnp != connp);
2466 CONN_INC_REF(tconnp);
2467 mutex_exit(&connfp->connf_lock);
2468 return (tconnp);
2469 }
2470 }
2471 mutex_exit(&connfp->connf_lock);
2472 return (NULL);
2473 }
2474
2475 /*
2476 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478 * Only checks for connected entries i.e. no INADDR_ANY checks.
2479 */
2480 conn_t *
ipcl_tcp_lookup_reversed_ipv4(ipha_t * ipha,tcpha_t * tcpha,int min_state,ip_stack_t * ipst)2481 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482 ip_stack_t *ipst)
2483 {
2484 uint32_t ports;
2485 uint16_t *pports;
2486 connf_t *connfp;
2487 conn_t *tconnp;
2488
2489 pports = (uint16_t *)&ports;
2490 pports[0] = tcpha->tha_fport;
2491 pports[1] = tcpha->tha_lport;
2492
2493 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 ports, ipst)];
2495
2496 mutex_enter(&connfp->connf_lock);
2497 for (tconnp = connfp->connf_head; tconnp != NULL;
2498 tconnp = tconnp->conn_next) {
2499
2500 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 tconnp->conn_tcp->tcp_state >= min_state) {
2503
2504 CONN_INC_REF(tconnp);
2505 mutex_exit(&connfp->connf_lock);
2506 return (tconnp);
2507 }
2508 }
2509 mutex_exit(&connfp->connf_lock);
2510 return (NULL);
2511 }
2512
2513 /*
2514 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516 * Only checks for connected entries i.e. no INADDR_ANY checks.
2517 * Match on ifindex in addition to addresses.
2518 */
2519 conn_t *
ipcl_tcp_lookup_reversed_ipv6(ip6_t * ip6h,tcpha_t * tcpha,int min_state,uint_t ifindex,ip_stack_t * ipst)2520 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521 uint_t ifindex, ip_stack_t *ipst)
2522 {
2523 tcp_t *tcp;
2524 uint32_t ports;
2525 uint16_t *pports;
2526 connf_t *connfp;
2527 conn_t *tconnp;
2528
2529 pports = (uint16_t *)&ports;
2530 pports[0] = tcpha->tha_fport;
2531 pports[1] = tcpha->tha_lport;
2532
2533 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 ports, ipst)];
2535
2536 mutex_enter(&connfp->connf_lock);
2537 for (tconnp = connfp->connf_head; tconnp != NULL;
2538 tconnp = tconnp->conn_next) {
2539
2540 tcp = tconnp->conn_tcp;
2541 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 tcp->tcp_state >= min_state &&
2544 (tconnp->conn_bound_if == 0 ||
2545 tconnp->conn_bound_if == ifindex)) {
2546
2547 CONN_INC_REF(tconnp);
2548 mutex_exit(&connfp->connf_lock);
2549 return (tconnp);
2550 }
2551 }
2552 mutex_exit(&connfp->connf_lock);
2553 return (NULL);
2554 }
2555
2556 /*
2557 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558 * a listener when changing state.
2559 */
2560 conn_t *
ipcl_lookup_listener_v4(uint16_t lport,ipaddr_t laddr,zoneid_t zoneid,ip_stack_t * ipst)2561 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562 ip_stack_t *ipst)
2563 {
2564 connf_t *bind_connfp;
2565 conn_t *connp;
2566 tcp_t *tcp;
2567
2568 /*
2569 * Avoid false matches for packets sent to an IP destination of
2570 * all zeros.
2571 */
2572 if (laddr == 0)
2573 return (NULL);
2574
2575 ASSERT(zoneid != ALL_ZONES);
2576
2577 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 mutex_enter(&bind_connfp->connf_lock);
2579 for (connp = bind_connfp->connf_head; connp != NULL;
2580 connp = connp->conn_next) {
2581 tcp = connp->conn_tcp;
2582 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 IPCL_ZONE_MATCH(connp, zoneid) &&
2584 (tcp->tcp_listener == NULL)) {
2585 CONN_INC_REF(connp);
2586 mutex_exit(&bind_connfp->connf_lock);
2587 return (connp);
2588 }
2589 }
2590 mutex_exit(&bind_connfp->connf_lock);
2591 return (NULL);
2592 }
2593
2594 /*
2595 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596 * a listener when changing state.
2597 */
2598 conn_t *
ipcl_lookup_listener_v6(uint16_t lport,in6_addr_t * laddr,uint_t ifindex,zoneid_t zoneid,ip_stack_t * ipst)2599 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600 zoneid_t zoneid, ip_stack_t *ipst)
2601 {
2602 connf_t *bind_connfp;
2603 conn_t *connp = NULL;
2604 tcp_t *tcp;
2605
2606 /*
2607 * Avoid false matches for packets sent to an IP destination of
2608 * all zeros.
2609 */
2610 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 return (NULL);
2612
2613 ASSERT(zoneid != ALL_ZONES);
2614
2615 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 mutex_enter(&bind_connfp->connf_lock);
2617 for (connp = bind_connfp->connf_head; connp != NULL;
2618 connp = connp->conn_next) {
2619 tcp = connp->conn_tcp;
2620 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 IPCL_ZONE_MATCH(connp, zoneid) &&
2622 (connp->conn_bound_if == 0 ||
2623 connp->conn_bound_if == ifindex) &&
2624 tcp->tcp_listener == NULL) {
2625 CONN_INC_REF(connp);
2626 mutex_exit(&bind_connfp->connf_lock);
2627 return (connp);
2628 }
2629 }
2630 mutex_exit(&bind_connfp->connf_lock);
2631 return (NULL);
2632 }
2633
2634 /*
2635 * ipcl_get_next_conn
2636 * get the next entry in the conn global list
2637 * and put a reference on the next_conn.
2638 * decrement the reference on the current conn.
2639 *
2640 * This is an iterator based walker function that also provides for
2641 * some selection by the caller. It walks through the conn_hash bucket
2642 * searching for the next valid connp in the list, and selects connections
2643 * that are neither closed nor condemned. It also REFHOLDS the conn
2644 * thus ensuring that the conn exists when the caller uses the conn.
2645 */
2646 conn_t *
ipcl_get_next_conn(connf_t * connfp,conn_t * connp,uint32_t conn_flags)2647 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 {
2649 conn_t *next_connp;
2650
2651 if (connfp == NULL)
2652 return (NULL);
2653
2654 mutex_enter(&connfp->connf_lock);
2655
2656 next_connp = (connp == NULL) ?
2657 connfp->connf_head : connp->conn_g_next;
2658
2659 while (next_connp != NULL) {
2660 mutex_enter(&next_connp->conn_lock);
2661 if (!(next_connp->conn_flags & conn_flags) ||
2662 (next_connp->conn_state_flags &
2663 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 /*
2665 * This conn has been condemned or
2666 * is closing, or the flags don't match
2667 */
2668 mutex_exit(&next_connp->conn_lock);
2669 next_connp = next_connp->conn_g_next;
2670 continue;
2671 }
2672 CONN_INC_REF_LOCKED(next_connp);
2673 mutex_exit(&next_connp->conn_lock);
2674 break;
2675 }
2676
2677 mutex_exit(&connfp->connf_lock);
2678
2679 if (connp != NULL)
2680 CONN_DEC_REF(connp);
2681
2682 return (next_connp);
2683 }
2684
2685 #ifdef CONN_DEBUG
2686 /*
2687 * Trace of the last NBUF refhold/refrele
2688 */
2689 int
conn_trace_ref(conn_t * connp)2690 conn_trace_ref(conn_t *connp)
2691 {
2692 int last;
2693 conn_trace_t *ctb;
2694
2695 ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 last = connp->conn_trace_last;
2697 last++;
2698 if (last == CONN_TRACE_MAX)
2699 last = 0;
2700
2701 ctb = &connp->conn_trace_buf[last];
2702 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 connp->conn_trace_last = last;
2704 return (1);
2705 }
2706
2707 int
conn_untrace_ref(conn_t * connp)2708 conn_untrace_ref(conn_t *connp)
2709 {
2710 int last;
2711 conn_trace_t *ctb;
2712
2713 ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 last = connp->conn_trace_last;
2715 last++;
2716 if (last == CONN_TRACE_MAX)
2717 last = 0;
2718
2719 ctb = &connp->conn_trace_buf[last];
2720 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 connp->conn_trace_last = last;
2722 return (1);
2723 }
2724 #endif
2725