1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/ip_ndp.h> 265 #include <inet/ip_impl.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/tcp.h> 276 #include <inet/ipsec_impl.h> 277 278 #include <sys/tsol/tnet.h> 279 #include <sys/sockio.h> 280 281 #ifdef DEBUG 282 #define IPCL_DEBUG 283 #else 284 #undef IPCL_DEBUG 285 #endif 286 287 #ifdef IPCL_DEBUG 288 int ipcl_debug_level = 0; 289 #define IPCL_DEBUG_LVL(level, args) \ 290 if (ipcl_debug_level & level) { printf args; } 291 #else 292 #define IPCL_DEBUG_LVL(level, args) {; } 293 #endif 294 /* Old value for compatibility. Setable in /etc/system */ 295 uint_t tcp_conn_hash_size = 0; 296 297 /* New value. Zero means choose automatically. Setable in /etc/system */ 298 uint_t ipcl_conn_hash_size = 0; 299 uint_t ipcl_conn_hash_memfactor = 8192; 300 uint_t ipcl_conn_hash_maxsize = 82500; 301 302 /* bind/udp fanout table size */ 303 uint_t ipcl_bind_fanout_size = 512; 304 uint_t ipcl_udp_fanout_size = 16384; 305 306 /* Raw socket fanout size. Must be a power of 2. */ 307 uint_t ipcl_raw_fanout_size = 256; 308 309 /* 310 * Power of 2^N Primes useful for hashing for N of 0-28, 311 * these primes are the nearest prime <= 2^N - 2^(N-2). 312 */ 313 314 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 315 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 316 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 317 50331599, 100663291, 201326557, 0} 318 319 /* 320 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 321 * are aligned on cache lines. 322 */ 323 typedef union itc_s { 324 conn_t itc_conn; 325 char itcu_filler[CACHE_ALIGN(conn_s)]; 326 } itc_t; 327 328 struct kmem_cache *tcp_conn_cache; 329 struct kmem_cache *ip_conn_cache; 330 struct kmem_cache *ip_helper_stream_cache; 331 extern struct kmem_cache *sctp_conn_cache; 332 extern struct kmem_cache *tcp_sack_info_cache; 333 extern struct kmem_cache *tcp_iphc_cache; 334 struct kmem_cache *udp_conn_cache; 335 struct kmem_cache *rawip_conn_cache; 336 struct kmem_cache *rts_conn_cache; 337 338 extern void tcp_timermp_free(tcp_t *); 339 extern mblk_t *tcp_timermp_alloc(int); 340 341 static int ip_conn_constructor(void *, void *, int); 342 static void ip_conn_destructor(void *, void *); 343 344 static int tcp_conn_constructor(void *, void *, int); 345 static void tcp_conn_destructor(void *, void *); 346 347 static int udp_conn_constructor(void *, void *, int); 348 static void udp_conn_destructor(void *, void *); 349 350 static int rawip_conn_constructor(void *, void *, int); 351 static void rawip_conn_destructor(void *, void *); 352 353 static int rts_conn_constructor(void *, void *, int); 354 static void rts_conn_destructor(void *, void *); 355 356 static int ip_helper_stream_constructor(void *, void *, int); 357 static void ip_helper_stream_destructor(void *, void *); 358 359 boolean_t ip_use_helper_cache = B_TRUE; 360 361 /* 362 * Hook functions to enable cluster networking 363 * On non-clustered systems these vectors must always be NULL. 364 */ 365 extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, 366 uint8_t *, in_port_t, void *); 367 extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, 368 uint8_t *, in_port_t, void *); 369 370 #ifdef IPCL_DEBUG 371 #define INET_NTOA_BUFSIZE 18 372 373 static char * 374 inet_ntoa_r(uint32_t in, char *b) 375 { 376 unsigned char *p; 377 378 p = (unsigned char *)∈ 379 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 380 return (b); 381 } 382 #endif 383 384 /* 385 * Global (for all stack instances) init routine 386 */ 387 void 388 ipcl_g_init(void) 389 { 390 ip_conn_cache = kmem_cache_create("ip_conn_cache", 391 sizeof (conn_t), CACHE_ALIGN_SIZE, 392 ip_conn_constructor, ip_conn_destructor, 393 NULL, NULL, NULL, 0); 394 395 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 396 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 397 tcp_conn_constructor, tcp_conn_destructor, 398 NULL, NULL, NULL, 0); 399 400 udp_conn_cache = kmem_cache_create("udp_conn_cache", 401 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 402 udp_conn_constructor, udp_conn_destructor, 403 NULL, NULL, NULL, 0); 404 405 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 406 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 407 rawip_conn_constructor, rawip_conn_destructor, 408 NULL, NULL, NULL, 0); 409 410 rts_conn_cache = kmem_cache_create("rts_conn_cache", 411 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 412 rts_conn_constructor, rts_conn_destructor, 413 NULL, NULL, NULL, 0); 414 415 if (ip_use_helper_cache) { 416 ip_helper_stream_cache = kmem_cache_create 417 ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), 418 CACHE_ALIGN_SIZE, ip_helper_stream_constructor, 419 ip_helper_stream_destructor, NULL, NULL, NULL, 0); 420 } else { 421 ip_helper_stream_cache = NULL; 422 } 423 } 424 425 /* 426 * ipclassifier intialization routine, sets up hash tables. 427 */ 428 void 429 ipcl_init(ip_stack_t *ipst) 430 { 431 int i; 432 int sizes[] = P2Ps(); 433 434 /* 435 * Calculate size of conn fanout table from /etc/system settings 436 */ 437 if (ipcl_conn_hash_size != 0) { 438 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 439 } else if (tcp_conn_hash_size != 0) { 440 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 441 } else { 442 extern pgcnt_t freemem; 443 444 ipst->ips_ipcl_conn_fanout_size = 445 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 446 447 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 448 ipst->ips_ipcl_conn_fanout_size = 449 ipcl_conn_hash_maxsize; 450 } 451 } 452 453 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 454 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 455 break; 456 } 457 } 458 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 459 /* Out of range, use the 2^16 value */ 460 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 461 } 462 463 /* Take values from /etc/system */ 464 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 465 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 466 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 467 468 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 469 470 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 471 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 472 473 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 474 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 479 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 480 481 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 482 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 483 MUTEX_DEFAULT, NULL); 484 } 485 486 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 487 sizeof (connf_t), KM_SLEEP); 488 for (i = 0; i < IPPROTO_MAX; i++) { 489 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 490 MUTEX_DEFAULT, NULL); 491 } 492 493 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 494 sizeof (connf_t), KM_SLEEP); 495 for (i = 0; i < IPPROTO_MAX; i++) { 496 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 497 MUTEX_DEFAULT, NULL); 498 } 499 500 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 501 mutex_init(&ipst->ips_rts_clients->connf_lock, 502 NULL, MUTEX_DEFAULT, NULL); 503 504 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 505 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 506 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 507 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 508 MUTEX_DEFAULT, NULL); 509 } 510 511 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 512 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 513 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 514 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 515 MUTEX_DEFAULT, NULL); 516 } 517 518 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 519 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 520 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 521 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 522 NULL, MUTEX_DEFAULT, NULL); 523 } 524 } 525 526 void 527 ipcl_g_destroy(void) 528 { 529 kmem_cache_destroy(ip_conn_cache); 530 kmem_cache_destroy(tcp_conn_cache); 531 kmem_cache_destroy(udp_conn_cache); 532 kmem_cache_destroy(rawip_conn_cache); 533 kmem_cache_destroy(rts_conn_cache); 534 } 535 536 /* 537 * All user-level and kernel use of the stack must be gone 538 * by now. 539 */ 540 void 541 ipcl_destroy(ip_stack_t *ipst) 542 { 543 int i; 544 545 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 546 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 547 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 548 } 549 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 550 sizeof (connf_t)); 551 ipst->ips_ipcl_conn_fanout = NULL; 552 553 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 554 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 555 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 556 } 557 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 558 sizeof (connf_t)); 559 ipst->ips_ipcl_bind_fanout = NULL; 560 561 for (i = 0; i < IPPROTO_MAX; i++) { 562 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 563 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 564 } 565 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 566 ipst->ips_ipcl_proto_fanout = NULL; 567 568 for (i = 0; i < IPPROTO_MAX; i++) { 569 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 570 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 571 } 572 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 573 IPPROTO_MAX * sizeof (connf_t)); 574 ipst->ips_ipcl_proto_fanout_v6 = NULL; 575 576 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 577 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 578 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 579 } 580 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 581 sizeof (connf_t)); 582 ipst->ips_ipcl_udp_fanout = NULL; 583 584 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 585 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 586 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 587 } 588 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 589 sizeof (connf_t)); 590 ipst->ips_ipcl_raw_fanout = NULL; 591 592 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 593 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 594 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 595 } 596 kmem_free(ipst->ips_ipcl_globalhash_fanout, 597 sizeof (connf_t) * CONN_G_HASH_SIZE); 598 ipst->ips_ipcl_globalhash_fanout = NULL; 599 600 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 601 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 602 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 603 ipst->ips_rts_clients = NULL; 604 } 605 606 /* 607 * conn creation routine. initialize the conn, sets the reference 608 * and inserts it in the global hash table. 609 */ 610 conn_t * 611 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 612 { 613 conn_t *connp; 614 sctp_stack_t *sctps; 615 struct kmem_cache *conn_cache; 616 617 switch (type) { 618 case IPCL_SCTPCONN: 619 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 620 return (NULL); 621 sctp_conn_init(connp); 622 sctps = ns->netstack_sctp; 623 SCTP_G_Q_REFHOLD(sctps); 624 netstack_hold(ns); 625 connp->conn_netstack = ns; 626 return (connp); 627 628 case IPCL_TCPCONN: 629 conn_cache = tcp_conn_cache; 630 break; 631 632 case IPCL_UDPCONN: 633 conn_cache = udp_conn_cache; 634 break; 635 636 case IPCL_RAWIPCONN: 637 conn_cache = rawip_conn_cache; 638 break; 639 640 case IPCL_RTSCONN: 641 conn_cache = rts_conn_cache; 642 break; 643 644 case IPCL_IPCCONN: 645 conn_cache = ip_conn_cache; 646 break; 647 648 default: 649 connp = NULL; 650 ASSERT(0); 651 } 652 653 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 654 return (NULL); 655 656 connp->conn_ref = 1; 657 netstack_hold(ns); 658 connp->conn_netstack = ns; 659 ipcl_globalhash_insert(connp); 660 return (connp); 661 } 662 663 void 664 ipcl_conn_destroy(conn_t *connp) 665 { 666 mblk_t *mp; 667 netstack_t *ns = connp->conn_netstack; 668 669 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 670 ASSERT(connp->conn_ref == 0); 671 ASSERT(connp->conn_ire_cache == NULL); 672 673 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 674 675 if (connp->conn_effective_cred != NULL) { 676 crfree(connp->conn_effective_cred); 677 connp->conn_effective_cred = NULL; 678 } 679 680 if (connp->conn_cred != NULL) { 681 crfree(connp->conn_cred); 682 connp->conn_cred = NULL; 683 } 684 685 ipcl_globalhash_remove(connp); 686 687 /* FIXME: add separate tcp_conn_free()? */ 688 if (connp->conn_flags & IPCL_TCPCONN) { 689 tcp_t *tcp = connp->conn_tcp; 690 tcp_stack_t *tcps; 691 692 ASSERT(tcp != NULL); 693 tcps = tcp->tcp_tcps; 694 if (tcps != NULL) { 695 if (connp->conn_latch != NULL) { 696 IPLATCH_REFRELE(connp->conn_latch, ns); 697 connp->conn_latch = NULL; 698 } 699 if (connp->conn_policy != NULL) { 700 IPPH_REFRELE(connp->conn_policy, ns); 701 connp->conn_policy = NULL; 702 } 703 tcp->tcp_tcps = NULL; 704 TCPS_REFRELE(tcps); 705 } 706 707 tcp_free(tcp); 708 mp = tcp->tcp_timercache; 709 tcp->tcp_cred = NULL; 710 711 if (tcp->tcp_sack_info != NULL) { 712 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 713 kmem_cache_free(tcp_sack_info_cache, 714 tcp->tcp_sack_info); 715 } 716 if (tcp->tcp_iphc != NULL) { 717 if (tcp->tcp_hdr_grown) { 718 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 719 } else { 720 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 721 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 722 } 723 tcp->tcp_iphc_len = 0; 724 } 725 ASSERT(tcp->tcp_iphc_len == 0); 726 727 /* 728 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 729 * the mblk. 730 */ 731 if (tcp->tcp_rsrv_mp != NULL) { 732 freeb(tcp->tcp_rsrv_mp); 733 tcp->tcp_rsrv_mp = NULL; 734 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 735 } 736 737 ASSERT(connp->conn_latch == NULL); 738 ASSERT(connp->conn_policy == NULL); 739 740 if (ns != NULL) { 741 ASSERT(tcp->tcp_tcps == NULL); 742 connp->conn_netstack = NULL; 743 netstack_rele(ns); 744 } 745 746 ipcl_conn_cleanup(connp); 747 connp->conn_flags = IPCL_TCPCONN; 748 bzero(tcp, sizeof (tcp_t)); 749 750 tcp->tcp_timercache = mp; 751 tcp->tcp_connp = connp; 752 kmem_cache_free(tcp_conn_cache, connp); 753 return; 754 } 755 if (connp->conn_latch != NULL) { 756 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 757 connp->conn_latch = NULL; 758 } 759 if (connp->conn_policy != NULL) { 760 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 761 connp->conn_policy = NULL; 762 } 763 if (connp->conn_ipsec_opt_mp != NULL) { 764 freemsg(connp->conn_ipsec_opt_mp); 765 connp->conn_ipsec_opt_mp = NULL; 766 } 767 768 if (connp->conn_flags & IPCL_SCTPCONN) { 769 ASSERT(ns != NULL); 770 sctp_free(connp); 771 return; 772 } 773 774 if (ns != NULL) { 775 connp->conn_netstack = NULL; 776 netstack_rele(ns); 777 } 778 779 ipcl_conn_cleanup(connp); 780 781 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 782 if (connp->conn_flags & IPCL_UDPCONN) { 783 connp->conn_flags = IPCL_UDPCONN; 784 kmem_cache_free(udp_conn_cache, connp); 785 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 786 787 connp->conn_flags = IPCL_RAWIPCONN; 788 connp->conn_ulp = IPPROTO_ICMP; 789 kmem_cache_free(rawip_conn_cache, connp); 790 } else if (connp->conn_flags & IPCL_RTSCONN) { 791 connp->conn_flags = IPCL_RTSCONN; 792 kmem_cache_free(rts_conn_cache, connp); 793 } else { 794 connp->conn_flags = IPCL_IPCCONN; 795 ASSERT(connp->conn_flags & IPCL_IPCCONN); 796 ASSERT(connp->conn_priv == NULL); 797 kmem_cache_free(ip_conn_cache, connp); 798 } 799 } 800 801 /* 802 * Running in cluster mode - deregister listener information 803 */ 804 805 static void 806 ipcl_conn_unlisten(conn_t *connp) 807 { 808 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 809 ASSERT(connp->conn_lport != 0); 810 811 if (cl_inet_unlisten != NULL) { 812 sa_family_t addr_family; 813 uint8_t *laddrp; 814 815 if (connp->conn_pkt_isv6) { 816 addr_family = AF_INET6; 817 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 818 } else { 819 addr_family = AF_INET; 820 laddrp = (uint8_t *)&connp->conn_bound_source; 821 } 822 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 823 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 824 } 825 connp->conn_flags &= ~IPCL_CL_LISTENER; 826 } 827 828 /* 829 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 830 * which table the conn belonged to). So for debugging we can see which hash 831 * table this connection was in. 832 */ 833 #define IPCL_HASH_REMOVE(connp) { \ 834 connf_t *connfp = (connp)->conn_fanout; \ 835 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 836 if (connfp != NULL) { \ 837 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 838 (void *)(connp))); \ 839 mutex_enter(&connfp->connf_lock); \ 840 if ((connp)->conn_next != NULL) \ 841 (connp)->conn_next->conn_prev = \ 842 (connp)->conn_prev; \ 843 if ((connp)->conn_prev != NULL) \ 844 (connp)->conn_prev->conn_next = \ 845 (connp)->conn_next; \ 846 else \ 847 connfp->connf_head = (connp)->conn_next; \ 848 (connp)->conn_fanout = NULL; \ 849 (connp)->conn_next = NULL; \ 850 (connp)->conn_prev = NULL; \ 851 (connp)->conn_flags |= IPCL_REMOVED; \ 852 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 853 ipcl_conn_unlisten((connp)); \ 854 CONN_DEC_REF((connp)); \ 855 mutex_exit(&connfp->connf_lock); \ 856 } \ 857 } 858 859 void 860 ipcl_hash_remove(conn_t *connp) 861 { 862 IPCL_HASH_REMOVE(connp); 863 } 864 865 /* 866 * The whole purpose of this function is allow removal of 867 * a conn_t from the connected hash for timewait reclaim. 868 * This is essentially a TW reclaim fastpath where timewait 869 * collector checks under fanout lock (so no one else can 870 * get access to the conn_t) that refcnt is 2 i.e. one for 871 * TCP and one for the classifier hash list. If ref count 872 * is indeed 2, we can just remove the conn under lock and 873 * avoid cleaning up the conn under squeue. This gives us 874 * improved performance. 875 */ 876 void 877 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 878 { 879 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 880 ASSERT(MUTEX_HELD(&connp->conn_lock)); 881 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 882 883 if ((connp)->conn_next != NULL) { 884 (connp)->conn_next->conn_prev = (connp)->conn_prev; 885 } 886 if ((connp)->conn_prev != NULL) { 887 (connp)->conn_prev->conn_next = (connp)->conn_next; 888 } else { 889 connfp->connf_head = (connp)->conn_next; 890 } 891 (connp)->conn_fanout = NULL; 892 (connp)->conn_next = NULL; 893 (connp)->conn_prev = NULL; 894 (connp)->conn_flags |= IPCL_REMOVED; 895 ASSERT((connp)->conn_ref == 2); 896 (connp)->conn_ref--; 897 } 898 899 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 900 ASSERT((connp)->conn_fanout == NULL); \ 901 ASSERT((connp)->conn_next == NULL); \ 902 ASSERT((connp)->conn_prev == NULL); \ 903 if ((connfp)->connf_head != NULL) { \ 904 (connfp)->connf_head->conn_prev = (connp); \ 905 (connp)->conn_next = (connfp)->connf_head; \ 906 } \ 907 (connp)->conn_fanout = (connfp); \ 908 (connfp)->connf_head = (connp); \ 909 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 910 IPCL_CONNECTED; \ 911 CONN_INC_REF(connp); \ 912 } 913 914 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 915 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 916 "connp %p", (void *)(connfp), (void *)(connp))); \ 917 IPCL_HASH_REMOVE((connp)); \ 918 mutex_enter(&(connfp)->connf_lock); \ 919 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 920 mutex_exit(&(connfp)->connf_lock); \ 921 } 922 923 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 924 conn_t *pconnp = NULL, *nconnp; \ 925 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 926 "connp %p", (void *)connfp, (void *)(connp))); \ 927 IPCL_HASH_REMOVE((connp)); \ 928 mutex_enter(&(connfp)->connf_lock); \ 929 nconnp = (connfp)->connf_head; \ 930 while (nconnp != NULL && \ 931 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 932 pconnp = nconnp; \ 933 nconnp = nconnp->conn_next; \ 934 } \ 935 if (pconnp != NULL) { \ 936 pconnp->conn_next = (connp); \ 937 (connp)->conn_prev = pconnp; \ 938 } else { \ 939 (connfp)->connf_head = (connp); \ 940 } \ 941 if (nconnp != NULL) { \ 942 (connp)->conn_next = nconnp; \ 943 nconnp->conn_prev = (connp); \ 944 } \ 945 (connp)->conn_fanout = (connfp); \ 946 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 947 IPCL_BOUND; \ 948 CONN_INC_REF(connp); \ 949 mutex_exit(&(connfp)->connf_lock); \ 950 } 951 952 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 953 conn_t **list, *prev, *next; \ 954 boolean_t isv4mapped = \ 955 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 956 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 957 "connp %p", (void *)(connfp), (void *)(connp))); \ 958 IPCL_HASH_REMOVE((connp)); \ 959 mutex_enter(&(connfp)->connf_lock); \ 960 list = &(connfp)->connf_head; \ 961 prev = NULL; \ 962 while ((next = *list) != NULL) { \ 963 if (isv4mapped && \ 964 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 965 connp->conn_zoneid == next->conn_zoneid) { \ 966 (connp)->conn_next = next; \ 967 if (prev != NULL) \ 968 prev = next->conn_prev; \ 969 next->conn_prev = (connp); \ 970 break; \ 971 } \ 972 list = &next->conn_next; \ 973 prev = next; \ 974 } \ 975 (connp)->conn_prev = prev; \ 976 *list = (connp); \ 977 (connp)->conn_fanout = (connfp); \ 978 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 979 IPCL_BOUND; \ 980 CONN_INC_REF((connp)); \ 981 mutex_exit(&(connfp)->connf_lock); \ 982 } 983 984 void 985 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 986 { 987 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 988 } 989 990 void 991 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 992 { 993 connf_t *connfp; 994 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 995 996 ASSERT(connp != NULL); 997 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 998 protocol == IPPROTO_ESP); 999 1000 connp->conn_ulp = protocol; 1001 1002 /* Insert it in the protocol hash */ 1003 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1004 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1005 } 1006 1007 void 1008 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 1009 { 1010 connf_t *connfp; 1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1012 1013 ASSERT(connp != NULL); 1014 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 1015 protocol == IPPROTO_ESP); 1016 1017 connp->conn_ulp = protocol; 1018 1019 /* Insert it in the Bind Hash */ 1020 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1021 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1022 } 1023 1024 /* 1025 * This function is used only for inserting SCTP raw socket now. 1026 * This may change later. 1027 * 1028 * Note that only one raw socket can be bound to a port. The param 1029 * lport is in network byte order. 1030 */ 1031 static int 1032 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1033 { 1034 connf_t *connfp; 1035 conn_t *oconnp; 1036 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1037 1038 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1039 1040 /* Check for existing raw socket already bound to the port. */ 1041 mutex_enter(&connfp->connf_lock); 1042 for (oconnp = connfp->connf_head; oconnp != NULL; 1043 oconnp = oconnp->conn_next) { 1044 if (oconnp->conn_lport == lport && 1045 oconnp->conn_zoneid == connp->conn_zoneid && 1046 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1047 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1048 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1049 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1050 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1051 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1052 &connp->conn_srcv6))) { 1053 break; 1054 } 1055 } 1056 mutex_exit(&connfp->connf_lock); 1057 if (oconnp != NULL) 1058 return (EADDRNOTAVAIL); 1059 1060 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1061 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1062 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1063 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1064 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1065 } else { 1066 IPCL_HASH_INSERT_BOUND(connfp, connp); 1067 } 1068 } else { 1069 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1070 } 1071 return (0); 1072 } 1073 1074 /* 1075 * Check for a MAC exemption conflict on a labeled system. Note that for 1076 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1077 * transport layer. This check is for binding all other protocols. 1078 * 1079 * Returns true if there's a conflict. 1080 */ 1081 static boolean_t 1082 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1083 { 1084 connf_t *connfp; 1085 conn_t *tconn; 1086 1087 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1088 mutex_enter(&connfp->connf_lock); 1089 for (tconn = connfp->connf_head; tconn != NULL; 1090 tconn = tconn->conn_next) { 1091 /* We don't allow v4 fallback for v6 raw socket */ 1092 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1093 continue; 1094 /* If neither is exempt, then there's no conflict */ 1095 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1096 continue; 1097 /* We are only concerned about sockets for a different zone */ 1098 if (connp->conn_zoneid == tconn->conn_zoneid) 1099 continue; 1100 /* If both are bound to different specific addrs, ok */ 1101 if (connp->conn_src != INADDR_ANY && 1102 tconn->conn_src != INADDR_ANY && 1103 connp->conn_src != tconn->conn_src) 1104 continue; 1105 /* These two conflict; fail */ 1106 break; 1107 } 1108 mutex_exit(&connfp->connf_lock); 1109 return (tconn != NULL); 1110 } 1111 1112 static boolean_t 1113 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1114 { 1115 connf_t *connfp; 1116 conn_t *tconn; 1117 1118 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1119 mutex_enter(&connfp->connf_lock); 1120 for (tconn = connfp->connf_head; tconn != NULL; 1121 tconn = tconn->conn_next) { 1122 /* We don't allow v4 fallback for v6 raw socket */ 1123 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1124 continue; 1125 /* If neither is exempt, then there's no conflict */ 1126 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1127 continue; 1128 /* We are only concerned about sockets for a different zone */ 1129 if (connp->conn_zoneid == tconn->conn_zoneid) 1130 continue; 1131 /* If both are bound to different addrs, ok */ 1132 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1133 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1134 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1135 continue; 1136 /* These two conflict; fail */ 1137 break; 1138 } 1139 mutex_exit(&connfp->connf_lock); 1140 return (tconn != NULL); 1141 } 1142 1143 /* 1144 * (v4, v6) bind hash insertion routines 1145 */ 1146 int 1147 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1148 { 1149 connf_t *connfp; 1150 #ifdef IPCL_DEBUG 1151 char buf[INET_NTOA_BUFSIZE]; 1152 #endif 1153 int ret = 0; 1154 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1155 1156 ASSERT(connp); 1157 1158 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1159 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1160 1161 connp->conn_ulp = protocol; 1162 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1163 connp->conn_lport = lport; 1164 1165 switch (protocol) { 1166 default: 1167 if (is_system_labeled() && 1168 check_exempt_conflict_v4(connp, ipst)) 1169 return (EADDRINUSE); 1170 /* FALLTHROUGH */ 1171 case IPPROTO_UDP: 1172 if (protocol == IPPROTO_UDP) { 1173 IPCL_DEBUG_LVL(64, 1174 ("ipcl_bind_insert: connp %p - udp\n", 1175 (void *)connp)); 1176 connfp = &ipst->ips_ipcl_udp_fanout[ 1177 IPCL_UDP_HASH(lport, ipst)]; 1178 } else { 1179 IPCL_DEBUG_LVL(64, 1180 ("ipcl_bind_insert: connp %p - protocol\n", 1181 (void *)connp)); 1182 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1183 } 1184 1185 if (connp->conn_rem != INADDR_ANY) { 1186 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1187 } else if (connp->conn_src != INADDR_ANY) { 1188 IPCL_HASH_INSERT_BOUND(connfp, connp); 1189 } else { 1190 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1191 } 1192 break; 1193 1194 case IPPROTO_TCP: 1195 1196 /* Insert it in the Bind Hash */ 1197 ASSERT(connp->conn_zoneid != ALL_ZONES); 1198 connfp = &ipst->ips_ipcl_bind_fanout[ 1199 IPCL_BIND_HASH(lport, ipst)]; 1200 if (connp->conn_src != INADDR_ANY) { 1201 IPCL_HASH_INSERT_BOUND(connfp, connp); 1202 } else { 1203 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1204 } 1205 if (cl_inet_listen != NULL) { 1206 ASSERT(!connp->conn_pkt_isv6); 1207 connp->conn_flags |= IPCL_CL_LISTENER; 1208 (*cl_inet_listen)( 1209 connp->conn_netstack->netstack_stackid, 1210 IPPROTO_TCP, AF_INET, 1211 (uint8_t *)&connp->conn_bound_source, lport, NULL); 1212 } 1213 break; 1214 1215 case IPPROTO_SCTP: 1216 ret = ipcl_sctp_hash_insert(connp, lport); 1217 break; 1218 } 1219 1220 return (ret); 1221 } 1222 1223 int 1224 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1225 uint16_t lport) 1226 { 1227 connf_t *connfp; 1228 int ret = 0; 1229 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1230 1231 ASSERT(connp); 1232 1233 connp->conn_ulp = protocol; 1234 connp->conn_srcv6 = *src; 1235 connp->conn_lport = lport; 1236 1237 switch (protocol) { 1238 default: 1239 if (is_system_labeled() && 1240 check_exempt_conflict_v6(connp, ipst)) 1241 return (EADDRINUSE); 1242 /* FALLTHROUGH */ 1243 case IPPROTO_UDP: 1244 if (protocol == IPPROTO_UDP) { 1245 IPCL_DEBUG_LVL(128, 1246 ("ipcl_bind_insert_v6: connp %p - udp\n", 1247 (void *)connp)); 1248 connfp = &ipst->ips_ipcl_udp_fanout[ 1249 IPCL_UDP_HASH(lport, ipst)]; 1250 } else { 1251 IPCL_DEBUG_LVL(128, 1252 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1253 (void *)connp)); 1254 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1255 } 1256 1257 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1258 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1259 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1260 IPCL_HASH_INSERT_BOUND(connfp, connp); 1261 } else { 1262 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1263 } 1264 break; 1265 1266 case IPPROTO_TCP: 1267 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1268 1269 /* Insert it in the Bind Hash */ 1270 ASSERT(connp->conn_zoneid != ALL_ZONES); 1271 connfp = &ipst->ips_ipcl_bind_fanout[ 1272 IPCL_BIND_HASH(lport, ipst)]; 1273 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1274 IPCL_HASH_INSERT_BOUND(connfp, connp); 1275 } else { 1276 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1277 } 1278 if (cl_inet_listen != NULL) { 1279 sa_family_t addr_family; 1280 uint8_t *laddrp; 1281 1282 if (connp->conn_pkt_isv6) { 1283 addr_family = AF_INET6; 1284 laddrp = 1285 (uint8_t *)&connp->conn_bound_source_v6; 1286 } else { 1287 addr_family = AF_INET; 1288 laddrp = (uint8_t *)&connp->conn_bound_source; 1289 } 1290 connp->conn_flags |= IPCL_CL_LISTENER; 1291 (*cl_inet_listen)( 1292 connp->conn_netstack->netstack_stackid, 1293 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1294 } 1295 break; 1296 1297 case IPPROTO_SCTP: 1298 ret = ipcl_sctp_hash_insert(connp, lport); 1299 break; 1300 } 1301 1302 return (ret); 1303 } 1304 1305 /* 1306 * ipcl_conn_hash insertion routines. 1307 */ 1308 int 1309 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1310 ipaddr_t rem, uint32_t ports) 1311 { 1312 connf_t *connfp; 1313 uint16_t *up; 1314 conn_t *tconnp; 1315 #ifdef IPCL_DEBUG 1316 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1317 #endif 1318 in_port_t lport; 1319 int ret = 0; 1320 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1321 1322 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1323 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1324 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1325 ports, protocol)); 1326 1327 switch (protocol) { 1328 case IPPROTO_TCP: 1329 if (!(connp->conn_flags & IPCL_EAGER)) { 1330 /* 1331 * for a eager connection, i.e connections which 1332 * have just been created, the initialization is 1333 * already done in ip at conn_creation time, so 1334 * we can skip the checks here. 1335 */ 1336 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1337 } 1338 1339 /* 1340 * For tcp, we check whether the connection tuple already 1341 * exists before allowing the connection to proceed. We 1342 * also allow indexing on the zoneid. This is to allow 1343 * multiple shared stack zones to have the same tcp 1344 * connection tuple. In practice this only happens for 1345 * INADDR_LOOPBACK as it's the only local address which 1346 * doesn't have to be unique. 1347 */ 1348 connfp = &ipst->ips_ipcl_conn_fanout[ 1349 IPCL_CONN_HASH(connp->conn_rem, 1350 connp->conn_ports, ipst)]; 1351 mutex_enter(&connfp->connf_lock); 1352 for (tconnp = connfp->connf_head; tconnp != NULL; 1353 tconnp = tconnp->conn_next) { 1354 if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1355 connp->conn_rem, connp->conn_src, 1356 connp->conn_ports)) && 1357 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1358 1359 /* Already have a conn. bail out */ 1360 mutex_exit(&connfp->connf_lock); 1361 return (EADDRINUSE); 1362 } 1363 } 1364 if (connp->conn_fanout != NULL) { 1365 /* 1366 * Probably a XTI/TLI application trying to do a 1367 * rebind. Let it happen. 1368 */ 1369 mutex_exit(&connfp->connf_lock); 1370 IPCL_HASH_REMOVE(connp); 1371 mutex_enter(&connfp->connf_lock); 1372 } 1373 1374 ASSERT(connp->conn_recv != NULL); 1375 1376 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1377 mutex_exit(&connfp->connf_lock); 1378 break; 1379 1380 case IPPROTO_SCTP: 1381 /* 1382 * The raw socket may have already been bound, remove it 1383 * from the hash first. 1384 */ 1385 IPCL_HASH_REMOVE(connp); 1386 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1387 ret = ipcl_sctp_hash_insert(connp, lport); 1388 break; 1389 1390 default: 1391 /* 1392 * Check for conflicts among MAC exempt bindings. For 1393 * transports with port numbers, this is done by the upper 1394 * level per-transport binding logic. For all others, it's 1395 * done here. 1396 */ 1397 if (is_system_labeled() && 1398 check_exempt_conflict_v4(connp, ipst)) 1399 return (EADDRINUSE); 1400 /* FALLTHROUGH */ 1401 1402 case IPPROTO_UDP: 1403 up = (uint16_t *)&ports; 1404 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1405 if (protocol == IPPROTO_UDP) { 1406 connfp = &ipst->ips_ipcl_udp_fanout[ 1407 IPCL_UDP_HASH(up[1], ipst)]; 1408 } else { 1409 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1410 } 1411 1412 if (connp->conn_rem != INADDR_ANY) { 1413 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1414 } else if (connp->conn_src != INADDR_ANY) { 1415 IPCL_HASH_INSERT_BOUND(connfp, connp); 1416 } else { 1417 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1418 } 1419 break; 1420 } 1421 1422 return (ret); 1423 } 1424 1425 int 1426 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1427 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1428 { 1429 connf_t *connfp; 1430 uint16_t *up; 1431 conn_t *tconnp; 1432 in_port_t lport; 1433 int ret = 0; 1434 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1435 1436 switch (protocol) { 1437 case IPPROTO_TCP: 1438 /* Just need to insert a conn struct */ 1439 if (!(connp->conn_flags & IPCL_EAGER)) { 1440 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1441 } 1442 1443 /* 1444 * For tcp, we check whether the connection tuple already 1445 * exists before allowing the connection to proceed. We 1446 * also allow indexing on the zoneid. This is to allow 1447 * multiple shared stack zones to have the same tcp 1448 * connection tuple. In practice this only happens for 1449 * ipv6_loopback as it's the only local address which 1450 * doesn't have to be unique. 1451 */ 1452 connfp = &ipst->ips_ipcl_conn_fanout[ 1453 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1454 ipst)]; 1455 mutex_enter(&connfp->connf_lock); 1456 for (tconnp = connfp->connf_head; tconnp != NULL; 1457 tconnp = tconnp->conn_next) { 1458 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1459 connp->conn_remv6, connp->conn_srcv6, 1460 connp->conn_ports) && 1461 (tconnp->conn_tcp->tcp_bound_if == 0 || 1462 tconnp->conn_tcp->tcp_bound_if == ifindex) && 1463 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1464 /* Already have a conn. bail out */ 1465 mutex_exit(&connfp->connf_lock); 1466 return (EADDRINUSE); 1467 } 1468 } 1469 if (connp->conn_fanout != NULL) { 1470 /* 1471 * Probably a XTI/TLI application trying to do a 1472 * rebind. Let it happen. 1473 */ 1474 mutex_exit(&connfp->connf_lock); 1475 IPCL_HASH_REMOVE(connp); 1476 mutex_enter(&connfp->connf_lock); 1477 } 1478 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1479 mutex_exit(&connfp->connf_lock); 1480 break; 1481 1482 case IPPROTO_SCTP: 1483 IPCL_HASH_REMOVE(connp); 1484 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1485 ret = ipcl_sctp_hash_insert(connp, lport); 1486 break; 1487 1488 default: 1489 if (is_system_labeled() && 1490 check_exempt_conflict_v6(connp, ipst)) 1491 return (EADDRINUSE); 1492 /* FALLTHROUGH */ 1493 case IPPROTO_UDP: 1494 up = (uint16_t *)&ports; 1495 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1496 if (protocol == IPPROTO_UDP) { 1497 connfp = &ipst->ips_ipcl_udp_fanout[ 1498 IPCL_UDP_HASH(up[1], ipst)]; 1499 } else { 1500 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1501 } 1502 1503 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1504 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1505 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1506 IPCL_HASH_INSERT_BOUND(connfp, connp); 1507 } else { 1508 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1509 } 1510 break; 1511 } 1512 1513 return (ret); 1514 } 1515 1516 /* 1517 * v4 packet classifying function. looks up the fanout table to 1518 * find the conn, the packet belongs to. returns the conn with 1519 * the reference held, null otherwise. 1520 * 1521 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1522 * Lookup" comment block are applied. Labels are also checked as described 1523 * above. If the packet is from the inside (looped back), and is from the same 1524 * zone, then label checks are omitted. 1525 */ 1526 conn_t * 1527 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1528 ip_stack_t *ipst) 1529 { 1530 ipha_t *ipha; 1531 connf_t *connfp, *bind_connfp; 1532 uint16_t lport; 1533 uint16_t fport; 1534 uint32_t ports; 1535 conn_t *connp; 1536 uint16_t *up; 1537 boolean_t shared_addr; 1538 boolean_t unlabeled; 1539 1540 ipha = (ipha_t *)mp->b_rptr; 1541 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1542 1543 switch (protocol) { 1544 case IPPROTO_TCP: 1545 ports = *(uint32_t *)up; 1546 connfp = 1547 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1548 ports, ipst)]; 1549 mutex_enter(&connfp->connf_lock); 1550 for (connp = connfp->connf_head; connp != NULL; 1551 connp = connp->conn_next) { 1552 if ((IPCL_CONN_MATCH(connp, protocol, 1553 ipha->ipha_src, ipha->ipha_dst, ports)) && 1554 (IPCL_ZONE_MATCH(connp, zoneid))) { 1555 break; 1556 } 1557 } 1558 1559 if (connp != NULL) { 1560 /* 1561 * We have a fully-bound TCP connection. 1562 * 1563 * For labeled systems, there's no need to check the 1564 * label here. It's known to be good as we checked 1565 * before allowing the connection to become bound. 1566 */ 1567 CONN_INC_REF(connp); 1568 mutex_exit(&connfp->connf_lock); 1569 return (connp); 1570 } 1571 1572 mutex_exit(&connfp->connf_lock); 1573 1574 lport = up[1]; 1575 unlabeled = B_FALSE; 1576 /* Cred cannot be null on IPv4 */ 1577 if (is_system_labeled()) { 1578 cred_t *cr = msg_getcred(mp, NULL); 1579 ASSERT(cr != NULL); 1580 unlabeled = (crgetlabel(cr)->tsl_flags & 1581 TSLF_UNLABELED) != 0; 1582 } 1583 shared_addr = (zoneid == ALL_ZONES); 1584 if (shared_addr) { 1585 /* 1586 * No need to handle exclusive-stack zones since 1587 * ALL_ZONES only applies to the shared stack. 1588 */ 1589 zoneid = tsol_mlp_findzone(protocol, lport); 1590 /* 1591 * If no shared MLP is found, tsol_mlp_findzone returns 1592 * ALL_ZONES. In that case, we assume it's SLP, and 1593 * search for the zone based on the packet label. 1594 * 1595 * If there is such a zone, we prefer to find a 1596 * connection in it. Otherwise, we look for a 1597 * MAC-exempt connection in any zone whose label 1598 * dominates the default label on the packet. 1599 */ 1600 if (zoneid == ALL_ZONES) 1601 zoneid = tsol_packet_to_zoneid(mp); 1602 else 1603 unlabeled = B_FALSE; 1604 } 1605 1606 bind_connfp = 1607 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1608 mutex_enter(&bind_connfp->connf_lock); 1609 for (connp = bind_connfp->connf_head; connp != NULL; 1610 connp = connp->conn_next) { 1611 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1612 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1613 (unlabeled && connp->conn_mac_exempt && 1614 shared_addr))) 1615 break; 1616 } 1617 1618 /* 1619 * If the matching connection is SLP on a private address, then 1620 * the label on the packet must match the local zone's label. 1621 * Otherwise, it must be in the label range defined by tnrh. 1622 * This is ensured by tsol_receive_label. 1623 */ 1624 if (connp != NULL && is_system_labeled() && 1625 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1626 shared_addr, connp)) { 1627 DTRACE_PROBE3( 1628 tx__ip__log__info__classify__tcp, 1629 char *, 1630 "connp(1) could not receive mp(2)", 1631 conn_t *, connp, mblk_t *, mp); 1632 connp = NULL; 1633 } 1634 1635 if (connp != NULL) { 1636 /* Have a listener at least */ 1637 CONN_INC_REF(connp); 1638 mutex_exit(&bind_connfp->connf_lock); 1639 return (connp); 1640 } 1641 1642 mutex_exit(&bind_connfp->connf_lock); 1643 1644 IPCL_DEBUG_LVL(512, 1645 ("ipcl_classify: couldn't classify mp = %p\n", 1646 (void *)mp)); 1647 break; 1648 1649 case IPPROTO_UDP: 1650 lport = up[1]; 1651 unlabeled = B_FALSE; 1652 /* Cred cannot be null on IPv4 */ 1653 if (is_system_labeled()) { 1654 cred_t *cr = msg_getcred(mp, NULL); 1655 ASSERT(cr != NULL); 1656 unlabeled = (crgetlabel(cr)->tsl_flags & 1657 TSLF_UNLABELED) != 0; 1658 } 1659 shared_addr = (zoneid == ALL_ZONES); 1660 if (shared_addr) { 1661 /* 1662 * No need to handle exclusive-stack zones since 1663 * ALL_ZONES only applies to the shared stack. 1664 */ 1665 zoneid = tsol_mlp_findzone(protocol, lport); 1666 /* 1667 * If no shared MLP is found, tsol_mlp_findzone returns 1668 * ALL_ZONES. In that case, we assume it's SLP, and 1669 * search for the zone based on the packet label. 1670 * 1671 * If there is such a zone, we prefer to find a 1672 * connection in it. Otherwise, we look for a 1673 * MAC-exempt connection in any zone whose label 1674 * dominates the default label on the packet. 1675 */ 1676 if (zoneid == ALL_ZONES) 1677 zoneid = tsol_packet_to_zoneid(mp); 1678 else 1679 unlabeled = B_FALSE; 1680 } 1681 fport = up[0]; 1682 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1683 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1684 mutex_enter(&connfp->connf_lock); 1685 for (connp = connfp->connf_head; connp != NULL; 1686 connp = connp->conn_next) { 1687 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1688 fport, ipha->ipha_src) && 1689 (IPCL_ZONE_MATCH(connp, zoneid) || 1690 (unlabeled && connp->conn_mac_exempt && 1691 shared_addr))) 1692 break; 1693 } 1694 1695 if (connp != NULL && is_system_labeled() && 1696 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1697 shared_addr, connp)) { 1698 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1699 char *, "connp(1) could not receive mp(2)", 1700 conn_t *, connp, mblk_t *, mp); 1701 connp = NULL; 1702 } 1703 1704 if (connp != NULL) { 1705 CONN_INC_REF(connp); 1706 mutex_exit(&connfp->connf_lock); 1707 return (connp); 1708 } 1709 1710 /* 1711 * We shouldn't come here for multicast/broadcast packets 1712 */ 1713 mutex_exit(&connfp->connf_lock); 1714 IPCL_DEBUG_LVL(512, 1715 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1716 lport, fport)); 1717 break; 1718 } 1719 1720 return (NULL); 1721 } 1722 1723 conn_t * 1724 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1725 ip_stack_t *ipst) 1726 { 1727 ip6_t *ip6h; 1728 connf_t *connfp, *bind_connfp; 1729 uint16_t lport; 1730 uint16_t fport; 1731 tcph_t *tcph; 1732 uint32_t ports; 1733 conn_t *connp; 1734 uint16_t *up; 1735 boolean_t shared_addr; 1736 boolean_t unlabeled; 1737 1738 ip6h = (ip6_t *)mp->b_rptr; 1739 1740 switch (protocol) { 1741 case IPPROTO_TCP: 1742 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1743 up = (uint16_t *)tcph->th_lport; 1744 ports = *(uint32_t *)up; 1745 1746 connfp = 1747 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1748 ports, ipst)]; 1749 mutex_enter(&connfp->connf_lock); 1750 for (connp = connfp->connf_head; connp != NULL; 1751 connp = connp->conn_next) { 1752 if ((IPCL_CONN_MATCH_V6(connp, protocol, 1753 ip6h->ip6_src, ip6h->ip6_dst, ports)) && 1754 (IPCL_ZONE_MATCH(connp, zoneid))) { 1755 break; 1756 } 1757 } 1758 1759 if (connp != NULL) { 1760 /* 1761 * We have a fully-bound TCP connection. 1762 * 1763 * For labeled systems, there's no need to check the 1764 * label here. It's known to be good as we checked 1765 * before allowing the connection to become bound. 1766 */ 1767 CONN_INC_REF(connp); 1768 mutex_exit(&connfp->connf_lock); 1769 return (connp); 1770 } 1771 1772 mutex_exit(&connfp->connf_lock); 1773 1774 lport = up[1]; 1775 unlabeled = B_FALSE; 1776 /* Cred can be null on IPv6 */ 1777 if (is_system_labeled()) { 1778 cred_t *cr = msg_getcred(mp, NULL); 1779 1780 unlabeled = (cr != NULL && 1781 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1782 } 1783 shared_addr = (zoneid == ALL_ZONES); 1784 if (shared_addr) { 1785 /* 1786 * No need to handle exclusive-stack zones since 1787 * ALL_ZONES only applies to the shared stack. 1788 */ 1789 zoneid = tsol_mlp_findzone(protocol, lport); 1790 /* 1791 * If no shared MLP is found, tsol_mlp_findzone returns 1792 * ALL_ZONES. In that case, we assume it's SLP, and 1793 * search for the zone based on the packet label. 1794 * 1795 * If there is such a zone, we prefer to find a 1796 * connection in it. Otherwise, we look for a 1797 * MAC-exempt connection in any zone whose label 1798 * dominates the default label on the packet. 1799 */ 1800 if (zoneid == ALL_ZONES) 1801 zoneid = tsol_packet_to_zoneid(mp); 1802 else 1803 unlabeled = B_FALSE; 1804 } 1805 1806 bind_connfp = 1807 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1808 mutex_enter(&bind_connfp->connf_lock); 1809 for (connp = bind_connfp->connf_head; connp != NULL; 1810 connp = connp->conn_next) { 1811 if (IPCL_BIND_MATCH_V6(connp, protocol, 1812 ip6h->ip6_dst, lport) && 1813 (IPCL_ZONE_MATCH(connp, zoneid) || 1814 (unlabeled && connp->conn_mac_exempt && 1815 shared_addr))) 1816 break; 1817 } 1818 1819 if (connp != NULL && is_system_labeled() && 1820 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1821 shared_addr, connp)) { 1822 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1823 char *, "connp(1) could not receive mp(2)", 1824 conn_t *, connp, mblk_t *, mp); 1825 connp = NULL; 1826 } 1827 1828 if (connp != NULL) { 1829 /* Have a listner at least */ 1830 CONN_INC_REF(connp); 1831 mutex_exit(&bind_connfp->connf_lock); 1832 IPCL_DEBUG_LVL(512, 1833 ("ipcl_classify_v6: found listner " 1834 "connp = %p\n", (void *)connp)); 1835 1836 return (connp); 1837 } 1838 1839 mutex_exit(&bind_connfp->connf_lock); 1840 1841 IPCL_DEBUG_LVL(512, 1842 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1843 (void *)mp)); 1844 break; 1845 1846 case IPPROTO_UDP: 1847 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1848 lport = up[1]; 1849 unlabeled = B_FALSE; 1850 /* Cred can be null on IPv6 */ 1851 if (is_system_labeled()) { 1852 cred_t *cr = msg_getcred(mp, NULL); 1853 1854 unlabeled = (cr != NULL && 1855 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1856 } 1857 shared_addr = (zoneid == ALL_ZONES); 1858 if (shared_addr) { 1859 /* 1860 * No need to handle exclusive-stack zones since 1861 * ALL_ZONES only applies to the shared stack. 1862 */ 1863 zoneid = tsol_mlp_findzone(protocol, lport); 1864 /* 1865 * If no shared MLP is found, tsol_mlp_findzone returns 1866 * ALL_ZONES. In that case, we assume it's SLP, and 1867 * search for the zone based on the packet label. 1868 * 1869 * If there is such a zone, we prefer to find a 1870 * connection in it. Otherwise, we look for a 1871 * MAC-exempt connection in any zone whose label 1872 * dominates the default label on the packet. 1873 */ 1874 if (zoneid == ALL_ZONES) 1875 zoneid = tsol_packet_to_zoneid(mp); 1876 else 1877 unlabeled = B_FALSE; 1878 } 1879 1880 fport = up[0]; 1881 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1882 fport)); 1883 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1884 mutex_enter(&connfp->connf_lock); 1885 for (connp = connfp->connf_head; connp != NULL; 1886 connp = connp->conn_next) { 1887 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1888 fport, ip6h->ip6_src) && 1889 (IPCL_ZONE_MATCH(connp, zoneid) || 1890 (unlabeled && connp->conn_mac_exempt && 1891 shared_addr))) 1892 break; 1893 } 1894 1895 if (connp != NULL && is_system_labeled() && 1896 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1897 shared_addr, connp)) { 1898 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1899 char *, "connp(1) could not receive mp(2)", 1900 conn_t *, connp, mblk_t *, mp); 1901 connp = NULL; 1902 } 1903 1904 if (connp != NULL) { 1905 CONN_INC_REF(connp); 1906 mutex_exit(&connfp->connf_lock); 1907 return (connp); 1908 } 1909 1910 /* 1911 * We shouldn't come here for multicast/broadcast packets 1912 */ 1913 mutex_exit(&connfp->connf_lock); 1914 IPCL_DEBUG_LVL(512, 1915 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1916 lport, fport)); 1917 break; 1918 } 1919 1920 return (NULL); 1921 } 1922 1923 /* 1924 * wrapper around ipcl_classify_(v4,v6) routines. 1925 */ 1926 conn_t * 1927 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1928 { 1929 uint16_t hdr_len; 1930 ipha_t *ipha; 1931 uint8_t *nexthdrp; 1932 1933 if (MBLKL(mp) < sizeof (ipha_t)) 1934 return (NULL); 1935 1936 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1937 case IPV4_VERSION: 1938 ipha = (ipha_t *)mp->b_rptr; 1939 hdr_len = IPH_HDR_LENGTH(ipha); 1940 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1941 zoneid, ipst)); 1942 case IPV6_VERSION: 1943 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1944 &hdr_len, &nexthdrp)) 1945 return (NULL); 1946 1947 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1948 } 1949 1950 return (NULL); 1951 } 1952 1953 conn_t * 1954 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1955 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1956 { 1957 connf_t *connfp; 1958 conn_t *connp; 1959 in_port_t lport; 1960 int af; 1961 boolean_t shared_addr; 1962 boolean_t unlabeled; 1963 const void *dst; 1964 1965 lport = ((uint16_t *)&ports)[1]; 1966 1967 unlabeled = B_FALSE; 1968 /* Cred can be null on IPv6 */ 1969 if (is_system_labeled()) { 1970 cred_t *cr = msg_getcred(mp, NULL); 1971 1972 unlabeled = (cr != NULL && 1973 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1974 } 1975 shared_addr = (zoneid == ALL_ZONES); 1976 if (shared_addr) { 1977 /* 1978 * No need to handle exclusive-stack zones since ALL_ZONES 1979 * only applies to the shared stack. 1980 */ 1981 zoneid = tsol_mlp_findzone(protocol, lport); 1982 /* 1983 * If no shared MLP is found, tsol_mlp_findzone returns 1984 * ALL_ZONES. In that case, we assume it's SLP, and search for 1985 * the zone based on the packet label. 1986 * 1987 * If there is such a zone, we prefer to find a connection in 1988 * it. Otherwise, we look for a MAC-exempt connection in any 1989 * zone whose label dominates the default label on the packet. 1990 */ 1991 if (zoneid == ALL_ZONES) 1992 zoneid = tsol_packet_to_zoneid(mp); 1993 else 1994 unlabeled = B_FALSE; 1995 } 1996 1997 af = IPH_HDR_VERSION(hdr); 1998 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1999 (const void *)&((ip6_t *)hdr)->ip6_dst; 2000 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 2001 2002 mutex_enter(&connfp->connf_lock); 2003 for (connp = connfp->connf_head; connp != NULL; 2004 connp = connp->conn_next) { 2005 /* We don't allow v4 fallback for v6 raw socket. */ 2006 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 2007 IPV6_VERSION)) 2008 continue; 2009 if (connp->conn_fully_bound) { 2010 if (af == IPV4_VERSION) { 2011 if (!IPCL_CONN_MATCH(connp, protocol, 2012 hdr->ipha_src, hdr->ipha_dst, ports)) 2013 continue; 2014 } else { 2015 if (!IPCL_CONN_MATCH_V6(connp, protocol, 2016 ((ip6_t *)hdr)->ip6_src, 2017 ((ip6_t *)hdr)->ip6_dst, ports)) 2018 continue; 2019 } 2020 } else { 2021 if (af == IPV4_VERSION) { 2022 if (!IPCL_BIND_MATCH(connp, protocol, 2023 hdr->ipha_dst, lport)) 2024 continue; 2025 } else { 2026 if (!IPCL_BIND_MATCH_V6(connp, protocol, 2027 ((ip6_t *)hdr)->ip6_dst, lport)) 2028 continue; 2029 } 2030 } 2031 2032 if (IPCL_ZONE_MATCH(connp, zoneid) || 2033 (unlabeled && connp->conn_mac_exempt && shared_addr)) 2034 break; 2035 } 2036 /* 2037 * If the connection is fully-bound and connection-oriented (TCP or 2038 * SCTP), then we've already validated the remote system's label. 2039 * There's no need to do it again for every packet. 2040 */ 2041 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 2042 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 2043 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 2044 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 2045 char *, "connp(1) could not receive mp(2)", 2046 conn_t *, connp, mblk_t *, mp); 2047 connp = NULL; 2048 } 2049 2050 if (connp != NULL) 2051 goto found; 2052 mutex_exit(&connfp->connf_lock); 2053 2054 /* Try to look for a wildcard match. */ 2055 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 2056 mutex_enter(&connfp->connf_lock); 2057 for (connp = connfp->connf_head; connp != NULL; 2058 connp = connp->conn_next) { 2059 /* We don't allow v4 fallback for v6 raw socket. */ 2060 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 2061 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 2062 continue; 2063 } 2064 if (af == IPV4_VERSION) { 2065 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 2066 break; 2067 } else { 2068 if (IPCL_RAW_MATCH_V6(connp, protocol, 2069 ((ip6_t *)hdr)->ip6_dst)) { 2070 break; 2071 } 2072 } 2073 } 2074 2075 if (connp != NULL) 2076 goto found; 2077 2078 mutex_exit(&connfp->connf_lock); 2079 return (NULL); 2080 2081 found: 2082 ASSERT(connp != NULL); 2083 CONN_INC_REF(connp); 2084 mutex_exit(&connfp->connf_lock); 2085 return (connp); 2086 } 2087 2088 /* ARGSUSED */ 2089 static int 2090 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2091 { 2092 itc_t *itc = (itc_t *)buf; 2093 conn_t *connp = &itc->itc_conn; 2094 tcp_t *tcp = (tcp_t *)&itc[1]; 2095 2096 bzero(connp, sizeof (conn_t)); 2097 bzero(tcp, sizeof (tcp_t)); 2098 2099 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2100 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2101 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 2102 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2103 connp->conn_tcp = tcp; 2104 connp->conn_flags = IPCL_TCPCONN; 2105 connp->conn_ulp = IPPROTO_TCP; 2106 tcp->tcp_connp = connp; 2107 return (0); 2108 } 2109 2110 /* ARGSUSED */ 2111 static void 2112 tcp_conn_destructor(void *buf, void *cdrarg) 2113 { 2114 itc_t *itc = (itc_t *)buf; 2115 conn_t *connp = &itc->itc_conn; 2116 tcp_t *tcp = (tcp_t *)&itc[1]; 2117 2118 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2119 ASSERT(tcp->tcp_connp == connp); 2120 ASSERT(connp->conn_tcp == tcp); 2121 tcp_timermp_free(tcp); 2122 mutex_destroy(&connp->conn_lock); 2123 cv_destroy(&connp->conn_cv); 2124 cv_destroy(&connp->conn_sq_cv); 2125 } 2126 2127 /* ARGSUSED */ 2128 static int 2129 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2130 { 2131 itc_t *itc = (itc_t *)buf; 2132 conn_t *connp = &itc->itc_conn; 2133 2134 bzero(connp, sizeof (conn_t)); 2135 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2136 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2137 connp->conn_flags = IPCL_IPCCONN; 2138 2139 return (0); 2140 } 2141 2142 /* ARGSUSED */ 2143 static void 2144 ip_conn_destructor(void *buf, void *cdrarg) 2145 { 2146 itc_t *itc = (itc_t *)buf; 2147 conn_t *connp = &itc->itc_conn; 2148 2149 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2150 ASSERT(connp->conn_priv == NULL); 2151 mutex_destroy(&connp->conn_lock); 2152 cv_destroy(&connp->conn_cv); 2153 } 2154 2155 /* ARGSUSED */ 2156 static int 2157 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2158 { 2159 itc_t *itc = (itc_t *)buf; 2160 conn_t *connp = &itc->itc_conn; 2161 udp_t *udp = (udp_t *)&itc[1]; 2162 2163 bzero(connp, sizeof (conn_t)); 2164 bzero(udp, sizeof (udp_t)); 2165 2166 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2167 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2168 connp->conn_udp = udp; 2169 connp->conn_flags = IPCL_UDPCONN; 2170 connp->conn_ulp = IPPROTO_UDP; 2171 udp->udp_connp = connp; 2172 return (0); 2173 } 2174 2175 /* ARGSUSED */ 2176 static void 2177 udp_conn_destructor(void *buf, void *cdrarg) 2178 { 2179 itc_t *itc = (itc_t *)buf; 2180 conn_t *connp = &itc->itc_conn; 2181 udp_t *udp = (udp_t *)&itc[1]; 2182 2183 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2184 ASSERT(udp->udp_connp == connp); 2185 ASSERT(connp->conn_udp == udp); 2186 mutex_destroy(&connp->conn_lock); 2187 cv_destroy(&connp->conn_cv); 2188 } 2189 2190 /* ARGSUSED */ 2191 static int 2192 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2193 { 2194 itc_t *itc = (itc_t *)buf; 2195 conn_t *connp = &itc->itc_conn; 2196 icmp_t *icmp = (icmp_t *)&itc[1]; 2197 2198 bzero(connp, sizeof (conn_t)); 2199 bzero(icmp, sizeof (icmp_t)); 2200 2201 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2202 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2203 connp->conn_icmp = icmp; 2204 connp->conn_flags = IPCL_RAWIPCONN; 2205 connp->conn_ulp = IPPROTO_ICMP; 2206 icmp->icmp_connp = connp; 2207 return (0); 2208 } 2209 2210 /* ARGSUSED */ 2211 static void 2212 rawip_conn_destructor(void *buf, void *cdrarg) 2213 { 2214 itc_t *itc = (itc_t *)buf; 2215 conn_t *connp = &itc->itc_conn; 2216 icmp_t *icmp = (icmp_t *)&itc[1]; 2217 2218 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2219 ASSERT(icmp->icmp_connp == connp); 2220 ASSERT(connp->conn_icmp == icmp); 2221 mutex_destroy(&connp->conn_lock); 2222 cv_destroy(&connp->conn_cv); 2223 } 2224 2225 /* ARGSUSED */ 2226 static int 2227 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2228 { 2229 itc_t *itc = (itc_t *)buf; 2230 conn_t *connp = &itc->itc_conn; 2231 rts_t *rts = (rts_t *)&itc[1]; 2232 2233 bzero(connp, sizeof (conn_t)); 2234 bzero(rts, sizeof (rts_t)); 2235 2236 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2237 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2238 connp->conn_rts = rts; 2239 connp->conn_flags = IPCL_RTSCONN; 2240 rts->rts_connp = connp; 2241 return (0); 2242 } 2243 2244 /* ARGSUSED */ 2245 static void 2246 rts_conn_destructor(void *buf, void *cdrarg) 2247 { 2248 itc_t *itc = (itc_t *)buf; 2249 conn_t *connp = &itc->itc_conn; 2250 rts_t *rts = (rts_t *)&itc[1]; 2251 2252 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2253 ASSERT(rts->rts_connp == connp); 2254 ASSERT(connp->conn_rts == rts); 2255 mutex_destroy(&connp->conn_lock); 2256 cv_destroy(&connp->conn_cv); 2257 } 2258 2259 /* ARGSUSED */ 2260 int 2261 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) 2262 { 2263 int error; 2264 netstack_t *ns; 2265 int ret; 2266 tcp_stack_t *tcps; 2267 ip_helper_stream_info_t *ip_helper_str; 2268 ip_stack_t *ipst; 2269 2270 ns = netstack_find_by_cred(kcred); 2271 ASSERT(ns != NULL); 2272 tcps = ns->netstack_tcp; 2273 ipst = ns->netstack_ip; 2274 ASSERT(tcps != NULL); 2275 ip_helper_str = (ip_helper_stream_info_t *)buf; 2276 2277 do { 2278 error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, 2279 &ip_helper_str->iphs_handle, ipst->ips_ldi_ident); 2280 } while (error == EINTR); 2281 2282 if (error == 0) { 2283 do { 2284 error = ldi_ioctl( 2285 ip_helper_str->iphs_handle, SIOCSQPTR, 2286 (intptr_t)buf, FKIOCTL, kcred, &ret); 2287 } while (error == EINTR); 2288 2289 if (error != 0) { 2290 (void) ldi_close( 2291 ip_helper_str->iphs_handle, 0, kcred); 2292 } 2293 } 2294 2295 netstack_rele(ipst->ips_netstack); 2296 2297 return (error); 2298 } 2299 2300 /* ARGSUSED */ 2301 static void 2302 ip_helper_stream_destructor(void *buf, void *cdrarg) 2303 { 2304 ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; 2305 2306 ip_helper_str->iphs_rq->q_ptr = 2307 ip_helper_str->iphs_wq->q_ptr = 2308 ip_helper_str->iphs_minfo; 2309 (void) ldi_close(ip_helper_str->iphs_handle, 0, kcred); 2310 } 2311 2312 2313 /* 2314 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2315 * in the conn_t. 2316 */ 2317 void 2318 ipcl_conn_cleanup(conn_t *connp) 2319 { 2320 ASSERT(connp->conn_ire_cache == NULL); 2321 ASSERT(connp->conn_latch == NULL); 2322 #ifdef notdef 2323 ASSERT(connp->conn_rq == NULL); 2324 ASSERT(connp->conn_wq == NULL); 2325 #endif 2326 ASSERT(connp->conn_cred == NULL); 2327 ASSERT(connp->conn_g_fanout == NULL); 2328 ASSERT(connp->conn_g_next == NULL); 2329 ASSERT(connp->conn_g_prev == NULL); 2330 ASSERT(connp->conn_policy == NULL); 2331 ASSERT(connp->conn_fanout == NULL); 2332 ASSERT(connp->conn_next == NULL); 2333 ASSERT(connp->conn_prev == NULL); 2334 #ifdef notdef 2335 /* 2336 * The ill and ipif pointers are not cleared before the conn_t 2337 * goes away since they do not hold a reference on the ill/ipif. 2338 * We should replace these pointers with ifindex/ipaddr_t to 2339 * make the code less complex. 2340 */ 2341 ASSERT(connp->conn_outgoing_ill == NULL); 2342 ASSERT(connp->conn_incoming_ill == NULL); 2343 ASSERT(connp->conn_multicast_ipif == NULL); 2344 ASSERT(connp->conn_multicast_ill == NULL); 2345 #endif 2346 ASSERT(connp->conn_oper_pending_ill == NULL); 2347 ASSERT(connp->conn_ilg == NULL); 2348 ASSERT(connp->conn_drain_next == NULL); 2349 ASSERT(connp->conn_drain_prev == NULL); 2350 #ifdef notdef 2351 /* conn_idl is not cleared when removed from idl list */ 2352 ASSERT(connp->conn_idl == NULL); 2353 #endif 2354 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2355 ASSERT(connp->conn_effective_cred == NULL); 2356 ASSERT(connp->conn_netstack == NULL); 2357 2358 ASSERT(connp->conn_helper_info == NULL); 2359 /* Clear out the conn_t fields that are not preserved */ 2360 bzero(&connp->conn_start_clr, 2361 sizeof (conn_t) - 2362 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2363 } 2364 2365 /* 2366 * All conns are inserted in a global multi-list for the benefit of 2367 * walkers. The walk is guaranteed to walk all open conns at the time 2368 * of the start of the walk exactly once. This property is needed to 2369 * achieve some cleanups during unplumb of interfaces. This is achieved 2370 * as follows. 2371 * 2372 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2373 * call the insert and delete functions below at creation and deletion 2374 * time respectively. The conn never moves or changes its position in this 2375 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2376 * won't increase due to walkers, once the conn deletion has started. Note 2377 * that we can't remove the conn from the global list and then wait for 2378 * the refcnt to drop to zero, since walkers would then see a truncated 2379 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2380 * conns until ip_open is ready to make them globally visible. 2381 * The global round robin multi-list locks are held only to get the 2382 * next member/insertion/deletion and contention should be negligible 2383 * if the multi-list is much greater than the number of cpus. 2384 */ 2385 void 2386 ipcl_globalhash_insert(conn_t *connp) 2387 { 2388 int index; 2389 struct connf_s *connfp; 2390 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2391 2392 /* 2393 * No need for atomic here. Approximate even distribution 2394 * in the global lists is sufficient. 2395 */ 2396 ipst->ips_conn_g_index++; 2397 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2398 2399 connp->conn_g_prev = NULL; 2400 /* 2401 * Mark as INCIPIENT, so that walkers will ignore this 2402 * for now, till ip_open is ready to make it visible globally. 2403 */ 2404 connp->conn_state_flags |= CONN_INCIPIENT; 2405 2406 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2407 /* Insert at the head of the list */ 2408 mutex_enter(&connfp->connf_lock); 2409 connp->conn_g_next = connfp->connf_head; 2410 if (connp->conn_g_next != NULL) 2411 connp->conn_g_next->conn_g_prev = connp; 2412 connfp->connf_head = connp; 2413 2414 /* The fanout bucket this conn points to */ 2415 connp->conn_g_fanout = connfp; 2416 2417 mutex_exit(&connfp->connf_lock); 2418 } 2419 2420 void 2421 ipcl_globalhash_remove(conn_t *connp) 2422 { 2423 struct connf_s *connfp; 2424 2425 /* 2426 * We were never inserted in the global multi list. 2427 * IPCL_NONE variety is never inserted in the global multilist 2428 * since it is presumed to not need any cleanup and is transient. 2429 */ 2430 if (connp->conn_g_fanout == NULL) 2431 return; 2432 2433 connfp = connp->conn_g_fanout; 2434 mutex_enter(&connfp->connf_lock); 2435 if (connp->conn_g_prev != NULL) 2436 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2437 else 2438 connfp->connf_head = connp->conn_g_next; 2439 if (connp->conn_g_next != NULL) 2440 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2441 mutex_exit(&connfp->connf_lock); 2442 2443 /* Better to stumble on a null pointer than to corrupt memory */ 2444 connp->conn_g_next = NULL; 2445 connp->conn_g_prev = NULL; 2446 connp->conn_g_fanout = NULL; 2447 } 2448 2449 /* 2450 * Walk the list of all conn_t's in the system, calling the function provided 2451 * with the specified argument for each. 2452 * Applies to both IPv4 and IPv6. 2453 * 2454 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2455 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2456 * unplumbed or removed. New conn_t's that are created while we are walking 2457 * may be missed by this walk, because they are not necessarily inserted 2458 * at the tail of the list. They are new conn_t's and thus don't have any 2459 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2460 * is created to the struct that is going away. 2461 */ 2462 void 2463 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2464 { 2465 int i; 2466 conn_t *connp; 2467 conn_t *prev_connp; 2468 2469 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2470 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2471 prev_connp = NULL; 2472 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2473 while (connp != NULL) { 2474 mutex_enter(&connp->conn_lock); 2475 if (connp->conn_state_flags & 2476 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2477 mutex_exit(&connp->conn_lock); 2478 connp = connp->conn_g_next; 2479 continue; 2480 } 2481 CONN_INC_REF_LOCKED(connp); 2482 mutex_exit(&connp->conn_lock); 2483 mutex_exit( 2484 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2485 (*func)(connp, arg); 2486 if (prev_connp != NULL) 2487 CONN_DEC_REF(prev_connp); 2488 mutex_enter( 2489 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2490 prev_connp = connp; 2491 connp = connp->conn_g_next; 2492 } 2493 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2494 if (prev_connp != NULL) 2495 CONN_DEC_REF(prev_connp); 2496 } 2497 } 2498 2499 /* 2500 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2501 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2502 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2503 * (peer tcp in ESTABLISHED state). 2504 */ 2505 conn_t * 2506 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2507 ip_stack_t *ipst) 2508 { 2509 uint32_t ports; 2510 uint16_t *pports = (uint16_t *)&ports; 2511 connf_t *connfp; 2512 conn_t *tconnp; 2513 boolean_t zone_chk; 2514 2515 /* 2516 * If either the source of destination address is loopback, then 2517 * both endpoints must be in the same Zone. Otherwise, both of 2518 * the addresses are system-wide unique (tcp is in ESTABLISHED 2519 * state) and the endpoints may reside in different Zones. 2520 */ 2521 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2522 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2523 2524 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2525 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2526 2527 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2528 ports, ipst)]; 2529 2530 mutex_enter(&connfp->connf_lock); 2531 for (tconnp = connfp->connf_head; tconnp != NULL; 2532 tconnp = tconnp->conn_next) { 2533 2534 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2535 ipha->ipha_dst, ipha->ipha_src, ports) && 2536 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2537 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2538 2539 ASSERT(tconnp != connp); 2540 CONN_INC_REF(tconnp); 2541 mutex_exit(&connfp->connf_lock); 2542 return (tconnp); 2543 } 2544 } 2545 mutex_exit(&connfp->connf_lock); 2546 return (NULL); 2547 } 2548 2549 /* 2550 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2551 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2552 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2553 * (peer tcp in ESTABLISHED state). 2554 */ 2555 conn_t * 2556 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2557 ip_stack_t *ipst) 2558 { 2559 uint32_t ports; 2560 uint16_t *pports = (uint16_t *)&ports; 2561 connf_t *connfp; 2562 conn_t *tconnp; 2563 boolean_t zone_chk; 2564 2565 /* 2566 * If either the source of destination address is loopback, then 2567 * both endpoints must be in the same Zone. Otherwise, both of 2568 * the addresses are system-wide unique (tcp is in ESTABLISHED 2569 * state) and the endpoints may reside in different Zones. We 2570 * don't do Zone check for link local address(es) because the 2571 * current Zone implementation treats each link local address as 2572 * being unique per system node, i.e. they belong to global Zone. 2573 */ 2574 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2575 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2576 2577 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2578 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2579 2580 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2581 ports, ipst)]; 2582 2583 mutex_enter(&connfp->connf_lock); 2584 for (tconnp = connfp->connf_head; tconnp != NULL; 2585 tconnp = tconnp->conn_next) { 2586 2587 /* We skip tcp_bound_if check here as this is loopback tcp */ 2588 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2589 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2590 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2591 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2592 2593 ASSERT(tconnp != connp); 2594 CONN_INC_REF(tconnp); 2595 mutex_exit(&connfp->connf_lock); 2596 return (tconnp); 2597 } 2598 } 2599 mutex_exit(&connfp->connf_lock); 2600 return (NULL); 2601 } 2602 2603 /* 2604 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2605 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2606 * Only checks for connected entries i.e. no INADDR_ANY checks. 2607 */ 2608 conn_t * 2609 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2610 ip_stack_t *ipst) 2611 { 2612 uint32_t ports; 2613 uint16_t *pports; 2614 connf_t *connfp; 2615 conn_t *tconnp; 2616 2617 pports = (uint16_t *)&ports; 2618 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2619 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2620 2621 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2622 ports, ipst)]; 2623 2624 mutex_enter(&connfp->connf_lock); 2625 for (tconnp = connfp->connf_head; tconnp != NULL; 2626 tconnp = tconnp->conn_next) { 2627 2628 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2629 ipha->ipha_dst, ipha->ipha_src, ports) && 2630 tconnp->conn_tcp->tcp_state >= min_state) { 2631 2632 CONN_INC_REF(tconnp); 2633 mutex_exit(&connfp->connf_lock); 2634 return (tconnp); 2635 } 2636 } 2637 mutex_exit(&connfp->connf_lock); 2638 return (NULL); 2639 } 2640 2641 /* 2642 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2643 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2644 * Only checks for connected entries i.e. no INADDR_ANY checks. 2645 * Match on ifindex in addition to addresses. 2646 */ 2647 conn_t * 2648 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2649 uint_t ifindex, ip_stack_t *ipst) 2650 { 2651 tcp_t *tcp; 2652 uint32_t ports; 2653 uint16_t *pports; 2654 connf_t *connfp; 2655 conn_t *tconnp; 2656 2657 pports = (uint16_t *)&ports; 2658 pports[0] = tcpha->tha_fport; 2659 pports[1] = tcpha->tha_lport; 2660 2661 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2662 ports, ipst)]; 2663 2664 mutex_enter(&connfp->connf_lock); 2665 for (tconnp = connfp->connf_head; tconnp != NULL; 2666 tconnp = tconnp->conn_next) { 2667 2668 tcp = tconnp->conn_tcp; 2669 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2670 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2671 tcp->tcp_state >= min_state && 2672 (tcp->tcp_bound_if == 0 || 2673 tcp->tcp_bound_if == ifindex)) { 2674 2675 CONN_INC_REF(tconnp); 2676 mutex_exit(&connfp->connf_lock); 2677 return (tconnp); 2678 } 2679 } 2680 mutex_exit(&connfp->connf_lock); 2681 return (NULL); 2682 } 2683 2684 /* 2685 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2686 * a listener when changing state. 2687 */ 2688 conn_t * 2689 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2690 ip_stack_t *ipst) 2691 { 2692 connf_t *bind_connfp; 2693 conn_t *connp; 2694 tcp_t *tcp; 2695 2696 /* 2697 * Avoid false matches for packets sent to an IP destination of 2698 * all zeros. 2699 */ 2700 if (laddr == 0) 2701 return (NULL); 2702 2703 ASSERT(zoneid != ALL_ZONES); 2704 2705 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2706 mutex_enter(&bind_connfp->connf_lock); 2707 for (connp = bind_connfp->connf_head; connp != NULL; 2708 connp = connp->conn_next) { 2709 tcp = connp->conn_tcp; 2710 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2711 IPCL_ZONE_MATCH(connp, zoneid) && 2712 (tcp->tcp_listener == NULL)) { 2713 CONN_INC_REF(connp); 2714 mutex_exit(&bind_connfp->connf_lock); 2715 return (connp); 2716 } 2717 } 2718 mutex_exit(&bind_connfp->connf_lock); 2719 return (NULL); 2720 } 2721 2722 /* 2723 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2724 * a listener when changing state. 2725 */ 2726 conn_t * 2727 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2728 zoneid_t zoneid, ip_stack_t *ipst) 2729 { 2730 connf_t *bind_connfp; 2731 conn_t *connp = NULL; 2732 tcp_t *tcp; 2733 2734 /* 2735 * Avoid false matches for packets sent to an IP destination of 2736 * all zeros. 2737 */ 2738 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2739 return (NULL); 2740 2741 ASSERT(zoneid != ALL_ZONES); 2742 2743 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2744 mutex_enter(&bind_connfp->connf_lock); 2745 for (connp = bind_connfp->connf_head; connp != NULL; 2746 connp = connp->conn_next) { 2747 tcp = connp->conn_tcp; 2748 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2749 IPCL_ZONE_MATCH(connp, zoneid) && 2750 (tcp->tcp_bound_if == 0 || 2751 tcp->tcp_bound_if == ifindex) && 2752 tcp->tcp_listener == NULL) { 2753 CONN_INC_REF(connp); 2754 mutex_exit(&bind_connfp->connf_lock); 2755 return (connp); 2756 } 2757 } 2758 mutex_exit(&bind_connfp->connf_lock); 2759 return (NULL); 2760 } 2761 2762 /* 2763 * ipcl_get_next_conn 2764 * get the next entry in the conn global list 2765 * and put a reference on the next_conn. 2766 * decrement the reference on the current conn. 2767 * 2768 * This is an iterator based walker function that also provides for 2769 * some selection by the caller. It walks through the conn_hash bucket 2770 * searching for the next valid connp in the list, and selects connections 2771 * that are neither closed nor condemned. It also REFHOLDS the conn 2772 * thus ensuring that the conn exists when the caller uses the conn. 2773 */ 2774 conn_t * 2775 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2776 { 2777 conn_t *next_connp; 2778 2779 if (connfp == NULL) 2780 return (NULL); 2781 2782 mutex_enter(&connfp->connf_lock); 2783 2784 next_connp = (connp == NULL) ? 2785 connfp->connf_head : connp->conn_g_next; 2786 2787 while (next_connp != NULL) { 2788 mutex_enter(&next_connp->conn_lock); 2789 if (!(next_connp->conn_flags & conn_flags) || 2790 (next_connp->conn_state_flags & 2791 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2792 /* 2793 * This conn has been condemned or 2794 * is closing, or the flags don't match 2795 */ 2796 mutex_exit(&next_connp->conn_lock); 2797 next_connp = next_connp->conn_g_next; 2798 continue; 2799 } 2800 CONN_INC_REF_LOCKED(next_connp); 2801 mutex_exit(&next_connp->conn_lock); 2802 break; 2803 } 2804 2805 mutex_exit(&connfp->connf_lock); 2806 2807 if (connp != NULL) 2808 CONN_DEC_REF(connp); 2809 2810 return (next_connp); 2811 } 2812 2813 #ifdef CONN_DEBUG 2814 /* 2815 * Trace of the last NBUF refhold/refrele 2816 */ 2817 int 2818 conn_trace_ref(conn_t *connp) 2819 { 2820 int last; 2821 conn_trace_t *ctb; 2822 2823 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2824 last = connp->conn_trace_last; 2825 last++; 2826 if (last == CONN_TRACE_MAX) 2827 last = 0; 2828 2829 ctb = &connp->conn_trace_buf[last]; 2830 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2831 connp->conn_trace_last = last; 2832 return (1); 2833 } 2834 2835 int 2836 conn_untrace_ref(conn_t *connp) 2837 { 2838 int last; 2839 conn_trace_t *ctb; 2840 2841 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2842 last = connp->conn_trace_last; 2843 last++; 2844 if (last == CONN_TRACE_MAX) 2845 last = 0; 2846 2847 ctb = &connp->conn_trace_buf[last]; 2848 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2849 connp->conn_trace_last = last; 2850 return (1); 2851 } 2852 #endif 2853