1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/ip_ndp.h> 265 #include <inet/ip_impl.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/tcp.h> 276 #include <inet/ipsec_impl.h> 277 278 #include <sys/tsol/tnet.h> 279 #include <sys/sockio.h> 280 281 #ifdef DEBUG 282 #define IPCL_DEBUG 283 #else 284 #undef IPCL_DEBUG 285 #endif 286 287 #ifdef IPCL_DEBUG 288 int ipcl_debug_level = 0; 289 #define IPCL_DEBUG_LVL(level, args) \ 290 if (ipcl_debug_level & level) { printf args; } 291 #else 292 #define IPCL_DEBUG_LVL(level, args) {; } 293 #endif 294 /* Old value for compatibility. Setable in /etc/system */ 295 uint_t tcp_conn_hash_size = 0; 296 297 /* New value. Zero means choose automatically. Setable in /etc/system */ 298 uint_t ipcl_conn_hash_size = 0; 299 uint_t ipcl_conn_hash_memfactor = 8192; 300 uint_t ipcl_conn_hash_maxsize = 82500; 301 302 /* bind/udp fanout table size */ 303 uint_t ipcl_bind_fanout_size = 512; 304 uint_t ipcl_udp_fanout_size = 16384; 305 306 /* Raw socket fanout size. Must be a power of 2. */ 307 uint_t ipcl_raw_fanout_size = 256; 308 309 /* 310 * Power of 2^N Primes useful for hashing for N of 0-28, 311 * these primes are the nearest prime <= 2^N - 2^(N-2). 312 */ 313 314 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 315 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 316 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 317 50331599, 100663291, 201326557, 0} 318 319 /* 320 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 321 * are aligned on cache lines. 322 */ 323 typedef union itc_s { 324 conn_t itc_conn; 325 char itcu_filler[CACHE_ALIGN(conn_s)]; 326 } itc_t; 327 328 struct kmem_cache *tcp_conn_cache; 329 struct kmem_cache *ip_conn_cache; 330 struct kmem_cache *ip_helper_stream_cache; 331 extern struct kmem_cache *sctp_conn_cache; 332 extern struct kmem_cache *tcp_sack_info_cache; 333 extern struct kmem_cache *tcp_iphc_cache; 334 struct kmem_cache *udp_conn_cache; 335 struct kmem_cache *rawip_conn_cache; 336 struct kmem_cache *rts_conn_cache; 337 338 extern void tcp_timermp_free(tcp_t *); 339 extern mblk_t *tcp_timermp_alloc(int); 340 341 static int ip_conn_constructor(void *, void *, int); 342 static void ip_conn_destructor(void *, void *); 343 344 static int tcp_conn_constructor(void *, void *, int); 345 static void tcp_conn_destructor(void *, void *); 346 347 static int udp_conn_constructor(void *, void *, int); 348 static void udp_conn_destructor(void *, void *); 349 350 static int rawip_conn_constructor(void *, void *, int); 351 static void rawip_conn_destructor(void *, void *); 352 353 static int rts_conn_constructor(void *, void *, int); 354 static void rts_conn_destructor(void *, void *); 355 356 static int ip_helper_stream_constructor(void *, void *, int); 357 static void ip_helper_stream_destructor(void *, void *); 358 359 boolean_t ip_use_helper_cache = B_TRUE; 360 361 /* 362 * Hook functions to enable cluster networking 363 * On non-clustered systems these vectors must always be NULL. 364 */ 365 extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, 366 uint8_t *, in_port_t, void *); 367 extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, 368 uint8_t *, in_port_t, void *); 369 370 #ifdef IPCL_DEBUG 371 #define INET_NTOA_BUFSIZE 18 372 373 static char * 374 inet_ntoa_r(uint32_t in, char *b) 375 { 376 unsigned char *p; 377 378 p = (unsigned char *)∈ 379 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 380 return (b); 381 } 382 #endif 383 384 /* 385 * Global (for all stack instances) init routine 386 */ 387 void 388 ipcl_g_init(void) 389 { 390 ip_conn_cache = kmem_cache_create("ip_conn_cache", 391 sizeof (conn_t), CACHE_ALIGN_SIZE, 392 ip_conn_constructor, ip_conn_destructor, 393 NULL, NULL, NULL, 0); 394 395 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 396 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 397 tcp_conn_constructor, tcp_conn_destructor, 398 NULL, NULL, NULL, 0); 399 400 udp_conn_cache = kmem_cache_create("udp_conn_cache", 401 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 402 udp_conn_constructor, udp_conn_destructor, 403 NULL, NULL, NULL, 0); 404 405 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 406 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 407 rawip_conn_constructor, rawip_conn_destructor, 408 NULL, NULL, NULL, 0); 409 410 rts_conn_cache = kmem_cache_create("rts_conn_cache", 411 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 412 rts_conn_constructor, rts_conn_destructor, 413 NULL, NULL, NULL, 0); 414 415 if (ip_use_helper_cache) { 416 ip_helper_stream_cache = kmem_cache_create 417 ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), 418 CACHE_ALIGN_SIZE, ip_helper_stream_constructor, 419 ip_helper_stream_destructor, NULL, NULL, NULL, 0); 420 } else { 421 ip_helper_stream_cache = NULL; 422 } 423 } 424 425 /* 426 * ipclassifier intialization routine, sets up hash tables. 427 */ 428 void 429 ipcl_init(ip_stack_t *ipst) 430 { 431 int i; 432 int sizes[] = P2Ps(); 433 434 /* 435 * Calculate size of conn fanout table from /etc/system settings 436 */ 437 if (ipcl_conn_hash_size != 0) { 438 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 439 } else if (tcp_conn_hash_size != 0) { 440 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 441 } else { 442 extern pgcnt_t freemem; 443 444 ipst->ips_ipcl_conn_fanout_size = 445 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 446 447 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 448 ipst->ips_ipcl_conn_fanout_size = 449 ipcl_conn_hash_maxsize; 450 } 451 } 452 453 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 454 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 455 break; 456 } 457 } 458 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 459 /* Out of range, use the 2^16 value */ 460 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 461 } 462 463 /* Take values from /etc/system */ 464 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 465 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 466 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 467 468 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 469 470 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 471 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 472 473 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 474 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 479 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 480 481 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 482 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 483 MUTEX_DEFAULT, NULL); 484 } 485 486 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 487 sizeof (connf_t), KM_SLEEP); 488 for (i = 0; i < IPPROTO_MAX; i++) { 489 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 490 MUTEX_DEFAULT, NULL); 491 } 492 493 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 494 sizeof (connf_t), KM_SLEEP); 495 for (i = 0; i < IPPROTO_MAX; i++) { 496 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 497 MUTEX_DEFAULT, NULL); 498 } 499 500 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 501 mutex_init(&ipst->ips_rts_clients->connf_lock, 502 NULL, MUTEX_DEFAULT, NULL); 503 504 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 505 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 506 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 507 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 508 MUTEX_DEFAULT, NULL); 509 } 510 511 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 512 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 513 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 514 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 515 MUTEX_DEFAULT, NULL); 516 } 517 518 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 519 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 520 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 521 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 522 NULL, MUTEX_DEFAULT, NULL); 523 } 524 } 525 526 void 527 ipcl_g_destroy(void) 528 { 529 kmem_cache_destroy(ip_conn_cache); 530 kmem_cache_destroy(tcp_conn_cache); 531 kmem_cache_destroy(udp_conn_cache); 532 kmem_cache_destroy(rawip_conn_cache); 533 kmem_cache_destroy(rts_conn_cache); 534 } 535 536 /* 537 * All user-level and kernel use of the stack must be gone 538 * by now. 539 */ 540 void 541 ipcl_destroy(ip_stack_t *ipst) 542 { 543 int i; 544 545 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 546 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 547 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 548 } 549 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 550 sizeof (connf_t)); 551 ipst->ips_ipcl_conn_fanout = NULL; 552 553 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 554 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 555 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 556 } 557 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 558 sizeof (connf_t)); 559 ipst->ips_ipcl_bind_fanout = NULL; 560 561 for (i = 0; i < IPPROTO_MAX; i++) { 562 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 563 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 564 } 565 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 566 ipst->ips_ipcl_proto_fanout = NULL; 567 568 for (i = 0; i < IPPROTO_MAX; i++) { 569 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 570 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 571 } 572 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 573 IPPROTO_MAX * sizeof (connf_t)); 574 ipst->ips_ipcl_proto_fanout_v6 = NULL; 575 576 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 577 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 578 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 579 } 580 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 581 sizeof (connf_t)); 582 ipst->ips_ipcl_udp_fanout = NULL; 583 584 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 585 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 586 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 587 } 588 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 589 sizeof (connf_t)); 590 ipst->ips_ipcl_raw_fanout = NULL; 591 592 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 593 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 594 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 595 } 596 kmem_free(ipst->ips_ipcl_globalhash_fanout, 597 sizeof (connf_t) * CONN_G_HASH_SIZE); 598 ipst->ips_ipcl_globalhash_fanout = NULL; 599 600 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 601 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 602 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 603 ipst->ips_rts_clients = NULL; 604 } 605 606 /* 607 * conn creation routine. initialize the conn, sets the reference 608 * and inserts it in the global hash table. 609 */ 610 conn_t * 611 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 612 { 613 conn_t *connp; 614 sctp_stack_t *sctps; 615 struct kmem_cache *conn_cache; 616 617 switch (type) { 618 case IPCL_SCTPCONN: 619 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 620 return (NULL); 621 sctp_conn_init(connp); 622 sctps = ns->netstack_sctp; 623 SCTP_G_Q_REFHOLD(sctps); 624 netstack_hold(ns); 625 connp->conn_netstack = ns; 626 return (connp); 627 628 case IPCL_TCPCONN: 629 conn_cache = tcp_conn_cache; 630 break; 631 632 case IPCL_UDPCONN: 633 conn_cache = udp_conn_cache; 634 break; 635 636 case IPCL_RAWIPCONN: 637 conn_cache = rawip_conn_cache; 638 break; 639 640 case IPCL_RTSCONN: 641 conn_cache = rts_conn_cache; 642 break; 643 644 case IPCL_IPCCONN: 645 conn_cache = ip_conn_cache; 646 break; 647 648 default: 649 connp = NULL; 650 ASSERT(0); 651 } 652 653 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 654 return (NULL); 655 656 connp->conn_ref = 1; 657 netstack_hold(ns); 658 connp->conn_netstack = ns; 659 ipcl_globalhash_insert(connp); 660 return (connp); 661 } 662 663 void 664 ipcl_conn_destroy(conn_t *connp) 665 { 666 mblk_t *mp; 667 netstack_t *ns = connp->conn_netstack; 668 669 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 670 ASSERT(connp->conn_ref == 0); 671 ASSERT(connp->conn_ire_cache == NULL); 672 673 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 674 675 if (connp->conn_peercred != NULL) { 676 crfree(connp->conn_peercred); 677 connp->conn_peercred = NULL; 678 } 679 680 if (connp->conn_cred != NULL) { 681 crfree(connp->conn_cred); 682 connp->conn_cred = NULL; 683 } 684 685 ipcl_globalhash_remove(connp); 686 687 /* FIXME: add separate tcp_conn_free()? */ 688 if (connp->conn_flags & IPCL_TCPCONN) { 689 tcp_t *tcp = connp->conn_tcp; 690 tcp_stack_t *tcps; 691 692 ASSERT(tcp != NULL); 693 tcps = tcp->tcp_tcps; 694 if (tcps != NULL) { 695 if (connp->conn_latch != NULL) { 696 IPLATCH_REFRELE(connp->conn_latch, ns); 697 connp->conn_latch = NULL; 698 } 699 if (connp->conn_policy != NULL) { 700 IPPH_REFRELE(connp->conn_policy, ns); 701 connp->conn_policy = NULL; 702 } 703 tcp->tcp_tcps = NULL; 704 TCPS_REFRELE(tcps); 705 } 706 707 tcp_free(tcp); 708 mp = tcp->tcp_timercache; 709 tcp->tcp_cred = NULL; 710 711 if (tcp->tcp_sack_info != NULL) { 712 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 713 kmem_cache_free(tcp_sack_info_cache, 714 tcp->tcp_sack_info); 715 } 716 if (tcp->tcp_iphc != NULL) { 717 if (tcp->tcp_hdr_grown) { 718 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 719 } else { 720 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 721 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 722 } 723 tcp->tcp_iphc_len = 0; 724 } 725 ASSERT(tcp->tcp_iphc_len == 0); 726 727 /* 728 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 729 * the mblk. 730 */ 731 if (tcp->tcp_rsrv_mp != NULL) { 732 freeb(tcp->tcp_rsrv_mp); 733 tcp->tcp_rsrv_mp = NULL; 734 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 735 } 736 737 ASSERT(connp->conn_latch == NULL); 738 ASSERT(connp->conn_policy == NULL); 739 740 if (ns != NULL) { 741 ASSERT(tcp->tcp_tcps == NULL); 742 connp->conn_netstack = NULL; 743 netstack_rele(ns); 744 } 745 746 ipcl_conn_cleanup(connp); 747 connp->conn_flags = IPCL_TCPCONN; 748 bzero(tcp, sizeof (tcp_t)); 749 750 tcp->tcp_timercache = mp; 751 tcp->tcp_connp = connp; 752 kmem_cache_free(tcp_conn_cache, connp); 753 return; 754 } 755 if (connp->conn_latch != NULL) { 756 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 757 connp->conn_latch = NULL; 758 } 759 if (connp->conn_policy != NULL) { 760 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 761 connp->conn_policy = NULL; 762 } 763 if (connp->conn_ipsec_opt_mp != NULL) { 764 freemsg(connp->conn_ipsec_opt_mp); 765 connp->conn_ipsec_opt_mp = NULL; 766 } 767 768 if (connp->conn_flags & IPCL_SCTPCONN) { 769 ASSERT(ns != NULL); 770 sctp_free(connp); 771 return; 772 } 773 774 if (ns != NULL) { 775 connp->conn_netstack = NULL; 776 netstack_rele(ns); 777 } 778 779 ipcl_conn_cleanup(connp); 780 781 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 782 if (connp->conn_flags & IPCL_UDPCONN) { 783 connp->conn_flags = IPCL_UDPCONN; 784 kmem_cache_free(udp_conn_cache, connp); 785 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 786 787 connp->conn_flags = IPCL_RAWIPCONN; 788 connp->conn_ulp = IPPROTO_ICMP; 789 kmem_cache_free(rawip_conn_cache, connp); 790 } else if (connp->conn_flags & IPCL_RTSCONN) { 791 connp->conn_flags = IPCL_RTSCONN; 792 kmem_cache_free(rts_conn_cache, connp); 793 } else { 794 connp->conn_flags = IPCL_IPCCONN; 795 ASSERT(connp->conn_flags & IPCL_IPCCONN); 796 ASSERT(connp->conn_priv == NULL); 797 kmem_cache_free(ip_conn_cache, connp); 798 } 799 } 800 801 /* 802 * Running in cluster mode - deregister listener information 803 */ 804 805 static void 806 ipcl_conn_unlisten(conn_t *connp) 807 { 808 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 809 ASSERT(connp->conn_lport != 0); 810 811 if (cl_inet_unlisten != NULL) { 812 sa_family_t addr_family; 813 uint8_t *laddrp; 814 815 if (connp->conn_pkt_isv6) { 816 addr_family = AF_INET6; 817 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 818 } else { 819 addr_family = AF_INET; 820 laddrp = (uint8_t *)&connp->conn_bound_source; 821 } 822 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 823 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 824 } 825 connp->conn_flags &= ~IPCL_CL_LISTENER; 826 } 827 828 /* 829 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 830 * which table the conn belonged to). So for debugging we can see which hash 831 * table this connection was in. 832 */ 833 #define IPCL_HASH_REMOVE(connp) { \ 834 connf_t *connfp = (connp)->conn_fanout; \ 835 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 836 if (connfp != NULL) { \ 837 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 838 (void *)(connp))); \ 839 mutex_enter(&connfp->connf_lock); \ 840 if ((connp)->conn_next != NULL) \ 841 (connp)->conn_next->conn_prev = \ 842 (connp)->conn_prev; \ 843 if ((connp)->conn_prev != NULL) \ 844 (connp)->conn_prev->conn_next = \ 845 (connp)->conn_next; \ 846 else \ 847 connfp->connf_head = (connp)->conn_next; \ 848 (connp)->conn_fanout = NULL; \ 849 (connp)->conn_next = NULL; \ 850 (connp)->conn_prev = NULL; \ 851 (connp)->conn_flags |= IPCL_REMOVED; \ 852 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 853 ipcl_conn_unlisten((connp)); \ 854 CONN_DEC_REF((connp)); \ 855 mutex_exit(&connfp->connf_lock); \ 856 } \ 857 } 858 859 void 860 ipcl_hash_remove(conn_t *connp) 861 { 862 IPCL_HASH_REMOVE(connp); 863 } 864 865 /* 866 * The whole purpose of this function is allow removal of 867 * a conn_t from the connected hash for timewait reclaim. 868 * This is essentially a TW reclaim fastpath where timewait 869 * collector checks under fanout lock (so no one else can 870 * get access to the conn_t) that refcnt is 2 i.e. one for 871 * TCP and one for the classifier hash list. If ref count 872 * is indeed 2, we can just remove the conn under lock and 873 * avoid cleaning up the conn under squeue. This gives us 874 * improved performance. 875 */ 876 void 877 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 878 { 879 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 880 ASSERT(MUTEX_HELD(&connp->conn_lock)); 881 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 882 883 if ((connp)->conn_next != NULL) { 884 (connp)->conn_next->conn_prev = (connp)->conn_prev; 885 } 886 if ((connp)->conn_prev != NULL) { 887 (connp)->conn_prev->conn_next = (connp)->conn_next; 888 } else { 889 connfp->connf_head = (connp)->conn_next; 890 } 891 (connp)->conn_fanout = NULL; 892 (connp)->conn_next = NULL; 893 (connp)->conn_prev = NULL; 894 (connp)->conn_flags |= IPCL_REMOVED; 895 ASSERT((connp)->conn_ref == 2); 896 (connp)->conn_ref--; 897 } 898 899 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 900 ASSERT((connp)->conn_fanout == NULL); \ 901 ASSERT((connp)->conn_next == NULL); \ 902 ASSERT((connp)->conn_prev == NULL); \ 903 if ((connfp)->connf_head != NULL) { \ 904 (connfp)->connf_head->conn_prev = (connp); \ 905 (connp)->conn_next = (connfp)->connf_head; \ 906 } \ 907 (connp)->conn_fanout = (connfp); \ 908 (connfp)->connf_head = (connp); \ 909 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 910 IPCL_CONNECTED; \ 911 CONN_INC_REF(connp); \ 912 } 913 914 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 915 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 916 "connp %p", (void *)(connfp), (void *)(connp))); \ 917 IPCL_HASH_REMOVE((connp)); \ 918 mutex_enter(&(connfp)->connf_lock); \ 919 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 920 mutex_exit(&(connfp)->connf_lock); \ 921 } 922 923 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 924 conn_t *pconnp = NULL, *nconnp; \ 925 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 926 "connp %p", (void *)connfp, (void *)(connp))); \ 927 IPCL_HASH_REMOVE((connp)); \ 928 mutex_enter(&(connfp)->connf_lock); \ 929 nconnp = (connfp)->connf_head; \ 930 while (nconnp != NULL && \ 931 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 932 pconnp = nconnp; \ 933 nconnp = nconnp->conn_next; \ 934 } \ 935 if (pconnp != NULL) { \ 936 pconnp->conn_next = (connp); \ 937 (connp)->conn_prev = pconnp; \ 938 } else { \ 939 (connfp)->connf_head = (connp); \ 940 } \ 941 if (nconnp != NULL) { \ 942 (connp)->conn_next = nconnp; \ 943 nconnp->conn_prev = (connp); \ 944 } \ 945 (connp)->conn_fanout = (connfp); \ 946 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 947 IPCL_BOUND; \ 948 CONN_INC_REF(connp); \ 949 mutex_exit(&(connfp)->connf_lock); \ 950 } 951 952 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 953 conn_t **list, *prev, *next; \ 954 boolean_t isv4mapped = \ 955 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 956 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 957 "connp %p", (void *)(connfp), (void *)(connp))); \ 958 IPCL_HASH_REMOVE((connp)); \ 959 mutex_enter(&(connfp)->connf_lock); \ 960 list = &(connfp)->connf_head; \ 961 prev = NULL; \ 962 while ((next = *list) != NULL) { \ 963 if (isv4mapped && \ 964 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 965 connp->conn_zoneid == next->conn_zoneid) { \ 966 (connp)->conn_next = next; \ 967 if (prev != NULL) \ 968 prev = next->conn_prev; \ 969 next->conn_prev = (connp); \ 970 break; \ 971 } \ 972 list = &next->conn_next; \ 973 prev = next; \ 974 } \ 975 (connp)->conn_prev = prev; \ 976 *list = (connp); \ 977 (connp)->conn_fanout = (connfp); \ 978 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 979 IPCL_BOUND; \ 980 CONN_INC_REF((connp)); \ 981 mutex_exit(&(connfp)->connf_lock); \ 982 } 983 984 void 985 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 986 { 987 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 988 } 989 990 void 991 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 992 { 993 connf_t *connfp; 994 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 995 996 ASSERT(connp != NULL); 997 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 998 protocol == IPPROTO_ESP); 999 1000 connp->conn_ulp = protocol; 1001 1002 /* Insert it in the protocol hash */ 1003 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1004 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1005 } 1006 1007 void 1008 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 1009 { 1010 connf_t *connfp; 1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1012 1013 ASSERT(connp != NULL); 1014 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 1015 protocol == IPPROTO_ESP); 1016 1017 connp->conn_ulp = protocol; 1018 1019 /* Insert it in the Bind Hash */ 1020 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1021 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1022 } 1023 1024 /* 1025 * This function is used only for inserting SCTP raw socket now. 1026 * This may change later. 1027 * 1028 * Note that only one raw socket can be bound to a port. The param 1029 * lport is in network byte order. 1030 */ 1031 static int 1032 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1033 { 1034 connf_t *connfp; 1035 conn_t *oconnp; 1036 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1037 1038 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1039 1040 /* Check for existing raw socket already bound to the port. */ 1041 mutex_enter(&connfp->connf_lock); 1042 for (oconnp = connfp->connf_head; oconnp != NULL; 1043 oconnp = oconnp->conn_next) { 1044 if (oconnp->conn_lport == lport && 1045 oconnp->conn_zoneid == connp->conn_zoneid && 1046 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1047 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1048 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1049 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1050 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1051 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1052 &connp->conn_srcv6))) { 1053 break; 1054 } 1055 } 1056 mutex_exit(&connfp->connf_lock); 1057 if (oconnp != NULL) 1058 return (EADDRNOTAVAIL); 1059 1060 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1061 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1062 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1063 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1064 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1065 } else { 1066 IPCL_HASH_INSERT_BOUND(connfp, connp); 1067 } 1068 } else { 1069 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1070 } 1071 return (0); 1072 } 1073 1074 /* 1075 * Check for a MAC exemption conflict on a labeled system. Note that for 1076 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1077 * transport layer. This check is for binding all other protocols. 1078 * 1079 * Returns true if there's a conflict. 1080 */ 1081 static boolean_t 1082 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1083 { 1084 connf_t *connfp; 1085 conn_t *tconn; 1086 1087 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1088 mutex_enter(&connfp->connf_lock); 1089 for (tconn = connfp->connf_head; tconn != NULL; 1090 tconn = tconn->conn_next) { 1091 /* We don't allow v4 fallback for v6 raw socket */ 1092 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1093 continue; 1094 /* If neither is exempt, then there's no conflict */ 1095 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1096 continue; 1097 /* If both are bound to different specific addrs, ok */ 1098 if (connp->conn_src != INADDR_ANY && 1099 tconn->conn_src != INADDR_ANY && 1100 connp->conn_src != tconn->conn_src) 1101 continue; 1102 /* These two conflict; fail */ 1103 break; 1104 } 1105 mutex_exit(&connfp->connf_lock); 1106 return (tconn != NULL); 1107 } 1108 1109 static boolean_t 1110 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1111 { 1112 connf_t *connfp; 1113 conn_t *tconn; 1114 1115 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1116 mutex_enter(&connfp->connf_lock); 1117 for (tconn = connfp->connf_head; tconn != NULL; 1118 tconn = tconn->conn_next) { 1119 /* We don't allow v4 fallback for v6 raw socket */ 1120 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1121 continue; 1122 /* If neither is exempt, then there's no conflict */ 1123 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1124 continue; 1125 /* If both are bound to different addrs, ok */ 1126 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1127 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1128 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1129 continue; 1130 /* These two conflict; fail */ 1131 break; 1132 } 1133 mutex_exit(&connfp->connf_lock); 1134 return (tconn != NULL); 1135 } 1136 1137 /* 1138 * (v4, v6) bind hash insertion routines 1139 */ 1140 int 1141 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1142 { 1143 connf_t *connfp; 1144 #ifdef IPCL_DEBUG 1145 char buf[INET_NTOA_BUFSIZE]; 1146 #endif 1147 int ret = 0; 1148 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1149 1150 ASSERT(connp); 1151 1152 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1153 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1154 1155 connp->conn_ulp = protocol; 1156 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1157 connp->conn_lport = lport; 1158 1159 switch (protocol) { 1160 default: 1161 if (is_system_labeled() && 1162 check_exempt_conflict_v4(connp, ipst)) 1163 return (EADDRINUSE); 1164 /* FALLTHROUGH */ 1165 case IPPROTO_UDP: 1166 if (protocol == IPPROTO_UDP) { 1167 IPCL_DEBUG_LVL(64, 1168 ("ipcl_bind_insert: connp %p - udp\n", 1169 (void *)connp)); 1170 connfp = &ipst->ips_ipcl_udp_fanout[ 1171 IPCL_UDP_HASH(lport, ipst)]; 1172 } else { 1173 IPCL_DEBUG_LVL(64, 1174 ("ipcl_bind_insert: connp %p - protocol\n", 1175 (void *)connp)); 1176 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1177 } 1178 1179 if (connp->conn_rem != INADDR_ANY) { 1180 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1181 } else if (connp->conn_src != INADDR_ANY) { 1182 IPCL_HASH_INSERT_BOUND(connfp, connp); 1183 } else { 1184 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1185 } 1186 break; 1187 1188 case IPPROTO_TCP: 1189 1190 /* Insert it in the Bind Hash */ 1191 ASSERT(connp->conn_zoneid != ALL_ZONES); 1192 connfp = &ipst->ips_ipcl_bind_fanout[ 1193 IPCL_BIND_HASH(lport, ipst)]; 1194 if (connp->conn_src != INADDR_ANY) { 1195 IPCL_HASH_INSERT_BOUND(connfp, connp); 1196 } else { 1197 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1198 } 1199 if (cl_inet_listen != NULL) { 1200 ASSERT(!connp->conn_pkt_isv6); 1201 connp->conn_flags |= IPCL_CL_LISTENER; 1202 (*cl_inet_listen)( 1203 connp->conn_netstack->netstack_stackid, 1204 IPPROTO_TCP, AF_INET, 1205 (uint8_t *)&connp->conn_bound_source, lport, NULL); 1206 } 1207 break; 1208 1209 case IPPROTO_SCTP: 1210 ret = ipcl_sctp_hash_insert(connp, lport); 1211 break; 1212 } 1213 1214 return (ret); 1215 } 1216 1217 int 1218 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1219 uint16_t lport) 1220 { 1221 connf_t *connfp; 1222 int ret = 0; 1223 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1224 1225 ASSERT(connp); 1226 1227 connp->conn_ulp = protocol; 1228 connp->conn_srcv6 = *src; 1229 connp->conn_lport = lport; 1230 1231 switch (protocol) { 1232 default: 1233 if (is_system_labeled() && 1234 check_exempt_conflict_v6(connp, ipst)) 1235 return (EADDRINUSE); 1236 /* FALLTHROUGH */ 1237 case IPPROTO_UDP: 1238 if (protocol == IPPROTO_UDP) { 1239 IPCL_DEBUG_LVL(128, 1240 ("ipcl_bind_insert_v6: connp %p - udp\n", 1241 (void *)connp)); 1242 connfp = &ipst->ips_ipcl_udp_fanout[ 1243 IPCL_UDP_HASH(lport, ipst)]; 1244 } else { 1245 IPCL_DEBUG_LVL(128, 1246 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1247 (void *)connp)); 1248 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1249 } 1250 1251 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1252 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1253 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1254 IPCL_HASH_INSERT_BOUND(connfp, connp); 1255 } else { 1256 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1257 } 1258 break; 1259 1260 case IPPROTO_TCP: 1261 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1262 1263 /* Insert it in the Bind Hash */ 1264 ASSERT(connp->conn_zoneid != ALL_ZONES); 1265 connfp = &ipst->ips_ipcl_bind_fanout[ 1266 IPCL_BIND_HASH(lport, ipst)]; 1267 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1268 IPCL_HASH_INSERT_BOUND(connfp, connp); 1269 } else { 1270 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1271 } 1272 if (cl_inet_listen != NULL) { 1273 sa_family_t addr_family; 1274 uint8_t *laddrp; 1275 1276 if (connp->conn_pkt_isv6) { 1277 addr_family = AF_INET6; 1278 laddrp = 1279 (uint8_t *)&connp->conn_bound_source_v6; 1280 } else { 1281 addr_family = AF_INET; 1282 laddrp = (uint8_t *)&connp->conn_bound_source; 1283 } 1284 connp->conn_flags |= IPCL_CL_LISTENER; 1285 (*cl_inet_listen)( 1286 connp->conn_netstack->netstack_stackid, 1287 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1288 } 1289 break; 1290 1291 case IPPROTO_SCTP: 1292 ret = ipcl_sctp_hash_insert(connp, lport); 1293 break; 1294 } 1295 1296 return (ret); 1297 } 1298 1299 /* 1300 * ipcl_conn_hash insertion routines. 1301 */ 1302 int 1303 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1304 ipaddr_t rem, uint32_t ports) 1305 { 1306 connf_t *connfp; 1307 uint16_t *up; 1308 conn_t *tconnp; 1309 #ifdef IPCL_DEBUG 1310 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1311 #endif 1312 in_port_t lport; 1313 int ret = 0; 1314 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1315 1316 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1317 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1318 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1319 ports, protocol)); 1320 1321 switch (protocol) { 1322 case IPPROTO_TCP: 1323 if (!(connp->conn_flags & IPCL_EAGER)) { 1324 /* 1325 * for a eager connection, i.e connections which 1326 * have just been created, the initialization is 1327 * already done in ip at conn_creation time, so 1328 * we can skip the checks here. 1329 */ 1330 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1331 } 1332 1333 /* 1334 * For tcp, we check whether the connection tuple already 1335 * exists before allowing the connection to proceed. We 1336 * also allow indexing on the zoneid. This is to allow 1337 * multiple shared stack zones to have the same tcp 1338 * connection tuple. In practice this only happens for 1339 * INADDR_LOOPBACK as it's the only local address which 1340 * doesn't have to be unique. 1341 */ 1342 connfp = &ipst->ips_ipcl_conn_fanout[ 1343 IPCL_CONN_HASH(connp->conn_rem, 1344 connp->conn_ports, ipst)]; 1345 mutex_enter(&connfp->connf_lock); 1346 for (tconnp = connfp->connf_head; tconnp != NULL; 1347 tconnp = tconnp->conn_next) { 1348 if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1349 connp->conn_rem, connp->conn_src, 1350 connp->conn_ports)) && 1351 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1352 1353 /* Already have a conn. bail out */ 1354 mutex_exit(&connfp->connf_lock); 1355 return (EADDRINUSE); 1356 } 1357 } 1358 if (connp->conn_fanout != NULL) { 1359 /* 1360 * Probably a XTI/TLI application trying to do a 1361 * rebind. Let it happen. 1362 */ 1363 mutex_exit(&connfp->connf_lock); 1364 IPCL_HASH_REMOVE(connp); 1365 mutex_enter(&connfp->connf_lock); 1366 } 1367 1368 ASSERT(connp->conn_recv != NULL); 1369 1370 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1371 mutex_exit(&connfp->connf_lock); 1372 break; 1373 1374 case IPPROTO_SCTP: 1375 /* 1376 * The raw socket may have already been bound, remove it 1377 * from the hash first. 1378 */ 1379 IPCL_HASH_REMOVE(connp); 1380 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1381 ret = ipcl_sctp_hash_insert(connp, lport); 1382 break; 1383 1384 default: 1385 /* 1386 * Check for conflicts among MAC exempt bindings. For 1387 * transports with port numbers, this is done by the upper 1388 * level per-transport binding logic. For all others, it's 1389 * done here. 1390 */ 1391 if (is_system_labeled() && 1392 check_exempt_conflict_v4(connp, ipst)) 1393 return (EADDRINUSE); 1394 /* FALLTHROUGH */ 1395 1396 case IPPROTO_UDP: 1397 up = (uint16_t *)&ports; 1398 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1399 if (protocol == IPPROTO_UDP) { 1400 connfp = &ipst->ips_ipcl_udp_fanout[ 1401 IPCL_UDP_HASH(up[1], ipst)]; 1402 } else { 1403 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1404 } 1405 1406 if (connp->conn_rem != INADDR_ANY) { 1407 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1408 } else if (connp->conn_src != INADDR_ANY) { 1409 IPCL_HASH_INSERT_BOUND(connfp, connp); 1410 } else { 1411 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1412 } 1413 break; 1414 } 1415 1416 return (ret); 1417 } 1418 1419 int 1420 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1421 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1422 { 1423 connf_t *connfp; 1424 uint16_t *up; 1425 conn_t *tconnp; 1426 in_port_t lport; 1427 int ret = 0; 1428 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1429 1430 switch (protocol) { 1431 case IPPROTO_TCP: 1432 /* Just need to insert a conn struct */ 1433 if (!(connp->conn_flags & IPCL_EAGER)) { 1434 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1435 } 1436 1437 /* 1438 * For tcp, we check whether the connection tuple already 1439 * exists before allowing the connection to proceed. We 1440 * also allow indexing on the zoneid. This is to allow 1441 * multiple shared stack zones to have the same tcp 1442 * connection tuple. In practice this only happens for 1443 * ipv6_loopback as it's the only local address which 1444 * doesn't have to be unique. 1445 */ 1446 connfp = &ipst->ips_ipcl_conn_fanout[ 1447 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1448 ipst)]; 1449 mutex_enter(&connfp->connf_lock); 1450 for (tconnp = connfp->connf_head; tconnp != NULL; 1451 tconnp = tconnp->conn_next) { 1452 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1453 connp->conn_remv6, connp->conn_srcv6, 1454 connp->conn_ports) && 1455 (tconnp->conn_tcp->tcp_bound_if == 0 || 1456 tconnp->conn_tcp->tcp_bound_if == ifindex) && 1457 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1458 /* Already have a conn. bail out */ 1459 mutex_exit(&connfp->connf_lock); 1460 return (EADDRINUSE); 1461 } 1462 } 1463 if (connp->conn_fanout != NULL) { 1464 /* 1465 * Probably a XTI/TLI application trying to do a 1466 * rebind. Let it happen. 1467 */ 1468 mutex_exit(&connfp->connf_lock); 1469 IPCL_HASH_REMOVE(connp); 1470 mutex_enter(&connfp->connf_lock); 1471 } 1472 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1473 mutex_exit(&connfp->connf_lock); 1474 break; 1475 1476 case IPPROTO_SCTP: 1477 IPCL_HASH_REMOVE(connp); 1478 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1479 ret = ipcl_sctp_hash_insert(connp, lport); 1480 break; 1481 1482 default: 1483 if (is_system_labeled() && 1484 check_exempt_conflict_v6(connp, ipst)) 1485 return (EADDRINUSE); 1486 /* FALLTHROUGH */ 1487 case IPPROTO_UDP: 1488 up = (uint16_t *)&ports; 1489 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1490 if (protocol == IPPROTO_UDP) { 1491 connfp = &ipst->ips_ipcl_udp_fanout[ 1492 IPCL_UDP_HASH(up[1], ipst)]; 1493 } else { 1494 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1495 } 1496 1497 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1498 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1499 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1500 IPCL_HASH_INSERT_BOUND(connfp, connp); 1501 } else { 1502 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1503 } 1504 break; 1505 } 1506 1507 return (ret); 1508 } 1509 1510 /* 1511 * v4 packet classifying function. looks up the fanout table to 1512 * find the conn, the packet belongs to. returns the conn with 1513 * the reference held, null otherwise. 1514 * 1515 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1516 * Lookup" comment block are applied. Labels are also checked as described 1517 * above. If the packet is from the inside (looped back), and is from the same 1518 * zone, then label checks are omitted. 1519 */ 1520 conn_t * 1521 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1522 ip_stack_t *ipst) 1523 { 1524 ipha_t *ipha; 1525 connf_t *connfp, *bind_connfp; 1526 uint16_t lport; 1527 uint16_t fport; 1528 uint32_t ports; 1529 conn_t *connp; 1530 uint16_t *up; 1531 boolean_t shared_addr; 1532 boolean_t unlabeled; 1533 1534 ipha = (ipha_t *)mp->b_rptr; 1535 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1536 1537 switch (protocol) { 1538 case IPPROTO_TCP: 1539 ports = *(uint32_t *)up; 1540 connfp = 1541 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1542 ports, ipst)]; 1543 mutex_enter(&connfp->connf_lock); 1544 for (connp = connfp->connf_head; connp != NULL; 1545 connp = connp->conn_next) { 1546 if ((IPCL_CONN_MATCH(connp, protocol, 1547 ipha->ipha_src, ipha->ipha_dst, ports)) && 1548 (IPCL_ZONE_MATCH(connp, zoneid))) { 1549 break; 1550 } 1551 } 1552 1553 if (connp != NULL) { 1554 /* 1555 * We have a fully-bound TCP connection. 1556 * 1557 * For labeled systems, there's no need to check the 1558 * label here. It's known to be good as we checked 1559 * before allowing the connection to become bound. 1560 */ 1561 CONN_INC_REF(connp); 1562 mutex_exit(&connfp->connf_lock); 1563 return (connp); 1564 } 1565 1566 mutex_exit(&connfp->connf_lock); 1567 1568 lport = up[1]; 1569 unlabeled = B_FALSE; 1570 /* Cred cannot be null on IPv4 */ 1571 if (is_system_labeled()) { 1572 cred_t *cr = msg_getcred(mp, NULL); 1573 ASSERT(cr != NULL); 1574 unlabeled = (crgetlabel(cr)->tsl_flags & 1575 TSLF_UNLABELED) != 0; 1576 } 1577 shared_addr = (zoneid == ALL_ZONES); 1578 if (shared_addr) { 1579 /* 1580 * No need to handle exclusive-stack zones since 1581 * ALL_ZONES only applies to the shared stack. 1582 */ 1583 zoneid = tsol_mlp_findzone(protocol, lport); 1584 /* 1585 * If no shared MLP is found, tsol_mlp_findzone returns 1586 * ALL_ZONES. In that case, we assume it's SLP, and 1587 * search for the zone based on the packet label. 1588 * 1589 * If there is such a zone, we prefer to find a 1590 * connection in it. Otherwise, we look for a 1591 * MAC-exempt connection in any zone whose label 1592 * dominates the default label on the packet. 1593 */ 1594 if (zoneid == ALL_ZONES) 1595 zoneid = tsol_packet_to_zoneid(mp); 1596 else 1597 unlabeled = B_FALSE; 1598 } 1599 1600 bind_connfp = 1601 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1602 mutex_enter(&bind_connfp->connf_lock); 1603 for (connp = bind_connfp->connf_head; connp != NULL; 1604 connp = connp->conn_next) { 1605 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1606 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1607 (unlabeled && connp->conn_mac_exempt))) 1608 break; 1609 } 1610 1611 /* 1612 * If the matching connection is SLP on a private address, then 1613 * the label on the packet must match the local zone's label. 1614 * Otherwise, it must be in the label range defined by tnrh. 1615 * This is ensured by tsol_receive_label. 1616 */ 1617 if (connp != NULL && is_system_labeled() && 1618 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1619 shared_addr, connp)) { 1620 DTRACE_PROBE3( 1621 tx__ip__log__info__classify__tcp, 1622 char *, 1623 "connp(1) could not receive mp(2)", 1624 conn_t *, connp, mblk_t *, mp); 1625 connp = NULL; 1626 } 1627 1628 if (connp != NULL) { 1629 /* Have a listener at least */ 1630 CONN_INC_REF(connp); 1631 mutex_exit(&bind_connfp->connf_lock); 1632 return (connp); 1633 } 1634 1635 mutex_exit(&bind_connfp->connf_lock); 1636 1637 IPCL_DEBUG_LVL(512, 1638 ("ipcl_classify: couldn't classify mp = %p\n", 1639 (void *)mp)); 1640 break; 1641 1642 case IPPROTO_UDP: 1643 lport = up[1]; 1644 unlabeled = B_FALSE; 1645 /* Cred cannot be null on IPv4 */ 1646 if (is_system_labeled()) { 1647 cred_t *cr = msg_getcred(mp, NULL); 1648 ASSERT(cr != NULL); 1649 unlabeled = (crgetlabel(cr)->tsl_flags & 1650 TSLF_UNLABELED) != 0; 1651 } 1652 shared_addr = (zoneid == ALL_ZONES); 1653 if (shared_addr) { 1654 /* 1655 * No need to handle exclusive-stack zones since 1656 * ALL_ZONES only applies to the shared stack. 1657 */ 1658 zoneid = tsol_mlp_findzone(protocol, lport); 1659 /* 1660 * If no shared MLP is found, tsol_mlp_findzone returns 1661 * ALL_ZONES. In that case, we assume it's SLP, and 1662 * search for the zone based on the packet label. 1663 * 1664 * If there is such a zone, we prefer to find a 1665 * connection in it. Otherwise, we look for a 1666 * MAC-exempt connection in any zone whose label 1667 * dominates the default label on the packet. 1668 */ 1669 if (zoneid == ALL_ZONES) 1670 zoneid = tsol_packet_to_zoneid(mp); 1671 else 1672 unlabeled = B_FALSE; 1673 } 1674 fport = up[0]; 1675 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1676 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1677 mutex_enter(&connfp->connf_lock); 1678 for (connp = connfp->connf_head; connp != NULL; 1679 connp = connp->conn_next) { 1680 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1681 fport, ipha->ipha_src) && 1682 (IPCL_ZONE_MATCH(connp, zoneid) || 1683 (unlabeled && connp->conn_mac_exempt))) 1684 break; 1685 } 1686 1687 if (connp != NULL && is_system_labeled() && 1688 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1689 shared_addr, connp)) { 1690 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1691 char *, "connp(1) could not receive mp(2)", 1692 conn_t *, connp, mblk_t *, mp); 1693 connp = NULL; 1694 } 1695 1696 if (connp != NULL) { 1697 CONN_INC_REF(connp); 1698 mutex_exit(&connfp->connf_lock); 1699 return (connp); 1700 } 1701 1702 /* 1703 * We shouldn't come here for multicast/broadcast packets 1704 */ 1705 mutex_exit(&connfp->connf_lock); 1706 IPCL_DEBUG_LVL(512, 1707 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1708 lport, fport)); 1709 break; 1710 } 1711 1712 return (NULL); 1713 } 1714 1715 conn_t * 1716 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1717 ip_stack_t *ipst) 1718 { 1719 ip6_t *ip6h; 1720 connf_t *connfp, *bind_connfp; 1721 uint16_t lport; 1722 uint16_t fport; 1723 tcph_t *tcph; 1724 uint32_t ports; 1725 conn_t *connp; 1726 uint16_t *up; 1727 boolean_t shared_addr; 1728 boolean_t unlabeled; 1729 1730 ip6h = (ip6_t *)mp->b_rptr; 1731 1732 switch (protocol) { 1733 case IPPROTO_TCP: 1734 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1735 up = (uint16_t *)tcph->th_lport; 1736 ports = *(uint32_t *)up; 1737 1738 connfp = 1739 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1740 ports, ipst)]; 1741 mutex_enter(&connfp->connf_lock); 1742 for (connp = connfp->connf_head; connp != NULL; 1743 connp = connp->conn_next) { 1744 if ((IPCL_CONN_MATCH_V6(connp, protocol, 1745 ip6h->ip6_src, ip6h->ip6_dst, ports)) && 1746 (IPCL_ZONE_MATCH(connp, zoneid))) { 1747 break; 1748 } 1749 } 1750 1751 if (connp != NULL) { 1752 /* 1753 * We have a fully-bound TCP connection. 1754 * 1755 * For labeled systems, there's no need to check the 1756 * label here. It's known to be good as we checked 1757 * before allowing the connection to become bound. 1758 */ 1759 CONN_INC_REF(connp); 1760 mutex_exit(&connfp->connf_lock); 1761 return (connp); 1762 } 1763 1764 mutex_exit(&connfp->connf_lock); 1765 1766 lport = up[1]; 1767 unlabeled = B_FALSE; 1768 /* Cred can be null on IPv6 */ 1769 if (is_system_labeled()) { 1770 cred_t *cr = msg_getcred(mp, NULL); 1771 1772 unlabeled = (cr != NULL && 1773 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1774 } 1775 shared_addr = (zoneid == ALL_ZONES); 1776 if (shared_addr) { 1777 /* 1778 * No need to handle exclusive-stack zones since 1779 * ALL_ZONES only applies to the shared stack. 1780 */ 1781 zoneid = tsol_mlp_findzone(protocol, lport); 1782 /* 1783 * If no shared MLP is found, tsol_mlp_findzone returns 1784 * ALL_ZONES. In that case, we assume it's SLP, and 1785 * search for the zone based on the packet label. 1786 * 1787 * If there is such a zone, we prefer to find a 1788 * connection in it. Otherwise, we look for a 1789 * MAC-exempt connection in any zone whose label 1790 * dominates the default label on the packet. 1791 */ 1792 if (zoneid == ALL_ZONES) 1793 zoneid = tsol_packet_to_zoneid(mp); 1794 else 1795 unlabeled = B_FALSE; 1796 } 1797 1798 bind_connfp = 1799 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1800 mutex_enter(&bind_connfp->connf_lock); 1801 for (connp = bind_connfp->connf_head; connp != NULL; 1802 connp = connp->conn_next) { 1803 if (IPCL_BIND_MATCH_V6(connp, protocol, 1804 ip6h->ip6_dst, lport) && 1805 (IPCL_ZONE_MATCH(connp, zoneid) || 1806 (unlabeled && connp->conn_mac_exempt))) 1807 break; 1808 } 1809 1810 if (connp != NULL && is_system_labeled() && 1811 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1812 shared_addr, connp)) { 1813 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1814 char *, "connp(1) could not receive mp(2)", 1815 conn_t *, connp, mblk_t *, mp); 1816 connp = NULL; 1817 } 1818 1819 if (connp != NULL) { 1820 /* Have a listner at least */ 1821 CONN_INC_REF(connp); 1822 mutex_exit(&bind_connfp->connf_lock); 1823 IPCL_DEBUG_LVL(512, 1824 ("ipcl_classify_v6: found listner " 1825 "connp = %p\n", (void *)connp)); 1826 1827 return (connp); 1828 } 1829 1830 mutex_exit(&bind_connfp->connf_lock); 1831 1832 IPCL_DEBUG_LVL(512, 1833 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1834 (void *)mp)); 1835 break; 1836 1837 case IPPROTO_UDP: 1838 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1839 lport = up[1]; 1840 unlabeled = B_FALSE; 1841 /* Cred can be null on IPv6 */ 1842 if (is_system_labeled()) { 1843 cred_t *cr = msg_getcred(mp, NULL); 1844 1845 unlabeled = (cr != NULL && 1846 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1847 } 1848 shared_addr = (zoneid == ALL_ZONES); 1849 if (shared_addr) { 1850 /* 1851 * No need to handle exclusive-stack zones since 1852 * ALL_ZONES only applies to the shared stack. 1853 */ 1854 zoneid = tsol_mlp_findzone(protocol, lport); 1855 /* 1856 * If no shared MLP is found, tsol_mlp_findzone returns 1857 * ALL_ZONES. In that case, we assume it's SLP, and 1858 * search for the zone based on the packet label. 1859 * 1860 * If there is such a zone, we prefer to find a 1861 * connection in it. Otherwise, we look for a 1862 * MAC-exempt connection in any zone whose label 1863 * dominates the default label on the packet. 1864 */ 1865 if (zoneid == ALL_ZONES) 1866 zoneid = tsol_packet_to_zoneid(mp); 1867 else 1868 unlabeled = B_FALSE; 1869 } 1870 1871 fport = up[0]; 1872 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1873 fport)); 1874 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1875 mutex_enter(&connfp->connf_lock); 1876 for (connp = connfp->connf_head; connp != NULL; 1877 connp = connp->conn_next) { 1878 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1879 fport, ip6h->ip6_src) && 1880 (IPCL_ZONE_MATCH(connp, zoneid) || 1881 (unlabeled && connp->conn_mac_exempt))) 1882 break; 1883 } 1884 1885 if (connp != NULL && is_system_labeled() && 1886 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1887 shared_addr, connp)) { 1888 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1889 char *, "connp(1) could not receive mp(2)", 1890 conn_t *, connp, mblk_t *, mp); 1891 connp = NULL; 1892 } 1893 1894 if (connp != NULL) { 1895 CONN_INC_REF(connp); 1896 mutex_exit(&connfp->connf_lock); 1897 return (connp); 1898 } 1899 1900 /* 1901 * We shouldn't come here for multicast/broadcast packets 1902 */ 1903 mutex_exit(&connfp->connf_lock); 1904 IPCL_DEBUG_LVL(512, 1905 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1906 lport, fport)); 1907 break; 1908 } 1909 1910 return (NULL); 1911 } 1912 1913 /* 1914 * wrapper around ipcl_classify_(v4,v6) routines. 1915 */ 1916 conn_t * 1917 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1918 { 1919 uint16_t hdr_len; 1920 ipha_t *ipha; 1921 uint8_t *nexthdrp; 1922 1923 if (MBLKL(mp) < sizeof (ipha_t)) 1924 return (NULL); 1925 1926 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1927 case IPV4_VERSION: 1928 ipha = (ipha_t *)mp->b_rptr; 1929 hdr_len = IPH_HDR_LENGTH(ipha); 1930 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1931 zoneid, ipst)); 1932 case IPV6_VERSION: 1933 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1934 &hdr_len, &nexthdrp)) 1935 return (NULL); 1936 1937 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1938 } 1939 1940 return (NULL); 1941 } 1942 1943 conn_t * 1944 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1945 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1946 { 1947 connf_t *connfp; 1948 conn_t *connp; 1949 in_port_t lport; 1950 int af; 1951 boolean_t shared_addr; 1952 boolean_t unlabeled; 1953 const void *dst; 1954 1955 lport = ((uint16_t *)&ports)[1]; 1956 1957 unlabeled = B_FALSE; 1958 /* Cred can be null on IPv6 */ 1959 if (is_system_labeled()) { 1960 cred_t *cr = msg_getcred(mp, NULL); 1961 1962 unlabeled = (cr != NULL && 1963 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1964 } 1965 shared_addr = (zoneid == ALL_ZONES); 1966 if (shared_addr) { 1967 /* 1968 * No need to handle exclusive-stack zones since ALL_ZONES 1969 * only applies to the shared stack. 1970 */ 1971 zoneid = tsol_mlp_findzone(protocol, lport); 1972 /* 1973 * If no shared MLP is found, tsol_mlp_findzone returns 1974 * ALL_ZONES. In that case, we assume it's SLP, and search for 1975 * the zone based on the packet label. 1976 * 1977 * If there is such a zone, we prefer to find a connection in 1978 * it. Otherwise, we look for a MAC-exempt connection in any 1979 * zone whose label dominates the default label on the packet. 1980 */ 1981 if (zoneid == ALL_ZONES) 1982 zoneid = tsol_packet_to_zoneid(mp); 1983 else 1984 unlabeled = B_FALSE; 1985 } 1986 1987 af = IPH_HDR_VERSION(hdr); 1988 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1989 (const void *)&((ip6_t *)hdr)->ip6_dst; 1990 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1991 1992 mutex_enter(&connfp->connf_lock); 1993 for (connp = connfp->connf_head; connp != NULL; 1994 connp = connp->conn_next) { 1995 /* We don't allow v4 fallback for v6 raw socket. */ 1996 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1997 IPV6_VERSION)) 1998 continue; 1999 if (connp->conn_fully_bound) { 2000 if (af == IPV4_VERSION) { 2001 if (!IPCL_CONN_MATCH(connp, protocol, 2002 hdr->ipha_src, hdr->ipha_dst, ports)) 2003 continue; 2004 } else { 2005 if (!IPCL_CONN_MATCH_V6(connp, protocol, 2006 ((ip6_t *)hdr)->ip6_src, 2007 ((ip6_t *)hdr)->ip6_dst, ports)) 2008 continue; 2009 } 2010 } else { 2011 if (af == IPV4_VERSION) { 2012 if (!IPCL_BIND_MATCH(connp, protocol, 2013 hdr->ipha_dst, lport)) 2014 continue; 2015 } else { 2016 if (!IPCL_BIND_MATCH_V6(connp, protocol, 2017 ((ip6_t *)hdr)->ip6_dst, lport)) 2018 continue; 2019 } 2020 } 2021 2022 if (IPCL_ZONE_MATCH(connp, zoneid) || 2023 (unlabeled && connp->conn_mac_exempt)) 2024 break; 2025 } 2026 /* 2027 * If the connection is fully-bound and connection-oriented (TCP or 2028 * SCTP), then we've already validated the remote system's label. 2029 * There's no need to do it again for every packet. 2030 */ 2031 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 2032 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 2033 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 2034 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 2035 char *, "connp(1) could not receive mp(2)", 2036 conn_t *, connp, mblk_t *, mp); 2037 connp = NULL; 2038 } 2039 2040 if (connp != NULL) 2041 goto found; 2042 mutex_exit(&connfp->connf_lock); 2043 2044 /* Try to look for a wildcard match. */ 2045 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 2046 mutex_enter(&connfp->connf_lock); 2047 for (connp = connfp->connf_head; connp != NULL; 2048 connp = connp->conn_next) { 2049 /* We don't allow v4 fallback for v6 raw socket. */ 2050 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 2051 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 2052 continue; 2053 } 2054 if (af == IPV4_VERSION) { 2055 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 2056 break; 2057 } else { 2058 if (IPCL_RAW_MATCH_V6(connp, protocol, 2059 ((ip6_t *)hdr)->ip6_dst)) { 2060 break; 2061 } 2062 } 2063 } 2064 2065 if (connp != NULL) 2066 goto found; 2067 2068 mutex_exit(&connfp->connf_lock); 2069 return (NULL); 2070 2071 found: 2072 ASSERT(connp != NULL); 2073 CONN_INC_REF(connp); 2074 mutex_exit(&connfp->connf_lock); 2075 return (connp); 2076 } 2077 2078 /* ARGSUSED */ 2079 static int 2080 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2081 { 2082 itc_t *itc = (itc_t *)buf; 2083 conn_t *connp = &itc->itc_conn; 2084 tcp_t *tcp = (tcp_t *)&itc[1]; 2085 2086 bzero(connp, sizeof (conn_t)); 2087 bzero(tcp, sizeof (tcp_t)); 2088 2089 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2090 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2091 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 2092 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2093 connp->conn_tcp = tcp; 2094 connp->conn_flags = IPCL_TCPCONN; 2095 connp->conn_ulp = IPPROTO_TCP; 2096 tcp->tcp_connp = connp; 2097 return (0); 2098 } 2099 2100 /* ARGSUSED */ 2101 static void 2102 tcp_conn_destructor(void *buf, void *cdrarg) 2103 { 2104 itc_t *itc = (itc_t *)buf; 2105 conn_t *connp = &itc->itc_conn; 2106 tcp_t *tcp = (tcp_t *)&itc[1]; 2107 2108 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2109 ASSERT(tcp->tcp_connp == connp); 2110 ASSERT(connp->conn_tcp == tcp); 2111 tcp_timermp_free(tcp); 2112 mutex_destroy(&connp->conn_lock); 2113 cv_destroy(&connp->conn_cv); 2114 cv_destroy(&connp->conn_sq_cv); 2115 } 2116 2117 /* ARGSUSED */ 2118 static int 2119 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2120 { 2121 itc_t *itc = (itc_t *)buf; 2122 conn_t *connp = &itc->itc_conn; 2123 2124 bzero(connp, sizeof (conn_t)); 2125 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2126 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2127 connp->conn_flags = IPCL_IPCCONN; 2128 2129 return (0); 2130 } 2131 2132 /* ARGSUSED */ 2133 static void 2134 ip_conn_destructor(void *buf, void *cdrarg) 2135 { 2136 itc_t *itc = (itc_t *)buf; 2137 conn_t *connp = &itc->itc_conn; 2138 2139 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2140 ASSERT(connp->conn_priv == NULL); 2141 mutex_destroy(&connp->conn_lock); 2142 cv_destroy(&connp->conn_cv); 2143 } 2144 2145 /* ARGSUSED */ 2146 static int 2147 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2148 { 2149 itc_t *itc = (itc_t *)buf; 2150 conn_t *connp = &itc->itc_conn; 2151 udp_t *udp = (udp_t *)&itc[1]; 2152 2153 bzero(connp, sizeof (conn_t)); 2154 bzero(udp, sizeof (udp_t)); 2155 2156 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2157 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2158 connp->conn_udp = udp; 2159 connp->conn_flags = IPCL_UDPCONN; 2160 connp->conn_ulp = IPPROTO_UDP; 2161 udp->udp_connp = connp; 2162 return (0); 2163 } 2164 2165 /* ARGSUSED */ 2166 static void 2167 udp_conn_destructor(void *buf, void *cdrarg) 2168 { 2169 itc_t *itc = (itc_t *)buf; 2170 conn_t *connp = &itc->itc_conn; 2171 udp_t *udp = (udp_t *)&itc[1]; 2172 2173 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2174 ASSERT(udp->udp_connp == connp); 2175 ASSERT(connp->conn_udp == udp); 2176 mutex_destroy(&connp->conn_lock); 2177 cv_destroy(&connp->conn_cv); 2178 } 2179 2180 /* ARGSUSED */ 2181 static int 2182 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2183 { 2184 itc_t *itc = (itc_t *)buf; 2185 conn_t *connp = &itc->itc_conn; 2186 icmp_t *icmp = (icmp_t *)&itc[1]; 2187 2188 bzero(connp, sizeof (conn_t)); 2189 bzero(icmp, sizeof (icmp_t)); 2190 2191 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2192 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2193 connp->conn_icmp = icmp; 2194 connp->conn_flags = IPCL_RAWIPCONN; 2195 connp->conn_ulp = IPPROTO_ICMP; 2196 icmp->icmp_connp = connp; 2197 return (0); 2198 } 2199 2200 /* ARGSUSED */ 2201 static void 2202 rawip_conn_destructor(void *buf, void *cdrarg) 2203 { 2204 itc_t *itc = (itc_t *)buf; 2205 conn_t *connp = &itc->itc_conn; 2206 icmp_t *icmp = (icmp_t *)&itc[1]; 2207 2208 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2209 ASSERT(icmp->icmp_connp == connp); 2210 ASSERT(connp->conn_icmp == icmp); 2211 mutex_destroy(&connp->conn_lock); 2212 cv_destroy(&connp->conn_cv); 2213 } 2214 2215 /* ARGSUSED */ 2216 static int 2217 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2218 { 2219 itc_t *itc = (itc_t *)buf; 2220 conn_t *connp = &itc->itc_conn; 2221 rts_t *rts = (rts_t *)&itc[1]; 2222 2223 bzero(connp, sizeof (conn_t)); 2224 bzero(rts, sizeof (rts_t)); 2225 2226 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2227 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2228 connp->conn_rts = rts; 2229 connp->conn_flags = IPCL_RTSCONN; 2230 rts->rts_connp = connp; 2231 return (0); 2232 } 2233 2234 /* ARGSUSED */ 2235 static void 2236 rts_conn_destructor(void *buf, void *cdrarg) 2237 { 2238 itc_t *itc = (itc_t *)buf; 2239 conn_t *connp = &itc->itc_conn; 2240 rts_t *rts = (rts_t *)&itc[1]; 2241 2242 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2243 ASSERT(rts->rts_connp == connp); 2244 ASSERT(connp->conn_rts == rts); 2245 mutex_destroy(&connp->conn_lock); 2246 cv_destroy(&connp->conn_cv); 2247 } 2248 2249 /* ARGSUSED */ 2250 int 2251 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) 2252 { 2253 int error; 2254 netstack_t *ns; 2255 int ret; 2256 tcp_stack_t *tcps; 2257 ip_helper_stream_info_t *ip_helper_str; 2258 ip_stack_t *ipst; 2259 2260 ns = netstack_find_by_cred(kcred); 2261 ASSERT(ns != NULL); 2262 tcps = ns->netstack_tcp; 2263 ipst = ns->netstack_ip; 2264 ASSERT(tcps != NULL); 2265 ip_helper_str = (ip_helper_stream_info_t *)buf; 2266 2267 do { 2268 error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, 2269 &ip_helper_str->iphs_handle, ipst->ips_ldi_ident); 2270 } while (error == EINTR); 2271 2272 if (error == 0) { 2273 do { 2274 error = ldi_ioctl( 2275 ip_helper_str->iphs_handle, SIOCSQPTR, 2276 (intptr_t)buf, FKIOCTL, kcred, &ret); 2277 } while (error == EINTR); 2278 2279 if (error != 0) { 2280 (void) ldi_close( 2281 ip_helper_str->iphs_handle, 0, kcred); 2282 } 2283 } 2284 2285 netstack_rele(ipst->ips_netstack); 2286 2287 return (error); 2288 } 2289 2290 /* ARGSUSED */ 2291 static void 2292 ip_helper_stream_destructor(void *buf, void *cdrarg) 2293 { 2294 ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; 2295 2296 ip_helper_str->iphs_rq->q_ptr = 2297 ip_helper_str->iphs_wq->q_ptr = 2298 ip_helper_str->iphs_minfo; 2299 (void) ldi_close(ip_helper_str->iphs_handle, 0, kcred); 2300 } 2301 2302 2303 /* 2304 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2305 * in the conn_t. 2306 */ 2307 void 2308 ipcl_conn_cleanup(conn_t *connp) 2309 { 2310 ASSERT(connp->conn_ire_cache == NULL); 2311 ASSERT(connp->conn_latch == NULL); 2312 #ifdef notdef 2313 ASSERT(connp->conn_rq == NULL); 2314 ASSERT(connp->conn_wq == NULL); 2315 #endif 2316 ASSERT(connp->conn_cred == NULL); 2317 ASSERT(connp->conn_g_fanout == NULL); 2318 ASSERT(connp->conn_g_next == NULL); 2319 ASSERT(connp->conn_g_prev == NULL); 2320 ASSERT(connp->conn_policy == NULL); 2321 ASSERT(connp->conn_fanout == NULL); 2322 ASSERT(connp->conn_next == NULL); 2323 ASSERT(connp->conn_prev == NULL); 2324 #ifdef notdef 2325 /* 2326 * The ill and ipif pointers are not cleared before the conn_t 2327 * goes away since they do not hold a reference on the ill/ipif. 2328 * We should replace these pointers with ifindex/ipaddr_t to 2329 * make the code less complex. 2330 */ 2331 ASSERT(connp->conn_outgoing_ill == NULL); 2332 ASSERT(connp->conn_incoming_ill == NULL); 2333 ASSERT(connp->conn_multicast_ipif == NULL); 2334 ASSERT(connp->conn_multicast_ill == NULL); 2335 #endif 2336 ASSERT(connp->conn_oper_pending_ill == NULL); 2337 ASSERT(connp->conn_ilg == NULL); 2338 ASSERT(connp->conn_drain_next == NULL); 2339 ASSERT(connp->conn_drain_prev == NULL); 2340 #ifdef notdef 2341 /* conn_idl is not cleared when removed from idl list */ 2342 ASSERT(connp->conn_idl == NULL); 2343 #endif 2344 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2345 ASSERT(connp->conn_peercred == NULL); 2346 ASSERT(connp->conn_netstack == NULL); 2347 2348 ASSERT(connp->conn_helper_info == NULL); 2349 /* Clear out the conn_t fields that are not preserved */ 2350 bzero(&connp->conn_start_clr, 2351 sizeof (conn_t) - 2352 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2353 } 2354 2355 /* 2356 * All conns are inserted in a global multi-list for the benefit of 2357 * walkers. The walk is guaranteed to walk all open conns at the time 2358 * of the start of the walk exactly once. This property is needed to 2359 * achieve some cleanups during unplumb of interfaces. This is achieved 2360 * as follows. 2361 * 2362 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2363 * call the insert and delete functions below at creation and deletion 2364 * time respectively. The conn never moves or changes its position in this 2365 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2366 * won't increase due to walkers, once the conn deletion has started. Note 2367 * that we can't remove the conn from the global list and then wait for 2368 * the refcnt to drop to zero, since walkers would then see a truncated 2369 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2370 * conns until ip_open is ready to make them globally visible. 2371 * The global round robin multi-list locks are held only to get the 2372 * next member/insertion/deletion and contention should be negligible 2373 * if the multi-list is much greater than the number of cpus. 2374 */ 2375 void 2376 ipcl_globalhash_insert(conn_t *connp) 2377 { 2378 int index; 2379 struct connf_s *connfp; 2380 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2381 2382 /* 2383 * No need for atomic here. Approximate even distribution 2384 * in the global lists is sufficient. 2385 */ 2386 ipst->ips_conn_g_index++; 2387 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2388 2389 connp->conn_g_prev = NULL; 2390 /* 2391 * Mark as INCIPIENT, so that walkers will ignore this 2392 * for now, till ip_open is ready to make it visible globally. 2393 */ 2394 connp->conn_state_flags |= CONN_INCIPIENT; 2395 2396 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2397 /* Insert at the head of the list */ 2398 mutex_enter(&connfp->connf_lock); 2399 connp->conn_g_next = connfp->connf_head; 2400 if (connp->conn_g_next != NULL) 2401 connp->conn_g_next->conn_g_prev = connp; 2402 connfp->connf_head = connp; 2403 2404 /* The fanout bucket this conn points to */ 2405 connp->conn_g_fanout = connfp; 2406 2407 mutex_exit(&connfp->connf_lock); 2408 } 2409 2410 void 2411 ipcl_globalhash_remove(conn_t *connp) 2412 { 2413 struct connf_s *connfp; 2414 2415 /* 2416 * We were never inserted in the global multi list. 2417 * IPCL_NONE variety is never inserted in the global multilist 2418 * since it is presumed to not need any cleanup and is transient. 2419 */ 2420 if (connp->conn_g_fanout == NULL) 2421 return; 2422 2423 connfp = connp->conn_g_fanout; 2424 mutex_enter(&connfp->connf_lock); 2425 if (connp->conn_g_prev != NULL) 2426 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2427 else 2428 connfp->connf_head = connp->conn_g_next; 2429 if (connp->conn_g_next != NULL) 2430 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2431 mutex_exit(&connfp->connf_lock); 2432 2433 /* Better to stumble on a null pointer than to corrupt memory */ 2434 connp->conn_g_next = NULL; 2435 connp->conn_g_prev = NULL; 2436 connp->conn_g_fanout = NULL; 2437 } 2438 2439 /* 2440 * Walk the list of all conn_t's in the system, calling the function provided 2441 * with the specified argument for each. 2442 * Applies to both IPv4 and IPv6. 2443 * 2444 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2445 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2446 * unplumbed or removed. New conn_t's that are created while we are walking 2447 * may be missed by this walk, because they are not necessarily inserted 2448 * at the tail of the list. They are new conn_t's and thus don't have any 2449 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2450 * is created to the struct that is going away. 2451 */ 2452 void 2453 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2454 { 2455 int i; 2456 conn_t *connp; 2457 conn_t *prev_connp; 2458 2459 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2460 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2461 prev_connp = NULL; 2462 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2463 while (connp != NULL) { 2464 mutex_enter(&connp->conn_lock); 2465 if (connp->conn_state_flags & 2466 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2467 mutex_exit(&connp->conn_lock); 2468 connp = connp->conn_g_next; 2469 continue; 2470 } 2471 CONN_INC_REF_LOCKED(connp); 2472 mutex_exit(&connp->conn_lock); 2473 mutex_exit( 2474 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2475 (*func)(connp, arg); 2476 if (prev_connp != NULL) 2477 CONN_DEC_REF(prev_connp); 2478 mutex_enter( 2479 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2480 prev_connp = connp; 2481 connp = connp->conn_g_next; 2482 } 2483 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2484 if (prev_connp != NULL) 2485 CONN_DEC_REF(prev_connp); 2486 } 2487 } 2488 2489 /* 2490 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2491 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2492 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2493 * (peer tcp in ESTABLISHED state). 2494 */ 2495 conn_t * 2496 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2497 ip_stack_t *ipst) 2498 { 2499 uint32_t ports; 2500 uint16_t *pports = (uint16_t *)&ports; 2501 connf_t *connfp; 2502 conn_t *tconnp; 2503 boolean_t zone_chk; 2504 2505 /* 2506 * If either the source of destination address is loopback, then 2507 * both endpoints must be in the same Zone. Otherwise, both of 2508 * the addresses are system-wide unique (tcp is in ESTABLISHED 2509 * state) and the endpoints may reside in different Zones. 2510 */ 2511 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2512 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2513 2514 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2515 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2516 2517 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2518 ports, ipst)]; 2519 2520 mutex_enter(&connfp->connf_lock); 2521 for (tconnp = connfp->connf_head; tconnp != NULL; 2522 tconnp = tconnp->conn_next) { 2523 2524 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2525 ipha->ipha_dst, ipha->ipha_src, ports) && 2526 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2527 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2528 2529 ASSERT(tconnp != connp); 2530 CONN_INC_REF(tconnp); 2531 mutex_exit(&connfp->connf_lock); 2532 return (tconnp); 2533 } 2534 } 2535 mutex_exit(&connfp->connf_lock); 2536 return (NULL); 2537 } 2538 2539 /* 2540 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2541 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2542 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2543 * (peer tcp in ESTABLISHED state). 2544 */ 2545 conn_t * 2546 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2547 ip_stack_t *ipst) 2548 { 2549 uint32_t ports; 2550 uint16_t *pports = (uint16_t *)&ports; 2551 connf_t *connfp; 2552 conn_t *tconnp; 2553 boolean_t zone_chk; 2554 2555 /* 2556 * If either the source of destination address is loopback, then 2557 * both endpoints must be in the same Zone. Otherwise, both of 2558 * the addresses are system-wide unique (tcp is in ESTABLISHED 2559 * state) and the endpoints may reside in different Zones. We 2560 * don't do Zone check for link local address(es) because the 2561 * current Zone implementation treats each link local address as 2562 * being unique per system node, i.e. they belong to global Zone. 2563 */ 2564 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2565 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2566 2567 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2568 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2569 2570 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2571 ports, ipst)]; 2572 2573 mutex_enter(&connfp->connf_lock); 2574 for (tconnp = connfp->connf_head; tconnp != NULL; 2575 tconnp = tconnp->conn_next) { 2576 2577 /* We skip tcp_bound_if check here as this is loopback tcp */ 2578 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2579 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2580 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2581 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2582 2583 ASSERT(tconnp != connp); 2584 CONN_INC_REF(tconnp); 2585 mutex_exit(&connfp->connf_lock); 2586 return (tconnp); 2587 } 2588 } 2589 mutex_exit(&connfp->connf_lock); 2590 return (NULL); 2591 } 2592 2593 /* 2594 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2595 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2596 * Only checks for connected entries i.e. no INADDR_ANY checks. 2597 */ 2598 conn_t * 2599 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2600 ip_stack_t *ipst) 2601 { 2602 uint32_t ports; 2603 uint16_t *pports; 2604 connf_t *connfp; 2605 conn_t *tconnp; 2606 2607 pports = (uint16_t *)&ports; 2608 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2609 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2610 2611 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2612 ports, ipst)]; 2613 2614 mutex_enter(&connfp->connf_lock); 2615 for (tconnp = connfp->connf_head; tconnp != NULL; 2616 tconnp = tconnp->conn_next) { 2617 2618 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2619 ipha->ipha_dst, ipha->ipha_src, ports) && 2620 tconnp->conn_tcp->tcp_state >= min_state) { 2621 2622 CONN_INC_REF(tconnp); 2623 mutex_exit(&connfp->connf_lock); 2624 return (tconnp); 2625 } 2626 } 2627 mutex_exit(&connfp->connf_lock); 2628 return (NULL); 2629 } 2630 2631 /* 2632 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2633 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2634 * Only checks for connected entries i.e. no INADDR_ANY checks. 2635 * Match on ifindex in addition to addresses. 2636 */ 2637 conn_t * 2638 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2639 uint_t ifindex, ip_stack_t *ipst) 2640 { 2641 tcp_t *tcp; 2642 uint32_t ports; 2643 uint16_t *pports; 2644 connf_t *connfp; 2645 conn_t *tconnp; 2646 2647 pports = (uint16_t *)&ports; 2648 pports[0] = tcpha->tha_fport; 2649 pports[1] = tcpha->tha_lport; 2650 2651 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2652 ports, ipst)]; 2653 2654 mutex_enter(&connfp->connf_lock); 2655 for (tconnp = connfp->connf_head; tconnp != NULL; 2656 tconnp = tconnp->conn_next) { 2657 2658 tcp = tconnp->conn_tcp; 2659 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2660 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2661 tcp->tcp_state >= min_state && 2662 (tcp->tcp_bound_if == 0 || 2663 tcp->tcp_bound_if == ifindex)) { 2664 2665 CONN_INC_REF(tconnp); 2666 mutex_exit(&connfp->connf_lock); 2667 return (tconnp); 2668 } 2669 } 2670 mutex_exit(&connfp->connf_lock); 2671 return (NULL); 2672 } 2673 2674 /* 2675 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2676 * a listener when changing state. 2677 */ 2678 conn_t * 2679 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2680 ip_stack_t *ipst) 2681 { 2682 connf_t *bind_connfp; 2683 conn_t *connp; 2684 tcp_t *tcp; 2685 2686 /* 2687 * Avoid false matches for packets sent to an IP destination of 2688 * all zeros. 2689 */ 2690 if (laddr == 0) 2691 return (NULL); 2692 2693 ASSERT(zoneid != ALL_ZONES); 2694 2695 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2696 mutex_enter(&bind_connfp->connf_lock); 2697 for (connp = bind_connfp->connf_head; connp != NULL; 2698 connp = connp->conn_next) { 2699 tcp = connp->conn_tcp; 2700 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2701 IPCL_ZONE_MATCH(connp, zoneid) && 2702 (tcp->tcp_listener == NULL)) { 2703 CONN_INC_REF(connp); 2704 mutex_exit(&bind_connfp->connf_lock); 2705 return (connp); 2706 } 2707 } 2708 mutex_exit(&bind_connfp->connf_lock); 2709 return (NULL); 2710 } 2711 2712 /* 2713 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2714 * a listener when changing state. 2715 */ 2716 conn_t * 2717 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2718 zoneid_t zoneid, ip_stack_t *ipst) 2719 { 2720 connf_t *bind_connfp; 2721 conn_t *connp = NULL; 2722 tcp_t *tcp; 2723 2724 /* 2725 * Avoid false matches for packets sent to an IP destination of 2726 * all zeros. 2727 */ 2728 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2729 return (NULL); 2730 2731 ASSERT(zoneid != ALL_ZONES); 2732 2733 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2734 mutex_enter(&bind_connfp->connf_lock); 2735 for (connp = bind_connfp->connf_head; connp != NULL; 2736 connp = connp->conn_next) { 2737 tcp = connp->conn_tcp; 2738 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2739 IPCL_ZONE_MATCH(connp, zoneid) && 2740 (tcp->tcp_bound_if == 0 || 2741 tcp->tcp_bound_if == ifindex) && 2742 tcp->tcp_listener == NULL) { 2743 CONN_INC_REF(connp); 2744 mutex_exit(&bind_connfp->connf_lock); 2745 return (connp); 2746 } 2747 } 2748 mutex_exit(&bind_connfp->connf_lock); 2749 return (NULL); 2750 } 2751 2752 /* 2753 * ipcl_get_next_conn 2754 * get the next entry in the conn global list 2755 * and put a reference on the next_conn. 2756 * decrement the reference on the current conn. 2757 * 2758 * This is an iterator based walker function that also provides for 2759 * some selection by the caller. It walks through the conn_hash bucket 2760 * searching for the next valid connp in the list, and selects connections 2761 * that are neither closed nor condemned. It also REFHOLDS the conn 2762 * thus ensuring that the conn exists when the caller uses the conn. 2763 */ 2764 conn_t * 2765 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2766 { 2767 conn_t *next_connp; 2768 2769 if (connfp == NULL) 2770 return (NULL); 2771 2772 mutex_enter(&connfp->connf_lock); 2773 2774 next_connp = (connp == NULL) ? 2775 connfp->connf_head : connp->conn_g_next; 2776 2777 while (next_connp != NULL) { 2778 mutex_enter(&next_connp->conn_lock); 2779 if (!(next_connp->conn_flags & conn_flags) || 2780 (next_connp->conn_state_flags & 2781 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2782 /* 2783 * This conn has been condemned or 2784 * is closing, or the flags don't match 2785 */ 2786 mutex_exit(&next_connp->conn_lock); 2787 next_connp = next_connp->conn_g_next; 2788 continue; 2789 } 2790 CONN_INC_REF_LOCKED(next_connp); 2791 mutex_exit(&next_connp->conn_lock); 2792 break; 2793 } 2794 2795 mutex_exit(&connfp->connf_lock); 2796 2797 if (connp != NULL) 2798 CONN_DEC_REF(connp); 2799 2800 return (next_connp); 2801 } 2802 2803 #ifdef CONN_DEBUG 2804 /* 2805 * Trace of the last NBUF refhold/refrele 2806 */ 2807 int 2808 conn_trace_ref(conn_t *connp) 2809 { 2810 int last; 2811 conn_trace_t *ctb; 2812 2813 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2814 last = connp->conn_trace_last; 2815 last++; 2816 if (last == CONN_TRACE_MAX) 2817 last = 0; 2818 2819 ctb = &connp->conn_trace_buf[last]; 2820 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2821 connp->conn_trace_last = last; 2822 return (1); 2823 } 2824 2825 int 2826 conn_untrace_ref(conn_t *connp) 2827 { 2828 int last; 2829 conn_trace_t *ctb; 2830 2831 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2832 last = connp->conn_trace_last; 2833 last++; 2834 if (last == CONN_TRACE_MAX) 2835 last = 0; 2836 2837 ctb = &connp->conn_trace_buf[last]; 2838 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2839 connp->conn_trace_last = last; 2840 return (1); 2841 } 2842 #endif 2843