1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/ip_ndp.h> 265 #include <inet/ip_impl.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/tcp.h> 276 #include <inet/ipsec_impl.h> 277 278 #include <sys/tsol/tnet.h> 279 #include <sys/sockio.h> 280 281 #ifdef DEBUG 282 #define IPCL_DEBUG 283 #else 284 #undef IPCL_DEBUG 285 #endif 286 287 #ifdef IPCL_DEBUG 288 int ipcl_debug_level = 0; 289 #define IPCL_DEBUG_LVL(level, args) \ 290 if (ipcl_debug_level & level) { printf args; } 291 #else 292 #define IPCL_DEBUG_LVL(level, args) {; } 293 #endif 294 /* Old value for compatibility. Setable in /etc/system */ 295 uint_t tcp_conn_hash_size = 0; 296 297 /* New value. Zero means choose automatically. Setable in /etc/system */ 298 uint_t ipcl_conn_hash_size = 0; 299 uint_t ipcl_conn_hash_memfactor = 8192; 300 uint_t ipcl_conn_hash_maxsize = 82500; 301 302 /* bind/udp fanout table size */ 303 uint_t ipcl_bind_fanout_size = 512; 304 uint_t ipcl_udp_fanout_size = 16384; 305 306 /* Raw socket fanout size. Must be a power of 2. */ 307 uint_t ipcl_raw_fanout_size = 256; 308 309 /* 310 * Power of 2^N Primes useful for hashing for N of 0-28, 311 * these primes are the nearest prime <= 2^N - 2^(N-2). 312 */ 313 314 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 315 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 316 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 317 50331599, 100663291, 201326557, 0} 318 319 /* 320 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 321 * are aligned on cache lines. 322 */ 323 typedef union itc_s { 324 conn_t itc_conn; 325 char itcu_filler[CACHE_ALIGN(conn_s)]; 326 } itc_t; 327 328 struct kmem_cache *tcp_conn_cache; 329 struct kmem_cache *ip_conn_cache; 330 struct kmem_cache *ip_helper_stream_cache; 331 extern struct kmem_cache *sctp_conn_cache; 332 extern struct kmem_cache *tcp_sack_info_cache; 333 extern struct kmem_cache *tcp_iphc_cache; 334 struct kmem_cache *udp_conn_cache; 335 struct kmem_cache *rawip_conn_cache; 336 struct kmem_cache *rts_conn_cache; 337 338 extern void tcp_timermp_free(tcp_t *); 339 extern mblk_t *tcp_timermp_alloc(int); 340 341 static int ip_conn_constructor(void *, void *, int); 342 static void ip_conn_destructor(void *, void *); 343 344 static int tcp_conn_constructor(void *, void *, int); 345 static void tcp_conn_destructor(void *, void *); 346 347 static int udp_conn_constructor(void *, void *, int); 348 static void udp_conn_destructor(void *, void *); 349 350 static int rawip_conn_constructor(void *, void *, int); 351 static void rawip_conn_destructor(void *, void *); 352 353 static int rts_conn_constructor(void *, void *, int); 354 static void rts_conn_destructor(void *, void *); 355 356 static int ip_helper_stream_constructor(void *, void *, int); 357 static void ip_helper_stream_destructor(void *, void *); 358 359 boolean_t ip_use_helper_cache = B_TRUE; 360 361 /* 362 * Hook functions to enable cluster networking 363 * On non-clustered systems these vectors must always be NULL. 364 */ 365 extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, 366 uint8_t *, in_port_t, void *); 367 extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, 368 uint8_t *, in_port_t, void *); 369 370 #ifdef IPCL_DEBUG 371 #define INET_NTOA_BUFSIZE 18 372 373 static char * 374 inet_ntoa_r(uint32_t in, char *b) 375 { 376 unsigned char *p; 377 378 p = (unsigned char *)∈ 379 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 380 return (b); 381 } 382 #endif 383 384 /* 385 * Global (for all stack instances) init routine 386 */ 387 void 388 ipcl_g_init(void) 389 { 390 ip_conn_cache = kmem_cache_create("ip_conn_cache", 391 sizeof (conn_t), CACHE_ALIGN_SIZE, 392 ip_conn_constructor, ip_conn_destructor, 393 NULL, NULL, NULL, 0); 394 395 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 396 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 397 tcp_conn_constructor, tcp_conn_destructor, 398 NULL, NULL, NULL, 0); 399 400 udp_conn_cache = kmem_cache_create("udp_conn_cache", 401 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 402 udp_conn_constructor, udp_conn_destructor, 403 NULL, NULL, NULL, 0); 404 405 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 406 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 407 rawip_conn_constructor, rawip_conn_destructor, 408 NULL, NULL, NULL, 0); 409 410 rts_conn_cache = kmem_cache_create("rts_conn_cache", 411 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 412 rts_conn_constructor, rts_conn_destructor, 413 NULL, NULL, NULL, 0); 414 415 if (ip_use_helper_cache) { 416 ip_helper_stream_cache = kmem_cache_create 417 ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), 418 CACHE_ALIGN_SIZE, ip_helper_stream_constructor, 419 ip_helper_stream_destructor, NULL, NULL, NULL, 0); 420 } else { 421 ip_helper_stream_cache = NULL; 422 } 423 } 424 425 /* 426 * ipclassifier intialization routine, sets up hash tables. 427 */ 428 void 429 ipcl_init(ip_stack_t *ipst) 430 { 431 int i; 432 int sizes[] = P2Ps(); 433 434 /* 435 * Calculate size of conn fanout table from /etc/system settings 436 */ 437 if (ipcl_conn_hash_size != 0) { 438 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 439 } else if (tcp_conn_hash_size != 0) { 440 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 441 } else { 442 extern pgcnt_t freemem; 443 444 ipst->ips_ipcl_conn_fanout_size = 445 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 446 447 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 448 ipst->ips_ipcl_conn_fanout_size = 449 ipcl_conn_hash_maxsize; 450 } 451 } 452 453 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 454 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 455 break; 456 } 457 } 458 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 459 /* Out of range, use the 2^16 value */ 460 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 461 } 462 463 /* Take values from /etc/system */ 464 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 465 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 466 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 467 468 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 469 470 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 471 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 472 473 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 474 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 479 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 480 481 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 482 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 483 MUTEX_DEFAULT, NULL); 484 } 485 486 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 487 sizeof (connf_t), KM_SLEEP); 488 for (i = 0; i < IPPROTO_MAX; i++) { 489 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 490 MUTEX_DEFAULT, NULL); 491 } 492 493 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 494 sizeof (connf_t), KM_SLEEP); 495 for (i = 0; i < IPPROTO_MAX; i++) { 496 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 497 MUTEX_DEFAULT, NULL); 498 } 499 500 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 501 mutex_init(&ipst->ips_rts_clients->connf_lock, 502 NULL, MUTEX_DEFAULT, NULL); 503 504 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 505 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 506 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 507 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 508 MUTEX_DEFAULT, NULL); 509 } 510 511 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 512 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 513 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 514 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 515 MUTEX_DEFAULT, NULL); 516 } 517 518 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 519 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 520 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 521 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 522 NULL, MUTEX_DEFAULT, NULL); 523 } 524 } 525 526 void 527 ipcl_g_destroy(void) 528 { 529 kmem_cache_destroy(ip_conn_cache); 530 kmem_cache_destroy(tcp_conn_cache); 531 kmem_cache_destroy(udp_conn_cache); 532 kmem_cache_destroy(rawip_conn_cache); 533 kmem_cache_destroy(rts_conn_cache); 534 } 535 536 /* 537 * All user-level and kernel use of the stack must be gone 538 * by now. 539 */ 540 void 541 ipcl_destroy(ip_stack_t *ipst) 542 { 543 int i; 544 545 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 546 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 547 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 548 } 549 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 550 sizeof (connf_t)); 551 ipst->ips_ipcl_conn_fanout = NULL; 552 553 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 554 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 555 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 556 } 557 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 558 sizeof (connf_t)); 559 ipst->ips_ipcl_bind_fanout = NULL; 560 561 for (i = 0; i < IPPROTO_MAX; i++) { 562 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 563 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 564 } 565 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 566 ipst->ips_ipcl_proto_fanout = NULL; 567 568 for (i = 0; i < IPPROTO_MAX; i++) { 569 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 570 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 571 } 572 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 573 IPPROTO_MAX * sizeof (connf_t)); 574 ipst->ips_ipcl_proto_fanout_v6 = NULL; 575 576 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 577 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 578 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 579 } 580 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 581 sizeof (connf_t)); 582 ipst->ips_ipcl_udp_fanout = NULL; 583 584 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 585 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 586 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 587 } 588 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 589 sizeof (connf_t)); 590 ipst->ips_ipcl_raw_fanout = NULL; 591 592 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 593 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 594 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 595 } 596 kmem_free(ipst->ips_ipcl_globalhash_fanout, 597 sizeof (connf_t) * CONN_G_HASH_SIZE); 598 ipst->ips_ipcl_globalhash_fanout = NULL; 599 600 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 601 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 602 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 603 ipst->ips_rts_clients = NULL; 604 } 605 606 /* 607 * conn creation routine. initialize the conn, sets the reference 608 * and inserts it in the global hash table. 609 */ 610 conn_t * 611 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 612 { 613 conn_t *connp; 614 sctp_stack_t *sctps; 615 struct kmem_cache *conn_cache; 616 617 switch (type) { 618 case IPCL_SCTPCONN: 619 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 620 return (NULL); 621 sctp_conn_init(connp); 622 sctps = ns->netstack_sctp; 623 SCTP_G_Q_REFHOLD(sctps); 624 netstack_hold(ns); 625 connp->conn_netstack = ns; 626 return (connp); 627 628 case IPCL_TCPCONN: 629 conn_cache = tcp_conn_cache; 630 break; 631 632 case IPCL_UDPCONN: 633 conn_cache = udp_conn_cache; 634 break; 635 636 case IPCL_RAWIPCONN: 637 conn_cache = rawip_conn_cache; 638 break; 639 640 case IPCL_RTSCONN: 641 conn_cache = rts_conn_cache; 642 break; 643 644 case IPCL_IPCCONN: 645 conn_cache = ip_conn_cache; 646 break; 647 648 default: 649 connp = NULL; 650 ASSERT(0); 651 } 652 653 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 654 return (NULL); 655 656 connp->conn_ref = 1; 657 netstack_hold(ns); 658 connp->conn_netstack = ns; 659 ipcl_globalhash_insert(connp); 660 return (connp); 661 } 662 663 void 664 ipcl_conn_destroy(conn_t *connp) 665 { 666 mblk_t *mp; 667 netstack_t *ns = connp->conn_netstack; 668 669 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 670 ASSERT(connp->conn_ref == 0); 671 ASSERT(connp->conn_ire_cache == NULL); 672 673 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 674 675 if (connp->conn_peercred != NULL && 676 connp->conn_peercred != connp->conn_cred) 677 crfree(connp->conn_peercred); 678 connp->conn_peercred = NULL; 679 680 if (connp->conn_cred != NULL) { 681 crfree(connp->conn_cred); 682 connp->conn_cred = NULL; 683 } 684 685 ipcl_globalhash_remove(connp); 686 687 /* FIXME: add separate tcp_conn_free()? */ 688 if (connp->conn_flags & IPCL_TCPCONN) { 689 tcp_t *tcp = connp->conn_tcp; 690 tcp_stack_t *tcps; 691 692 ASSERT(tcp != NULL); 693 tcps = tcp->tcp_tcps; 694 if (tcps != NULL) { 695 if (connp->conn_latch != NULL) { 696 IPLATCH_REFRELE(connp->conn_latch, ns); 697 connp->conn_latch = NULL; 698 } 699 if (connp->conn_policy != NULL) { 700 IPPH_REFRELE(connp->conn_policy, ns); 701 connp->conn_policy = NULL; 702 } 703 tcp->tcp_tcps = NULL; 704 TCPS_REFRELE(tcps); 705 } 706 707 tcp_free(tcp); 708 mp = tcp->tcp_timercache; 709 tcp->tcp_cred = NULL; 710 711 if (tcp->tcp_sack_info != NULL) { 712 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 713 kmem_cache_free(tcp_sack_info_cache, 714 tcp->tcp_sack_info); 715 } 716 if (tcp->tcp_iphc != NULL) { 717 if (tcp->tcp_hdr_grown) { 718 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 719 } else { 720 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 721 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 722 } 723 tcp->tcp_iphc_len = 0; 724 } 725 ASSERT(tcp->tcp_iphc_len == 0); 726 727 /* 728 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 729 * the mblk. 730 */ 731 if (tcp->tcp_rsrv_mp != NULL) { 732 freeb(tcp->tcp_rsrv_mp); 733 tcp->tcp_rsrv_mp = NULL; 734 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 735 } 736 737 ASSERT(connp->conn_latch == NULL); 738 ASSERT(connp->conn_policy == NULL); 739 740 if (ns != NULL) { 741 ASSERT(tcp->tcp_tcps == NULL); 742 connp->conn_netstack = NULL; 743 netstack_rele(ns); 744 } 745 746 ipcl_conn_cleanup(connp); 747 connp->conn_flags = IPCL_TCPCONN; 748 bzero(tcp, sizeof (tcp_t)); 749 750 tcp->tcp_timercache = mp; 751 tcp->tcp_connp = connp; 752 kmem_cache_free(tcp_conn_cache, connp); 753 return; 754 } 755 if (connp->conn_latch != NULL) { 756 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 757 connp->conn_latch = NULL; 758 } 759 if (connp->conn_policy != NULL) { 760 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 761 connp->conn_policy = NULL; 762 } 763 if (connp->conn_ipsec_opt_mp != NULL) { 764 freemsg(connp->conn_ipsec_opt_mp); 765 connp->conn_ipsec_opt_mp = NULL; 766 } 767 768 if (connp->conn_flags & IPCL_SCTPCONN) { 769 ASSERT(ns != NULL); 770 sctp_free(connp); 771 return; 772 } 773 774 if (ns != NULL) { 775 connp->conn_netstack = NULL; 776 netstack_rele(ns); 777 } 778 779 ipcl_conn_cleanup(connp); 780 781 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 782 if (connp->conn_flags & IPCL_UDPCONN) { 783 connp->conn_flags = IPCL_UDPCONN; 784 kmem_cache_free(udp_conn_cache, connp); 785 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 786 787 connp->conn_flags = IPCL_RAWIPCONN; 788 connp->conn_ulp = IPPROTO_ICMP; 789 kmem_cache_free(rawip_conn_cache, connp); 790 } else if (connp->conn_flags & IPCL_RTSCONN) { 791 connp->conn_flags = IPCL_RTSCONN; 792 kmem_cache_free(rts_conn_cache, connp); 793 } else { 794 connp->conn_flags = IPCL_IPCCONN; 795 ASSERT(connp->conn_flags & IPCL_IPCCONN); 796 ASSERT(connp->conn_priv == NULL); 797 kmem_cache_free(ip_conn_cache, connp); 798 } 799 } 800 801 /* 802 * Running in cluster mode - deregister listener information 803 */ 804 805 static void 806 ipcl_conn_unlisten(conn_t *connp) 807 { 808 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 809 ASSERT(connp->conn_lport != 0); 810 811 if (cl_inet_unlisten != NULL) { 812 sa_family_t addr_family; 813 uint8_t *laddrp; 814 815 if (connp->conn_pkt_isv6) { 816 addr_family = AF_INET6; 817 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 818 } else { 819 addr_family = AF_INET; 820 laddrp = (uint8_t *)&connp->conn_bound_source; 821 } 822 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 823 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 824 } 825 connp->conn_flags &= ~IPCL_CL_LISTENER; 826 } 827 828 /* 829 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 830 * which table the conn belonged to). So for debugging we can see which hash 831 * table this connection was in. 832 */ 833 #define IPCL_HASH_REMOVE(connp) { \ 834 connf_t *connfp = (connp)->conn_fanout; \ 835 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 836 if (connfp != NULL) { \ 837 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 838 (void *)(connp))); \ 839 mutex_enter(&connfp->connf_lock); \ 840 if ((connp)->conn_next != NULL) \ 841 (connp)->conn_next->conn_prev = \ 842 (connp)->conn_prev; \ 843 if ((connp)->conn_prev != NULL) \ 844 (connp)->conn_prev->conn_next = \ 845 (connp)->conn_next; \ 846 else \ 847 connfp->connf_head = (connp)->conn_next; \ 848 (connp)->conn_fanout = NULL; \ 849 (connp)->conn_next = NULL; \ 850 (connp)->conn_prev = NULL; \ 851 (connp)->conn_flags |= IPCL_REMOVED; \ 852 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 853 ipcl_conn_unlisten((connp)); \ 854 CONN_DEC_REF((connp)); \ 855 mutex_exit(&connfp->connf_lock); \ 856 } \ 857 } 858 859 void 860 ipcl_hash_remove(conn_t *connp) 861 { 862 IPCL_HASH_REMOVE(connp); 863 } 864 865 /* 866 * The whole purpose of this function is allow removal of 867 * a conn_t from the connected hash for timewait reclaim. 868 * This is essentially a TW reclaim fastpath where timewait 869 * collector checks under fanout lock (so no one else can 870 * get access to the conn_t) that refcnt is 2 i.e. one for 871 * TCP and one for the classifier hash list. If ref count 872 * is indeed 2, we can just remove the conn under lock and 873 * avoid cleaning up the conn under squeue. This gives us 874 * improved performance. 875 */ 876 void 877 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 878 { 879 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 880 ASSERT(MUTEX_HELD(&connp->conn_lock)); 881 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 882 883 if ((connp)->conn_next != NULL) { 884 (connp)->conn_next->conn_prev = (connp)->conn_prev; 885 } 886 if ((connp)->conn_prev != NULL) { 887 (connp)->conn_prev->conn_next = (connp)->conn_next; 888 } else { 889 connfp->connf_head = (connp)->conn_next; 890 } 891 (connp)->conn_fanout = NULL; 892 (connp)->conn_next = NULL; 893 (connp)->conn_prev = NULL; 894 (connp)->conn_flags |= IPCL_REMOVED; 895 ASSERT((connp)->conn_ref == 2); 896 (connp)->conn_ref--; 897 } 898 899 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 900 ASSERT((connp)->conn_fanout == NULL); \ 901 ASSERT((connp)->conn_next == NULL); \ 902 ASSERT((connp)->conn_prev == NULL); \ 903 if ((connfp)->connf_head != NULL) { \ 904 (connfp)->connf_head->conn_prev = (connp); \ 905 (connp)->conn_next = (connfp)->connf_head; \ 906 } \ 907 (connp)->conn_fanout = (connfp); \ 908 (connfp)->connf_head = (connp); \ 909 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 910 IPCL_CONNECTED; \ 911 CONN_INC_REF(connp); \ 912 } 913 914 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 915 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 916 "connp %p", (void *)(connfp), (void *)(connp))); \ 917 IPCL_HASH_REMOVE((connp)); \ 918 mutex_enter(&(connfp)->connf_lock); \ 919 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 920 mutex_exit(&(connfp)->connf_lock); \ 921 } 922 923 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 924 conn_t *pconnp = NULL, *nconnp; \ 925 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 926 "connp %p", (void *)connfp, (void *)(connp))); \ 927 IPCL_HASH_REMOVE((connp)); \ 928 mutex_enter(&(connfp)->connf_lock); \ 929 nconnp = (connfp)->connf_head; \ 930 while (nconnp != NULL && \ 931 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 932 pconnp = nconnp; \ 933 nconnp = nconnp->conn_next; \ 934 } \ 935 if (pconnp != NULL) { \ 936 pconnp->conn_next = (connp); \ 937 (connp)->conn_prev = pconnp; \ 938 } else { \ 939 (connfp)->connf_head = (connp); \ 940 } \ 941 if (nconnp != NULL) { \ 942 (connp)->conn_next = nconnp; \ 943 nconnp->conn_prev = (connp); \ 944 } \ 945 (connp)->conn_fanout = (connfp); \ 946 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 947 IPCL_BOUND; \ 948 CONN_INC_REF(connp); \ 949 mutex_exit(&(connfp)->connf_lock); \ 950 } 951 952 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 953 conn_t **list, *prev, *next; \ 954 boolean_t isv4mapped = \ 955 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 956 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 957 "connp %p", (void *)(connfp), (void *)(connp))); \ 958 IPCL_HASH_REMOVE((connp)); \ 959 mutex_enter(&(connfp)->connf_lock); \ 960 list = &(connfp)->connf_head; \ 961 prev = NULL; \ 962 while ((next = *list) != NULL) { \ 963 if (isv4mapped && \ 964 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 965 connp->conn_zoneid == next->conn_zoneid) { \ 966 (connp)->conn_next = next; \ 967 if (prev != NULL) \ 968 prev = next->conn_prev; \ 969 next->conn_prev = (connp); \ 970 break; \ 971 } \ 972 list = &next->conn_next; \ 973 prev = next; \ 974 } \ 975 (connp)->conn_prev = prev; \ 976 *list = (connp); \ 977 (connp)->conn_fanout = (connfp); \ 978 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 979 IPCL_BOUND; \ 980 CONN_INC_REF((connp)); \ 981 mutex_exit(&(connfp)->connf_lock); \ 982 } 983 984 void 985 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 986 { 987 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 988 } 989 990 void 991 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 992 { 993 connf_t *connfp; 994 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 995 996 ASSERT(connp != NULL); 997 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 998 protocol == IPPROTO_ESP); 999 1000 connp->conn_ulp = protocol; 1001 1002 /* Insert it in the protocol hash */ 1003 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1004 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1005 } 1006 1007 void 1008 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 1009 { 1010 connf_t *connfp; 1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1012 1013 ASSERT(connp != NULL); 1014 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 1015 protocol == IPPROTO_ESP); 1016 1017 connp->conn_ulp = protocol; 1018 1019 /* Insert it in the Bind Hash */ 1020 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1021 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1022 } 1023 1024 /* 1025 * This function is used only for inserting SCTP raw socket now. 1026 * This may change later. 1027 * 1028 * Note that only one raw socket can be bound to a port. The param 1029 * lport is in network byte order. 1030 */ 1031 static int 1032 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1033 { 1034 connf_t *connfp; 1035 conn_t *oconnp; 1036 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1037 1038 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1039 1040 /* Check for existing raw socket already bound to the port. */ 1041 mutex_enter(&connfp->connf_lock); 1042 for (oconnp = connfp->connf_head; oconnp != NULL; 1043 oconnp = oconnp->conn_next) { 1044 if (oconnp->conn_lport == lport && 1045 oconnp->conn_zoneid == connp->conn_zoneid && 1046 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1047 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1048 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1049 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1050 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1051 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1052 &connp->conn_srcv6))) { 1053 break; 1054 } 1055 } 1056 mutex_exit(&connfp->connf_lock); 1057 if (oconnp != NULL) 1058 return (EADDRNOTAVAIL); 1059 1060 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1061 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1062 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1063 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1064 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1065 } else { 1066 IPCL_HASH_INSERT_BOUND(connfp, connp); 1067 } 1068 } else { 1069 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1070 } 1071 return (0); 1072 } 1073 1074 /* 1075 * Check for a MAC exemption conflict on a labeled system. Note that for 1076 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1077 * transport layer. This check is for binding all other protocols. 1078 * 1079 * Returns true if there's a conflict. 1080 */ 1081 static boolean_t 1082 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1083 { 1084 connf_t *connfp; 1085 conn_t *tconn; 1086 1087 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1088 mutex_enter(&connfp->connf_lock); 1089 for (tconn = connfp->connf_head; tconn != NULL; 1090 tconn = tconn->conn_next) { 1091 /* We don't allow v4 fallback for v6 raw socket */ 1092 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1093 continue; 1094 /* If neither is exempt, then there's no conflict */ 1095 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1096 continue; 1097 /* If both are bound to different specific addrs, ok */ 1098 if (connp->conn_src != INADDR_ANY && 1099 tconn->conn_src != INADDR_ANY && 1100 connp->conn_src != tconn->conn_src) 1101 continue; 1102 /* These two conflict; fail */ 1103 break; 1104 } 1105 mutex_exit(&connfp->connf_lock); 1106 return (tconn != NULL); 1107 } 1108 1109 static boolean_t 1110 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1111 { 1112 connf_t *connfp; 1113 conn_t *tconn; 1114 1115 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1116 mutex_enter(&connfp->connf_lock); 1117 for (tconn = connfp->connf_head; tconn != NULL; 1118 tconn = tconn->conn_next) { 1119 /* We don't allow v4 fallback for v6 raw socket */ 1120 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1121 continue; 1122 /* If neither is exempt, then there's no conflict */ 1123 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1124 continue; 1125 /* If both are bound to different addrs, ok */ 1126 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1127 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1128 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1129 continue; 1130 /* These two conflict; fail */ 1131 break; 1132 } 1133 mutex_exit(&connfp->connf_lock); 1134 return (tconn != NULL); 1135 } 1136 1137 /* 1138 * (v4, v6) bind hash insertion routines 1139 */ 1140 int 1141 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1142 { 1143 connf_t *connfp; 1144 #ifdef IPCL_DEBUG 1145 char buf[INET_NTOA_BUFSIZE]; 1146 #endif 1147 int ret = 0; 1148 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1149 1150 ASSERT(connp); 1151 1152 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1153 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1154 1155 connp->conn_ulp = protocol; 1156 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1157 connp->conn_lport = lport; 1158 1159 switch (protocol) { 1160 default: 1161 if (is_system_labeled() && 1162 check_exempt_conflict_v4(connp, ipst)) 1163 return (EADDRINUSE); 1164 /* FALLTHROUGH */ 1165 case IPPROTO_UDP: 1166 if (protocol == IPPROTO_UDP) { 1167 IPCL_DEBUG_LVL(64, 1168 ("ipcl_bind_insert: connp %p - udp\n", 1169 (void *)connp)); 1170 connfp = &ipst->ips_ipcl_udp_fanout[ 1171 IPCL_UDP_HASH(lport, ipst)]; 1172 } else { 1173 IPCL_DEBUG_LVL(64, 1174 ("ipcl_bind_insert: connp %p - protocol\n", 1175 (void *)connp)); 1176 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1177 } 1178 1179 if (connp->conn_rem != INADDR_ANY) { 1180 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1181 } else if (connp->conn_src != INADDR_ANY) { 1182 IPCL_HASH_INSERT_BOUND(connfp, connp); 1183 } else { 1184 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1185 } 1186 break; 1187 1188 case IPPROTO_TCP: 1189 1190 /* Insert it in the Bind Hash */ 1191 ASSERT(connp->conn_zoneid != ALL_ZONES); 1192 connfp = &ipst->ips_ipcl_bind_fanout[ 1193 IPCL_BIND_HASH(lport, ipst)]; 1194 if (connp->conn_src != INADDR_ANY) { 1195 IPCL_HASH_INSERT_BOUND(connfp, connp); 1196 } else { 1197 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1198 } 1199 if (cl_inet_listen != NULL) { 1200 ASSERT(!connp->conn_pkt_isv6); 1201 connp->conn_flags |= IPCL_CL_LISTENER; 1202 (*cl_inet_listen)( 1203 connp->conn_netstack->netstack_stackid, 1204 IPPROTO_TCP, AF_INET, 1205 (uint8_t *)&connp->conn_bound_source, lport, NULL); 1206 } 1207 break; 1208 1209 case IPPROTO_SCTP: 1210 ret = ipcl_sctp_hash_insert(connp, lport); 1211 break; 1212 } 1213 1214 return (ret); 1215 } 1216 1217 int 1218 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1219 uint16_t lport) 1220 { 1221 connf_t *connfp; 1222 int ret = 0; 1223 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1224 1225 ASSERT(connp); 1226 1227 connp->conn_ulp = protocol; 1228 connp->conn_srcv6 = *src; 1229 connp->conn_lport = lport; 1230 1231 switch (protocol) { 1232 default: 1233 if (is_system_labeled() && 1234 check_exempt_conflict_v6(connp, ipst)) 1235 return (EADDRINUSE); 1236 /* FALLTHROUGH */ 1237 case IPPROTO_UDP: 1238 if (protocol == IPPROTO_UDP) { 1239 IPCL_DEBUG_LVL(128, 1240 ("ipcl_bind_insert_v6: connp %p - udp\n", 1241 (void *)connp)); 1242 connfp = &ipst->ips_ipcl_udp_fanout[ 1243 IPCL_UDP_HASH(lport, ipst)]; 1244 } else { 1245 IPCL_DEBUG_LVL(128, 1246 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1247 (void *)connp)); 1248 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1249 } 1250 1251 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1252 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1253 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1254 IPCL_HASH_INSERT_BOUND(connfp, connp); 1255 } else { 1256 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1257 } 1258 break; 1259 1260 case IPPROTO_TCP: 1261 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1262 1263 /* Insert it in the Bind Hash */ 1264 ASSERT(connp->conn_zoneid != ALL_ZONES); 1265 connfp = &ipst->ips_ipcl_bind_fanout[ 1266 IPCL_BIND_HASH(lport, ipst)]; 1267 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1268 IPCL_HASH_INSERT_BOUND(connfp, connp); 1269 } else { 1270 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1271 } 1272 if (cl_inet_listen != NULL) { 1273 sa_family_t addr_family; 1274 uint8_t *laddrp; 1275 1276 if (connp->conn_pkt_isv6) { 1277 addr_family = AF_INET6; 1278 laddrp = 1279 (uint8_t *)&connp->conn_bound_source_v6; 1280 } else { 1281 addr_family = AF_INET; 1282 laddrp = (uint8_t *)&connp->conn_bound_source; 1283 } 1284 connp->conn_flags |= IPCL_CL_LISTENER; 1285 (*cl_inet_listen)( 1286 connp->conn_netstack->netstack_stackid, 1287 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1288 } 1289 break; 1290 1291 case IPPROTO_SCTP: 1292 ret = ipcl_sctp_hash_insert(connp, lport); 1293 break; 1294 } 1295 1296 return (ret); 1297 } 1298 1299 /* 1300 * ipcl_conn_hash insertion routines. 1301 */ 1302 int 1303 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1304 ipaddr_t rem, uint32_t ports) 1305 { 1306 connf_t *connfp; 1307 uint16_t *up; 1308 conn_t *tconnp; 1309 #ifdef IPCL_DEBUG 1310 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1311 #endif 1312 in_port_t lport; 1313 int ret = 0; 1314 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1315 1316 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1317 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1318 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1319 ports, protocol)); 1320 1321 switch (protocol) { 1322 case IPPROTO_TCP: 1323 if (!(connp->conn_flags & IPCL_EAGER)) { 1324 /* 1325 * for a eager connection, i.e connections which 1326 * have just been created, the initialization is 1327 * already done in ip at conn_creation time, so 1328 * we can skip the checks here. 1329 */ 1330 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1331 } 1332 1333 /* 1334 * For tcp, we check whether the connection tuple already 1335 * exists before allowing the connection to proceed. We 1336 * also allow indexing on the zoneid. This is to allow 1337 * multiple shared stack zones to have the same tcp 1338 * connection tuple. In practice this only happens for 1339 * INADDR_LOOPBACK as it's the only local address which 1340 * doesn't have to be unique. 1341 */ 1342 connfp = &ipst->ips_ipcl_conn_fanout[ 1343 IPCL_CONN_HASH(connp->conn_rem, 1344 connp->conn_ports, ipst)]; 1345 mutex_enter(&connfp->connf_lock); 1346 for (tconnp = connfp->connf_head; tconnp != NULL; 1347 tconnp = tconnp->conn_next) { 1348 if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1349 connp->conn_rem, connp->conn_src, 1350 connp->conn_ports)) && 1351 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1352 1353 /* Already have a conn. bail out */ 1354 mutex_exit(&connfp->connf_lock); 1355 return (EADDRINUSE); 1356 } 1357 } 1358 if (connp->conn_fanout != NULL) { 1359 /* 1360 * Probably a XTI/TLI application trying to do a 1361 * rebind. Let it happen. 1362 */ 1363 mutex_exit(&connfp->connf_lock); 1364 IPCL_HASH_REMOVE(connp); 1365 mutex_enter(&connfp->connf_lock); 1366 } 1367 1368 ASSERT(connp->conn_recv != NULL); 1369 1370 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1371 mutex_exit(&connfp->connf_lock); 1372 break; 1373 1374 case IPPROTO_SCTP: 1375 /* 1376 * The raw socket may have already been bound, remove it 1377 * from the hash first. 1378 */ 1379 IPCL_HASH_REMOVE(connp); 1380 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1381 ret = ipcl_sctp_hash_insert(connp, lport); 1382 break; 1383 1384 default: 1385 /* 1386 * Check for conflicts among MAC exempt bindings. For 1387 * transports with port numbers, this is done by the upper 1388 * level per-transport binding logic. For all others, it's 1389 * done here. 1390 */ 1391 if (is_system_labeled() && 1392 check_exempt_conflict_v4(connp, ipst)) 1393 return (EADDRINUSE); 1394 /* FALLTHROUGH */ 1395 1396 case IPPROTO_UDP: 1397 up = (uint16_t *)&ports; 1398 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1399 if (protocol == IPPROTO_UDP) { 1400 connfp = &ipst->ips_ipcl_udp_fanout[ 1401 IPCL_UDP_HASH(up[1], ipst)]; 1402 } else { 1403 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1404 } 1405 1406 if (connp->conn_rem != INADDR_ANY) { 1407 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1408 } else if (connp->conn_src != INADDR_ANY) { 1409 IPCL_HASH_INSERT_BOUND(connfp, connp); 1410 } else { 1411 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1412 } 1413 break; 1414 } 1415 1416 return (ret); 1417 } 1418 1419 int 1420 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1421 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1422 { 1423 connf_t *connfp; 1424 uint16_t *up; 1425 conn_t *tconnp; 1426 in_port_t lport; 1427 int ret = 0; 1428 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1429 1430 switch (protocol) { 1431 case IPPROTO_TCP: 1432 /* Just need to insert a conn struct */ 1433 if (!(connp->conn_flags & IPCL_EAGER)) { 1434 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1435 } 1436 1437 /* 1438 * For tcp, we check whether the connection tuple already 1439 * exists before allowing the connection to proceed. We 1440 * also allow indexing on the zoneid. This is to allow 1441 * multiple shared stack zones to have the same tcp 1442 * connection tuple. In practice this only happens for 1443 * ipv6_loopback as it's the only local address which 1444 * doesn't have to be unique. 1445 */ 1446 connfp = &ipst->ips_ipcl_conn_fanout[ 1447 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1448 ipst)]; 1449 mutex_enter(&connfp->connf_lock); 1450 for (tconnp = connfp->connf_head; tconnp != NULL; 1451 tconnp = tconnp->conn_next) { 1452 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1453 connp->conn_remv6, connp->conn_srcv6, 1454 connp->conn_ports) && 1455 (tconnp->conn_tcp->tcp_bound_if == 0 || 1456 tconnp->conn_tcp->tcp_bound_if == ifindex) && 1457 (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { 1458 /* Already have a conn. bail out */ 1459 mutex_exit(&connfp->connf_lock); 1460 return (EADDRINUSE); 1461 } 1462 } 1463 if (connp->conn_fanout != NULL) { 1464 /* 1465 * Probably a XTI/TLI application trying to do a 1466 * rebind. Let it happen. 1467 */ 1468 mutex_exit(&connfp->connf_lock); 1469 IPCL_HASH_REMOVE(connp); 1470 mutex_enter(&connfp->connf_lock); 1471 } 1472 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1473 mutex_exit(&connfp->connf_lock); 1474 break; 1475 1476 case IPPROTO_SCTP: 1477 IPCL_HASH_REMOVE(connp); 1478 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1479 ret = ipcl_sctp_hash_insert(connp, lport); 1480 break; 1481 1482 default: 1483 if (is_system_labeled() && 1484 check_exempt_conflict_v6(connp, ipst)) 1485 return (EADDRINUSE); 1486 /* FALLTHROUGH */ 1487 case IPPROTO_UDP: 1488 up = (uint16_t *)&ports; 1489 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1490 if (protocol == IPPROTO_UDP) { 1491 connfp = &ipst->ips_ipcl_udp_fanout[ 1492 IPCL_UDP_HASH(up[1], ipst)]; 1493 } else { 1494 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1495 } 1496 1497 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1498 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1499 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1500 IPCL_HASH_INSERT_BOUND(connfp, connp); 1501 } else { 1502 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1503 } 1504 break; 1505 } 1506 1507 return (ret); 1508 } 1509 1510 /* 1511 * v4 packet classifying function. looks up the fanout table to 1512 * find the conn, the packet belongs to. returns the conn with 1513 * the reference held, null otherwise. 1514 * 1515 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1516 * Lookup" comment block are applied. Labels are also checked as described 1517 * above. If the packet is from the inside (looped back), and is from the same 1518 * zone, then label checks are omitted. 1519 */ 1520 conn_t * 1521 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1522 ip_stack_t *ipst) 1523 { 1524 ipha_t *ipha; 1525 connf_t *connfp, *bind_connfp; 1526 uint16_t lport; 1527 uint16_t fport; 1528 uint32_t ports; 1529 conn_t *connp; 1530 uint16_t *up; 1531 boolean_t shared_addr; 1532 boolean_t unlabeled; 1533 1534 ipha = (ipha_t *)mp->b_rptr; 1535 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1536 1537 switch (protocol) { 1538 case IPPROTO_TCP: 1539 ports = *(uint32_t *)up; 1540 connfp = 1541 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1542 ports, ipst)]; 1543 mutex_enter(&connfp->connf_lock); 1544 for (connp = connfp->connf_head; connp != NULL; 1545 connp = connp->conn_next) { 1546 if ((IPCL_CONN_MATCH(connp, protocol, 1547 ipha->ipha_src, ipha->ipha_dst, ports)) && 1548 (IPCL_ZONE_MATCH(connp, zoneid))) { 1549 break; 1550 } 1551 } 1552 1553 if (connp != NULL) { 1554 /* 1555 * We have a fully-bound TCP connection. 1556 * 1557 * For labeled systems, there's no need to check the 1558 * label here. It's known to be good as we checked 1559 * before allowing the connection to become bound. 1560 */ 1561 CONN_INC_REF(connp); 1562 mutex_exit(&connfp->connf_lock); 1563 return (connp); 1564 } 1565 1566 mutex_exit(&connfp->connf_lock); 1567 1568 lport = up[1]; 1569 unlabeled = B_FALSE; 1570 /* Cred cannot be null on IPv4 */ 1571 if (is_system_labeled()) 1572 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1573 TSLF_UNLABELED) != 0; 1574 shared_addr = (zoneid == ALL_ZONES); 1575 if (shared_addr) { 1576 /* 1577 * No need to handle exclusive-stack zones since 1578 * ALL_ZONES only applies to the shared stack. 1579 */ 1580 zoneid = tsol_mlp_findzone(protocol, lport); 1581 /* 1582 * If no shared MLP is found, tsol_mlp_findzone returns 1583 * ALL_ZONES. In that case, we assume it's SLP, and 1584 * search for the zone based on the packet label. 1585 * 1586 * If there is such a zone, we prefer to find a 1587 * connection in it. Otherwise, we look for a 1588 * MAC-exempt connection in any zone whose label 1589 * dominates the default label on the packet. 1590 */ 1591 if (zoneid == ALL_ZONES) 1592 zoneid = tsol_packet_to_zoneid(mp); 1593 else 1594 unlabeled = B_FALSE; 1595 } 1596 1597 bind_connfp = 1598 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1599 mutex_enter(&bind_connfp->connf_lock); 1600 for (connp = bind_connfp->connf_head; connp != NULL; 1601 connp = connp->conn_next) { 1602 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1603 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1604 (unlabeled && connp->conn_mac_exempt))) 1605 break; 1606 } 1607 1608 /* 1609 * If the matching connection is SLP on a private address, then 1610 * the label on the packet must match the local zone's label. 1611 * Otherwise, it must be in the label range defined by tnrh. 1612 * This is ensured by tsol_receive_label. 1613 */ 1614 if (connp != NULL && is_system_labeled() && 1615 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1616 shared_addr, connp)) { 1617 DTRACE_PROBE3( 1618 tx__ip__log__info__classify__tcp, 1619 char *, 1620 "connp(1) could not receive mp(2)", 1621 conn_t *, connp, mblk_t *, mp); 1622 connp = NULL; 1623 } 1624 1625 if (connp != NULL) { 1626 /* Have a listener at least */ 1627 CONN_INC_REF(connp); 1628 mutex_exit(&bind_connfp->connf_lock); 1629 return (connp); 1630 } 1631 1632 mutex_exit(&bind_connfp->connf_lock); 1633 1634 IPCL_DEBUG_LVL(512, 1635 ("ipcl_classify: couldn't classify mp = %p\n", 1636 (void *)mp)); 1637 break; 1638 1639 case IPPROTO_UDP: 1640 lport = up[1]; 1641 unlabeled = B_FALSE; 1642 /* Cred cannot be null on IPv4 */ 1643 if (is_system_labeled()) 1644 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1645 TSLF_UNLABELED) != 0; 1646 shared_addr = (zoneid == ALL_ZONES); 1647 if (shared_addr) { 1648 /* 1649 * No need to handle exclusive-stack zones since 1650 * ALL_ZONES only applies to the shared stack. 1651 */ 1652 zoneid = tsol_mlp_findzone(protocol, lport); 1653 /* 1654 * If no shared MLP is found, tsol_mlp_findzone returns 1655 * ALL_ZONES. In that case, we assume it's SLP, and 1656 * search for the zone based on the packet label. 1657 * 1658 * If there is such a zone, we prefer to find a 1659 * connection in it. Otherwise, we look for a 1660 * MAC-exempt connection in any zone whose label 1661 * dominates the default label on the packet. 1662 */ 1663 if (zoneid == ALL_ZONES) 1664 zoneid = tsol_packet_to_zoneid(mp); 1665 else 1666 unlabeled = B_FALSE; 1667 } 1668 fport = up[0]; 1669 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1670 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1671 mutex_enter(&connfp->connf_lock); 1672 for (connp = connfp->connf_head; connp != NULL; 1673 connp = connp->conn_next) { 1674 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1675 fport, ipha->ipha_src) && 1676 (IPCL_ZONE_MATCH(connp, zoneid) || 1677 (unlabeled && connp->conn_mac_exempt))) 1678 break; 1679 } 1680 1681 if (connp != NULL && is_system_labeled() && 1682 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1683 shared_addr, connp)) { 1684 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1685 char *, "connp(1) could not receive mp(2)", 1686 conn_t *, connp, mblk_t *, mp); 1687 connp = NULL; 1688 } 1689 1690 if (connp != NULL) { 1691 CONN_INC_REF(connp); 1692 mutex_exit(&connfp->connf_lock); 1693 return (connp); 1694 } 1695 1696 /* 1697 * We shouldn't come here for multicast/broadcast packets 1698 */ 1699 mutex_exit(&connfp->connf_lock); 1700 IPCL_DEBUG_LVL(512, 1701 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1702 lport, fport)); 1703 break; 1704 } 1705 1706 return (NULL); 1707 } 1708 1709 conn_t * 1710 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1711 ip_stack_t *ipst) 1712 { 1713 ip6_t *ip6h; 1714 connf_t *connfp, *bind_connfp; 1715 uint16_t lport; 1716 uint16_t fport; 1717 tcph_t *tcph; 1718 uint32_t ports; 1719 conn_t *connp; 1720 uint16_t *up; 1721 boolean_t shared_addr; 1722 boolean_t unlabeled; 1723 1724 ip6h = (ip6_t *)mp->b_rptr; 1725 1726 switch (protocol) { 1727 case IPPROTO_TCP: 1728 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1729 up = (uint16_t *)tcph->th_lport; 1730 ports = *(uint32_t *)up; 1731 1732 connfp = 1733 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1734 ports, ipst)]; 1735 mutex_enter(&connfp->connf_lock); 1736 for (connp = connfp->connf_head; connp != NULL; 1737 connp = connp->conn_next) { 1738 if ((IPCL_CONN_MATCH_V6(connp, protocol, 1739 ip6h->ip6_src, ip6h->ip6_dst, ports)) && 1740 (IPCL_ZONE_MATCH(connp, zoneid))) { 1741 break; 1742 } 1743 } 1744 1745 if (connp != NULL) { 1746 /* 1747 * We have a fully-bound TCP connection. 1748 * 1749 * For labeled systems, there's no need to check the 1750 * label here. It's known to be good as we checked 1751 * before allowing the connection to become bound. 1752 */ 1753 CONN_INC_REF(connp); 1754 mutex_exit(&connfp->connf_lock); 1755 return (connp); 1756 } 1757 1758 mutex_exit(&connfp->connf_lock); 1759 1760 lport = up[1]; 1761 unlabeled = B_FALSE; 1762 /* Cred can be null on IPv6 */ 1763 if (is_system_labeled()) { 1764 cred_t *cr = DB_CRED(mp); 1765 1766 unlabeled = (cr != NULL && 1767 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1768 } 1769 shared_addr = (zoneid == ALL_ZONES); 1770 if (shared_addr) { 1771 /* 1772 * No need to handle exclusive-stack zones since 1773 * ALL_ZONES only applies to the shared stack. 1774 */ 1775 zoneid = tsol_mlp_findzone(protocol, lport); 1776 /* 1777 * If no shared MLP is found, tsol_mlp_findzone returns 1778 * ALL_ZONES. In that case, we assume it's SLP, and 1779 * search for the zone based on the packet label. 1780 * 1781 * If there is such a zone, we prefer to find a 1782 * connection in it. Otherwise, we look for a 1783 * MAC-exempt connection in any zone whose label 1784 * dominates the default label on the packet. 1785 */ 1786 if (zoneid == ALL_ZONES) 1787 zoneid = tsol_packet_to_zoneid(mp); 1788 else 1789 unlabeled = B_FALSE; 1790 } 1791 1792 bind_connfp = 1793 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1794 mutex_enter(&bind_connfp->connf_lock); 1795 for (connp = bind_connfp->connf_head; connp != NULL; 1796 connp = connp->conn_next) { 1797 if (IPCL_BIND_MATCH_V6(connp, protocol, 1798 ip6h->ip6_dst, lport) && 1799 (IPCL_ZONE_MATCH(connp, zoneid) || 1800 (unlabeled && connp->conn_mac_exempt))) 1801 break; 1802 } 1803 1804 if (connp != NULL && is_system_labeled() && 1805 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1806 shared_addr, connp)) { 1807 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1808 char *, "connp(1) could not receive mp(2)", 1809 conn_t *, connp, mblk_t *, mp); 1810 connp = NULL; 1811 } 1812 1813 if (connp != NULL) { 1814 /* Have a listner at least */ 1815 CONN_INC_REF(connp); 1816 mutex_exit(&bind_connfp->connf_lock); 1817 IPCL_DEBUG_LVL(512, 1818 ("ipcl_classify_v6: found listner " 1819 "connp = %p\n", (void *)connp)); 1820 1821 return (connp); 1822 } 1823 1824 mutex_exit(&bind_connfp->connf_lock); 1825 1826 IPCL_DEBUG_LVL(512, 1827 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1828 (void *)mp)); 1829 break; 1830 1831 case IPPROTO_UDP: 1832 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1833 lport = up[1]; 1834 unlabeled = B_FALSE; 1835 /* Cred can be null on IPv6 */ 1836 if (is_system_labeled()) { 1837 cred_t *cr = DB_CRED(mp); 1838 1839 unlabeled = (cr != NULL && 1840 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1841 } 1842 shared_addr = (zoneid == ALL_ZONES); 1843 if (shared_addr) { 1844 /* 1845 * No need to handle exclusive-stack zones since 1846 * ALL_ZONES only applies to the shared stack. 1847 */ 1848 zoneid = tsol_mlp_findzone(protocol, lport); 1849 /* 1850 * If no shared MLP is found, tsol_mlp_findzone returns 1851 * ALL_ZONES. In that case, we assume it's SLP, and 1852 * search for the zone based on the packet label. 1853 * 1854 * If there is such a zone, we prefer to find a 1855 * connection in it. Otherwise, we look for a 1856 * MAC-exempt connection in any zone whose label 1857 * dominates the default label on the packet. 1858 */ 1859 if (zoneid == ALL_ZONES) 1860 zoneid = tsol_packet_to_zoneid(mp); 1861 else 1862 unlabeled = B_FALSE; 1863 } 1864 1865 fport = up[0]; 1866 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1867 fport)); 1868 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1869 mutex_enter(&connfp->connf_lock); 1870 for (connp = connfp->connf_head; connp != NULL; 1871 connp = connp->conn_next) { 1872 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1873 fport, ip6h->ip6_src) && 1874 (IPCL_ZONE_MATCH(connp, zoneid) || 1875 (unlabeled && connp->conn_mac_exempt))) 1876 break; 1877 } 1878 1879 if (connp != NULL && is_system_labeled() && 1880 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1881 shared_addr, connp)) { 1882 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1883 char *, "connp(1) could not receive mp(2)", 1884 conn_t *, connp, mblk_t *, mp); 1885 connp = NULL; 1886 } 1887 1888 if (connp != NULL) { 1889 CONN_INC_REF(connp); 1890 mutex_exit(&connfp->connf_lock); 1891 return (connp); 1892 } 1893 1894 /* 1895 * We shouldn't come here for multicast/broadcast packets 1896 */ 1897 mutex_exit(&connfp->connf_lock); 1898 IPCL_DEBUG_LVL(512, 1899 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1900 lport, fport)); 1901 break; 1902 } 1903 1904 return (NULL); 1905 } 1906 1907 /* 1908 * wrapper around ipcl_classify_(v4,v6) routines. 1909 */ 1910 conn_t * 1911 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1912 { 1913 uint16_t hdr_len; 1914 ipha_t *ipha; 1915 uint8_t *nexthdrp; 1916 1917 if (MBLKL(mp) < sizeof (ipha_t)) 1918 return (NULL); 1919 1920 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1921 case IPV4_VERSION: 1922 ipha = (ipha_t *)mp->b_rptr; 1923 hdr_len = IPH_HDR_LENGTH(ipha); 1924 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1925 zoneid, ipst)); 1926 case IPV6_VERSION: 1927 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1928 &hdr_len, &nexthdrp)) 1929 return (NULL); 1930 1931 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1932 } 1933 1934 return (NULL); 1935 } 1936 1937 conn_t * 1938 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1939 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1940 { 1941 connf_t *connfp; 1942 conn_t *connp; 1943 in_port_t lport; 1944 int af; 1945 boolean_t shared_addr; 1946 boolean_t unlabeled; 1947 const void *dst; 1948 1949 lport = ((uint16_t *)&ports)[1]; 1950 1951 unlabeled = B_FALSE; 1952 /* Cred can be null on IPv6 */ 1953 if (is_system_labeled()) { 1954 cred_t *cr = DB_CRED(mp); 1955 1956 unlabeled = (cr != NULL && 1957 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1958 } 1959 shared_addr = (zoneid == ALL_ZONES); 1960 if (shared_addr) { 1961 /* 1962 * No need to handle exclusive-stack zones since ALL_ZONES 1963 * only applies to the shared stack. 1964 */ 1965 zoneid = tsol_mlp_findzone(protocol, lport); 1966 /* 1967 * If no shared MLP is found, tsol_mlp_findzone returns 1968 * ALL_ZONES. In that case, we assume it's SLP, and search for 1969 * the zone based on the packet label. 1970 * 1971 * If there is such a zone, we prefer to find a connection in 1972 * it. Otherwise, we look for a MAC-exempt connection in any 1973 * zone whose label dominates the default label on the packet. 1974 */ 1975 if (zoneid == ALL_ZONES) 1976 zoneid = tsol_packet_to_zoneid(mp); 1977 else 1978 unlabeled = B_FALSE; 1979 } 1980 1981 af = IPH_HDR_VERSION(hdr); 1982 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1983 (const void *)&((ip6_t *)hdr)->ip6_dst; 1984 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1985 1986 mutex_enter(&connfp->connf_lock); 1987 for (connp = connfp->connf_head; connp != NULL; 1988 connp = connp->conn_next) { 1989 /* We don't allow v4 fallback for v6 raw socket. */ 1990 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1991 IPV6_VERSION)) 1992 continue; 1993 if (connp->conn_fully_bound) { 1994 if (af == IPV4_VERSION) { 1995 if (!IPCL_CONN_MATCH(connp, protocol, 1996 hdr->ipha_src, hdr->ipha_dst, ports)) 1997 continue; 1998 } else { 1999 if (!IPCL_CONN_MATCH_V6(connp, protocol, 2000 ((ip6_t *)hdr)->ip6_src, 2001 ((ip6_t *)hdr)->ip6_dst, ports)) 2002 continue; 2003 } 2004 } else { 2005 if (af == IPV4_VERSION) { 2006 if (!IPCL_BIND_MATCH(connp, protocol, 2007 hdr->ipha_dst, lport)) 2008 continue; 2009 } else { 2010 if (!IPCL_BIND_MATCH_V6(connp, protocol, 2011 ((ip6_t *)hdr)->ip6_dst, lport)) 2012 continue; 2013 } 2014 } 2015 2016 if (IPCL_ZONE_MATCH(connp, zoneid) || 2017 (unlabeled && connp->conn_mac_exempt)) 2018 break; 2019 } 2020 /* 2021 * If the connection is fully-bound and connection-oriented (TCP or 2022 * SCTP), then we've already validated the remote system's label. 2023 * There's no need to do it again for every packet. 2024 */ 2025 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 2026 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 2027 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 2028 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 2029 char *, "connp(1) could not receive mp(2)", 2030 conn_t *, connp, mblk_t *, mp); 2031 connp = NULL; 2032 } 2033 2034 if (connp != NULL) 2035 goto found; 2036 mutex_exit(&connfp->connf_lock); 2037 2038 /* Try to look for a wildcard match. */ 2039 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 2040 mutex_enter(&connfp->connf_lock); 2041 for (connp = connfp->connf_head; connp != NULL; 2042 connp = connp->conn_next) { 2043 /* We don't allow v4 fallback for v6 raw socket. */ 2044 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 2045 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 2046 continue; 2047 } 2048 if (af == IPV4_VERSION) { 2049 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 2050 break; 2051 } else { 2052 if (IPCL_RAW_MATCH_V6(connp, protocol, 2053 ((ip6_t *)hdr)->ip6_dst)) { 2054 break; 2055 } 2056 } 2057 } 2058 2059 if (connp != NULL) 2060 goto found; 2061 2062 mutex_exit(&connfp->connf_lock); 2063 return (NULL); 2064 2065 found: 2066 ASSERT(connp != NULL); 2067 CONN_INC_REF(connp); 2068 mutex_exit(&connfp->connf_lock); 2069 return (connp); 2070 } 2071 2072 /* ARGSUSED */ 2073 static int 2074 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2075 { 2076 itc_t *itc = (itc_t *)buf; 2077 conn_t *connp = &itc->itc_conn; 2078 tcp_t *tcp = (tcp_t *)&itc[1]; 2079 2080 bzero(connp, sizeof (conn_t)); 2081 bzero(tcp, sizeof (tcp_t)); 2082 2083 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2084 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2085 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 2086 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2087 connp->conn_tcp = tcp; 2088 connp->conn_flags = IPCL_TCPCONN; 2089 connp->conn_ulp = IPPROTO_TCP; 2090 tcp->tcp_connp = connp; 2091 return (0); 2092 } 2093 2094 /* ARGSUSED */ 2095 static void 2096 tcp_conn_destructor(void *buf, void *cdrarg) 2097 { 2098 itc_t *itc = (itc_t *)buf; 2099 conn_t *connp = &itc->itc_conn; 2100 tcp_t *tcp = (tcp_t *)&itc[1]; 2101 2102 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2103 ASSERT(tcp->tcp_connp == connp); 2104 ASSERT(connp->conn_tcp == tcp); 2105 tcp_timermp_free(tcp); 2106 mutex_destroy(&connp->conn_lock); 2107 cv_destroy(&connp->conn_cv); 2108 cv_destroy(&connp->conn_sq_cv); 2109 } 2110 2111 /* ARGSUSED */ 2112 static int 2113 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2114 { 2115 itc_t *itc = (itc_t *)buf; 2116 conn_t *connp = &itc->itc_conn; 2117 2118 bzero(connp, sizeof (conn_t)); 2119 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2120 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2121 connp->conn_flags = IPCL_IPCCONN; 2122 2123 return (0); 2124 } 2125 2126 /* ARGSUSED */ 2127 static void 2128 ip_conn_destructor(void *buf, void *cdrarg) 2129 { 2130 itc_t *itc = (itc_t *)buf; 2131 conn_t *connp = &itc->itc_conn; 2132 2133 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2134 ASSERT(connp->conn_priv == NULL); 2135 mutex_destroy(&connp->conn_lock); 2136 cv_destroy(&connp->conn_cv); 2137 } 2138 2139 /* ARGSUSED */ 2140 static int 2141 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2142 { 2143 itc_t *itc = (itc_t *)buf; 2144 conn_t *connp = &itc->itc_conn; 2145 udp_t *udp = (udp_t *)&itc[1]; 2146 2147 bzero(connp, sizeof (conn_t)); 2148 bzero(udp, sizeof (udp_t)); 2149 2150 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2151 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2152 connp->conn_udp = udp; 2153 connp->conn_flags = IPCL_UDPCONN; 2154 connp->conn_ulp = IPPROTO_UDP; 2155 udp->udp_connp = connp; 2156 return (0); 2157 } 2158 2159 /* ARGSUSED */ 2160 static void 2161 udp_conn_destructor(void *buf, void *cdrarg) 2162 { 2163 itc_t *itc = (itc_t *)buf; 2164 conn_t *connp = &itc->itc_conn; 2165 udp_t *udp = (udp_t *)&itc[1]; 2166 2167 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2168 ASSERT(udp->udp_connp == connp); 2169 ASSERT(connp->conn_udp == udp); 2170 mutex_destroy(&connp->conn_lock); 2171 cv_destroy(&connp->conn_cv); 2172 } 2173 2174 /* ARGSUSED */ 2175 static int 2176 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2177 { 2178 itc_t *itc = (itc_t *)buf; 2179 conn_t *connp = &itc->itc_conn; 2180 icmp_t *icmp = (icmp_t *)&itc[1]; 2181 2182 bzero(connp, sizeof (conn_t)); 2183 bzero(icmp, sizeof (icmp_t)); 2184 2185 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2186 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2187 connp->conn_icmp = icmp; 2188 connp->conn_flags = IPCL_RAWIPCONN; 2189 connp->conn_ulp = IPPROTO_ICMP; 2190 icmp->icmp_connp = connp; 2191 return (0); 2192 } 2193 2194 /* ARGSUSED */ 2195 static void 2196 rawip_conn_destructor(void *buf, void *cdrarg) 2197 { 2198 itc_t *itc = (itc_t *)buf; 2199 conn_t *connp = &itc->itc_conn; 2200 icmp_t *icmp = (icmp_t *)&itc[1]; 2201 2202 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2203 ASSERT(icmp->icmp_connp == connp); 2204 ASSERT(connp->conn_icmp == icmp); 2205 mutex_destroy(&connp->conn_lock); 2206 cv_destroy(&connp->conn_cv); 2207 } 2208 2209 /* ARGSUSED */ 2210 static int 2211 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2212 { 2213 itc_t *itc = (itc_t *)buf; 2214 conn_t *connp = &itc->itc_conn; 2215 rts_t *rts = (rts_t *)&itc[1]; 2216 2217 bzero(connp, sizeof (conn_t)); 2218 bzero(rts, sizeof (rts_t)); 2219 2220 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2221 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2222 connp->conn_rts = rts; 2223 connp->conn_flags = IPCL_RTSCONN; 2224 rts->rts_connp = connp; 2225 return (0); 2226 } 2227 2228 /* ARGSUSED */ 2229 static void 2230 rts_conn_destructor(void *buf, void *cdrarg) 2231 { 2232 itc_t *itc = (itc_t *)buf; 2233 conn_t *connp = &itc->itc_conn; 2234 rts_t *rts = (rts_t *)&itc[1]; 2235 2236 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2237 ASSERT(rts->rts_connp == connp); 2238 ASSERT(connp->conn_rts == rts); 2239 mutex_destroy(&connp->conn_lock); 2240 cv_destroy(&connp->conn_cv); 2241 } 2242 2243 /* ARGSUSED */ 2244 int 2245 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) 2246 { 2247 int error; 2248 netstack_t *ns; 2249 int ret; 2250 tcp_stack_t *tcps; 2251 ip_helper_stream_info_t *ip_helper_str; 2252 ip_stack_t *ipst; 2253 2254 ns = netstack_find_by_cred(kcred); 2255 ASSERT(ns != NULL); 2256 tcps = ns->netstack_tcp; 2257 ipst = ns->netstack_ip; 2258 ASSERT(tcps != NULL); 2259 ip_helper_str = (ip_helper_stream_info_t *)buf; 2260 2261 do { 2262 error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, 2263 &ip_helper_str->iphs_handle, ipst->ips_ldi_ident); 2264 } while (error == EINTR); 2265 2266 if (error == 0) { 2267 do { 2268 error = ldi_ioctl( 2269 ip_helper_str->iphs_handle, SIOCSQPTR, 2270 (intptr_t)buf, FKIOCTL, kcred, &ret); 2271 } while (error == EINTR); 2272 2273 if (error != 0) { 2274 (void) ldi_close( 2275 ip_helper_str->iphs_handle, 0, kcred); 2276 } 2277 } 2278 2279 netstack_rele(ipst->ips_netstack); 2280 2281 return (error); 2282 } 2283 2284 /* ARGSUSED */ 2285 static void 2286 ip_helper_stream_destructor(void *buf, void *cdrarg) 2287 { 2288 ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; 2289 2290 ip_helper_str->iphs_rq->q_ptr = 2291 ip_helper_str->iphs_wq->q_ptr = 2292 ip_helper_str->iphs_minfo; 2293 (void) ldi_close(ip_helper_str->iphs_handle, 0, kcred); 2294 } 2295 2296 2297 /* 2298 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2299 * in the conn_t. 2300 */ 2301 void 2302 ipcl_conn_cleanup(conn_t *connp) 2303 { 2304 ASSERT(connp->conn_ire_cache == NULL); 2305 ASSERT(connp->conn_latch == NULL); 2306 #ifdef notdef 2307 ASSERT(connp->conn_rq == NULL); 2308 ASSERT(connp->conn_wq == NULL); 2309 #endif 2310 ASSERT(connp->conn_cred == NULL); 2311 ASSERT(connp->conn_g_fanout == NULL); 2312 ASSERT(connp->conn_g_next == NULL); 2313 ASSERT(connp->conn_g_prev == NULL); 2314 ASSERT(connp->conn_policy == NULL); 2315 ASSERT(connp->conn_fanout == NULL); 2316 ASSERT(connp->conn_next == NULL); 2317 ASSERT(connp->conn_prev == NULL); 2318 #ifdef notdef 2319 /* 2320 * The ill and ipif pointers are not cleared before the conn_t 2321 * goes away since they do not hold a reference on the ill/ipif. 2322 * We should replace these pointers with ifindex/ipaddr_t to 2323 * make the code less complex. 2324 */ 2325 ASSERT(connp->conn_outgoing_ill == NULL); 2326 ASSERT(connp->conn_incoming_ill == NULL); 2327 ASSERT(connp->conn_multicast_ipif == NULL); 2328 ASSERT(connp->conn_multicast_ill == NULL); 2329 #endif 2330 ASSERT(connp->conn_oper_pending_ill == NULL); 2331 ASSERT(connp->conn_ilg == NULL); 2332 ASSERT(connp->conn_drain_next == NULL); 2333 ASSERT(connp->conn_drain_prev == NULL); 2334 #ifdef notdef 2335 /* conn_idl is not cleared when removed from idl list */ 2336 ASSERT(connp->conn_idl == NULL); 2337 #endif 2338 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2339 ASSERT(connp->conn_peercred == NULL); 2340 ASSERT(connp->conn_netstack == NULL); 2341 2342 ASSERT(connp->conn_helper_info == NULL); 2343 /* Clear out the conn_t fields that are not preserved */ 2344 bzero(&connp->conn_start_clr, 2345 sizeof (conn_t) - 2346 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2347 } 2348 2349 /* 2350 * All conns are inserted in a global multi-list for the benefit of 2351 * walkers. The walk is guaranteed to walk all open conns at the time 2352 * of the start of the walk exactly once. This property is needed to 2353 * achieve some cleanups during unplumb of interfaces. This is achieved 2354 * as follows. 2355 * 2356 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2357 * call the insert and delete functions below at creation and deletion 2358 * time respectively. The conn never moves or changes its position in this 2359 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2360 * won't increase due to walkers, once the conn deletion has started. Note 2361 * that we can't remove the conn from the global list and then wait for 2362 * the refcnt to drop to zero, since walkers would then see a truncated 2363 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2364 * conns until ip_open is ready to make them globally visible. 2365 * The global round robin multi-list locks are held only to get the 2366 * next member/insertion/deletion and contention should be negligible 2367 * if the multi-list is much greater than the number of cpus. 2368 */ 2369 void 2370 ipcl_globalhash_insert(conn_t *connp) 2371 { 2372 int index; 2373 struct connf_s *connfp; 2374 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2375 2376 /* 2377 * No need for atomic here. Approximate even distribution 2378 * in the global lists is sufficient. 2379 */ 2380 ipst->ips_conn_g_index++; 2381 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2382 2383 connp->conn_g_prev = NULL; 2384 /* 2385 * Mark as INCIPIENT, so that walkers will ignore this 2386 * for now, till ip_open is ready to make it visible globally. 2387 */ 2388 connp->conn_state_flags |= CONN_INCIPIENT; 2389 2390 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2391 /* Insert at the head of the list */ 2392 mutex_enter(&connfp->connf_lock); 2393 connp->conn_g_next = connfp->connf_head; 2394 if (connp->conn_g_next != NULL) 2395 connp->conn_g_next->conn_g_prev = connp; 2396 connfp->connf_head = connp; 2397 2398 /* The fanout bucket this conn points to */ 2399 connp->conn_g_fanout = connfp; 2400 2401 mutex_exit(&connfp->connf_lock); 2402 } 2403 2404 void 2405 ipcl_globalhash_remove(conn_t *connp) 2406 { 2407 struct connf_s *connfp; 2408 2409 /* 2410 * We were never inserted in the global multi list. 2411 * IPCL_NONE variety is never inserted in the global multilist 2412 * since it is presumed to not need any cleanup and is transient. 2413 */ 2414 if (connp->conn_g_fanout == NULL) 2415 return; 2416 2417 connfp = connp->conn_g_fanout; 2418 mutex_enter(&connfp->connf_lock); 2419 if (connp->conn_g_prev != NULL) 2420 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2421 else 2422 connfp->connf_head = connp->conn_g_next; 2423 if (connp->conn_g_next != NULL) 2424 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2425 mutex_exit(&connfp->connf_lock); 2426 2427 /* Better to stumble on a null pointer than to corrupt memory */ 2428 connp->conn_g_next = NULL; 2429 connp->conn_g_prev = NULL; 2430 connp->conn_g_fanout = NULL; 2431 } 2432 2433 /* 2434 * Walk the list of all conn_t's in the system, calling the function provided 2435 * with the specified argument for each. 2436 * Applies to both IPv4 and IPv6. 2437 * 2438 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2439 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2440 * unplumbed or removed. New conn_t's that are created while we are walking 2441 * may be missed by this walk, because they are not necessarily inserted 2442 * at the tail of the list. They are new conn_t's and thus don't have any 2443 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2444 * is created to the struct that is going away. 2445 */ 2446 void 2447 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2448 { 2449 int i; 2450 conn_t *connp; 2451 conn_t *prev_connp; 2452 2453 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2454 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2455 prev_connp = NULL; 2456 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2457 while (connp != NULL) { 2458 mutex_enter(&connp->conn_lock); 2459 if (connp->conn_state_flags & 2460 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2461 mutex_exit(&connp->conn_lock); 2462 connp = connp->conn_g_next; 2463 continue; 2464 } 2465 CONN_INC_REF_LOCKED(connp); 2466 mutex_exit(&connp->conn_lock); 2467 mutex_exit( 2468 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2469 (*func)(connp, arg); 2470 if (prev_connp != NULL) 2471 CONN_DEC_REF(prev_connp); 2472 mutex_enter( 2473 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2474 prev_connp = connp; 2475 connp = connp->conn_g_next; 2476 } 2477 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2478 if (prev_connp != NULL) 2479 CONN_DEC_REF(prev_connp); 2480 } 2481 } 2482 2483 /* 2484 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2485 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2486 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2487 * (peer tcp in ESTABLISHED state). 2488 */ 2489 conn_t * 2490 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2491 ip_stack_t *ipst) 2492 { 2493 uint32_t ports; 2494 uint16_t *pports = (uint16_t *)&ports; 2495 connf_t *connfp; 2496 conn_t *tconnp; 2497 boolean_t zone_chk; 2498 2499 /* 2500 * If either the source of destination address is loopback, then 2501 * both endpoints must be in the same Zone. Otherwise, both of 2502 * the addresses are system-wide unique (tcp is in ESTABLISHED 2503 * state) and the endpoints may reside in different Zones. 2504 */ 2505 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2506 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2507 2508 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2509 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2510 2511 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2512 ports, ipst)]; 2513 2514 mutex_enter(&connfp->connf_lock); 2515 for (tconnp = connfp->connf_head; tconnp != NULL; 2516 tconnp = tconnp->conn_next) { 2517 2518 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2519 ipha->ipha_dst, ipha->ipha_src, ports) && 2520 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2521 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2522 2523 ASSERT(tconnp != connp); 2524 CONN_INC_REF(tconnp); 2525 mutex_exit(&connfp->connf_lock); 2526 return (tconnp); 2527 } 2528 } 2529 mutex_exit(&connfp->connf_lock); 2530 return (NULL); 2531 } 2532 2533 /* 2534 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2535 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2536 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2537 * (peer tcp in ESTABLISHED state). 2538 */ 2539 conn_t * 2540 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2541 ip_stack_t *ipst) 2542 { 2543 uint32_t ports; 2544 uint16_t *pports = (uint16_t *)&ports; 2545 connf_t *connfp; 2546 conn_t *tconnp; 2547 boolean_t zone_chk; 2548 2549 /* 2550 * If either the source of destination address is loopback, then 2551 * both endpoints must be in the same Zone. Otherwise, both of 2552 * the addresses are system-wide unique (tcp is in ESTABLISHED 2553 * state) and the endpoints may reside in different Zones. We 2554 * don't do Zone check for link local address(es) because the 2555 * current Zone implementation treats each link local address as 2556 * being unique per system node, i.e. they belong to global Zone. 2557 */ 2558 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2559 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2560 2561 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2562 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2563 2564 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2565 ports, ipst)]; 2566 2567 mutex_enter(&connfp->connf_lock); 2568 for (tconnp = connfp->connf_head; tconnp != NULL; 2569 tconnp = tconnp->conn_next) { 2570 2571 /* We skip tcp_bound_if check here as this is loopback tcp */ 2572 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2573 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2574 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2575 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2576 2577 ASSERT(tconnp != connp); 2578 CONN_INC_REF(tconnp); 2579 mutex_exit(&connfp->connf_lock); 2580 return (tconnp); 2581 } 2582 } 2583 mutex_exit(&connfp->connf_lock); 2584 return (NULL); 2585 } 2586 2587 /* 2588 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2589 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2590 * Only checks for connected entries i.e. no INADDR_ANY checks. 2591 */ 2592 conn_t * 2593 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2594 ip_stack_t *ipst) 2595 { 2596 uint32_t ports; 2597 uint16_t *pports; 2598 connf_t *connfp; 2599 conn_t *tconnp; 2600 2601 pports = (uint16_t *)&ports; 2602 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2603 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2604 2605 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2606 ports, ipst)]; 2607 2608 mutex_enter(&connfp->connf_lock); 2609 for (tconnp = connfp->connf_head; tconnp != NULL; 2610 tconnp = tconnp->conn_next) { 2611 2612 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2613 ipha->ipha_dst, ipha->ipha_src, ports) && 2614 tconnp->conn_tcp->tcp_state >= min_state) { 2615 2616 CONN_INC_REF(tconnp); 2617 mutex_exit(&connfp->connf_lock); 2618 return (tconnp); 2619 } 2620 } 2621 mutex_exit(&connfp->connf_lock); 2622 return (NULL); 2623 } 2624 2625 /* 2626 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2627 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2628 * Only checks for connected entries i.e. no INADDR_ANY checks. 2629 * Match on ifindex in addition to addresses. 2630 */ 2631 conn_t * 2632 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2633 uint_t ifindex, ip_stack_t *ipst) 2634 { 2635 tcp_t *tcp; 2636 uint32_t ports; 2637 uint16_t *pports; 2638 connf_t *connfp; 2639 conn_t *tconnp; 2640 2641 pports = (uint16_t *)&ports; 2642 pports[0] = tcpha->tha_fport; 2643 pports[1] = tcpha->tha_lport; 2644 2645 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2646 ports, ipst)]; 2647 2648 mutex_enter(&connfp->connf_lock); 2649 for (tconnp = connfp->connf_head; tconnp != NULL; 2650 tconnp = tconnp->conn_next) { 2651 2652 tcp = tconnp->conn_tcp; 2653 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2654 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2655 tcp->tcp_state >= min_state && 2656 (tcp->tcp_bound_if == 0 || 2657 tcp->tcp_bound_if == ifindex)) { 2658 2659 CONN_INC_REF(tconnp); 2660 mutex_exit(&connfp->connf_lock); 2661 return (tconnp); 2662 } 2663 } 2664 mutex_exit(&connfp->connf_lock); 2665 return (NULL); 2666 } 2667 2668 /* 2669 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2670 * a listener when changing state. 2671 */ 2672 conn_t * 2673 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2674 ip_stack_t *ipst) 2675 { 2676 connf_t *bind_connfp; 2677 conn_t *connp; 2678 tcp_t *tcp; 2679 2680 /* 2681 * Avoid false matches for packets sent to an IP destination of 2682 * all zeros. 2683 */ 2684 if (laddr == 0) 2685 return (NULL); 2686 2687 ASSERT(zoneid != ALL_ZONES); 2688 2689 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2690 mutex_enter(&bind_connfp->connf_lock); 2691 for (connp = bind_connfp->connf_head; connp != NULL; 2692 connp = connp->conn_next) { 2693 tcp = connp->conn_tcp; 2694 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2695 IPCL_ZONE_MATCH(connp, zoneid) && 2696 (tcp->tcp_listener == NULL)) { 2697 CONN_INC_REF(connp); 2698 mutex_exit(&bind_connfp->connf_lock); 2699 return (connp); 2700 } 2701 } 2702 mutex_exit(&bind_connfp->connf_lock); 2703 return (NULL); 2704 } 2705 2706 /* 2707 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2708 * a listener when changing state. 2709 */ 2710 conn_t * 2711 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2712 zoneid_t zoneid, ip_stack_t *ipst) 2713 { 2714 connf_t *bind_connfp; 2715 conn_t *connp = NULL; 2716 tcp_t *tcp; 2717 2718 /* 2719 * Avoid false matches for packets sent to an IP destination of 2720 * all zeros. 2721 */ 2722 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2723 return (NULL); 2724 2725 ASSERT(zoneid != ALL_ZONES); 2726 2727 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2728 mutex_enter(&bind_connfp->connf_lock); 2729 for (connp = bind_connfp->connf_head; connp != NULL; 2730 connp = connp->conn_next) { 2731 tcp = connp->conn_tcp; 2732 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2733 IPCL_ZONE_MATCH(connp, zoneid) && 2734 (tcp->tcp_bound_if == 0 || 2735 tcp->tcp_bound_if == ifindex) && 2736 tcp->tcp_listener == NULL) { 2737 CONN_INC_REF(connp); 2738 mutex_exit(&bind_connfp->connf_lock); 2739 return (connp); 2740 } 2741 } 2742 mutex_exit(&bind_connfp->connf_lock); 2743 return (NULL); 2744 } 2745 2746 /* 2747 * ipcl_get_next_conn 2748 * get the next entry in the conn global list 2749 * and put a reference on the next_conn. 2750 * decrement the reference on the current conn. 2751 * 2752 * This is an iterator based walker function that also provides for 2753 * some selection by the caller. It walks through the conn_hash bucket 2754 * searching for the next valid connp in the list, and selects connections 2755 * that are neither closed nor condemned. It also REFHOLDS the conn 2756 * thus ensuring that the conn exists when the caller uses the conn. 2757 */ 2758 conn_t * 2759 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2760 { 2761 conn_t *next_connp; 2762 2763 if (connfp == NULL) 2764 return (NULL); 2765 2766 mutex_enter(&connfp->connf_lock); 2767 2768 next_connp = (connp == NULL) ? 2769 connfp->connf_head : connp->conn_g_next; 2770 2771 while (next_connp != NULL) { 2772 mutex_enter(&next_connp->conn_lock); 2773 if (!(next_connp->conn_flags & conn_flags) || 2774 (next_connp->conn_state_flags & 2775 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2776 /* 2777 * This conn has been condemned or 2778 * is closing, or the flags don't match 2779 */ 2780 mutex_exit(&next_connp->conn_lock); 2781 next_connp = next_connp->conn_g_next; 2782 continue; 2783 } 2784 CONN_INC_REF_LOCKED(next_connp); 2785 mutex_exit(&next_connp->conn_lock); 2786 break; 2787 } 2788 2789 mutex_exit(&connfp->connf_lock); 2790 2791 if (connp != NULL) 2792 CONN_DEC_REF(connp); 2793 2794 return (next_connp); 2795 } 2796 2797 #ifdef CONN_DEBUG 2798 /* 2799 * Trace of the last NBUF refhold/refrele 2800 */ 2801 int 2802 conn_trace_ref(conn_t *connp) 2803 { 2804 int last; 2805 conn_trace_t *ctb; 2806 2807 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2808 last = connp->conn_trace_last; 2809 last++; 2810 if (last == CONN_TRACE_MAX) 2811 last = 0; 2812 2813 ctb = &connp->conn_trace_buf[last]; 2814 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2815 connp->conn_trace_last = last; 2816 return (1); 2817 } 2818 2819 int 2820 conn_untrace_ref(conn_t *connp) 2821 { 2822 int last; 2823 conn_trace_t *ctb; 2824 2825 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2826 last = connp->conn_trace_last; 2827 last++; 2828 if (last == CONN_TRACE_MAX) 2829 last = 0; 2830 2831 ctb = &connp->conn_trace_buf[last]; 2832 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2833 connp->conn_trace_last = last; 2834 return (1); 2835 } 2836 #endif 2837