1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * conn_recv is used to pass up packets to the ULP. 56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 57 * a listener, and changes to tcp_input_listener as the listener has picked a 58 * good squeue. For other cases it is set to tcp_input_data. 59 * 60 * conn_recvicmp is used to pass up ICMP errors to the ULP. 61 * 62 * Classifier uses several hash tables: 63 * 64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 65 * ipcl_bind_fanout: contains all connections in BOUND state 66 * ipcl_proto_fanout: IPv4 protocol fanout 67 * ipcl_proto_fanout_v6: IPv6 protocol fanout 68 * ipcl_udp_fanout: contains all UDP connections 69 * ipcl_iptun_fanout: contains all IP tunnel connections 70 * ipcl_globalhash_fanout: contains all connections 71 * 72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 73 * which need to view all existing connections. 74 * 75 * All tables are protected by per-bucket locks. When both per-bucket lock and 76 * connection lock need to be held, the per-bucket lock should be acquired 77 * first, followed by the connection lock. 78 * 79 * All functions doing search in one of these tables increment a reference 80 * counter on the connection found (if any). This reference should be dropped 81 * when the caller has finished processing the connection. 82 * 83 * 84 * INTERFACES: 85 * =========== 86 * 87 * Connection Lookup: 88 * ------------------ 89 * 90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 92 * 93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 94 * it can't find any associated connection. If the connection is found, its 95 * reference counter is incremented. 96 * 97 * mp: mblock, containing packet header. The full header should fit 98 * into a single mblock. It should also contain at least full IP 99 * and TCP or UDP header. 100 * 101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 102 * 103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 104 * the packet. 105 * 106 * ira->ira_zoneid: The zone in which the returned connection must be; the 107 * zoneid corresponding to the ire_zoneid on the IRE located for 108 * the packet's destination address. 109 * 110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 111 * IRAF_TX_SHARED_ADDR flags 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp); 187 * int ipcl_conn_insert_v4(connp); 188 * int ipcl_conn_insert_v6(connp); 189 * 190 * Insert 'connp' in the ipcl_conn_fanout. 191 * Arguements : 192 * connp conn_t to be inserted 193 * 194 * Return value : 195 * 0 if connp was inserted 196 * EADDRINUSE if the connection with the same tuple 197 * already exists. 198 * 199 * int ipcl_bind_insert(connp); 200 * int ipcl_bind_insert_v4(connp); 201 * int ipcl_bind_insert_v6(connp); 202 * 203 * Insert 'connp' in ipcl_bind_fanout. 204 * Arguements : 205 * connp conn_t to be inserted 206 * 207 * 208 * void ipcl_hash_remove(connp); 209 * 210 * Removes the 'connp' from the connection fanout table. 211 * 212 * Connection Creation/Destruction 213 * ------------------------------- 214 * 215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 216 * 217 * Creates a new conn based on the type flag, inserts it into 218 * globalhash table. 219 * 220 * type: This flag determines the type of conn_t which needs to be 221 * created i.e., which kmem_cache it comes from. 222 * IPCL_TCPCONN indicates a TCP connection 223 * IPCL_SCTPCONN indicates a SCTP connection 224 * IPCL_UDPCONN indicates a UDP conn_t. 225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 226 * IPCL_RTSCONN indicates a RTS conn_t. 227 * IPCL_IPCCONN indicates all other connections. 228 * 229 * void ipcl_conn_destroy(connp) 230 * 231 * Destroys the connection state, removes it from the global 232 * connection hash table and frees its memory. 233 */ 234 235 #include <sys/types.h> 236 #include <sys/stream.h> 237 #include <sys/stropts.h> 238 #include <sys/sysmacros.h> 239 #include <sys/strsubr.h> 240 #include <sys/strsun.h> 241 #define _SUN_TPI_VERSION 2 242 #include <sys/ddi.h> 243 #include <sys/cmn_err.h> 244 #include <sys/debug.h> 245 246 #include <sys/systm.h> 247 #include <sys/param.h> 248 #include <sys/kmem.h> 249 #include <sys/isa_defs.h> 250 #include <inet/common.h> 251 #include <netinet/ip6.h> 252 #include <netinet/icmp6.h> 253 254 #include <inet/ip.h> 255 #include <inet/ip_if.h> 256 #include <inet/ip_ire.h> 257 #include <inet/ip6.h> 258 #include <inet/ip_ndp.h> 259 #include <inet/ip_impl.h> 260 #include <inet/udp_impl.h> 261 #include <inet/sctp_ip.h> 262 #include <inet/sctp/sctp_impl.h> 263 #include <inet/rawip_impl.h> 264 #include <inet/rts_impl.h> 265 #include <inet/iptun/iptun_impl.h> 266 267 #include <sys/cpuvar.h> 268 269 #include <inet/ipclassifier.h> 270 #include <inet/tcp.h> 271 #include <inet/ipsec_impl.h> 272 273 #include <sys/tsol/tnet.h> 274 #include <sys/sockio.h> 275 276 /* Old value for compatibility. Setable in /etc/system */ 277 uint_t tcp_conn_hash_size = 0; 278 279 /* New value. Zero means choose automatically. Setable in /etc/system */ 280 uint_t ipcl_conn_hash_size = 0; 281 uint_t ipcl_conn_hash_memfactor = 8192; 282 uint_t ipcl_conn_hash_maxsize = 82500; 283 284 /* bind/udp fanout table size */ 285 uint_t ipcl_bind_fanout_size = 512; 286 uint_t ipcl_udp_fanout_size = 16384; 287 288 /* Raw socket fanout size. Must be a power of 2. */ 289 uint_t ipcl_raw_fanout_size = 256; 290 291 /* 292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 293 * expect that most large deployments would have hundreds of tunnels, and 294 * thousands in the extreme case. 295 */ 296 uint_t ipcl_iptun_fanout_size = 6143; 297 298 /* 299 * Power of 2^N Primes useful for hashing for N of 0-28, 300 * these primes are the nearest prime <= 2^N - 2^(N-2). 301 */ 302 303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 306 50331599, 100663291, 201326557, 0} 307 308 /* 309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 310 * are aligned on cache lines. 311 */ 312 typedef union itc_s { 313 conn_t itc_conn; 314 char itcu_filler[CACHE_ALIGN(conn_s)]; 315 } itc_t; 316 317 struct kmem_cache *tcp_conn_cache; 318 struct kmem_cache *ip_conn_cache; 319 extern struct kmem_cache *sctp_conn_cache; 320 extern struct kmem_cache *tcp_sack_info_cache; 321 struct kmem_cache *udp_conn_cache; 322 struct kmem_cache *rawip_conn_cache; 323 struct kmem_cache *rts_conn_cache; 324 325 extern void tcp_timermp_free(tcp_t *); 326 extern mblk_t *tcp_timermp_alloc(int); 327 328 static int ip_conn_constructor(void *, void *, int); 329 static void ip_conn_destructor(void *, void *); 330 331 static int tcp_conn_constructor(void *, void *, int); 332 static void tcp_conn_destructor(void *, void *); 333 334 static int udp_conn_constructor(void *, void *, int); 335 static void udp_conn_destructor(void *, void *); 336 337 static int rawip_conn_constructor(void *, void *, int); 338 static void rawip_conn_destructor(void *, void *); 339 340 static int rts_conn_constructor(void *, void *, int); 341 static void rts_conn_destructor(void *, void *); 342 343 /* 344 * Global (for all stack instances) init routine 345 */ 346 void 347 ipcl_g_init(void) 348 { 349 ip_conn_cache = kmem_cache_create("ip_conn_cache", 350 sizeof (conn_t), CACHE_ALIGN_SIZE, 351 ip_conn_constructor, ip_conn_destructor, 352 NULL, NULL, NULL, 0); 353 354 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 355 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 356 tcp_conn_constructor, tcp_conn_destructor, 357 tcp_conn_reclaim, NULL, NULL, 0); 358 359 udp_conn_cache = kmem_cache_create("udp_conn_cache", 360 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 361 udp_conn_constructor, udp_conn_destructor, 362 NULL, NULL, NULL, 0); 363 364 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 365 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 366 rawip_conn_constructor, rawip_conn_destructor, 367 NULL, NULL, NULL, 0); 368 369 rts_conn_cache = kmem_cache_create("rts_conn_cache", 370 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 371 rts_conn_constructor, rts_conn_destructor, 372 NULL, NULL, NULL, 0); 373 } 374 375 /* 376 * ipclassifier intialization routine, sets up hash tables. 377 */ 378 void 379 ipcl_init(ip_stack_t *ipst) 380 { 381 int i; 382 int sizes[] = P2Ps(); 383 384 /* 385 * Calculate size of conn fanout table from /etc/system settings 386 */ 387 if (ipcl_conn_hash_size != 0) { 388 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 389 } else if (tcp_conn_hash_size != 0) { 390 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 391 } else { 392 extern pgcnt_t freemem; 393 394 ipst->ips_ipcl_conn_fanout_size = 395 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 396 397 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 398 ipst->ips_ipcl_conn_fanout_size = 399 ipcl_conn_hash_maxsize; 400 } 401 } 402 403 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 404 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 405 break; 406 } 407 } 408 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 409 /* Out of range, use the 2^16 value */ 410 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 411 } 412 413 /* Take values from /etc/system */ 414 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 415 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 416 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 417 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 418 419 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 420 421 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 422 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 423 424 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 425 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 426 MUTEX_DEFAULT, NULL); 427 } 428 429 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 430 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 431 432 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 433 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 434 MUTEX_DEFAULT, NULL); 435 } 436 437 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 438 sizeof (connf_t), KM_SLEEP); 439 for (i = 0; i < IPPROTO_MAX; i++) { 440 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 441 MUTEX_DEFAULT, NULL); 442 } 443 444 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 445 sizeof (connf_t), KM_SLEEP); 446 for (i = 0; i < IPPROTO_MAX; i++) { 447 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 448 MUTEX_DEFAULT, NULL); 449 } 450 451 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 452 mutex_init(&ipst->ips_rts_clients->connf_lock, 453 NULL, MUTEX_DEFAULT, NULL); 454 455 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 456 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 457 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 458 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 459 MUTEX_DEFAULT, NULL); 460 } 461 462 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 463 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 464 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 465 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 466 MUTEX_DEFAULT, NULL); 467 } 468 469 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 470 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 471 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 472 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 473 MUTEX_DEFAULT, NULL); 474 } 475 476 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 477 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 478 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 479 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 480 NULL, MUTEX_DEFAULT, NULL); 481 } 482 } 483 484 void 485 ipcl_g_destroy(void) 486 { 487 kmem_cache_destroy(ip_conn_cache); 488 kmem_cache_destroy(tcp_conn_cache); 489 kmem_cache_destroy(udp_conn_cache); 490 kmem_cache_destroy(rawip_conn_cache); 491 kmem_cache_destroy(rts_conn_cache); 492 } 493 494 /* 495 * All user-level and kernel use of the stack must be gone 496 * by now. 497 */ 498 void 499 ipcl_destroy(ip_stack_t *ipst) 500 { 501 int i; 502 503 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 504 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 505 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 506 } 507 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 508 sizeof (connf_t)); 509 ipst->ips_ipcl_conn_fanout = NULL; 510 511 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 512 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 513 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 514 } 515 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 516 sizeof (connf_t)); 517 ipst->ips_ipcl_bind_fanout = NULL; 518 519 for (i = 0; i < IPPROTO_MAX; i++) { 520 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 521 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 522 } 523 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 524 IPPROTO_MAX * sizeof (connf_t)); 525 ipst->ips_ipcl_proto_fanout_v4 = NULL; 526 527 for (i = 0; i < IPPROTO_MAX; i++) { 528 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 529 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 530 } 531 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 532 IPPROTO_MAX * sizeof (connf_t)); 533 ipst->ips_ipcl_proto_fanout_v6 = NULL; 534 535 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 536 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 537 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 538 } 539 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 540 sizeof (connf_t)); 541 ipst->ips_ipcl_udp_fanout = NULL; 542 543 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 544 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 545 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 546 } 547 kmem_free(ipst->ips_ipcl_iptun_fanout, 548 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 549 ipst->ips_ipcl_iptun_fanout = NULL; 550 551 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 552 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 553 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 554 } 555 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 556 sizeof (connf_t)); 557 ipst->ips_ipcl_raw_fanout = NULL; 558 559 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 560 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 561 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 562 } 563 kmem_free(ipst->ips_ipcl_globalhash_fanout, 564 sizeof (connf_t) * CONN_G_HASH_SIZE); 565 ipst->ips_ipcl_globalhash_fanout = NULL; 566 567 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 568 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 569 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 570 ipst->ips_rts_clients = NULL; 571 } 572 573 /* 574 * conn creation routine. initialize the conn, sets the reference 575 * and inserts it in the global hash table. 576 */ 577 conn_t * 578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 579 { 580 conn_t *connp; 581 struct kmem_cache *conn_cache; 582 583 switch (type) { 584 case IPCL_SCTPCONN: 585 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 586 return (NULL); 587 sctp_conn_init(connp); 588 netstack_hold(ns); 589 connp->conn_netstack = ns; 590 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 591 ipcl_globalhash_insert(connp); 592 return (connp); 593 594 case IPCL_TCPCONN: 595 conn_cache = tcp_conn_cache; 596 break; 597 598 case IPCL_UDPCONN: 599 conn_cache = udp_conn_cache; 600 break; 601 602 case IPCL_RAWIPCONN: 603 conn_cache = rawip_conn_cache; 604 break; 605 606 case IPCL_RTSCONN: 607 conn_cache = rts_conn_cache; 608 break; 609 610 case IPCL_IPCCONN: 611 conn_cache = ip_conn_cache; 612 break; 613 614 default: 615 connp = NULL; 616 ASSERT(0); 617 } 618 619 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 620 return (NULL); 621 622 connp->conn_ref = 1; 623 netstack_hold(ns); 624 connp->conn_netstack = ns; 625 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 626 ipcl_globalhash_insert(connp); 627 return (connp); 628 } 629 630 void 631 ipcl_conn_destroy(conn_t *connp) 632 { 633 mblk_t *mp; 634 netstack_t *ns = connp->conn_netstack; 635 636 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 637 ASSERT(connp->conn_ref == 0); 638 639 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 640 641 if (connp->conn_cred != NULL) { 642 crfree(connp->conn_cred); 643 connp->conn_cred = NULL; 644 /* ixa_cred done in ipcl_conn_cleanup below */ 645 } 646 647 if (connp->conn_ht_iphc != NULL) { 648 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 649 connp->conn_ht_iphc = NULL; 650 connp->conn_ht_iphc_allocated = 0; 651 connp->conn_ht_iphc_len = 0; 652 connp->conn_ht_ulp = NULL; 653 connp->conn_ht_ulp_len = 0; 654 } 655 ip_pkt_free(&connp->conn_xmit_ipp); 656 657 ipcl_globalhash_remove(connp); 658 659 if (connp->conn_latch != NULL) { 660 IPLATCH_REFRELE(connp->conn_latch); 661 connp->conn_latch = NULL; 662 } 663 if (connp->conn_latch_in_policy != NULL) { 664 IPPOL_REFRELE(connp->conn_latch_in_policy); 665 connp->conn_latch_in_policy = NULL; 666 } 667 if (connp->conn_latch_in_action != NULL) { 668 IPACT_REFRELE(connp->conn_latch_in_action); 669 connp->conn_latch_in_action = NULL; 670 } 671 if (connp->conn_policy != NULL) { 672 IPPH_REFRELE(connp->conn_policy, ns); 673 connp->conn_policy = NULL; 674 } 675 676 if (connp->conn_ipsec_opt_mp != NULL) { 677 freemsg(connp->conn_ipsec_opt_mp); 678 connp->conn_ipsec_opt_mp = NULL; 679 } 680 681 if (connp->conn_flags & IPCL_TCPCONN) { 682 tcp_t *tcp = connp->conn_tcp; 683 684 tcp_free(tcp); 685 mp = tcp->tcp_timercache; 686 687 tcp->tcp_tcps = NULL; 688 689 if (tcp->tcp_sack_info != NULL) { 690 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 691 kmem_cache_free(tcp_sack_info_cache, 692 tcp->tcp_sack_info); 693 } 694 695 /* 696 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 697 * the mblk. 698 */ 699 if (tcp->tcp_rsrv_mp != NULL) { 700 freeb(tcp->tcp_rsrv_mp); 701 tcp->tcp_rsrv_mp = NULL; 702 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 703 } 704 705 ipcl_conn_cleanup(connp); 706 connp->conn_flags = IPCL_TCPCONN; 707 if (ns != NULL) { 708 ASSERT(tcp->tcp_tcps == NULL); 709 connp->conn_netstack = NULL; 710 connp->conn_ixa->ixa_ipst = NULL; 711 netstack_rele(ns); 712 } 713 714 bzero(tcp, sizeof (tcp_t)); 715 716 tcp->tcp_timercache = mp; 717 tcp->tcp_connp = connp; 718 kmem_cache_free(tcp_conn_cache, connp); 719 return; 720 } 721 722 if (connp->conn_flags & IPCL_SCTPCONN) { 723 ASSERT(ns != NULL); 724 sctp_free(connp); 725 return; 726 } 727 728 ipcl_conn_cleanup(connp); 729 if (ns != NULL) { 730 connp->conn_netstack = NULL; 731 connp->conn_ixa->ixa_ipst = NULL; 732 netstack_rele(ns); 733 } 734 735 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 736 if (connp->conn_flags & IPCL_UDPCONN) { 737 connp->conn_flags = IPCL_UDPCONN; 738 kmem_cache_free(udp_conn_cache, connp); 739 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 740 connp->conn_flags = IPCL_RAWIPCONN; 741 connp->conn_proto = IPPROTO_ICMP; 742 connp->conn_ixa->ixa_protocol = connp->conn_proto; 743 kmem_cache_free(rawip_conn_cache, connp); 744 } else if (connp->conn_flags & IPCL_RTSCONN) { 745 connp->conn_flags = IPCL_RTSCONN; 746 kmem_cache_free(rts_conn_cache, connp); 747 } else { 748 connp->conn_flags = IPCL_IPCCONN; 749 ASSERT(connp->conn_flags & IPCL_IPCCONN); 750 ASSERT(connp->conn_priv == NULL); 751 kmem_cache_free(ip_conn_cache, connp); 752 } 753 } 754 755 /* 756 * Running in cluster mode - deregister listener information 757 */ 758 static void 759 ipcl_conn_unlisten(conn_t *connp) 760 { 761 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 762 ASSERT(connp->conn_lport != 0); 763 764 if (cl_inet_unlisten != NULL) { 765 sa_family_t addr_family; 766 uint8_t *laddrp; 767 768 if (connp->conn_ipversion == IPV6_VERSION) { 769 addr_family = AF_INET6; 770 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 771 } else { 772 addr_family = AF_INET; 773 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 774 } 775 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 776 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 777 } 778 connp->conn_flags &= ~IPCL_CL_LISTENER; 779 } 780 781 /* 782 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 783 * which table the conn belonged to). So for debugging we can see which hash 784 * table this connection was in. 785 */ 786 #define IPCL_HASH_REMOVE(connp) { \ 787 connf_t *connfp = (connp)->conn_fanout; \ 788 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 789 if (connfp != NULL) { \ 790 mutex_enter(&connfp->connf_lock); \ 791 if ((connp)->conn_next != NULL) \ 792 (connp)->conn_next->conn_prev = \ 793 (connp)->conn_prev; \ 794 if ((connp)->conn_prev != NULL) \ 795 (connp)->conn_prev->conn_next = \ 796 (connp)->conn_next; \ 797 else \ 798 connfp->connf_head = (connp)->conn_next; \ 799 (connp)->conn_fanout = NULL; \ 800 (connp)->conn_next = NULL; \ 801 (connp)->conn_prev = NULL; \ 802 (connp)->conn_flags |= IPCL_REMOVED; \ 803 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 804 ipcl_conn_unlisten((connp)); \ 805 CONN_DEC_REF((connp)); \ 806 mutex_exit(&connfp->connf_lock); \ 807 } \ 808 } 809 810 void 811 ipcl_hash_remove(conn_t *connp) 812 { 813 uint8_t protocol = connp->conn_proto; 814 815 IPCL_HASH_REMOVE(connp); 816 if (protocol == IPPROTO_RSVP) 817 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 818 } 819 820 /* 821 * The whole purpose of this function is allow removal of 822 * a conn_t from the connected hash for timewait reclaim. 823 * This is essentially a TW reclaim fastpath where timewait 824 * collector checks under fanout lock (so no one else can 825 * get access to the conn_t) that refcnt is 2 i.e. one for 826 * TCP and one for the classifier hash list. If ref count 827 * is indeed 2, we can just remove the conn under lock and 828 * avoid cleaning up the conn under squeue. This gives us 829 * improved performance. 830 */ 831 void 832 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 833 { 834 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 835 ASSERT(MUTEX_HELD(&connp->conn_lock)); 836 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 837 838 if ((connp)->conn_next != NULL) { 839 (connp)->conn_next->conn_prev = (connp)->conn_prev; 840 } 841 if ((connp)->conn_prev != NULL) { 842 (connp)->conn_prev->conn_next = (connp)->conn_next; 843 } else { 844 connfp->connf_head = (connp)->conn_next; 845 } 846 (connp)->conn_fanout = NULL; 847 (connp)->conn_next = NULL; 848 (connp)->conn_prev = NULL; 849 (connp)->conn_flags |= IPCL_REMOVED; 850 ASSERT((connp)->conn_ref == 2); 851 (connp)->conn_ref--; 852 } 853 854 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 855 ASSERT((connp)->conn_fanout == NULL); \ 856 ASSERT((connp)->conn_next == NULL); \ 857 ASSERT((connp)->conn_prev == NULL); \ 858 if ((connfp)->connf_head != NULL) { \ 859 (connfp)->connf_head->conn_prev = (connp); \ 860 (connp)->conn_next = (connfp)->connf_head; \ 861 } \ 862 (connp)->conn_fanout = (connfp); \ 863 (connfp)->connf_head = (connp); \ 864 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 865 IPCL_CONNECTED; \ 866 CONN_INC_REF(connp); \ 867 } 868 869 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 870 IPCL_HASH_REMOVE((connp)); \ 871 mutex_enter(&(connfp)->connf_lock); \ 872 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 873 mutex_exit(&(connfp)->connf_lock); \ 874 } 875 876 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 877 conn_t *pconnp = NULL, *nconnp; \ 878 IPCL_HASH_REMOVE((connp)); \ 879 mutex_enter(&(connfp)->connf_lock); \ 880 nconnp = (connfp)->connf_head; \ 881 while (nconnp != NULL && \ 882 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 883 pconnp = nconnp; \ 884 nconnp = nconnp->conn_next; \ 885 } \ 886 if (pconnp != NULL) { \ 887 pconnp->conn_next = (connp); \ 888 (connp)->conn_prev = pconnp; \ 889 } else { \ 890 (connfp)->connf_head = (connp); \ 891 } \ 892 if (nconnp != NULL) { \ 893 (connp)->conn_next = nconnp; \ 894 nconnp->conn_prev = (connp); \ 895 } \ 896 (connp)->conn_fanout = (connfp); \ 897 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 898 IPCL_BOUND; \ 899 CONN_INC_REF(connp); \ 900 mutex_exit(&(connfp)->connf_lock); \ 901 } 902 903 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 904 conn_t **list, *prev, *next; \ 905 boolean_t isv4mapped = \ 906 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 907 IPCL_HASH_REMOVE((connp)); \ 908 mutex_enter(&(connfp)->connf_lock); \ 909 list = &(connfp)->connf_head; \ 910 prev = NULL; \ 911 while ((next = *list) != NULL) { \ 912 if (isv4mapped && \ 913 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 914 connp->conn_zoneid == next->conn_zoneid) { \ 915 (connp)->conn_next = next; \ 916 if (prev != NULL) \ 917 prev = next->conn_prev; \ 918 next->conn_prev = (connp); \ 919 break; \ 920 } \ 921 list = &next->conn_next; \ 922 prev = next; \ 923 } \ 924 (connp)->conn_prev = prev; \ 925 *list = (connp); \ 926 (connp)->conn_fanout = (connfp); \ 927 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 928 IPCL_BOUND; \ 929 CONN_INC_REF((connp)); \ 930 mutex_exit(&(connfp)->connf_lock); \ 931 } 932 933 void 934 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 935 { 936 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 937 } 938 939 /* 940 * Because the classifier is used to classify inbound packets, the destination 941 * address is meant to be our local tunnel address (tunnel source), and the 942 * source the remote tunnel address (tunnel destination). 943 * 944 * Note that conn_proto can't be used for fanout since the upper protocol 945 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 946 */ 947 conn_t * 948 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 949 { 950 connf_t *connfp; 951 conn_t *connp; 952 953 /* first look for IPv4 tunnel links */ 954 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 955 mutex_enter(&connfp->connf_lock); 956 for (connp = connfp->connf_head; connp != NULL; 957 connp = connp->conn_next) { 958 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 959 break; 960 } 961 if (connp != NULL) 962 goto done; 963 964 mutex_exit(&connfp->connf_lock); 965 966 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 967 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 968 INADDR_ANY)]; 969 mutex_enter(&connfp->connf_lock); 970 for (connp = connfp->connf_head; connp != NULL; 971 connp = connp->conn_next) { 972 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 973 break; 974 } 975 done: 976 if (connp != NULL) 977 CONN_INC_REF(connp); 978 mutex_exit(&connfp->connf_lock); 979 return (connp); 980 } 981 982 conn_t * 983 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 984 { 985 connf_t *connfp; 986 conn_t *connp; 987 988 /* Look for an IPv6 tunnel link */ 989 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 990 mutex_enter(&connfp->connf_lock); 991 for (connp = connfp->connf_head; connp != NULL; 992 connp = connp->conn_next) { 993 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 994 CONN_INC_REF(connp); 995 break; 996 } 997 } 998 mutex_exit(&connfp->connf_lock); 999 return (connp); 1000 } 1001 1002 /* 1003 * This function is used only for inserting SCTP raw socket now. 1004 * This may change later. 1005 * 1006 * Note that only one raw socket can be bound to a port. The param 1007 * lport is in network byte order. 1008 */ 1009 static int 1010 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1011 { 1012 connf_t *connfp; 1013 conn_t *oconnp; 1014 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1015 1016 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1017 1018 /* Check for existing raw socket already bound to the port. */ 1019 mutex_enter(&connfp->connf_lock); 1020 for (oconnp = connfp->connf_head; oconnp != NULL; 1021 oconnp = oconnp->conn_next) { 1022 if (oconnp->conn_lport == lport && 1023 oconnp->conn_zoneid == connp->conn_zoneid && 1024 oconnp->conn_family == connp->conn_family && 1025 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1026 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1027 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1028 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1029 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1030 &connp->conn_laddr_v6))) { 1031 break; 1032 } 1033 } 1034 mutex_exit(&connfp->connf_lock); 1035 if (oconnp != NULL) 1036 return (EADDRNOTAVAIL); 1037 1038 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1039 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1040 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1041 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1042 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1043 } else { 1044 IPCL_HASH_INSERT_BOUND(connfp, connp); 1045 } 1046 } else { 1047 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1048 } 1049 return (0); 1050 } 1051 1052 static int 1053 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1054 { 1055 connf_t *connfp; 1056 conn_t *tconnp; 1057 ipaddr_t laddr = connp->conn_laddr_v4; 1058 ipaddr_t faddr = connp->conn_faddr_v4; 1059 1060 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1061 mutex_enter(&connfp->connf_lock); 1062 for (tconnp = connfp->connf_head; tconnp != NULL; 1063 tconnp = tconnp->conn_next) { 1064 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1065 /* A tunnel is already bound to these addresses. */ 1066 mutex_exit(&connfp->connf_lock); 1067 return (EADDRINUSE); 1068 } 1069 } 1070 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1071 mutex_exit(&connfp->connf_lock); 1072 return (0); 1073 } 1074 1075 static int 1076 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1077 { 1078 connf_t *connfp; 1079 conn_t *tconnp; 1080 in6_addr_t *laddr = &connp->conn_laddr_v6; 1081 in6_addr_t *faddr = &connp->conn_faddr_v6; 1082 1083 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1084 mutex_enter(&connfp->connf_lock); 1085 for (tconnp = connfp->connf_head; tconnp != NULL; 1086 tconnp = tconnp->conn_next) { 1087 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1088 /* A tunnel is already bound to these addresses. */ 1089 mutex_exit(&connfp->connf_lock); 1090 return (EADDRINUSE); 1091 } 1092 } 1093 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1094 mutex_exit(&connfp->connf_lock); 1095 return (0); 1096 } 1097 1098 /* 1099 * Check for a MAC exemption conflict on a labeled system. Note that for 1100 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1101 * transport layer. This check is for binding all other protocols. 1102 * 1103 * Returns true if there's a conflict. 1104 */ 1105 static boolean_t 1106 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1107 { 1108 connf_t *connfp; 1109 conn_t *tconn; 1110 1111 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1112 mutex_enter(&connfp->connf_lock); 1113 for (tconn = connfp->connf_head; tconn != NULL; 1114 tconn = tconn->conn_next) { 1115 /* We don't allow v4 fallback for v6 raw socket */ 1116 if (connp->conn_family != tconn->conn_family) 1117 continue; 1118 /* If neither is exempt, then there's no conflict */ 1119 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1120 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1121 continue; 1122 /* We are only concerned about sockets for a different zone */ 1123 if (connp->conn_zoneid == tconn->conn_zoneid) 1124 continue; 1125 /* If both are bound to different specific addrs, ok */ 1126 if (connp->conn_laddr_v4 != INADDR_ANY && 1127 tconn->conn_laddr_v4 != INADDR_ANY && 1128 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1129 continue; 1130 /* These two conflict; fail */ 1131 break; 1132 } 1133 mutex_exit(&connfp->connf_lock); 1134 return (tconn != NULL); 1135 } 1136 1137 static boolean_t 1138 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1139 { 1140 connf_t *connfp; 1141 conn_t *tconn; 1142 1143 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1144 mutex_enter(&connfp->connf_lock); 1145 for (tconn = connfp->connf_head; tconn != NULL; 1146 tconn = tconn->conn_next) { 1147 /* We don't allow v4 fallback for v6 raw socket */ 1148 if (connp->conn_family != tconn->conn_family) 1149 continue; 1150 /* If neither is exempt, then there's no conflict */ 1151 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1152 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1153 continue; 1154 /* We are only concerned about sockets for a different zone */ 1155 if (connp->conn_zoneid == tconn->conn_zoneid) 1156 continue; 1157 /* If both are bound to different addrs, ok */ 1158 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1159 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1160 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1161 &tconn->conn_laddr_v6)) 1162 continue; 1163 /* These two conflict; fail */ 1164 break; 1165 } 1166 mutex_exit(&connfp->connf_lock); 1167 return (tconn != NULL); 1168 } 1169 1170 /* 1171 * (v4, v6) bind hash insertion routines 1172 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1173 */ 1174 1175 int 1176 ipcl_bind_insert(conn_t *connp) 1177 { 1178 if (connp->conn_ipversion == IPV6_VERSION) 1179 return (ipcl_bind_insert_v6(connp)); 1180 else 1181 return (ipcl_bind_insert_v4(connp)); 1182 } 1183 1184 int 1185 ipcl_bind_insert_v4(conn_t *connp) 1186 { 1187 connf_t *connfp; 1188 int ret = 0; 1189 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1190 uint16_t lport = connp->conn_lport; 1191 uint8_t protocol = connp->conn_proto; 1192 1193 if (IPCL_IS_IPTUN(connp)) 1194 return (ipcl_iptun_hash_insert(connp, ipst)); 1195 1196 switch (protocol) { 1197 default: 1198 if (is_system_labeled() && 1199 check_exempt_conflict_v4(connp, ipst)) 1200 return (EADDRINUSE); 1201 /* FALLTHROUGH */ 1202 case IPPROTO_UDP: 1203 if (protocol == IPPROTO_UDP) { 1204 connfp = &ipst->ips_ipcl_udp_fanout[ 1205 IPCL_UDP_HASH(lport, ipst)]; 1206 } else { 1207 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1208 } 1209 1210 if (connp->conn_faddr_v4 != INADDR_ANY) { 1211 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1212 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1213 IPCL_HASH_INSERT_BOUND(connfp, connp); 1214 } else { 1215 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1216 } 1217 if (protocol == IPPROTO_RSVP) 1218 ill_set_inputfn_all(ipst); 1219 break; 1220 1221 case IPPROTO_TCP: 1222 /* Insert it in the Bind Hash */ 1223 ASSERT(connp->conn_zoneid != ALL_ZONES); 1224 connfp = &ipst->ips_ipcl_bind_fanout[ 1225 IPCL_BIND_HASH(lport, ipst)]; 1226 if (connp->conn_laddr_v4 != INADDR_ANY) { 1227 IPCL_HASH_INSERT_BOUND(connfp, connp); 1228 } else { 1229 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1230 } 1231 if (cl_inet_listen != NULL) { 1232 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1233 connp->conn_flags |= IPCL_CL_LISTENER; 1234 (*cl_inet_listen)( 1235 connp->conn_netstack->netstack_stackid, 1236 IPPROTO_TCP, AF_INET, 1237 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1238 } 1239 break; 1240 1241 case IPPROTO_SCTP: 1242 ret = ipcl_sctp_hash_insert(connp, lport); 1243 break; 1244 } 1245 1246 return (ret); 1247 } 1248 1249 int 1250 ipcl_bind_insert_v6(conn_t *connp) 1251 { 1252 connf_t *connfp; 1253 int ret = 0; 1254 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1255 uint16_t lport = connp->conn_lport; 1256 uint8_t protocol = connp->conn_proto; 1257 1258 if (IPCL_IS_IPTUN(connp)) { 1259 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1260 } 1261 1262 switch (protocol) { 1263 default: 1264 if (is_system_labeled() && 1265 check_exempt_conflict_v6(connp, ipst)) 1266 return (EADDRINUSE); 1267 /* FALLTHROUGH */ 1268 case IPPROTO_UDP: 1269 if (protocol == IPPROTO_UDP) { 1270 connfp = &ipst->ips_ipcl_udp_fanout[ 1271 IPCL_UDP_HASH(lport, ipst)]; 1272 } else { 1273 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1274 } 1275 1276 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1277 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1278 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1279 IPCL_HASH_INSERT_BOUND(connfp, connp); 1280 } else { 1281 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1282 } 1283 break; 1284 1285 case IPPROTO_TCP: 1286 /* Insert it in the Bind Hash */ 1287 ASSERT(connp->conn_zoneid != ALL_ZONES); 1288 connfp = &ipst->ips_ipcl_bind_fanout[ 1289 IPCL_BIND_HASH(lport, ipst)]; 1290 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1291 IPCL_HASH_INSERT_BOUND(connfp, connp); 1292 } else { 1293 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1294 } 1295 if (cl_inet_listen != NULL) { 1296 sa_family_t addr_family; 1297 uint8_t *laddrp; 1298 1299 if (connp->conn_ipversion == IPV6_VERSION) { 1300 addr_family = AF_INET6; 1301 laddrp = 1302 (uint8_t *)&connp->conn_bound_addr_v6; 1303 } else { 1304 addr_family = AF_INET; 1305 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1306 } 1307 connp->conn_flags |= IPCL_CL_LISTENER; 1308 (*cl_inet_listen)( 1309 connp->conn_netstack->netstack_stackid, 1310 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1311 } 1312 break; 1313 1314 case IPPROTO_SCTP: 1315 ret = ipcl_sctp_hash_insert(connp, lport); 1316 break; 1317 } 1318 1319 return (ret); 1320 } 1321 1322 /* 1323 * ipcl_conn_hash insertion routines. 1324 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1325 */ 1326 1327 int 1328 ipcl_conn_insert(conn_t *connp) 1329 { 1330 if (connp->conn_ipversion == IPV6_VERSION) 1331 return (ipcl_conn_insert_v6(connp)); 1332 else 1333 return (ipcl_conn_insert_v4(connp)); 1334 } 1335 1336 int 1337 ipcl_conn_insert_v4(conn_t *connp) 1338 { 1339 connf_t *connfp; 1340 conn_t *tconnp; 1341 int ret = 0; 1342 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1343 uint16_t lport = connp->conn_lport; 1344 uint8_t protocol = connp->conn_proto; 1345 1346 if (IPCL_IS_IPTUN(connp)) 1347 return (ipcl_iptun_hash_insert(connp, ipst)); 1348 1349 switch (protocol) { 1350 case IPPROTO_TCP: 1351 /* 1352 * For TCP, we check whether the connection tuple already 1353 * exists before allowing the connection to proceed. We 1354 * also allow indexing on the zoneid. This is to allow 1355 * multiple shared stack zones to have the same tcp 1356 * connection tuple. In practice this only happens for 1357 * INADDR_LOOPBACK as it's the only local address which 1358 * doesn't have to be unique. 1359 */ 1360 connfp = &ipst->ips_ipcl_conn_fanout[ 1361 IPCL_CONN_HASH(connp->conn_faddr_v4, 1362 connp->conn_ports, ipst)]; 1363 mutex_enter(&connfp->connf_lock); 1364 for (tconnp = connfp->connf_head; tconnp != NULL; 1365 tconnp = tconnp->conn_next) { 1366 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1367 connp->conn_faddr_v4, connp->conn_laddr_v4, 1368 connp->conn_ports) && 1369 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1370 /* Already have a conn. bail out */ 1371 mutex_exit(&connfp->connf_lock); 1372 return (EADDRINUSE); 1373 } 1374 } 1375 if (connp->conn_fanout != NULL) { 1376 /* 1377 * Probably a XTI/TLI application trying to do a 1378 * rebind. Let it happen. 1379 */ 1380 mutex_exit(&connfp->connf_lock); 1381 IPCL_HASH_REMOVE(connp); 1382 mutex_enter(&connfp->connf_lock); 1383 } 1384 1385 ASSERT(connp->conn_recv != NULL); 1386 ASSERT(connp->conn_recvicmp != NULL); 1387 1388 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1389 mutex_exit(&connfp->connf_lock); 1390 break; 1391 1392 case IPPROTO_SCTP: 1393 /* 1394 * The raw socket may have already been bound, remove it 1395 * from the hash first. 1396 */ 1397 IPCL_HASH_REMOVE(connp); 1398 ret = ipcl_sctp_hash_insert(connp, lport); 1399 break; 1400 1401 default: 1402 /* 1403 * Check for conflicts among MAC exempt bindings. For 1404 * transports with port numbers, this is done by the upper 1405 * level per-transport binding logic. For all others, it's 1406 * done here. 1407 */ 1408 if (is_system_labeled() && 1409 check_exempt_conflict_v4(connp, ipst)) 1410 return (EADDRINUSE); 1411 /* FALLTHROUGH */ 1412 1413 case IPPROTO_UDP: 1414 if (protocol == IPPROTO_UDP) { 1415 connfp = &ipst->ips_ipcl_udp_fanout[ 1416 IPCL_UDP_HASH(lport, ipst)]; 1417 } else { 1418 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1419 } 1420 1421 if (connp->conn_faddr_v4 != INADDR_ANY) { 1422 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1423 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1424 IPCL_HASH_INSERT_BOUND(connfp, connp); 1425 } else { 1426 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1427 } 1428 break; 1429 } 1430 1431 return (ret); 1432 } 1433 1434 int 1435 ipcl_conn_insert_v6(conn_t *connp) 1436 { 1437 connf_t *connfp; 1438 conn_t *tconnp; 1439 int ret = 0; 1440 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1441 uint16_t lport = connp->conn_lport; 1442 uint8_t protocol = connp->conn_proto; 1443 uint_t ifindex = connp->conn_bound_if; 1444 1445 if (IPCL_IS_IPTUN(connp)) 1446 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1447 1448 switch (protocol) { 1449 case IPPROTO_TCP: 1450 1451 /* 1452 * For tcp, we check whether the connection tuple already 1453 * exists before allowing the connection to proceed. We 1454 * also allow indexing on the zoneid. This is to allow 1455 * multiple shared stack zones to have the same tcp 1456 * connection tuple. In practice this only happens for 1457 * ipv6_loopback as it's the only local address which 1458 * doesn't have to be unique. 1459 */ 1460 connfp = &ipst->ips_ipcl_conn_fanout[ 1461 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1462 ipst)]; 1463 mutex_enter(&connfp->connf_lock); 1464 for (tconnp = connfp->connf_head; tconnp != NULL; 1465 tconnp = tconnp->conn_next) { 1466 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1467 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1468 connp->conn_faddr_v6, connp->conn_laddr_v6, 1469 connp->conn_ports) && 1470 (tconnp->conn_bound_if == 0 || 1471 tconnp->conn_bound_if == ifindex) && 1472 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1473 /* Already have a conn. bail out */ 1474 mutex_exit(&connfp->connf_lock); 1475 return (EADDRINUSE); 1476 } 1477 } 1478 if (connp->conn_fanout != NULL) { 1479 /* 1480 * Probably a XTI/TLI application trying to do a 1481 * rebind. Let it happen. 1482 */ 1483 mutex_exit(&connfp->connf_lock); 1484 IPCL_HASH_REMOVE(connp); 1485 mutex_enter(&connfp->connf_lock); 1486 } 1487 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1488 mutex_exit(&connfp->connf_lock); 1489 break; 1490 1491 case IPPROTO_SCTP: 1492 IPCL_HASH_REMOVE(connp); 1493 ret = ipcl_sctp_hash_insert(connp, lport); 1494 break; 1495 1496 default: 1497 if (is_system_labeled() && 1498 check_exempt_conflict_v6(connp, ipst)) 1499 return (EADDRINUSE); 1500 /* FALLTHROUGH */ 1501 case IPPROTO_UDP: 1502 if (protocol == IPPROTO_UDP) { 1503 connfp = &ipst->ips_ipcl_udp_fanout[ 1504 IPCL_UDP_HASH(lport, ipst)]; 1505 } else { 1506 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1507 } 1508 1509 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1510 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1511 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1512 IPCL_HASH_INSERT_BOUND(connfp, connp); 1513 } else { 1514 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1515 } 1516 break; 1517 } 1518 1519 return (ret); 1520 } 1521 1522 /* 1523 * v4 packet classifying function. looks up the fanout table to 1524 * find the conn, the packet belongs to. returns the conn with 1525 * the reference held, null otherwise. 1526 * 1527 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1528 * Lookup" comment block are applied. Labels are also checked as described 1529 * above. If the packet is from the inside (looped back), and is from the same 1530 * zone, then label checks are omitted. 1531 */ 1532 conn_t * 1533 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1534 ip_recv_attr_t *ira, ip_stack_t *ipst) 1535 { 1536 ipha_t *ipha; 1537 connf_t *connfp, *bind_connfp; 1538 uint16_t lport; 1539 uint16_t fport; 1540 uint32_t ports; 1541 conn_t *connp; 1542 uint16_t *up; 1543 zoneid_t zoneid = ira->ira_zoneid; 1544 1545 ipha = (ipha_t *)mp->b_rptr; 1546 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1547 1548 switch (protocol) { 1549 case IPPROTO_TCP: 1550 ports = *(uint32_t *)up; 1551 connfp = 1552 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1553 ports, ipst)]; 1554 mutex_enter(&connfp->connf_lock); 1555 for (connp = connfp->connf_head; connp != NULL; 1556 connp = connp->conn_next) { 1557 if (IPCL_CONN_MATCH(connp, protocol, 1558 ipha->ipha_src, ipha->ipha_dst, ports) && 1559 (connp->conn_zoneid == zoneid || 1560 connp->conn_allzones || 1561 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1562 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1563 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1564 break; 1565 } 1566 1567 if (connp != NULL) { 1568 /* 1569 * We have a fully-bound TCP connection. 1570 * 1571 * For labeled systems, there's no need to check the 1572 * label here. It's known to be good as we checked 1573 * before allowing the connection to become bound. 1574 */ 1575 CONN_INC_REF(connp); 1576 mutex_exit(&connfp->connf_lock); 1577 return (connp); 1578 } 1579 1580 mutex_exit(&connfp->connf_lock); 1581 lport = up[1]; 1582 bind_connfp = 1583 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1584 mutex_enter(&bind_connfp->connf_lock); 1585 for (connp = bind_connfp->connf_head; connp != NULL; 1586 connp = connp->conn_next) { 1587 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1588 lport) && 1589 (connp->conn_zoneid == zoneid || 1590 connp->conn_allzones || 1591 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1592 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1593 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1594 break; 1595 } 1596 1597 /* 1598 * If the matching connection is SLP on a private address, then 1599 * the label on the packet must match the local zone's label. 1600 * Otherwise, it must be in the label range defined by tnrh. 1601 * This is ensured by tsol_receive_local. 1602 * 1603 * Note that we don't check tsol_receive_local for 1604 * the connected case. 1605 */ 1606 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1607 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1608 ira, connp)) { 1609 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1610 char *, "connp(1) could not receive mp(2)", 1611 conn_t *, connp, mblk_t *, mp); 1612 connp = NULL; 1613 } 1614 1615 if (connp != NULL) { 1616 /* Have a listener at least */ 1617 CONN_INC_REF(connp); 1618 mutex_exit(&bind_connfp->connf_lock); 1619 return (connp); 1620 } 1621 1622 mutex_exit(&bind_connfp->connf_lock); 1623 break; 1624 1625 case IPPROTO_UDP: 1626 lport = up[1]; 1627 fport = up[0]; 1628 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1629 mutex_enter(&connfp->connf_lock); 1630 for (connp = connfp->connf_head; connp != NULL; 1631 connp = connp->conn_next) { 1632 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1633 fport, ipha->ipha_src) && 1634 (connp->conn_zoneid == zoneid || 1635 connp->conn_allzones || 1636 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1637 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1638 break; 1639 } 1640 1641 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1642 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1643 ira, connp)) { 1644 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1645 char *, "connp(1) could not receive mp(2)", 1646 conn_t *, connp, mblk_t *, mp); 1647 connp = NULL; 1648 } 1649 1650 if (connp != NULL) { 1651 CONN_INC_REF(connp); 1652 mutex_exit(&connfp->connf_lock); 1653 return (connp); 1654 } 1655 1656 /* 1657 * We shouldn't come here for multicast/broadcast packets 1658 */ 1659 mutex_exit(&connfp->connf_lock); 1660 1661 break; 1662 1663 case IPPROTO_ENCAP: 1664 case IPPROTO_IPV6: 1665 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1666 &ipha->ipha_dst, ipst)); 1667 } 1668 1669 return (NULL); 1670 } 1671 1672 conn_t * 1673 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1674 ip_recv_attr_t *ira, ip_stack_t *ipst) 1675 { 1676 ip6_t *ip6h; 1677 connf_t *connfp, *bind_connfp; 1678 uint16_t lport; 1679 uint16_t fport; 1680 tcpha_t *tcpha; 1681 uint32_t ports; 1682 conn_t *connp; 1683 uint16_t *up; 1684 zoneid_t zoneid = ira->ira_zoneid; 1685 1686 ip6h = (ip6_t *)mp->b_rptr; 1687 1688 switch (protocol) { 1689 case IPPROTO_TCP: 1690 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1691 up = &tcpha->tha_lport; 1692 ports = *(uint32_t *)up; 1693 1694 connfp = 1695 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1696 ports, ipst)]; 1697 mutex_enter(&connfp->connf_lock); 1698 for (connp = connfp->connf_head; connp != NULL; 1699 connp = connp->conn_next) { 1700 if (IPCL_CONN_MATCH_V6(connp, protocol, 1701 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1702 (connp->conn_zoneid == zoneid || 1703 connp->conn_allzones || 1704 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1705 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1706 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1707 break; 1708 } 1709 1710 if (connp != NULL) { 1711 /* 1712 * We have a fully-bound TCP connection. 1713 * 1714 * For labeled systems, there's no need to check the 1715 * label here. It's known to be good as we checked 1716 * before allowing the connection to become bound. 1717 */ 1718 CONN_INC_REF(connp); 1719 mutex_exit(&connfp->connf_lock); 1720 return (connp); 1721 } 1722 1723 mutex_exit(&connfp->connf_lock); 1724 1725 lport = up[1]; 1726 bind_connfp = 1727 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1728 mutex_enter(&bind_connfp->connf_lock); 1729 for (connp = bind_connfp->connf_head; connp != NULL; 1730 connp = connp->conn_next) { 1731 if (IPCL_BIND_MATCH_V6(connp, protocol, 1732 ip6h->ip6_dst, lport) && 1733 (connp->conn_zoneid == zoneid || 1734 connp->conn_allzones || 1735 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1736 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1737 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1738 break; 1739 } 1740 1741 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1742 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1743 ira, connp)) { 1744 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1745 char *, "connp(1) could not receive mp(2)", 1746 conn_t *, connp, mblk_t *, mp); 1747 connp = NULL; 1748 } 1749 1750 if (connp != NULL) { 1751 /* Have a listner at least */ 1752 CONN_INC_REF(connp); 1753 mutex_exit(&bind_connfp->connf_lock); 1754 return (connp); 1755 } 1756 1757 mutex_exit(&bind_connfp->connf_lock); 1758 break; 1759 1760 case IPPROTO_UDP: 1761 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1762 lport = up[1]; 1763 fport = up[0]; 1764 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1765 mutex_enter(&connfp->connf_lock); 1766 for (connp = connfp->connf_head; connp != NULL; 1767 connp = connp->conn_next) { 1768 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1769 fport, ip6h->ip6_src) && 1770 (connp->conn_zoneid == zoneid || 1771 connp->conn_allzones || 1772 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1773 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1774 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1775 break; 1776 } 1777 1778 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1779 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1780 ira, connp)) { 1781 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1782 char *, "connp(1) could not receive mp(2)", 1783 conn_t *, connp, mblk_t *, mp); 1784 connp = NULL; 1785 } 1786 1787 if (connp != NULL) { 1788 CONN_INC_REF(connp); 1789 mutex_exit(&connfp->connf_lock); 1790 return (connp); 1791 } 1792 1793 /* 1794 * We shouldn't come here for multicast/broadcast packets 1795 */ 1796 mutex_exit(&connfp->connf_lock); 1797 break; 1798 case IPPROTO_ENCAP: 1799 case IPPROTO_IPV6: 1800 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1801 &ip6h->ip6_dst, ipst)); 1802 } 1803 1804 return (NULL); 1805 } 1806 1807 /* 1808 * wrapper around ipcl_classify_(v4,v6) routines. 1809 */ 1810 conn_t * 1811 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1812 { 1813 if (ira->ira_flags & IRAF_IS_IPV4) { 1814 return (ipcl_classify_v4(mp, ira->ira_protocol, 1815 ira->ira_ip_hdr_length, ira, ipst)); 1816 } else { 1817 return (ipcl_classify_v6(mp, ira->ira_protocol, 1818 ira->ira_ip_hdr_length, ira, ipst)); 1819 } 1820 } 1821 1822 /* 1823 * Only used to classify SCTP RAW sockets 1824 */ 1825 conn_t * 1826 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1827 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1828 { 1829 connf_t *connfp; 1830 conn_t *connp; 1831 in_port_t lport; 1832 int ipversion; 1833 const void *dst; 1834 zoneid_t zoneid = ira->ira_zoneid; 1835 1836 lport = ((uint16_t *)&ports)[1]; 1837 if (ira->ira_flags & IRAF_IS_IPV4) { 1838 dst = (const void *)&ipha->ipha_dst; 1839 ipversion = IPV4_VERSION; 1840 } else { 1841 dst = (const void *)&ip6h->ip6_dst; 1842 ipversion = IPV6_VERSION; 1843 } 1844 1845 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1846 mutex_enter(&connfp->connf_lock); 1847 for (connp = connfp->connf_head; connp != NULL; 1848 connp = connp->conn_next) { 1849 /* We don't allow v4 fallback for v6 raw socket. */ 1850 if (ipversion != connp->conn_ipversion) 1851 continue; 1852 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1853 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1854 if (ipversion == IPV4_VERSION) { 1855 if (!IPCL_CONN_MATCH(connp, protocol, 1856 ipha->ipha_src, ipha->ipha_dst, ports)) 1857 continue; 1858 } else { 1859 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1860 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1861 continue; 1862 } 1863 } else { 1864 if (ipversion == IPV4_VERSION) { 1865 if (!IPCL_BIND_MATCH(connp, protocol, 1866 ipha->ipha_dst, lport)) 1867 continue; 1868 } else { 1869 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1870 ip6h->ip6_dst, lport)) 1871 continue; 1872 } 1873 } 1874 1875 if (connp->conn_zoneid == zoneid || 1876 connp->conn_allzones || 1877 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1878 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1879 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1880 break; 1881 } 1882 1883 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1884 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1885 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1886 char *, "connp(1) could not receive mp(2)", 1887 conn_t *, connp, mblk_t *, mp); 1888 connp = NULL; 1889 } 1890 1891 if (connp != NULL) 1892 goto found; 1893 mutex_exit(&connfp->connf_lock); 1894 1895 /* Try to look for a wildcard SCTP RAW socket match. */ 1896 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1897 mutex_enter(&connfp->connf_lock); 1898 for (connp = connfp->connf_head; connp != NULL; 1899 connp = connp->conn_next) { 1900 /* We don't allow v4 fallback for v6 raw socket. */ 1901 if (ipversion != connp->conn_ipversion) 1902 continue; 1903 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1904 continue; 1905 1906 if (ipversion == IPV4_VERSION) { 1907 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1908 break; 1909 } else { 1910 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1911 break; 1912 } 1913 } 1914 } 1915 1916 if (connp != NULL) 1917 goto found; 1918 1919 mutex_exit(&connfp->connf_lock); 1920 return (NULL); 1921 1922 found: 1923 ASSERT(connp != NULL); 1924 CONN_INC_REF(connp); 1925 mutex_exit(&connfp->connf_lock); 1926 return (connp); 1927 } 1928 1929 /* ARGSUSED */ 1930 static int 1931 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1932 { 1933 itc_t *itc = (itc_t *)buf; 1934 conn_t *connp = &itc->itc_conn; 1935 tcp_t *tcp = (tcp_t *)&itc[1]; 1936 1937 bzero(connp, sizeof (conn_t)); 1938 bzero(tcp, sizeof (tcp_t)); 1939 1940 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1941 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1942 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1943 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1944 if (tcp->tcp_timercache == NULL) 1945 return (ENOMEM); 1946 connp->conn_tcp = tcp; 1947 connp->conn_flags = IPCL_TCPCONN; 1948 connp->conn_proto = IPPROTO_TCP; 1949 tcp->tcp_connp = connp; 1950 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1951 1952 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1953 if (connp->conn_ixa == NULL) { 1954 tcp_timermp_free(tcp); 1955 return (ENOMEM); 1956 } 1957 connp->conn_ixa->ixa_refcnt = 1; 1958 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1959 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1960 return (0); 1961 } 1962 1963 /* ARGSUSED */ 1964 static void 1965 tcp_conn_destructor(void *buf, void *cdrarg) 1966 { 1967 itc_t *itc = (itc_t *)buf; 1968 conn_t *connp = &itc->itc_conn; 1969 tcp_t *tcp = (tcp_t *)&itc[1]; 1970 1971 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1972 ASSERT(tcp->tcp_connp == connp); 1973 ASSERT(connp->conn_tcp == tcp); 1974 tcp_timermp_free(tcp); 1975 mutex_destroy(&connp->conn_lock); 1976 cv_destroy(&connp->conn_cv); 1977 cv_destroy(&connp->conn_sq_cv); 1978 rw_destroy(&connp->conn_ilg_lock); 1979 1980 /* Can be NULL if constructor failed */ 1981 if (connp->conn_ixa != NULL) { 1982 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1983 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1984 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1985 ixa_refrele(connp->conn_ixa); 1986 } 1987 } 1988 1989 /* ARGSUSED */ 1990 static int 1991 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1992 { 1993 itc_t *itc = (itc_t *)buf; 1994 conn_t *connp = &itc->itc_conn; 1995 1996 bzero(connp, sizeof (conn_t)); 1997 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1998 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1999 connp->conn_flags = IPCL_IPCCONN; 2000 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2001 2002 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2003 if (connp->conn_ixa == NULL) 2004 return (ENOMEM); 2005 connp->conn_ixa->ixa_refcnt = 1; 2006 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2007 return (0); 2008 } 2009 2010 /* ARGSUSED */ 2011 static void 2012 ip_conn_destructor(void *buf, void *cdrarg) 2013 { 2014 itc_t *itc = (itc_t *)buf; 2015 conn_t *connp = &itc->itc_conn; 2016 2017 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2018 ASSERT(connp->conn_priv == NULL); 2019 mutex_destroy(&connp->conn_lock); 2020 cv_destroy(&connp->conn_cv); 2021 rw_destroy(&connp->conn_ilg_lock); 2022 2023 /* Can be NULL if constructor failed */ 2024 if (connp->conn_ixa != NULL) { 2025 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2026 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2027 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2028 ixa_refrele(connp->conn_ixa); 2029 } 2030 } 2031 2032 /* ARGSUSED */ 2033 static int 2034 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2035 { 2036 itc_t *itc = (itc_t *)buf; 2037 conn_t *connp = &itc->itc_conn; 2038 udp_t *udp = (udp_t *)&itc[1]; 2039 2040 bzero(connp, sizeof (conn_t)); 2041 bzero(udp, sizeof (udp_t)); 2042 2043 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2044 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2045 connp->conn_udp = udp; 2046 connp->conn_flags = IPCL_UDPCONN; 2047 connp->conn_proto = IPPROTO_UDP; 2048 udp->udp_connp = connp; 2049 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2050 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2051 if (connp->conn_ixa == NULL) 2052 return (ENOMEM); 2053 connp->conn_ixa->ixa_refcnt = 1; 2054 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2055 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2056 return (0); 2057 } 2058 2059 /* ARGSUSED */ 2060 static void 2061 udp_conn_destructor(void *buf, void *cdrarg) 2062 { 2063 itc_t *itc = (itc_t *)buf; 2064 conn_t *connp = &itc->itc_conn; 2065 udp_t *udp = (udp_t *)&itc[1]; 2066 2067 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2068 ASSERT(udp->udp_connp == connp); 2069 ASSERT(connp->conn_udp == udp); 2070 mutex_destroy(&connp->conn_lock); 2071 cv_destroy(&connp->conn_cv); 2072 rw_destroy(&connp->conn_ilg_lock); 2073 2074 /* Can be NULL if constructor failed */ 2075 if (connp->conn_ixa != NULL) { 2076 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2077 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2078 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2079 ixa_refrele(connp->conn_ixa); 2080 } 2081 } 2082 2083 /* ARGSUSED */ 2084 static int 2085 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2086 { 2087 itc_t *itc = (itc_t *)buf; 2088 conn_t *connp = &itc->itc_conn; 2089 icmp_t *icmp = (icmp_t *)&itc[1]; 2090 2091 bzero(connp, sizeof (conn_t)); 2092 bzero(icmp, sizeof (icmp_t)); 2093 2094 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2095 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2096 connp->conn_icmp = icmp; 2097 connp->conn_flags = IPCL_RAWIPCONN; 2098 connp->conn_proto = IPPROTO_ICMP; 2099 icmp->icmp_connp = connp; 2100 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2101 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2102 if (connp->conn_ixa == NULL) 2103 return (ENOMEM); 2104 connp->conn_ixa->ixa_refcnt = 1; 2105 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2106 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2107 return (0); 2108 } 2109 2110 /* ARGSUSED */ 2111 static void 2112 rawip_conn_destructor(void *buf, void *cdrarg) 2113 { 2114 itc_t *itc = (itc_t *)buf; 2115 conn_t *connp = &itc->itc_conn; 2116 icmp_t *icmp = (icmp_t *)&itc[1]; 2117 2118 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2119 ASSERT(icmp->icmp_connp == connp); 2120 ASSERT(connp->conn_icmp == icmp); 2121 mutex_destroy(&connp->conn_lock); 2122 cv_destroy(&connp->conn_cv); 2123 rw_destroy(&connp->conn_ilg_lock); 2124 2125 /* Can be NULL if constructor failed */ 2126 if (connp->conn_ixa != NULL) { 2127 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2128 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2129 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2130 ixa_refrele(connp->conn_ixa); 2131 } 2132 } 2133 2134 /* ARGSUSED */ 2135 static int 2136 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2137 { 2138 itc_t *itc = (itc_t *)buf; 2139 conn_t *connp = &itc->itc_conn; 2140 rts_t *rts = (rts_t *)&itc[1]; 2141 2142 bzero(connp, sizeof (conn_t)); 2143 bzero(rts, sizeof (rts_t)); 2144 2145 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2146 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2147 connp->conn_rts = rts; 2148 connp->conn_flags = IPCL_RTSCONN; 2149 rts->rts_connp = connp; 2150 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2151 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2152 if (connp->conn_ixa == NULL) 2153 return (ENOMEM); 2154 connp->conn_ixa->ixa_refcnt = 1; 2155 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2156 return (0); 2157 } 2158 2159 /* ARGSUSED */ 2160 static void 2161 rts_conn_destructor(void *buf, void *cdrarg) 2162 { 2163 itc_t *itc = (itc_t *)buf; 2164 conn_t *connp = &itc->itc_conn; 2165 rts_t *rts = (rts_t *)&itc[1]; 2166 2167 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2168 ASSERT(rts->rts_connp == connp); 2169 ASSERT(connp->conn_rts == rts); 2170 mutex_destroy(&connp->conn_lock); 2171 cv_destroy(&connp->conn_cv); 2172 rw_destroy(&connp->conn_ilg_lock); 2173 2174 /* Can be NULL if constructor failed */ 2175 if (connp->conn_ixa != NULL) { 2176 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2177 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2178 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2179 ixa_refrele(connp->conn_ixa); 2180 } 2181 } 2182 2183 /* 2184 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2185 * in the conn_t. 2186 * 2187 * Below we list all the pointers in the conn_t as a documentation aid. 2188 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2189 * If you add any pointers to the conn_t please add an ASSERT here 2190 * and #ifdef it out if it can't be actually asserted to be NULL. 2191 * In any case, we bzero most of the conn_t at the end of the function. 2192 */ 2193 void 2194 ipcl_conn_cleanup(conn_t *connp) 2195 { 2196 ip_xmit_attr_t *ixa; 2197 2198 ASSERT(connp->conn_latch == NULL); 2199 ASSERT(connp->conn_latch_in_policy == NULL); 2200 ASSERT(connp->conn_latch_in_action == NULL); 2201 #ifdef notdef 2202 ASSERT(connp->conn_rq == NULL); 2203 ASSERT(connp->conn_wq == NULL); 2204 #endif 2205 ASSERT(connp->conn_cred == NULL); 2206 ASSERT(connp->conn_g_fanout == NULL); 2207 ASSERT(connp->conn_g_next == NULL); 2208 ASSERT(connp->conn_g_prev == NULL); 2209 ASSERT(connp->conn_policy == NULL); 2210 ASSERT(connp->conn_fanout == NULL); 2211 ASSERT(connp->conn_next == NULL); 2212 ASSERT(connp->conn_prev == NULL); 2213 ASSERT(connp->conn_oper_pending_ill == NULL); 2214 ASSERT(connp->conn_ilg == NULL); 2215 ASSERT(connp->conn_drain_next == NULL); 2216 ASSERT(connp->conn_drain_prev == NULL); 2217 #ifdef notdef 2218 /* conn_idl is not cleared when removed from idl list */ 2219 ASSERT(connp->conn_idl == NULL); 2220 #endif 2221 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2222 #ifdef notdef 2223 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2224 ASSERT(connp->conn_netstack == NULL); 2225 #endif 2226 2227 ASSERT(connp->conn_helper_info == NULL); 2228 ASSERT(connp->conn_ixa != NULL); 2229 ixa = connp->conn_ixa; 2230 ASSERT(ixa->ixa_refcnt == 1); 2231 /* Need to preserve ixa_protocol */ 2232 ixa_cleanup(ixa); 2233 ixa->ixa_flags = 0; 2234 2235 /* Clear out the conn_t fields that are not preserved */ 2236 bzero(&connp->conn_start_clr, 2237 sizeof (conn_t) - 2238 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2239 } 2240 2241 /* 2242 * All conns are inserted in a global multi-list for the benefit of 2243 * walkers. The walk is guaranteed to walk all open conns at the time 2244 * of the start of the walk exactly once. This property is needed to 2245 * achieve some cleanups during unplumb of interfaces. This is achieved 2246 * as follows. 2247 * 2248 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2249 * call the insert and delete functions below at creation and deletion 2250 * time respectively. The conn never moves or changes its position in this 2251 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2252 * won't increase due to walkers, once the conn deletion has started. Note 2253 * that we can't remove the conn from the global list and then wait for 2254 * the refcnt to drop to zero, since walkers would then see a truncated 2255 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2256 * conns until ip_open is ready to make them globally visible. 2257 * The global round robin multi-list locks are held only to get the 2258 * next member/insertion/deletion and contention should be negligible 2259 * if the multi-list is much greater than the number of cpus. 2260 */ 2261 void 2262 ipcl_globalhash_insert(conn_t *connp) 2263 { 2264 int index; 2265 struct connf_s *connfp; 2266 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2267 2268 /* 2269 * No need for atomic here. Approximate even distribution 2270 * in the global lists is sufficient. 2271 */ 2272 ipst->ips_conn_g_index++; 2273 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2274 2275 connp->conn_g_prev = NULL; 2276 /* 2277 * Mark as INCIPIENT, so that walkers will ignore this 2278 * for now, till ip_open is ready to make it visible globally. 2279 */ 2280 connp->conn_state_flags |= CONN_INCIPIENT; 2281 2282 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2283 /* Insert at the head of the list */ 2284 mutex_enter(&connfp->connf_lock); 2285 connp->conn_g_next = connfp->connf_head; 2286 if (connp->conn_g_next != NULL) 2287 connp->conn_g_next->conn_g_prev = connp; 2288 connfp->connf_head = connp; 2289 2290 /* The fanout bucket this conn points to */ 2291 connp->conn_g_fanout = connfp; 2292 2293 mutex_exit(&connfp->connf_lock); 2294 } 2295 2296 void 2297 ipcl_globalhash_remove(conn_t *connp) 2298 { 2299 struct connf_s *connfp; 2300 2301 /* 2302 * We were never inserted in the global multi list. 2303 * IPCL_NONE variety is never inserted in the global multilist 2304 * since it is presumed to not need any cleanup and is transient. 2305 */ 2306 if (connp->conn_g_fanout == NULL) 2307 return; 2308 2309 connfp = connp->conn_g_fanout; 2310 mutex_enter(&connfp->connf_lock); 2311 if (connp->conn_g_prev != NULL) 2312 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2313 else 2314 connfp->connf_head = connp->conn_g_next; 2315 if (connp->conn_g_next != NULL) 2316 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2317 mutex_exit(&connfp->connf_lock); 2318 2319 /* Better to stumble on a null pointer than to corrupt memory */ 2320 connp->conn_g_next = NULL; 2321 connp->conn_g_prev = NULL; 2322 connp->conn_g_fanout = NULL; 2323 } 2324 2325 /* 2326 * Walk the list of all conn_t's in the system, calling the function provided 2327 * With the specified argument for each. 2328 * Applies to both IPv4 and IPv6. 2329 * 2330 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2331 * conn_oper_pending_ill). To guard against stale pointers 2332 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2333 * unplumbed or removed. New conn_t's that are created while we are walking 2334 * may be missed by this walk, because they are not necessarily inserted 2335 * at the tail of the list. They are new conn_t's and thus don't have any 2336 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2337 * is created to the struct that is going away. 2338 */ 2339 void 2340 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2341 { 2342 int i; 2343 conn_t *connp; 2344 conn_t *prev_connp; 2345 2346 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2347 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2348 prev_connp = NULL; 2349 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2350 while (connp != NULL) { 2351 mutex_enter(&connp->conn_lock); 2352 if (connp->conn_state_flags & 2353 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2354 mutex_exit(&connp->conn_lock); 2355 connp = connp->conn_g_next; 2356 continue; 2357 } 2358 CONN_INC_REF_LOCKED(connp); 2359 mutex_exit(&connp->conn_lock); 2360 mutex_exit( 2361 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2362 (*func)(connp, arg); 2363 if (prev_connp != NULL) 2364 CONN_DEC_REF(prev_connp); 2365 mutex_enter( 2366 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2367 prev_connp = connp; 2368 connp = connp->conn_g_next; 2369 } 2370 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2371 if (prev_connp != NULL) 2372 CONN_DEC_REF(prev_connp); 2373 } 2374 } 2375 2376 /* 2377 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2378 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2379 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2380 * (peer tcp in ESTABLISHED state). 2381 */ 2382 conn_t * 2383 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2384 ip_stack_t *ipst) 2385 { 2386 uint32_t ports; 2387 uint16_t *pports = (uint16_t *)&ports; 2388 connf_t *connfp; 2389 conn_t *tconnp; 2390 boolean_t zone_chk; 2391 2392 /* 2393 * If either the source of destination address is loopback, then 2394 * both endpoints must be in the same Zone. Otherwise, both of 2395 * the addresses are system-wide unique (tcp is in ESTABLISHED 2396 * state) and the endpoints may reside in different Zones. 2397 */ 2398 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2399 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2400 2401 pports[0] = tcpha->tha_fport; 2402 pports[1] = tcpha->tha_lport; 2403 2404 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2405 ports, ipst)]; 2406 2407 mutex_enter(&connfp->connf_lock); 2408 for (tconnp = connfp->connf_head; tconnp != NULL; 2409 tconnp = tconnp->conn_next) { 2410 2411 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2412 ipha->ipha_dst, ipha->ipha_src, ports) && 2413 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2414 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2415 2416 ASSERT(tconnp != connp); 2417 CONN_INC_REF(tconnp); 2418 mutex_exit(&connfp->connf_lock); 2419 return (tconnp); 2420 } 2421 } 2422 mutex_exit(&connfp->connf_lock); 2423 return (NULL); 2424 } 2425 2426 /* 2427 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2428 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2429 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2430 * (peer tcp in ESTABLISHED state). 2431 */ 2432 conn_t * 2433 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2434 ip_stack_t *ipst) 2435 { 2436 uint32_t ports; 2437 uint16_t *pports = (uint16_t *)&ports; 2438 connf_t *connfp; 2439 conn_t *tconnp; 2440 boolean_t zone_chk; 2441 2442 /* 2443 * If either the source of destination address is loopback, then 2444 * both endpoints must be in the same Zone. Otherwise, both of 2445 * the addresses are system-wide unique (tcp is in ESTABLISHED 2446 * state) and the endpoints may reside in different Zones. We 2447 * don't do Zone check for link local address(es) because the 2448 * current Zone implementation treats each link local address as 2449 * being unique per system node, i.e. they belong to global Zone. 2450 */ 2451 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2452 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2453 2454 pports[0] = tcpha->tha_fport; 2455 pports[1] = tcpha->tha_lport; 2456 2457 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2458 ports, ipst)]; 2459 2460 mutex_enter(&connfp->connf_lock); 2461 for (tconnp = connfp->connf_head; tconnp != NULL; 2462 tconnp = tconnp->conn_next) { 2463 2464 /* We skip conn_bound_if check here as this is loopback tcp */ 2465 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2466 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2467 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2468 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2469 2470 ASSERT(tconnp != connp); 2471 CONN_INC_REF(tconnp); 2472 mutex_exit(&connfp->connf_lock); 2473 return (tconnp); 2474 } 2475 } 2476 mutex_exit(&connfp->connf_lock); 2477 return (NULL); 2478 } 2479 2480 /* 2481 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2482 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2483 * Only checks for connected entries i.e. no INADDR_ANY checks. 2484 */ 2485 conn_t * 2486 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2487 ip_stack_t *ipst) 2488 { 2489 uint32_t ports; 2490 uint16_t *pports; 2491 connf_t *connfp; 2492 conn_t *tconnp; 2493 2494 pports = (uint16_t *)&ports; 2495 pports[0] = tcpha->tha_fport; 2496 pports[1] = tcpha->tha_lport; 2497 2498 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2499 ports, ipst)]; 2500 2501 mutex_enter(&connfp->connf_lock); 2502 for (tconnp = connfp->connf_head; tconnp != NULL; 2503 tconnp = tconnp->conn_next) { 2504 2505 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2506 ipha->ipha_dst, ipha->ipha_src, ports) && 2507 tconnp->conn_tcp->tcp_state >= min_state) { 2508 2509 CONN_INC_REF(tconnp); 2510 mutex_exit(&connfp->connf_lock); 2511 return (tconnp); 2512 } 2513 } 2514 mutex_exit(&connfp->connf_lock); 2515 return (NULL); 2516 } 2517 2518 /* 2519 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2520 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2521 * Only checks for connected entries i.e. no INADDR_ANY checks. 2522 * Match on ifindex in addition to addresses. 2523 */ 2524 conn_t * 2525 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2526 uint_t ifindex, ip_stack_t *ipst) 2527 { 2528 tcp_t *tcp; 2529 uint32_t ports; 2530 uint16_t *pports; 2531 connf_t *connfp; 2532 conn_t *tconnp; 2533 2534 pports = (uint16_t *)&ports; 2535 pports[0] = tcpha->tha_fport; 2536 pports[1] = tcpha->tha_lport; 2537 2538 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2539 ports, ipst)]; 2540 2541 mutex_enter(&connfp->connf_lock); 2542 for (tconnp = connfp->connf_head; tconnp != NULL; 2543 tconnp = tconnp->conn_next) { 2544 2545 tcp = tconnp->conn_tcp; 2546 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2547 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2548 tcp->tcp_state >= min_state && 2549 (tconnp->conn_bound_if == 0 || 2550 tconnp->conn_bound_if == ifindex)) { 2551 2552 CONN_INC_REF(tconnp); 2553 mutex_exit(&connfp->connf_lock); 2554 return (tconnp); 2555 } 2556 } 2557 mutex_exit(&connfp->connf_lock); 2558 return (NULL); 2559 } 2560 2561 /* 2562 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2563 * a listener when changing state. 2564 */ 2565 conn_t * 2566 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2567 ip_stack_t *ipst) 2568 { 2569 connf_t *bind_connfp; 2570 conn_t *connp; 2571 tcp_t *tcp; 2572 2573 /* 2574 * Avoid false matches for packets sent to an IP destination of 2575 * all zeros. 2576 */ 2577 if (laddr == 0) 2578 return (NULL); 2579 2580 ASSERT(zoneid != ALL_ZONES); 2581 2582 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2583 mutex_enter(&bind_connfp->connf_lock); 2584 for (connp = bind_connfp->connf_head; connp != NULL; 2585 connp = connp->conn_next) { 2586 tcp = connp->conn_tcp; 2587 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2588 IPCL_ZONE_MATCH(connp, zoneid) && 2589 (tcp->tcp_listener == NULL)) { 2590 CONN_INC_REF(connp); 2591 mutex_exit(&bind_connfp->connf_lock); 2592 return (connp); 2593 } 2594 } 2595 mutex_exit(&bind_connfp->connf_lock); 2596 return (NULL); 2597 } 2598 2599 /* 2600 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2601 * a listener when changing state. 2602 */ 2603 conn_t * 2604 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2605 zoneid_t zoneid, ip_stack_t *ipst) 2606 { 2607 connf_t *bind_connfp; 2608 conn_t *connp = NULL; 2609 tcp_t *tcp; 2610 2611 /* 2612 * Avoid false matches for packets sent to an IP destination of 2613 * all zeros. 2614 */ 2615 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2616 return (NULL); 2617 2618 ASSERT(zoneid != ALL_ZONES); 2619 2620 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2621 mutex_enter(&bind_connfp->connf_lock); 2622 for (connp = bind_connfp->connf_head; connp != NULL; 2623 connp = connp->conn_next) { 2624 tcp = connp->conn_tcp; 2625 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2626 IPCL_ZONE_MATCH(connp, zoneid) && 2627 (connp->conn_bound_if == 0 || 2628 connp->conn_bound_if == ifindex) && 2629 tcp->tcp_listener == NULL) { 2630 CONN_INC_REF(connp); 2631 mutex_exit(&bind_connfp->connf_lock); 2632 return (connp); 2633 } 2634 } 2635 mutex_exit(&bind_connfp->connf_lock); 2636 return (NULL); 2637 } 2638 2639 /* 2640 * ipcl_get_next_conn 2641 * get the next entry in the conn global list 2642 * and put a reference on the next_conn. 2643 * decrement the reference on the current conn. 2644 * 2645 * This is an iterator based walker function that also provides for 2646 * some selection by the caller. It walks through the conn_hash bucket 2647 * searching for the next valid connp in the list, and selects connections 2648 * that are neither closed nor condemned. It also REFHOLDS the conn 2649 * thus ensuring that the conn exists when the caller uses the conn. 2650 */ 2651 conn_t * 2652 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2653 { 2654 conn_t *next_connp; 2655 2656 if (connfp == NULL) 2657 return (NULL); 2658 2659 mutex_enter(&connfp->connf_lock); 2660 2661 next_connp = (connp == NULL) ? 2662 connfp->connf_head : connp->conn_g_next; 2663 2664 while (next_connp != NULL) { 2665 mutex_enter(&next_connp->conn_lock); 2666 if (!(next_connp->conn_flags & conn_flags) || 2667 (next_connp->conn_state_flags & 2668 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2669 /* 2670 * This conn has been condemned or 2671 * is closing, or the flags don't match 2672 */ 2673 mutex_exit(&next_connp->conn_lock); 2674 next_connp = next_connp->conn_g_next; 2675 continue; 2676 } 2677 CONN_INC_REF_LOCKED(next_connp); 2678 mutex_exit(&next_connp->conn_lock); 2679 break; 2680 } 2681 2682 mutex_exit(&connfp->connf_lock); 2683 2684 if (connp != NULL) 2685 CONN_DEC_REF(connp); 2686 2687 return (next_connp); 2688 } 2689 2690 #ifdef CONN_DEBUG 2691 /* 2692 * Trace of the last NBUF refhold/refrele 2693 */ 2694 int 2695 conn_trace_ref(conn_t *connp) 2696 { 2697 int last; 2698 conn_trace_t *ctb; 2699 2700 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2701 last = connp->conn_trace_last; 2702 last++; 2703 if (last == CONN_TRACE_MAX) 2704 last = 0; 2705 2706 ctb = &connp->conn_trace_buf[last]; 2707 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2708 connp->conn_trace_last = last; 2709 return (1); 2710 } 2711 2712 int 2713 conn_untrace_ref(conn_t *connp) 2714 { 2715 int last; 2716 conn_trace_t *ctb; 2717 2718 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2719 last = connp->conn_trace_last; 2720 last++; 2721 if (last == CONN_TRACE_MAX) 2722 last = 0; 2723 2724 ctb = &connp->conn_trace_buf[last]; 2725 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2726 connp->conn_trace_last = last; 2727 return (1); 2728 } 2729 #endif 2730