1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * IP PACKET CLASSIFIER 27 * 28 * The IP packet classifier provides mapping between IP packets and persistent 29 * connection state for connection-oriented protocols. It also provides 30 * interface for managing connection states. 31 * 32 * The connection state is kept in conn_t data structure and contains, among 33 * other things: 34 * 35 * o local/remote address and ports 36 * o Transport protocol 37 * o squeue for the connection (for TCP only) 38 * o reference counter 39 * o Connection state 40 * o hash table linkage 41 * o interface/ire information 42 * o credentials 43 * o ipsec policy 44 * o send and receive functions. 45 * o mutex lock. 46 * 47 * Connections use a reference counting scheme. They are freed when the 48 * reference counter drops to zero. A reference is incremented when connection 49 * is placed in a list or table, when incoming packet for the connection arrives 50 * and when connection is processed via squeue (squeue processing may be 51 * asynchronous and the reference protects the connection from being destroyed 52 * before its processing is finished). 53 * 54 * conn_recv is used to pass up packets to the ULP. 55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 56 * a listener, and changes to tcp_input_listener as the listener has picked a 57 * good squeue. For other cases it is set to tcp_input_data. 58 * 59 * conn_recvicmp is used to pass up ICMP errors to the ULP. 60 * 61 * Classifier uses several hash tables: 62 * 63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 64 * ipcl_bind_fanout: contains all connections in BOUND state 65 * ipcl_proto_fanout: IPv4 protocol fanout 66 * ipcl_proto_fanout_v6: IPv6 protocol fanout 67 * ipcl_udp_fanout: contains all UDP connections 68 * ipcl_iptun_fanout: contains all IP tunnel connections 69 * ipcl_globalhash_fanout: contains all connections 70 * 71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 72 * which need to view all existing connections. 73 * 74 * All tables are protected by per-bucket locks. When both per-bucket lock and 75 * connection lock need to be held, the per-bucket lock should be acquired 76 * first, followed by the connection lock. 77 * 78 * All functions doing search in one of these tables increment a reference 79 * counter on the connection found (if any). This reference should be dropped 80 * when the caller has finished processing the connection. 81 * 82 * 83 * INTERFACES: 84 * =========== 85 * 86 * Connection Lookup: 87 * ------------------ 88 * 89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 91 * 92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 93 * it can't find any associated connection. If the connection is found, its 94 * reference counter is incremented. 95 * 96 * mp: mblock, containing packet header. The full header should fit 97 * into a single mblock. It should also contain at least full IP 98 * and TCP or UDP header. 99 * 100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 101 * 102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 103 * the packet. 104 * 105 * ira->ira_zoneid: The zone in which the returned connection must be; the 106 * zoneid corresponding to the ire_zoneid on the IRE located for 107 * the packet's destination address. 108 * 109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 110 * IRAF_TX_SHARED_ADDR flags 111 * 112 * For TCP connections, the lookup order is as follows: 113 * 5-tuple {src, dst, protocol, local port, remote port} 114 * lookup in ipcl_conn_fanout table. 115 * 3-tuple {dst, remote port, protocol} lookup in 116 * ipcl_bind_fanout table. 117 * 118 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 119 * remote port} lookup is done on ipcl_udp_fanout. Note that, 120 * these interfaces do not handle cases where a packets belongs 121 * to multiple UDP clients, which is handled in IP itself. 122 * 123 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 124 * determine which actual zone gets the segment. This is used only in a 125 * labeled environment. The matching rules are: 126 * 127 * - If it's not a multilevel port, then the label on the packet selects 128 * the zone. Unlabeled packets are delivered to the global zone. 129 * 130 * - If it's a multilevel port, then only the zone registered to receive 131 * packets on that port matches. 132 * 133 * Also, in a labeled environment, packet labels need to be checked. For fully 134 * bound TCP connections, we can assume that the packet label was checked 135 * during connection establishment, and doesn't need to be checked on each 136 * packet. For others, though, we need to check for strict equality or, for 137 * multilevel ports, membership in the range or set. This part currently does 138 * a tnrh lookup on each packet, but could be optimized to use cached results 139 * if that were necessary. (SCTP doesn't come through here, but if it did, 140 * we would apply the same rules as TCP.) 141 * 142 * An implication of the above is that fully-bound TCP sockets must always use 143 * distinct 4-tuples; they can't be discriminated by label alone. 144 * 145 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 146 * as there's no connection set-up handshake and no shared state. 147 * 148 * Labels on looped-back packets within a single zone do not need to be 149 * checked, as all processes in the same zone have the same label. 150 * 151 * Finally, for unlabeled packets received by a labeled system, special rules 152 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 153 * socket in the zone whose label matches the default label of the sender, if 154 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 155 * receiver's label must dominate the sender's default label. 156 * 157 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 158 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 159 * ip_stack); 160 * 161 * Lookup routine to find a exact match for {src, dst, local port, 162 * remote port) for TCP connections in ipcl_conn_fanout. The address and 163 * ports are read from the IP and TCP header respectively. 164 * 165 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 166 * zoneid, ip_stack); 167 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 168 * zoneid, ip_stack); 169 * 170 * Lookup routine to find a listener with the tuple {lport, laddr, 171 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 172 * parameter interface index is also compared. 173 * 174 * void ipcl_walk(func, arg, ip_stack) 175 * 176 * Apply 'func' to every connection available. The 'func' is called as 177 * (*func)(connp, arg). The walk is non-atomic so connections may be 178 * created and destroyed during the walk. The CONN_CONDEMNED and 179 * CONN_INCIPIENT flags ensure that connections which are newly created 180 * or being destroyed are not selected by the walker. 181 * 182 * Table Updates 183 * ------------- 184 * 185 * int ipcl_conn_insert(connp); 186 * int ipcl_conn_insert_v4(connp); 187 * int ipcl_conn_insert_v6(connp); 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * 193 * Return value : 194 * 0 if connp was inserted 195 * EADDRINUSE if the connection with the same tuple 196 * already exists. 197 * 198 * int ipcl_bind_insert(connp); 199 * int ipcl_bind_insert_v4(connp); 200 * int ipcl_bind_insert_v6(connp); 201 * 202 * Insert 'connp' in ipcl_bind_fanout. 203 * Arguements : 204 * connp conn_t to be inserted 205 * 206 * 207 * void ipcl_hash_remove(connp); 208 * 209 * Removes the 'connp' from the connection fanout table. 210 * 211 * Connection Creation/Destruction 212 * ------------------------------- 213 * 214 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 215 * 216 * Creates a new conn based on the type flag, inserts it into 217 * globalhash table. 218 * 219 * type: This flag determines the type of conn_t which needs to be 220 * created i.e., which kmem_cache it comes from. 221 * IPCL_TCPCONN indicates a TCP connection 222 * IPCL_SCTPCONN indicates a SCTP connection 223 * IPCL_UDPCONN indicates a UDP conn_t. 224 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 225 * IPCL_RTSCONN indicates a RTS conn_t. 226 * IPCL_IPCCONN indicates all other connections. 227 * 228 * void ipcl_conn_destroy(connp) 229 * 230 * Destroys the connection state, removes it from the global 231 * connection hash table and frees its memory. 232 */ 233 234 #include <sys/types.h> 235 #include <sys/stream.h> 236 #include <sys/stropts.h> 237 #include <sys/sysmacros.h> 238 #include <sys/strsubr.h> 239 #include <sys/strsun.h> 240 #define _SUN_TPI_VERSION 2 241 #include <sys/ddi.h> 242 #include <sys/cmn_err.h> 243 #include <sys/debug.h> 244 245 #include <sys/systm.h> 246 #include <sys/param.h> 247 #include <sys/kmem.h> 248 #include <sys/isa_defs.h> 249 #include <inet/common.h> 250 #include <netinet/ip6.h> 251 #include <netinet/icmp6.h> 252 253 #include <inet/ip.h> 254 #include <inet/ip_if.h> 255 #include <inet/ip_ire.h> 256 #include <inet/ip6.h> 257 #include <inet/ip_ndp.h> 258 #include <inet/ip_impl.h> 259 #include <inet/udp_impl.h> 260 #include <inet/sctp_ip.h> 261 #include <inet/sctp/sctp_impl.h> 262 #include <inet/rawip_impl.h> 263 #include <inet/rts_impl.h> 264 #include <inet/iptun/iptun_impl.h> 265 266 #include <sys/cpuvar.h> 267 268 #include <inet/ipclassifier.h> 269 #include <inet/tcp.h> 270 #include <inet/ipsec_impl.h> 271 272 #include <sys/tsol/tnet.h> 273 #include <sys/sockio.h> 274 275 /* Old value for compatibility. Setable in /etc/system */ 276 uint_t tcp_conn_hash_size = 0; 277 278 /* New value. Zero means choose automatically. Setable in /etc/system */ 279 uint_t ipcl_conn_hash_size = 0; 280 uint_t ipcl_conn_hash_memfactor = 8192; 281 uint_t ipcl_conn_hash_maxsize = 82500; 282 283 /* bind/udp fanout table size */ 284 uint_t ipcl_bind_fanout_size = 512; 285 uint_t ipcl_udp_fanout_size = 16384; 286 287 /* Raw socket fanout size. Must be a power of 2. */ 288 uint_t ipcl_raw_fanout_size = 256; 289 290 /* 291 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 292 * expect that most large deployments would have hundreds of tunnels, and 293 * thousands in the extreme case. 294 */ 295 uint_t ipcl_iptun_fanout_size = 6143; 296 297 /* 298 * Power of 2^N Primes useful for hashing for N of 0-28, 299 * these primes are the nearest prime <= 2^N - 2^(N-2). 300 */ 301 302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 303 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 304 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 305 50331599, 100663291, 201326557, 0} 306 307 /* 308 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 309 * are aligned on cache lines. 310 */ 311 typedef union itc_s { 312 conn_t itc_conn; 313 char itcu_filler[CACHE_ALIGN(conn_s)]; 314 } itc_t; 315 316 struct kmem_cache *tcp_conn_cache; 317 struct kmem_cache *ip_conn_cache; 318 extern struct kmem_cache *sctp_conn_cache; 319 struct kmem_cache *udp_conn_cache; 320 struct kmem_cache *rawip_conn_cache; 321 struct kmem_cache *rts_conn_cache; 322 323 extern void tcp_timermp_free(tcp_t *); 324 extern mblk_t *tcp_timermp_alloc(int); 325 326 static int ip_conn_constructor(void *, void *, int); 327 static void ip_conn_destructor(void *, void *); 328 329 static int tcp_conn_constructor(void *, void *, int); 330 static void tcp_conn_destructor(void *, void *); 331 332 static int udp_conn_constructor(void *, void *, int); 333 static void udp_conn_destructor(void *, void *); 334 335 static int rawip_conn_constructor(void *, void *, int); 336 static void rawip_conn_destructor(void *, void *); 337 338 static int rts_conn_constructor(void *, void *, int); 339 static void rts_conn_destructor(void *, void *); 340 341 /* 342 * Global (for all stack instances) init routine 343 */ 344 void 345 ipcl_g_init(void) 346 { 347 ip_conn_cache = kmem_cache_create("ip_conn_cache", 348 sizeof (conn_t), CACHE_ALIGN_SIZE, 349 ip_conn_constructor, ip_conn_destructor, 350 NULL, NULL, NULL, 0); 351 352 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 353 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 354 tcp_conn_constructor, tcp_conn_destructor, 355 tcp_conn_reclaim, NULL, NULL, 0); 356 357 udp_conn_cache = kmem_cache_create("udp_conn_cache", 358 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 359 udp_conn_constructor, udp_conn_destructor, 360 NULL, NULL, NULL, 0); 361 362 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 363 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 364 rawip_conn_constructor, rawip_conn_destructor, 365 NULL, NULL, NULL, 0); 366 367 rts_conn_cache = kmem_cache_create("rts_conn_cache", 368 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 369 rts_conn_constructor, rts_conn_destructor, 370 NULL, NULL, NULL, 0); 371 } 372 373 /* 374 * ipclassifier intialization routine, sets up hash tables. 375 */ 376 void 377 ipcl_init(ip_stack_t *ipst) 378 { 379 int i; 380 int sizes[] = P2Ps(); 381 382 /* 383 * Calculate size of conn fanout table from /etc/system settings 384 */ 385 if (ipcl_conn_hash_size != 0) { 386 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 387 } else if (tcp_conn_hash_size != 0) { 388 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 389 } else { 390 extern pgcnt_t freemem; 391 392 ipst->ips_ipcl_conn_fanout_size = 393 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 394 395 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 396 ipst->ips_ipcl_conn_fanout_size = 397 ipcl_conn_hash_maxsize; 398 } 399 } 400 401 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 402 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 403 break; 404 } 405 } 406 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 407 /* Out of range, use the 2^16 value */ 408 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 409 } 410 411 /* Take values from /etc/system */ 412 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 413 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 414 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 415 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 416 417 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 418 419 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 420 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 421 422 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 423 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 424 MUTEX_DEFAULT, NULL); 425 } 426 427 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 428 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 429 430 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 431 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 432 MUTEX_DEFAULT, NULL); 433 } 434 435 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 436 sizeof (connf_t), KM_SLEEP); 437 for (i = 0; i < IPPROTO_MAX; i++) { 438 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 439 MUTEX_DEFAULT, NULL); 440 } 441 442 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 443 sizeof (connf_t), KM_SLEEP); 444 for (i = 0; i < IPPROTO_MAX; i++) { 445 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 446 MUTEX_DEFAULT, NULL); 447 } 448 449 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 450 mutex_init(&ipst->ips_rts_clients->connf_lock, 451 NULL, MUTEX_DEFAULT, NULL); 452 453 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 454 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 455 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 456 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 457 MUTEX_DEFAULT, NULL); 458 } 459 460 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 461 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 462 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 463 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 464 MUTEX_DEFAULT, NULL); 465 } 466 467 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 468 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 469 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 470 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 471 MUTEX_DEFAULT, NULL); 472 } 473 474 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 475 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 476 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 477 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 478 NULL, MUTEX_DEFAULT, NULL); 479 } 480 } 481 482 void 483 ipcl_g_destroy(void) 484 { 485 kmem_cache_destroy(ip_conn_cache); 486 kmem_cache_destroy(tcp_conn_cache); 487 kmem_cache_destroy(udp_conn_cache); 488 kmem_cache_destroy(rawip_conn_cache); 489 kmem_cache_destroy(rts_conn_cache); 490 } 491 492 /* 493 * All user-level and kernel use of the stack must be gone 494 * by now. 495 */ 496 void 497 ipcl_destroy(ip_stack_t *ipst) 498 { 499 int i; 500 501 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 502 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 503 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 504 } 505 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 506 sizeof (connf_t)); 507 ipst->ips_ipcl_conn_fanout = NULL; 508 509 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 510 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 511 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 512 } 513 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 514 sizeof (connf_t)); 515 ipst->ips_ipcl_bind_fanout = NULL; 516 517 for (i = 0; i < IPPROTO_MAX; i++) { 518 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 519 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 520 } 521 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 522 IPPROTO_MAX * sizeof (connf_t)); 523 ipst->ips_ipcl_proto_fanout_v4 = NULL; 524 525 for (i = 0; i < IPPROTO_MAX; i++) { 526 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 527 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 528 } 529 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 530 IPPROTO_MAX * sizeof (connf_t)); 531 ipst->ips_ipcl_proto_fanout_v6 = NULL; 532 533 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 534 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 535 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 536 } 537 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 538 sizeof (connf_t)); 539 ipst->ips_ipcl_udp_fanout = NULL; 540 541 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 542 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 543 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 544 } 545 kmem_free(ipst->ips_ipcl_iptun_fanout, 546 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 547 ipst->ips_ipcl_iptun_fanout = NULL; 548 549 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 550 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 551 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 552 } 553 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 554 sizeof (connf_t)); 555 ipst->ips_ipcl_raw_fanout = NULL; 556 557 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 558 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 559 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 560 } 561 kmem_free(ipst->ips_ipcl_globalhash_fanout, 562 sizeof (connf_t) * CONN_G_HASH_SIZE); 563 ipst->ips_ipcl_globalhash_fanout = NULL; 564 565 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 566 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 567 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 568 ipst->ips_rts_clients = NULL; 569 } 570 571 /* 572 * conn creation routine. initialize the conn, sets the reference 573 * and inserts it in the global hash table. 574 */ 575 conn_t * 576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 577 { 578 conn_t *connp; 579 struct kmem_cache *conn_cache; 580 581 switch (type) { 582 case IPCL_SCTPCONN: 583 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 584 return (NULL); 585 sctp_conn_init(connp); 586 netstack_hold(ns); 587 connp->conn_netstack = ns; 588 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 589 connp->conn_ixa->ixa_conn_id = (long)connp; 590 ipcl_globalhash_insert(connp); 591 return (connp); 592 593 case IPCL_TCPCONN: 594 conn_cache = tcp_conn_cache; 595 break; 596 597 case IPCL_UDPCONN: 598 conn_cache = udp_conn_cache; 599 break; 600 601 case IPCL_RAWIPCONN: 602 conn_cache = rawip_conn_cache; 603 break; 604 605 case IPCL_RTSCONN: 606 conn_cache = rts_conn_cache; 607 break; 608 609 case IPCL_IPCCONN: 610 conn_cache = ip_conn_cache; 611 break; 612 613 default: 614 connp = NULL; 615 ASSERT(0); 616 } 617 618 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 619 return (NULL); 620 621 connp->conn_ref = 1; 622 netstack_hold(ns); 623 connp->conn_netstack = ns; 624 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 625 connp->conn_ixa->ixa_conn_id = (long)connp; 626 ipcl_globalhash_insert(connp); 627 return (connp); 628 } 629 630 void 631 ipcl_conn_destroy(conn_t *connp) 632 { 633 mblk_t *mp; 634 netstack_t *ns = connp->conn_netstack; 635 636 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 637 ASSERT(connp->conn_ref == 0); 638 639 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 640 641 if (connp->conn_cred != NULL) { 642 crfree(connp->conn_cred); 643 connp->conn_cred = NULL; 644 /* ixa_cred done in ipcl_conn_cleanup below */ 645 } 646 647 if (connp->conn_ht_iphc != NULL) { 648 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 649 connp->conn_ht_iphc = NULL; 650 connp->conn_ht_iphc_allocated = 0; 651 connp->conn_ht_iphc_len = 0; 652 connp->conn_ht_ulp = NULL; 653 connp->conn_ht_ulp_len = 0; 654 } 655 ip_pkt_free(&connp->conn_xmit_ipp); 656 657 ipcl_globalhash_remove(connp); 658 659 if (connp->conn_latch != NULL) { 660 IPLATCH_REFRELE(connp->conn_latch); 661 connp->conn_latch = NULL; 662 } 663 if (connp->conn_latch_in_policy != NULL) { 664 IPPOL_REFRELE(connp->conn_latch_in_policy); 665 connp->conn_latch_in_policy = NULL; 666 } 667 if (connp->conn_latch_in_action != NULL) { 668 IPACT_REFRELE(connp->conn_latch_in_action); 669 connp->conn_latch_in_action = NULL; 670 } 671 if (connp->conn_policy != NULL) { 672 IPPH_REFRELE(connp->conn_policy, ns); 673 connp->conn_policy = NULL; 674 } 675 676 if (connp->conn_ipsec_opt_mp != NULL) { 677 freemsg(connp->conn_ipsec_opt_mp); 678 connp->conn_ipsec_opt_mp = NULL; 679 } 680 681 if (connp->conn_flags & IPCL_TCPCONN) { 682 tcp_t *tcp = connp->conn_tcp; 683 684 tcp_free(tcp); 685 mp = tcp->tcp_timercache; 686 687 tcp->tcp_tcps = NULL; 688 689 /* 690 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 691 * the mblk. 692 */ 693 if (tcp->tcp_rsrv_mp != NULL) { 694 freeb(tcp->tcp_rsrv_mp); 695 tcp->tcp_rsrv_mp = NULL; 696 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 697 } 698 699 ipcl_conn_cleanup(connp); 700 connp->conn_flags = IPCL_TCPCONN; 701 if (ns != NULL) { 702 ASSERT(tcp->tcp_tcps == NULL); 703 connp->conn_netstack = NULL; 704 connp->conn_ixa->ixa_ipst = NULL; 705 netstack_rele(ns); 706 } 707 708 bzero(tcp, sizeof (tcp_t)); 709 710 tcp->tcp_timercache = mp; 711 tcp->tcp_connp = connp; 712 kmem_cache_free(tcp_conn_cache, connp); 713 return; 714 } 715 716 if (connp->conn_flags & IPCL_SCTPCONN) { 717 ASSERT(ns != NULL); 718 sctp_free(connp); 719 return; 720 } 721 722 ipcl_conn_cleanup(connp); 723 if (ns != NULL) { 724 connp->conn_netstack = NULL; 725 connp->conn_ixa->ixa_ipst = NULL; 726 netstack_rele(ns); 727 } 728 729 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 730 if (connp->conn_flags & IPCL_UDPCONN) { 731 connp->conn_flags = IPCL_UDPCONN; 732 kmem_cache_free(udp_conn_cache, connp); 733 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 734 connp->conn_flags = IPCL_RAWIPCONN; 735 connp->conn_proto = IPPROTO_ICMP; 736 connp->conn_ixa->ixa_protocol = connp->conn_proto; 737 kmem_cache_free(rawip_conn_cache, connp); 738 } else if (connp->conn_flags & IPCL_RTSCONN) { 739 connp->conn_flags = IPCL_RTSCONN; 740 kmem_cache_free(rts_conn_cache, connp); 741 } else { 742 connp->conn_flags = IPCL_IPCCONN; 743 ASSERT(connp->conn_flags & IPCL_IPCCONN); 744 ASSERT(connp->conn_priv == NULL); 745 kmem_cache_free(ip_conn_cache, connp); 746 } 747 } 748 749 /* 750 * Running in cluster mode - deregister listener information 751 */ 752 static void 753 ipcl_conn_unlisten(conn_t *connp) 754 { 755 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 756 ASSERT(connp->conn_lport != 0); 757 758 if (cl_inet_unlisten != NULL) { 759 sa_family_t addr_family; 760 uint8_t *laddrp; 761 762 if (connp->conn_ipversion == IPV6_VERSION) { 763 addr_family = AF_INET6; 764 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 765 } else { 766 addr_family = AF_INET; 767 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 768 } 769 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 770 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 771 } 772 connp->conn_flags &= ~IPCL_CL_LISTENER; 773 } 774 775 /* 776 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 777 * which table the conn belonged to). So for debugging we can see which hash 778 * table this connection was in. 779 */ 780 #define IPCL_HASH_REMOVE(connp) { \ 781 connf_t *connfp = (connp)->conn_fanout; \ 782 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 783 if (connfp != NULL) { \ 784 mutex_enter(&connfp->connf_lock); \ 785 if ((connp)->conn_next != NULL) \ 786 (connp)->conn_next->conn_prev = \ 787 (connp)->conn_prev; \ 788 if ((connp)->conn_prev != NULL) \ 789 (connp)->conn_prev->conn_next = \ 790 (connp)->conn_next; \ 791 else \ 792 connfp->connf_head = (connp)->conn_next; \ 793 (connp)->conn_fanout = NULL; \ 794 (connp)->conn_next = NULL; \ 795 (connp)->conn_prev = NULL; \ 796 (connp)->conn_flags |= IPCL_REMOVED; \ 797 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 798 ipcl_conn_unlisten((connp)); \ 799 CONN_DEC_REF((connp)); \ 800 mutex_exit(&connfp->connf_lock); \ 801 } \ 802 } 803 804 void 805 ipcl_hash_remove(conn_t *connp) 806 { 807 uint8_t protocol = connp->conn_proto; 808 809 IPCL_HASH_REMOVE(connp); 810 if (protocol == IPPROTO_RSVP) 811 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 812 } 813 814 /* 815 * The whole purpose of this function is allow removal of 816 * a conn_t from the connected hash for timewait reclaim. 817 * This is essentially a TW reclaim fastpath where timewait 818 * collector checks under fanout lock (so no one else can 819 * get access to the conn_t) that refcnt is 2 i.e. one for 820 * TCP and one for the classifier hash list. If ref count 821 * is indeed 2, we can just remove the conn under lock and 822 * avoid cleaning up the conn under squeue. This gives us 823 * improved performance. 824 */ 825 void 826 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 827 { 828 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 829 ASSERT(MUTEX_HELD(&connp->conn_lock)); 830 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 831 832 if ((connp)->conn_next != NULL) { 833 (connp)->conn_next->conn_prev = (connp)->conn_prev; 834 } 835 if ((connp)->conn_prev != NULL) { 836 (connp)->conn_prev->conn_next = (connp)->conn_next; 837 } else { 838 connfp->connf_head = (connp)->conn_next; 839 } 840 (connp)->conn_fanout = NULL; 841 (connp)->conn_next = NULL; 842 (connp)->conn_prev = NULL; 843 (connp)->conn_flags |= IPCL_REMOVED; 844 ASSERT((connp)->conn_ref == 2); 845 (connp)->conn_ref--; 846 } 847 848 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 849 ASSERT((connp)->conn_fanout == NULL); \ 850 ASSERT((connp)->conn_next == NULL); \ 851 ASSERT((connp)->conn_prev == NULL); \ 852 if ((connfp)->connf_head != NULL) { \ 853 (connfp)->connf_head->conn_prev = (connp); \ 854 (connp)->conn_next = (connfp)->connf_head; \ 855 } \ 856 (connp)->conn_fanout = (connfp); \ 857 (connfp)->connf_head = (connp); \ 858 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 859 IPCL_CONNECTED; \ 860 CONN_INC_REF(connp); \ 861 } 862 863 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 864 IPCL_HASH_REMOVE((connp)); \ 865 mutex_enter(&(connfp)->connf_lock); \ 866 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 867 mutex_exit(&(connfp)->connf_lock); \ 868 } 869 870 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 871 conn_t *pconnp = NULL, *nconnp; \ 872 IPCL_HASH_REMOVE((connp)); \ 873 mutex_enter(&(connfp)->connf_lock); \ 874 nconnp = (connfp)->connf_head; \ 875 while (nconnp != NULL && \ 876 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 877 pconnp = nconnp; \ 878 nconnp = nconnp->conn_next; \ 879 } \ 880 if (pconnp != NULL) { \ 881 pconnp->conn_next = (connp); \ 882 (connp)->conn_prev = pconnp; \ 883 } else { \ 884 (connfp)->connf_head = (connp); \ 885 } \ 886 if (nconnp != NULL) { \ 887 (connp)->conn_next = nconnp; \ 888 nconnp->conn_prev = (connp); \ 889 } \ 890 (connp)->conn_fanout = (connfp); \ 891 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 892 IPCL_BOUND; \ 893 CONN_INC_REF(connp); \ 894 mutex_exit(&(connfp)->connf_lock); \ 895 } 896 897 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 898 conn_t **list, *prev, *next; \ 899 boolean_t isv4mapped = \ 900 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 901 IPCL_HASH_REMOVE((connp)); \ 902 mutex_enter(&(connfp)->connf_lock); \ 903 list = &(connfp)->connf_head; \ 904 prev = NULL; \ 905 while ((next = *list) != NULL) { \ 906 if (isv4mapped && \ 907 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 908 connp->conn_zoneid == next->conn_zoneid) { \ 909 (connp)->conn_next = next; \ 910 if (prev != NULL) \ 911 prev = next->conn_prev; \ 912 next->conn_prev = (connp); \ 913 break; \ 914 } \ 915 list = &next->conn_next; \ 916 prev = next; \ 917 } \ 918 (connp)->conn_prev = prev; \ 919 *list = (connp); \ 920 (connp)->conn_fanout = (connfp); \ 921 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 922 IPCL_BOUND; \ 923 CONN_INC_REF((connp)); \ 924 mutex_exit(&(connfp)->connf_lock); \ 925 } 926 927 void 928 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 929 { 930 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 931 } 932 933 /* 934 * Because the classifier is used to classify inbound packets, the destination 935 * address is meant to be our local tunnel address (tunnel source), and the 936 * source the remote tunnel address (tunnel destination). 937 * 938 * Note that conn_proto can't be used for fanout since the upper protocol 939 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 940 */ 941 conn_t * 942 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 943 { 944 connf_t *connfp; 945 conn_t *connp; 946 947 /* first look for IPv4 tunnel links */ 948 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 949 mutex_enter(&connfp->connf_lock); 950 for (connp = connfp->connf_head; connp != NULL; 951 connp = connp->conn_next) { 952 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 953 break; 954 } 955 if (connp != NULL) 956 goto done; 957 958 mutex_exit(&connfp->connf_lock); 959 960 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 961 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 962 INADDR_ANY)]; 963 mutex_enter(&connfp->connf_lock); 964 for (connp = connfp->connf_head; connp != NULL; 965 connp = connp->conn_next) { 966 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 967 break; 968 } 969 done: 970 if (connp != NULL) 971 CONN_INC_REF(connp); 972 mutex_exit(&connfp->connf_lock); 973 return (connp); 974 } 975 976 conn_t * 977 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 978 { 979 connf_t *connfp; 980 conn_t *connp; 981 982 /* Look for an IPv6 tunnel link */ 983 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 984 mutex_enter(&connfp->connf_lock); 985 for (connp = connfp->connf_head; connp != NULL; 986 connp = connp->conn_next) { 987 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 988 CONN_INC_REF(connp); 989 break; 990 } 991 } 992 mutex_exit(&connfp->connf_lock); 993 return (connp); 994 } 995 996 /* 997 * This function is used only for inserting SCTP raw socket now. 998 * This may change later. 999 * 1000 * Note that only one raw socket can be bound to a port. The param 1001 * lport is in network byte order. 1002 */ 1003 static int 1004 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1005 { 1006 connf_t *connfp; 1007 conn_t *oconnp; 1008 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1009 1010 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1011 1012 /* Check for existing raw socket already bound to the port. */ 1013 mutex_enter(&connfp->connf_lock); 1014 for (oconnp = connfp->connf_head; oconnp != NULL; 1015 oconnp = oconnp->conn_next) { 1016 if (oconnp->conn_lport == lport && 1017 oconnp->conn_zoneid == connp->conn_zoneid && 1018 oconnp->conn_family == connp->conn_family && 1019 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1020 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1021 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1022 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1023 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1024 &connp->conn_laddr_v6))) { 1025 break; 1026 } 1027 } 1028 mutex_exit(&connfp->connf_lock); 1029 if (oconnp != NULL) 1030 return (EADDRNOTAVAIL); 1031 1032 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1033 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1034 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1035 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1036 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1037 } else { 1038 IPCL_HASH_INSERT_BOUND(connfp, connp); 1039 } 1040 } else { 1041 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1042 } 1043 return (0); 1044 } 1045 1046 static int 1047 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1048 { 1049 connf_t *connfp; 1050 conn_t *tconnp; 1051 ipaddr_t laddr = connp->conn_laddr_v4; 1052 ipaddr_t faddr = connp->conn_faddr_v4; 1053 1054 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1055 mutex_enter(&connfp->connf_lock); 1056 for (tconnp = connfp->connf_head; tconnp != NULL; 1057 tconnp = tconnp->conn_next) { 1058 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1059 /* A tunnel is already bound to these addresses. */ 1060 mutex_exit(&connfp->connf_lock); 1061 return (EADDRINUSE); 1062 } 1063 } 1064 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1065 mutex_exit(&connfp->connf_lock); 1066 return (0); 1067 } 1068 1069 static int 1070 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1071 { 1072 connf_t *connfp; 1073 conn_t *tconnp; 1074 in6_addr_t *laddr = &connp->conn_laddr_v6; 1075 in6_addr_t *faddr = &connp->conn_faddr_v6; 1076 1077 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1078 mutex_enter(&connfp->connf_lock); 1079 for (tconnp = connfp->connf_head; tconnp != NULL; 1080 tconnp = tconnp->conn_next) { 1081 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1082 /* A tunnel is already bound to these addresses. */ 1083 mutex_exit(&connfp->connf_lock); 1084 return (EADDRINUSE); 1085 } 1086 } 1087 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1088 mutex_exit(&connfp->connf_lock); 1089 return (0); 1090 } 1091 1092 /* 1093 * Check for a MAC exemption conflict on a labeled system. Note that for 1094 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1095 * transport layer. This check is for binding all other protocols. 1096 * 1097 * Returns true if there's a conflict. 1098 */ 1099 static boolean_t 1100 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1101 { 1102 connf_t *connfp; 1103 conn_t *tconn; 1104 1105 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1106 mutex_enter(&connfp->connf_lock); 1107 for (tconn = connfp->connf_head; tconn != NULL; 1108 tconn = tconn->conn_next) { 1109 /* We don't allow v4 fallback for v6 raw socket */ 1110 if (connp->conn_family != tconn->conn_family) 1111 continue; 1112 /* If neither is exempt, then there's no conflict */ 1113 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1114 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1115 continue; 1116 /* We are only concerned about sockets for a different zone */ 1117 if (connp->conn_zoneid == tconn->conn_zoneid) 1118 continue; 1119 /* If both are bound to different specific addrs, ok */ 1120 if (connp->conn_laddr_v4 != INADDR_ANY && 1121 tconn->conn_laddr_v4 != INADDR_ANY && 1122 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1123 continue; 1124 /* These two conflict; fail */ 1125 break; 1126 } 1127 mutex_exit(&connfp->connf_lock); 1128 return (tconn != NULL); 1129 } 1130 1131 static boolean_t 1132 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1133 { 1134 connf_t *connfp; 1135 conn_t *tconn; 1136 1137 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1138 mutex_enter(&connfp->connf_lock); 1139 for (tconn = connfp->connf_head; tconn != NULL; 1140 tconn = tconn->conn_next) { 1141 /* We don't allow v4 fallback for v6 raw socket */ 1142 if (connp->conn_family != tconn->conn_family) 1143 continue; 1144 /* If neither is exempt, then there's no conflict */ 1145 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1146 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1147 continue; 1148 /* We are only concerned about sockets for a different zone */ 1149 if (connp->conn_zoneid == tconn->conn_zoneid) 1150 continue; 1151 /* If both are bound to different addrs, ok */ 1152 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1153 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1154 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1155 &tconn->conn_laddr_v6)) 1156 continue; 1157 /* These two conflict; fail */ 1158 break; 1159 } 1160 mutex_exit(&connfp->connf_lock); 1161 return (tconn != NULL); 1162 } 1163 1164 /* 1165 * (v4, v6) bind hash insertion routines 1166 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1167 */ 1168 1169 int 1170 ipcl_bind_insert(conn_t *connp) 1171 { 1172 if (connp->conn_ipversion == IPV6_VERSION) 1173 return (ipcl_bind_insert_v6(connp)); 1174 else 1175 return (ipcl_bind_insert_v4(connp)); 1176 } 1177 1178 int 1179 ipcl_bind_insert_v4(conn_t *connp) 1180 { 1181 connf_t *connfp; 1182 int ret = 0; 1183 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1184 uint16_t lport = connp->conn_lport; 1185 uint8_t protocol = connp->conn_proto; 1186 1187 if (IPCL_IS_IPTUN(connp)) 1188 return (ipcl_iptun_hash_insert(connp, ipst)); 1189 1190 switch (protocol) { 1191 default: 1192 if (is_system_labeled() && 1193 check_exempt_conflict_v4(connp, ipst)) 1194 return (EADDRINUSE); 1195 /* FALLTHROUGH */ 1196 case IPPROTO_UDP: 1197 if (protocol == IPPROTO_UDP) { 1198 connfp = &ipst->ips_ipcl_udp_fanout[ 1199 IPCL_UDP_HASH(lport, ipst)]; 1200 } else { 1201 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1202 } 1203 1204 if (connp->conn_faddr_v4 != INADDR_ANY) { 1205 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1206 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1207 IPCL_HASH_INSERT_BOUND(connfp, connp); 1208 } else { 1209 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1210 } 1211 if (protocol == IPPROTO_RSVP) 1212 ill_set_inputfn_all(ipst); 1213 break; 1214 1215 case IPPROTO_TCP: 1216 /* Insert it in the Bind Hash */ 1217 ASSERT(connp->conn_zoneid != ALL_ZONES); 1218 connfp = &ipst->ips_ipcl_bind_fanout[ 1219 IPCL_BIND_HASH(lport, ipst)]; 1220 if (connp->conn_laddr_v4 != INADDR_ANY) { 1221 IPCL_HASH_INSERT_BOUND(connfp, connp); 1222 } else { 1223 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1224 } 1225 if (cl_inet_listen != NULL) { 1226 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1227 connp->conn_flags |= IPCL_CL_LISTENER; 1228 (*cl_inet_listen)( 1229 connp->conn_netstack->netstack_stackid, 1230 IPPROTO_TCP, AF_INET, 1231 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1232 } 1233 break; 1234 1235 case IPPROTO_SCTP: 1236 ret = ipcl_sctp_hash_insert(connp, lport); 1237 break; 1238 } 1239 1240 return (ret); 1241 } 1242 1243 int 1244 ipcl_bind_insert_v6(conn_t *connp) 1245 { 1246 connf_t *connfp; 1247 int ret = 0; 1248 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1249 uint16_t lport = connp->conn_lport; 1250 uint8_t protocol = connp->conn_proto; 1251 1252 if (IPCL_IS_IPTUN(connp)) { 1253 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1254 } 1255 1256 switch (protocol) { 1257 default: 1258 if (is_system_labeled() && 1259 check_exempt_conflict_v6(connp, ipst)) 1260 return (EADDRINUSE); 1261 /* FALLTHROUGH */ 1262 case IPPROTO_UDP: 1263 if (protocol == IPPROTO_UDP) { 1264 connfp = &ipst->ips_ipcl_udp_fanout[ 1265 IPCL_UDP_HASH(lport, ipst)]; 1266 } else { 1267 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1268 } 1269 1270 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1271 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1272 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1273 IPCL_HASH_INSERT_BOUND(connfp, connp); 1274 } else { 1275 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1276 } 1277 break; 1278 1279 case IPPROTO_TCP: 1280 /* Insert it in the Bind Hash */ 1281 ASSERT(connp->conn_zoneid != ALL_ZONES); 1282 connfp = &ipst->ips_ipcl_bind_fanout[ 1283 IPCL_BIND_HASH(lport, ipst)]; 1284 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1285 IPCL_HASH_INSERT_BOUND(connfp, connp); 1286 } else { 1287 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1288 } 1289 if (cl_inet_listen != NULL) { 1290 sa_family_t addr_family; 1291 uint8_t *laddrp; 1292 1293 if (connp->conn_ipversion == IPV6_VERSION) { 1294 addr_family = AF_INET6; 1295 laddrp = 1296 (uint8_t *)&connp->conn_bound_addr_v6; 1297 } else { 1298 addr_family = AF_INET; 1299 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1300 } 1301 connp->conn_flags |= IPCL_CL_LISTENER; 1302 (*cl_inet_listen)( 1303 connp->conn_netstack->netstack_stackid, 1304 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1305 } 1306 break; 1307 1308 case IPPROTO_SCTP: 1309 ret = ipcl_sctp_hash_insert(connp, lport); 1310 break; 1311 } 1312 1313 return (ret); 1314 } 1315 1316 /* 1317 * ipcl_conn_hash insertion routines. 1318 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1319 */ 1320 1321 int 1322 ipcl_conn_insert(conn_t *connp) 1323 { 1324 if (connp->conn_ipversion == IPV6_VERSION) 1325 return (ipcl_conn_insert_v6(connp)); 1326 else 1327 return (ipcl_conn_insert_v4(connp)); 1328 } 1329 1330 int 1331 ipcl_conn_insert_v4(conn_t *connp) 1332 { 1333 connf_t *connfp; 1334 conn_t *tconnp; 1335 int ret = 0; 1336 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1337 uint16_t lport = connp->conn_lport; 1338 uint8_t protocol = connp->conn_proto; 1339 1340 if (IPCL_IS_IPTUN(connp)) 1341 return (ipcl_iptun_hash_insert(connp, ipst)); 1342 1343 switch (protocol) { 1344 case IPPROTO_TCP: 1345 /* 1346 * For TCP, we check whether the connection tuple already 1347 * exists before allowing the connection to proceed. We 1348 * also allow indexing on the zoneid. This is to allow 1349 * multiple shared stack zones to have the same tcp 1350 * connection tuple. In practice this only happens for 1351 * INADDR_LOOPBACK as it's the only local address which 1352 * doesn't have to be unique. 1353 */ 1354 connfp = &ipst->ips_ipcl_conn_fanout[ 1355 IPCL_CONN_HASH(connp->conn_faddr_v4, 1356 connp->conn_ports, ipst)]; 1357 mutex_enter(&connfp->connf_lock); 1358 for (tconnp = connfp->connf_head; tconnp != NULL; 1359 tconnp = tconnp->conn_next) { 1360 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1361 connp->conn_faddr_v4, connp->conn_laddr_v4, 1362 connp->conn_ports) && 1363 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1364 /* Already have a conn. bail out */ 1365 mutex_exit(&connfp->connf_lock); 1366 return (EADDRINUSE); 1367 } 1368 } 1369 if (connp->conn_fanout != NULL) { 1370 /* 1371 * Probably a XTI/TLI application trying to do a 1372 * rebind. Let it happen. 1373 */ 1374 mutex_exit(&connfp->connf_lock); 1375 IPCL_HASH_REMOVE(connp); 1376 mutex_enter(&connfp->connf_lock); 1377 } 1378 1379 ASSERT(connp->conn_recv != NULL); 1380 ASSERT(connp->conn_recvicmp != NULL); 1381 1382 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1383 mutex_exit(&connfp->connf_lock); 1384 break; 1385 1386 case IPPROTO_SCTP: 1387 /* 1388 * The raw socket may have already been bound, remove it 1389 * from the hash first. 1390 */ 1391 IPCL_HASH_REMOVE(connp); 1392 ret = ipcl_sctp_hash_insert(connp, lport); 1393 break; 1394 1395 default: 1396 /* 1397 * Check for conflicts among MAC exempt bindings. For 1398 * transports with port numbers, this is done by the upper 1399 * level per-transport binding logic. For all others, it's 1400 * done here. 1401 */ 1402 if (is_system_labeled() && 1403 check_exempt_conflict_v4(connp, ipst)) 1404 return (EADDRINUSE); 1405 /* FALLTHROUGH */ 1406 1407 case IPPROTO_UDP: 1408 if (protocol == IPPROTO_UDP) { 1409 connfp = &ipst->ips_ipcl_udp_fanout[ 1410 IPCL_UDP_HASH(lport, ipst)]; 1411 } else { 1412 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1413 } 1414 1415 if (connp->conn_faddr_v4 != INADDR_ANY) { 1416 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1417 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1418 IPCL_HASH_INSERT_BOUND(connfp, connp); 1419 } else { 1420 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1421 } 1422 break; 1423 } 1424 1425 return (ret); 1426 } 1427 1428 int 1429 ipcl_conn_insert_v6(conn_t *connp) 1430 { 1431 connf_t *connfp; 1432 conn_t *tconnp; 1433 int ret = 0; 1434 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1435 uint16_t lport = connp->conn_lport; 1436 uint8_t protocol = connp->conn_proto; 1437 uint_t ifindex = connp->conn_bound_if; 1438 1439 if (IPCL_IS_IPTUN(connp)) 1440 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1441 1442 switch (protocol) { 1443 case IPPROTO_TCP: 1444 1445 /* 1446 * For tcp, we check whether the connection tuple already 1447 * exists before allowing the connection to proceed. We 1448 * also allow indexing on the zoneid. This is to allow 1449 * multiple shared stack zones to have the same tcp 1450 * connection tuple. In practice this only happens for 1451 * ipv6_loopback as it's the only local address which 1452 * doesn't have to be unique. 1453 */ 1454 connfp = &ipst->ips_ipcl_conn_fanout[ 1455 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1456 ipst)]; 1457 mutex_enter(&connfp->connf_lock); 1458 for (tconnp = connfp->connf_head; tconnp != NULL; 1459 tconnp = tconnp->conn_next) { 1460 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1461 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1462 connp->conn_faddr_v6, connp->conn_laddr_v6, 1463 connp->conn_ports) && 1464 (tconnp->conn_bound_if == 0 || 1465 tconnp->conn_bound_if == ifindex) && 1466 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1467 /* Already have a conn. bail out */ 1468 mutex_exit(&connfp->connf_lock); 1469 return (EADDRINUSE); 1470 } 1471 } 1472 if (connp->conn_fanout != NULL) { 1473 /* 1474 * Probably a XTI/TLI application trying to do a 1475 * rebind. Let it happen. 1476 */ 1477 mutex_exit(&connfp->connf_lock); 1478 IPCL_HASH_REMOVE(connp); 1479 mutex_enter(&connfp->connf_lock); 1480 } 1481 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1482 mutex_exit(&connfp->connf_lock); 1483 break; 1484 1485 case IPPROTO_SCTP: 1486 IPCL_HASH_REMOVE(connp); 1487 ret = ipcl_sctp_hash_insert(connp, lport); 1488 break; 1489 1490 default: 1491 if (is_system_labeled() && 1492 check_exempt_conflict_v6(connp, ipst)) 1493 return (EADDRINUSE); 1494 /* FALLTHROUGH */ 1495 case IPPROTO_UDP: 1496 if (protocol == IPPROTO_UDP) { 1497 connfp = &ipst->ips_ipcl_udp_fanout[ 1498 IPCL_UDP_HASH(lport, ipst)]; 1499 } else { 1500 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1501 } 1502 1503 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1504 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1505 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1506 IPCL_HASH_INSERT_BOUND(connfp, connp); 1507 } else { 1508 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1509 } 1510 break; 1511 } 1512 1513 return (ret); 1514 } 1515 1516 /* 1517 * v4 packet classifying function. looks up the fanout table to 1518 * find the conn, the packet belongs to. returns the conn with 1519 * the reference held, null otherwise. 1520 * 1521 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1522 * Lookup" comment block are applied. Labels are also checked as described 1523 * above. If the packet is from the inside (looped back), and is from the same 1524 * zone, then label checks are omitted. 1525 */ 1526 conn_t * 1527 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1528 ip_recv_attr_t *ira, ip_stack_t *ipst) 1529 { 1530 ipha_t *ipha; 1531 connf_t *connfp, *bind_connfp; 1532 uint16_t lport; 1533 uint16_t fport; 1534 uint32_t ports; 1535 conn_t *connp; 1536 uint16_t *up; 1537 zoneid_t zoneid = ira->ira_zoneid; 1538 1539 ipha = (ipha_t *)mp->b_rptr; 1540 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1541 1542 switch (protocol) { 1543 case IPPROTO_TCP: 1544 ports = *(uint32_t *)up; 1545 connfp = 1546 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1547 ports, ipst)]; 1548 mutex_enter(&connfp->connf_lock); 1549 for (connp = connfp->connf_head; connp != NULL; 1550 connp = connp->conn_next) { 1551 if (IPCL_CONN_MATCH(connp, protocol, 1552 ipha->ipha_src, ipha->ipha_dst, ports) && 1553 (connp->conn_zoneid == zoneid || 1554 connp->conn_allzones || 1555 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1556 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1557 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1558 break; 1559 } 1560 1561 if (connp != NULL) { 1562 /* 1563 * We have a fully-bound TCP connection. 1564 * 1565 * For labeled systems, there's no need to check the 1566 * label here. It's known to be good as we checked 1567 * before allowing the connection to become bound. 1568 */ 1569 CONN_INC_REF(connp); 1570 mutex_exit(&connfp->connf_lock); 1571 return (connp); 1572 } 1573 1574 mutex_exit(&connfp->connf_lock); 1575 lport = up[1]; 1576 bind_connfp = 1577 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1578 mutex_enter(&bind_connfp->connf_lock); 1579 for (connp = bind_connfp->connf_head; connp != NULL; 1580 connp = connp->conn_next) { 1581 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1582 lport) && 1583 (connp->conn_zoneid == zoneid || 1584 connp->conn_allzones || 1585 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1586 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1587 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1588 break; 1589 } 1590 1591 /* 1592 * If the matching connection is SLP on a private address, then 1593 * the label on the packet must match the local zone's label. 1594 * Otherwise, it must be in the label range defined by tnrh. 1595 * This is ensured by tsol_receive_local. 1596 * 1597 * Note that we don't check tsol_receive_local for 1598 * the connected case. 1599 */ 1600 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1601 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1602 ira, connp)) { 1603 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1604 char *, "connp(1) could not receive mp(2)", 1605 conn_t *, connp, mblk_t *, mp); 1606 connp = NULL; 1607 } 1608 1609 if (connp != NULL) { 1610 /* Have a listener at least */ 1611 CONN_INC_REF(connp); 1612 mutex_exit(&bind_connfp->connf_lock); 1613 return (connp); 1614 } 1615 1616 mutex_exit(&bind_connfp->connf_lock); 1617 break; 1618 1619 case IPPROTO_UDP: 1620 lport = up[1]; 1621 fport = up[0]; 1622 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1623 mutex_enter(&connfp->connf_lock); 1624 for (connp = connfp->connf_head; connp != NULL; 1625 connp = connp->conn_next) { 1626 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1627 fport, ipha->ipha_src) && 1628 (connp->conn_zoneid == zoneid || 1629 connp->conn_allzones || 1630 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1631 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1632 break; 1633 } 1634 1635 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1636 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1637 ira, connp)) { 1638 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1639 char *, "connp(1) could not receive mp(2)", 1640 conn_t *, connp, mblk_t *, mp); 1641 connp = NULL; 1642 } 1643 1644 if (connp != NULL) { 1645 CONN_INC_REF(connp); 1646 mutex_exit(&connfp->connf_lock); 1647 return (connp); 1648 } 1649 1650 /* 1651 * We shouldn't come here for multicast/broadcast packets 1652 */ 1653 mutex_exit(&connfp->connf_lock); 1654 1655 break; 1656 1657 case IPPROTO_ENCAP: 1658 case IPPROTO_IPV6: 1659 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1660 &ipha->ipha_dst, ipst)); 1661 } 1662 1663 return (NULL); 1664 } 1665 1666 conn_t * 1667 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1668 ip_recv_attr_t *ira, ip_stack_t *ipst) 1669 { 1670 ip6_t *ip6h; 1671 connf_t *connfp, *bind_connfp; 1672 uint16_t lport; 1673 uint16_t fport; 1674 tcpha_t *tcpha; 1675 uint32_t ports; 1676 conn_t *connp; 1677 uint16_t *up; 1678 zoneid_t zoneid = ira->ira_zoneid; 1679 1680 ip6h = (ip6_t *)mp->b_rptr; 1681 1682 switch (protocol) { 1683 case IPPROTO_TCP: 1684 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1685 up = &tcpha->tha_lport; 1686 ports = *(uint32_t *)up; 1687 1688 connfp = 1689 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1690 ports, ipst)]; 1691 mutex_enter(&connfp->connf_lock); 1692 for (connp = connfp->connf_head; connp != NULL; 1693 connp = connp->conn_next) { 1694 if (IPCL_CONN_MATCH_V6(connp, protocol, 1695 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1696 (connp->conn_zoneid == zoneid || 1697 connp->conn_allzones || 1698 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1699 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1700 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1701 break; 1702 } 1703 1704 if (connp != NULL) { 1705 /* 1706 * We have a fully-bound TCP connection. 1707 * 1708 * For labeled systems, there's no need to check the 1709 * label here. It's known to be good as we checked 1710 * before allowing the connection to become bound. 1711 */ 1712 CONN_INC_REF(connp); 1713 mutex_exit(&connfp->connf_lock); 1714 return (connp); 1715 } 1716 1717 mutex_exit(&connfp->connf_lock); 1718 1719 lport = up[1]; 1720 bind_connfp = 1721 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1722 mutex_enter(&bind_connfp->connf_lock); 1723 for (connp = bind_connfp->connf_head; connp != NULL; 1724 connp = connp->conn_next) { 1725 if (IPCL_BIND_MATCH_V6(connp, protocol, 1726 ip6h->ip6_dst, lport) && 1727 (connp->conn_zoneid == zoneid || 1728 connp->conn_allzones || 1729 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1730 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1731 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1732 break; 1733 } 1734 1735 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1736 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1737 ira, connp)) { 1738 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1739 char *, "connp(1) could not receive mp(2)", 1740 conn_t *, connp, mblk_t *, mp); 1741 connp = NULL; 1742 } 1743 1744 if (connp != NULL) { 1745 /* Have a listner at least */ 1746 CONN_INC_REF(connp); 1747 mutex_exit(&bind_connfp->connf_lock); 1748 return (connp); 1749 } 1750 1751 mutex_exit(&bind_connfp->connf_lock); 1752 break; 1753 1754 case IPPROTO_UDP: 1755 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1756 lport = up[1]; 1757 fport = up[0]; 1758 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1759 mutex_enter(&connfp->connf_lock); 1760 for (connp = connfp->connf_head; connp != NULL; 1761 connp = connp->conn_next) { 1762 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1763 fport, ip6h->ip6_src) && 1764 (connp->conn_zoneid == zoneid || 1765 connp->conn_allzones || 1766 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1767 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1768 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1769 break; 1770 } 1771 1772 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1773 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1774 ira, connp)) { 1775 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1776 char *, "connp(1) could not receive mp(2)", 1777 conn_t *, connp, mblk_t *, mp); 1778 connp = NULL; 1779 } 1780 1781 if (connp != NULL) { 1782 CONN_INC_REF(connp); 1783 mutex_exit(&connfp->connf_lock); 1784 return (connp); 1785 } 1786 1787 /* 1788 * We shouldn't come here for multicast/broadcast packets 1789 */ 1790 mutex_exit(&connfp->connf_lock); 1791 break; 1792 case IPPROTO_ENCAP: 1793 case IPPROTO_IPV6: 1794 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1795 &ip6h->ip6_dst, ipst)); 1796 } 1797 1798 return (NULL); 1799 } 1800 1801 /* 1802 * wrapper around ipcl_classify_(v4,v6) routines. 1803 */ 1804 conn_t * 1805 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1806 { 1807 if (ira->ira_flags & IRAF_IS_IPV4) { 1808 return (ipcl_classify_v4(mp, ira->ira_protocol, 1809 ira->ira_ip_hdr_length, ira, ipst)); 1810 } else { 1811 return (ipcl_classify_v6(mp, ira->ira_protocol, 1812 ira->ira_ip_hdr_length, ira, ipst)); 1813 } 1814 } 1815 1816 /* 1817 * Only used to classify SCTP RAW sockets 1818 */ 1819 conn_t * 1820 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1821 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1822 { 1823 connf_t *connfp; 1824 conn_t *connp; 1825 in_port_t lport; 1826 int ipversion; 1827 const void *dst; 1828 zoneid_t zoneid = ira->ira_zoneid; 1829 1830 lport = ((uint16_t *)&ports)[1]; 1831 if (ira->ira_flags & IRAF_IS_IPV4) { 1832 dst = (const void *)&ipha->ipha_dst; 1833 ipversion = IPV4_VERSION; 1834 } else { 1835 dst = (const void *)&ip6h->ip6_dst; 1836 ipversion = IPV6_VERSION; 1837 } 1838 1839 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1840 mutex_enter(&connfp->connf_lock); 1841 for (connp = connfp->connf_head; connp != NULL; 1842 connp = connp->conn_next) { 1843 /* We don't allow v4 fallback for v6 raw socket. */ 1844 if (ipversion != connp->conn_ipversion) 1845 continue; 1846 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1847 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1848 if (ipversion == IPV4_VERSION) { 1849 if (!IPCL_CONN_MATCH(connp, protocol, 1850 ipha->ipha_src, ipha->ipha_dst, ports)) 1851 continue; 1852 } else { 1853 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1854 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1855 continue; 1856 } 1857 } else { 1858 if (ipversion == IPV4_VERSION) { 1859 if (!IPCL_BIND_MATCH(connp, protocol, 1860 ipha->ipha_dst, lport)) 1861 continue; 1862 } else { 1863 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1864 ip6h->ip6_dst, lport)) 1865 continue; 1866 } 1867 } 1868 1869 if (connp->conn_zoneid == zoneid || 1870 connp->conn_allzones || 1871 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1872 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1873 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1874 break; 1875 } 1876 1877 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1878 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1879 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1880 char *, "connp(1) could not receive mp(2)", 1881 conn_t *, connp, mblk_t *, mp); 1882 connp = NULL; 1883 } 1884 1885 if (connp != NULL) 1886 goto found; 1887 mutex_exit(&connfp->connf_lock); 1888 1889 /* Try to look for a wildcard SCTP RAW socket match. */ 1890 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1891 mutex_enter(&connfp->connf_lock); 1892 for (connp = connfp->connf_head; connp != NULL; 1893 connp = connp->conn_next) { 1894 /* We don't allow v4 fallback for v6 raw socket. */ 1895 if (ipversion != connp->conn_ipversion) 1896 continue; 1897 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1898 continue; 1899 1900 if (ipversion == IPV4_VERSION) { 1901 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1902 break; 1903 } else { 1904 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1905 break; 1906 } 1907 } 1908 } 1909 1910 if (connp != NULL) 1911 goto found; 1912 1913 mutex_exit(&connfp->connf_lock); 1914 return (NULL); 1915 1916 found: 1917 ASSERT(connp != NULL); 1918 CONN_INC_REF(connp); 1919 mutex_exit(&connfp->connf_lock); 1920 return (connp); 1921 } 1922 1923 /* ARGSUSED */ 1924 static int 1925 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1926 { 1927 itc_t *itc = (itc_t *)buf; 1928 conn_t *connp = &itc->itc_conn; 1929 tcp_t *tcp = (tcp_t *)&itc[1]; 1930 1931 bzero(connp, sizeof (conn_t)); 1932 bzero(tcp, sizeof (tcp_t)); 1933 1934 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1935 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1936 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1937 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1938 if (tcp->tcp_timercache == NULL) 1939 return (ENOMEM); 1940 connp->conn_tcp = tcp; 1941 connp->conn_flags = IPCL_TCPCONN; 1942 connp->conn_proto = IPPROTO_TCP; 1943 tcp->tcp_connp = connp; 1944 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1945 1946 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1947 if (connp->conn_ixa == NULL) { 1948 tcp_timermp_free(tcp); 1949 return (ENOMEM); 1950 } 1951 connp->conn_ixa->ixa_refcnt = 1; 1952 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1953 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1954 return (0); 1955 } 1956 1957 /* ARGSUSED */ 1958 static void 1959 tcp_conn_destructor(void *buf, void *cdrarg) 1960 { 1961 itc_t *itc = (itc_t *)buf; 1962 conn_t *connp = &itc->itc_conn; 1963 tcp_t *tcp = (tcp_t *)&itc[1]; 1964 1965 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1966 ASSERT(tcp->tcp_connp == connp); 1967 ASSERT(connp->conn_tcp == tcp); 1968 tcp_timermp_free(tcp); 1969 mutex_destroy(&connp->conn_lock); 1970 cv_destroy(&connp->conn_cv); 1971 cv_destroy(&connp->conn_sq_cv); 1972 rw_destroy(&connp->conn_ilg_lock); 1973 1974 /* Can be NULL if constructor failed */ 1975 if (connp->conn_ixa != NULL) { 1976 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1977 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1978 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1979 ixa_refrele(connp->conn_ixa); 1980 } 1981 } 1982 1983 /* ARGSUSED */ 1984 static int 1985 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1986 { 1987 itc_t *itc = (itc_t *)buf; 1988 conn_t *connp = &itc->itc_conn; 1989 1990 bzero(connp, sizeof (conn_t)); 1991 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1992 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1993 connp->conn_flags = IPCL_IPCCONN; 1994 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1995 1996 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1997 if (connp->conn_ixa == NULL) 1998 return (ENOMEM); 1999 connp->conn_ixa->ixa_refcnt = 1; 2000 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2001 return (0); 2002 } 2003 2004 /* ARGSUSED */ 2005 static void 2006 ip_conn_destructor(void *buf, void *cdrarg) 2007 { 2008 itc_t *itc = (itc_t *)buf; 2009 conn_t *connp = &itc->itc_conn; 2010 2011 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2012 ASSERT(connp->conn_priv == NULL); 2013 mutex_destroy(&connp->conn_lock); 2014 cv_destroy(&connp->conn_cv); 2015 rw_destroy(&connp->conn_ilg_lock); 2016 2017 /* Can be NULL if constructor failed */ 2018 if (connp->conn_ixa != NULL) { 2019 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2020 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2021 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2022 ixa_refrele(connp->conn_ixa); 2023 } 2024 } 2025 2026 /* ARGSUSED */ 2027 static int 2028 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2029 { 2030 itc_t *itc = (itc_t *)buf; 2031 conn_t *connp = &itc->itc_conn; 2032 udp_t *udp = (udp_t *)&itc[1]; 2033 2034 bzero(connp, sizeof (conn_t)); 2035 bzero(udp, sizeof (udp_t)); 2036 2037 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2038 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2039 connp->conn_udp = udp; 2040 connp->conn_flags = IPCL_UDPCONN; 2041 connp->conn_proto = IPPROTO_UDP; 2042 udp->udp_connp = connp; 2043 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2044 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2045 if (connp->conn_ixa == NULL) 2046 return (ENOMEM); 2047 connp->conn_ixa->ixa_refcnt = 1; 2048 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2049 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2050 return (0); 2051 } 2052 2053 /* ARGSUSED */ 2054 static void 2055 udp_conn_destructor(void *buf, void *cdrarg) 2056 { 2057 itc_t *itc = (itc_t *)buf; 2058 conn_t *connp = &itc->itc_conn; 2059 udp_t *udp = (udp_t *)&itc[1]; 2060 2061 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2062 ASSERT(udp->udp_connp == connp); 2063 ASSERT(connp->conn_udp == udp); 2064 mutex_destroy(&connp->conn_lock); 2065 cv_destroy(&connp->conn_cv); 2066 rw_destroy(&connp->conn_ilg_lock); 2067 2068 /* Can be NULL if constructor failed */ 2069 if (connp->conn_ixa != NULL) { 2070 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2071 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2072 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2073 ixa_refrele(connp->conn_ixa); 2074 } 2075 } 2076 2077 /* ARGSUSED */ 2078 static int 2079 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2080 { 2081 itc_t *itc = (itc_t *)buf; 2082 conn_t *connp = &itc->itc_conn; 2083 icmp_t *icmp = (icmp_t *)&itc[1]; 2084 2085 bzero(connp, sizeof (conn_t)); 2086 bzero(icmp, sizeof (icmp_t)); 2087 2088 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2089 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2090 connp->conn_icmp = icmp; 2091 connp->conn_flags = IPCL_RAWIPCONN; 2092 connp->conn_proto = IPPROTO_ICMP; 2093 icmp->icmp_connp = connp; 2094 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2095 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2096 if (connp->conn_ixa == NULL) 2097 return (ENOMEM); 2098 connp->conn_ixa->ixa_refcnt = 1; 2099 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2100 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2101 return (0); 2102 } 2103 2104 /* ARGSUSED */ 2105 static void 2106 rawip_conn_destructor(void *buf, void *cdrarg) 2107 { 2108 itc_t *itc = (itc_t *)buf; 2109 conn_t *connp = &itc->itc_conn; 2110 icmp_t *icmp = (icmp_t *)&itc[1]; 2111 2112 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2113 ASSERT(icmp->icmp_connp == connp); 2114 ASSERT(connp->conn_icmp == icmp); 2115 mutex_destroy(&connp->conn_lock); 2116 cv_destroy(&connp->conn_cv); 2117 rw_destroy(&connp->conn_ilg_lock); 2118 2119 /* Can be NULL if constructor failed */ 2120 if (connp->conn_ixa != NULL) { 2121 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2122 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2123 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2124 ixa_refrele(connp->conn_ixa); 2125 } 2126 } 2127 2128 /* ARGSUSED */ 2129 static int 2130 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2131 { 2132 itc_t *itc = (itc_t *)buf; 2133 conn_t *connp = &itc->itc_conn; 2134 rts_t *rts = (rts_t *)&itc[1]; 2135 2136 bzero(connp, sizeof (conn_t)); 2137 bzero(rts, sizeof (rts_t)); 2138 2139 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2140 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2141 connp->conn_rts = rts; 2142 connp->conn_flags = IPCL_RTSCONN; 2143 rts->rts_connp = connp; 2144 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2145 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2146 if (connp->conn_ixa == NULL) 2147 return (ENOMEM); 2148 connp->conn_ixa->ixa_refcnt = 1; 2149 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2150 return (0); 2151 } 2152 2153 /* ARGSUSED */ 2154 static void 2155 rts_conn_destructor(void *buf, void *cdrarg) 2156 { 2157 itc_t *itc = (itc_t *)buf; 2158 conn_t *connp = &itc->itc_conn; 2159 rts_t *rts = (rts_t *)&itc[1]; 2160 2161 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2162 ASSERT(rts->rts_connp == connp); 2163 ASSERT(connp->conn_rts == rts); 2164 mutex_destroy(&connp->conn_lock); 2165 cv_destroy(&connp->conn_cv); 2166 rw_destroy(&connp->conn_ilg_lock); 2167 2168 /* Can be NULL if constructor failed */ 2169 if (connp->conn_ixa != NULL) { 2170 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2171 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2172 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2173 ixa_refrele(connp->conn_ixa); 2174 } 2175 } 2176 2177 /* 2178 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2179 * in the conn_t. 2180 * 2181 * Below we list all the pointers in the conn_t as a documentation aid. 2182 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2183 * If you add any pointers to the conn_t please add an ASSERT here 2184 * and #ifdef it out if it can't be actually asserted to be NULL. 2185 * In any case, we bzero most of the conn_t at the end of the function. 2186 */ 2187 void 2188 ipcl_conn_cleanup(conn_t *connp) 2189 { 2190 ip_xmit_attr_t *ixa; 2191 2192 ASSERT(connp->conn_latch == NULL); 2193 ASSERT(connp->conn_latch_in_policy == NULL); 2194 ASSERT(connp->conn_latch_in_action == NULL); 2195 #ifdef notdef 2196 ASSERT(connp->conn_rq == NULL); 2197 ASSERT(connp->conn_wq == NULL); 2198 #endif 2199 ASSERT(connp->conn_cred == NULL); 2200 ASSERT(connp->conn_g_fanout == NULL); 2201 ASSERT(connp->conn_g_next == NULL); 2202 ASSERT(connp->conn_g_prev == NULL); 2203 ASSERT(connp->conn_policy == NULL); 2204 ASSERT(connp->conn_fanout == NULL); 2205 ASSERT(connp->conn_next == NULL); 2206 ASSERT(connp->conn_prev == NULL); 2207 ASSERT(connp->conn_oper_pending_ill == NULL); 2208 ASSERT(connp->conn_ilg == NULL); 2209 ASSERT(connp->conn_drain_next == NULL); 2210 ASSERT(connp->conn_drain_prev == NULL); 2211 #ifdef notdef 2212 /* conn_idl is not cleared when removed from idl list */ 2213 ASSERT(connp->conn_idl == NULL); 2214 #endif 2215 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2216 #ifdef notdef 2217 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2218 ASSERT(connp->conn_netstack == NULL); 2219 #endif 2220 2221 ASSERT(connp->conn_helper_info == NULL); 2222 ASSERT(connp->conn_ixa != NULL); 2223 ixa = connp->conn_ixa; 2224 ASSERT(ixa->ixa_refcnt == 1); 2225 /* Need to preserve ixa_protocol */ 2226 ixa_cleanup(ixa); 2227 ixa->ixa_flags = 0; 2228 2229 /* Clear out the conn_t fields that are not preserved */ 2230 bzero(&connp->conn_start_clr, 2231 sizeof (conn_t) - 2232 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2233 } 2234 2235 /* 2236 * All conns are inserted in a global multi-list for the benefit of 2237 * walkers. The walk is guaranteed to walk all open conns at the time 2238 * of the start of the walk exactly once. This property is needed to 2239 * achieve some cleanups during unplumb of interfaces. This is achieved 2240 * as follows. 2241 * 2242 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2243 * call the insert and delete functions below at creation and deletion 2244 * time respectively. The conn never moves or changes its position in this 2245 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2246 * won't increase due to walkers, once the conn deletion has started. Note 2247 * that we can't remove the conn from the global list and then wait for 2248 * the refcnt to drop to zero, since walkers would then see a truncated 2249 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2250 * conns until ip_open is ready to make them globally visible. 2251 * The global round robin multi-list locks are held only to get the 2252 * next member/insertion/deletion and contention should be negligible 2253 * if the multi-list is much greater than the number of cpus. 2254 */ 2255 void 2256 ipcl_globalhash_insert(conn_t *connp) 2257 { 2258 int index; 2259 struct connf_s *connfp; 2260 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2261 2262 /* 2263 * No need for atomic here. Approximate even distribution 2264 * in the global lists is sufficient. 2265 */ 2266 ipst->ips_conn_g_index++; 2267 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2268 2269 connp->conn_g_prev = NULL; 2270 /* 2271 * Mark as INCIPIENT, so that walkers will ignore this 2272 * for now, till ip_open is ready to make it visible globally. 2273 */ 2274 connp->conn_state_flags |= CONN_INCIPIENT; 2275 2276 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2277 /* Insert at the head of the list */ 2278 mutex_enter(&connfp->connf_lock); 2279 connp->conn_g_next = connfp->connf_head; 2280 if (connp->conn_g_next != NULL) 2281 connp->conn_g_next->conn_g_prev = connp; 2282 connfp->connf_head = connp; 2283 2284 /* The fanout bucket this conn points to */ 2285 connp->conn_g_fanout = connfp; 2286 2287 mutex_exit(&connfp->connf_lock); 2288 } 2289 2290 void 2291 ipcl_globalhash_remove(conn_t *connp) 2292 { 2293 struct connf_s *connfp; 2294 2295 /* 2296 * We were never inserted in the global multi list. 2297 * IPCL_NONE variety is never inserted in the global multilist 2298 * since it is presumed to not need any cleanup and is transient. 2299 */ 2300 if (connp->conn_g_fanout == NULL) 2301 return; 2302 2303 connfp = connp->conn_g_fanout; 2304 mutex_enter(&connfp->connf_lock); 2305 if (connp->conn_g_prev != NULL) 2306 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2307 else 2308 connfp->connf_head = connp->conn_g_next; 2309 if (connp->conn_g_next != NULL) 2310 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2311 mutex_exit(&connfp->connf_lock); 2312 2313 /* Better to stumble on a null pointer than to corrupt memory */ 2314 connp->conn_g_next = NULL; 2315 connp->conn_g_prev = NULL; 2316 connp->conn_g_fanout = NULL; 2317 } 2318 2319 /* 2320 * Walk the list of all conn_t's in the system, calling the function provided 2321 * With the specified argument for each. 2322 * Applies to both IPv4 and IPv6. 2323 * 2324 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2325 * conn_oper_pending_ill). To guard against stale pointers 2326 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2327 * unplumbed or removed. New conn_t's that are created while we are walking 2328 * may be missed by this walk, because they are not necessarily inserted 2329 * at the tail of the list. They are new conn_t's and thus don't have any 2330 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2331 * is created to the struct that is going away. 2332 */ 2333 void 2334 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2335 { 2336 int i; 2337 conn_t *connp; 2338 conn_t *prev_connp; 2339 2340 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2341 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2342 prev_connp = NULL; 2343 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2344 while (connp != NULL) { 2345 mutex_enter(&connp->conn_lock); 2346 if (connp->conn_state_flags & 2347 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2348 mutex_exit(&connp->conn_lock); 2349 connp = connp->conn_g_next; 2350 continue; 2351 } 2352 CONN_INC_REF_LOCKED(connp); 2353 mutex_exit(&connp->conn_lock); 2354 mutex_exit( 2355 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2356 (*func)(connp, arg); 2357 if (prev_connp != NULL) 2358 CONN_DEC_REF(prev_connp); 2359 mutex_enter( 2360 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2361 prev_connp = connp; 2362 connp = connp->conn_g_next; 2363 } 2364 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2365 if (prev_connp != NULL) 2366 CONN_DEC_REF(prev_connp); 2367 } 2368 } 2369 2370 /* 2371 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2372 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2373 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2374 * (peer tcp in ESTABLISHED state). 2375 */ 2376 conn_t * 2377 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2378 ip_stack_t *ipst) 2379 { 2380 uint32_t ports; 2381 uint16_t *pports = (uint16_t *)&ports; 2382 connf_t *connfp; 2383 conn_t *tconnp; 2384 boolean_t zone_chk; 2385 2386 /* 2387 * If either the source of destination address is loopback, then 2388 * both endpoints must be in the same Zone. Otherwise, both of 2389 * the addresses are system-wide unique (tcp is in ESTABLISHED 2390 * state) and the endpoints may reside in different Zones. 2391 */ 2392 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2393 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2394 2395 pports[0] = tcpha->tha_fport; 2396 pports[1] = tcpha->tha_lport; 2397 2398 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2399 ports, ipst)]; 2400 2401 mutex_enter(&connfp->connf_lock); 2402 for (tconnp = connfp->connf_head; tconnp != NULL; 2403 tconnp = tconnp->conn_next) { 2404 2405 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2406 ipha->ipha_dst, ipha->ipha_src, ports) && 2407 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2408 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2409 2410 ASSERT(tconnp != connp); 2411 CONN_INC_REF(tconnp); 2412 mutex_exit(&connfp->connf_lock); 2413 return (tconnp); 2414 } 2415 } 2416 mutex_exit(&connfp->connf_lock); 2417 return (NULL); 2418 } 2419 2420 /* 2421 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2422 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2423 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2424 * (peer tcp in ESTABLISHED state). 2425 */ 2426 conn_t * 2427 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2428 ip_stack_t *ipst) 2429 { 2430 uint32_t ports; 2431 uint16_t *pports = (uint16_t *)&ports; 2432 connf_t *connfp; 2433 conn_t *tconnp; 2434 boolean_t zone_chk; 2435 2436 /* 2437 * If either the source of destination address is loopback, then 2438 * both endpoints must be in the same Zone. Otherwise, both of 2439 * the addresses are system-wide unique (tcp is in ESTABLISHED 2440 * state) and the endpoints may reside in different Zones. We 2441 * don't do Zone check for link local address(es) because the 2442 * current Zone implementation treats each link local address as 2443 * being unique per system node, i.e. they belong to global Zone. 2444 */ 2445 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2446 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2447 2448 pports[0] = tcpha->tha_fport; 2449 pports[1] = tcpha->tha_lport; 2450 2451 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2452 ports, ipst)]; 2453 2454 mutex_enter(&connfp->connf_lock); 2455 for (tconnp = connfp->connf_head; tconnp != NULL; 2456 tconnp = tconnp->conn_next) { 2457 2458 /* We skip conn_bound_if check here as this is loopback tcp */ 2459 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2460 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2461 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2462 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2463 2464 ASSERT(tconnp != connp); 2465 CONN_INC_REF(tconnp); 2466 mutex_exit(&connfp->connf_lock); 2467 return (tconnp); 2468 } 2469 } 2470 mutex_exit(&connfp->connf_lock); 2471 return (NULL); 2472 } 2473 2474 /* 2475 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2476 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2477 * Only checks for connected entries i.e. no INADDR_ANY checks. 2478 */ 2479 conn_t * 2480 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2481 ip_stack_t *ipst) 2482 { 2483 uint32_t ports; 2484 uint16_t *pports; 2485 connf_t *connfp; 2486 conn_t *tconnp; 2487 2488 pports = (uint16_t *)&ports; 2489 pports[0] = tcpha->tha_fport; 2490 pports[1] = tcpha->tha_lport; 2491 2492 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2493 ports, ipst)]; 2494 2495 mutex_enter(&connfp->connf_lock); 2496 for (tconnp = connfp->connf_head; tconnp != NULL; 2497 tconnp = tconnp->conn_next) { 2498 2499 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2500 ipha->ipha_dst, ipha->ipha_src, ports) && 2501 tconnp->conn_tcp->tcp_state >= min_state) { 2502 2503 CONN_INC_REF(tconnp); 2504 mutex_exit(&connfp->connf_lock); 2505 return (tconnp); 2506 } 2507 } 2508 mutex_exit(&connfp->connf_lock); 2509 return (NULL); 2510 } 2511 2512 /* 2513 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2514 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2515 * Only checks for connected entries i.e. no INADDR_ANY checks. 2516 * Match on ifindex in addition to addresses. 2517 */ 2518 conn_t * 2519 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2520 uint_t ifindex, ip_stack_t *ipst) 2521 { 2522 tcp_t *tcp; 2523 uint32_t ports; 2524 uint16_t *pports; 2525 connf_t *connfp; 2526 conn_t *tconnp; 2527 2528 pports = (uint16_t *)&ports; 2529 pports[0] = tcpha->tha_fport; 2530 pports[1] = tcpha->tha_lport; 2531 2532 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2533 ports, ipst)]; 2534 2535 mutex_enter(&connfp->connf_lock); 2536 for (tconnp = connfp->connf_head; tconnp != NULL; 2537 tconnp = tconnp->conn_next) { 2538 2539 tcp = tconnp->conn_tcp; 2540 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2541 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2542 tcp->tcp_state >= min_state && 2543 (tconnp->conn_bound_if == 0 || 2544 tconnp->conn_bound_if == ifindex)) { 2545 2546 CONN_INC_REF(tconnp); 2547 mutex_exit(&connfp->connf_lock); 2548 return (tconnp); 2549 } 2550 } 2551 mutex_exit(&connfp->connf_lock); 2552 return (NULL); 2553 } 2554 2555 /* 2556 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2557 * a listener when changing state. 2558 */ 2559 conn_t * 2560 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2561 ip_stack_t *ipst) 2562 { 2563 connf_t *bind_connfp; 2564 conn_t *connp; 2565 tcp_t *tcp; 2566 2567 /* 2568 * Avoid false matches for packets sent to an IP destination of 2569 * all zeros. 2570 */ 2571 if (laddr == 0) 2572 return (NULL); 2573 2574 ASSERT(zoneid != ALL_ZONES); 2575 2576 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2577 mutex_enter(&bind_connfp->connf_lock); 2578 for (connp = bind_connfp->connf_head; connp != NULL; 2579 connp = connp->conn_next) { 2580 tcp = connp->conn_tcp; 2581 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2582 IPCL_ZONE_MATCH(connp, zoneid) && 2583 (tcp->tcp_listener == NULL)) { 2584 CONN_INC_REF(connp); 2585 mutex_exit(&bind_connfp->connf_lock); 2586 return (connp); 2587 } 2588 } 2589 mutex_exit(&bind_connfp->connf_lock); 2590 return (NULL); 2591 } 2592 2593 /* 2594 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2595 * a listener when changing state. 2596 */ 2597 conn_t * 2598 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2599 zoneid_t zoneid, ip_stack_t *ipst) 2600 { 2601 connf_t *bind_connfp; 2602 conn_t *connp = NULL; 2603 tcp_t *tcp; 2604 2605 /* 2606 * Avoid false matches for packets sent to an IP destination of 2607 * all zeros. 2608 */ 2609 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2610 return (NULL); 2611 2612 ASSERT(zoneid != ALL_ZONES); 2613 2614 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2615 mutex_enter(&bind_connfp->connf_lock); 2616 for (connp = bind_connfp->connf_head; connp != NULL; 2617 connp = connp->conn_next) { 2618 tcp = connp->conn_tcp; 2619 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2620 IPCL_ZONE_MATCH(connp, zoneid) && 2621 (connp->conn_bound_if == 0 || 2622 connp->conn_bound_if == ifindex) && 2623 tcp->tcp_listener == NULL) { 2624 CONN_INC_REF(connp); 2625 mutex_exit(&bind_connfp->connf_lock); 2626 return (connp); 2627 } 2628 } 2629 mutex_exit(&bind_connfp->connf_lock); 2630 return (NULL); 2631 } 2632 2633 /* 2634 * ipcl_get_next_conn 2635 * get the next entry in the conn global list 2636 * and put a reference on the next_conn. 2637 * decrement the reference on the current conn. 2638 * 2639 * This is an iterator based walker function that also provides for 2640 * some selection by the caller. It walks through the conn_hash bucket 2641 * searching for the next valid connp in the list, and selects connections 2642 * that are neither closed nor condemned. It also REFHOLDS the conn 2643 * thus ensuring that the conn exists when the caller uses the conn. 2644 */ 2645 conn_t * 2646 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2647 { 2648 conn_t *next_connp; 2649 2650 if (connfp == NULL) 2651 return (NULL); 2652 2653 mutex_enter(&connfp->connf_lock); 2654 2655 next_connp = (connp == NULL) ? 2656 connfp->connf_head : connp->conn_g_next; 2657 2658 while (next_connp != NULL) { 2659 mutex_enter(&next_connp->conn_lock); 2660 if (!(next_connp->conn_flags & conn_flags) || 2661 (next_connp->conn_state_flags & 2662 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2663 /* 2664 * This conn has been condemned or 2665 * is closing, or the flags don't match 2666 */ 2667 mutex_exit(&next_connp->conn_lock); 2668 next_connp = next_connp->conn_g_next; 2669 continue; 2670 } 2671 CONN_INC_REF_LOCKED(next_connp); 2672 mutex_exit(&next_connp->conn_lock); 2673 break; 2674 } 2675 2676 mutex_exit(&connfp->connf_lock); 2677 2678 if (connp != NULL) 2679 CONN_DEC_REF(connp); 2680 2681 return (next_connp); 2682 } 2683 2684 #ifdef CONN_DEBUG 2685 /* 2686 * Trace of the last NBUF refhold/refrele 2687 */ 2688 int 2689 conn_trace_ref(conn_t *connp) 2690 { 2691 int last; 2692 conn_trace_t *ctb; 2693 2694 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2695 last = connp->conn_trace_last; 2696 last++; 2697 if (last == CONN_TRACE_MAX) 2698 last = 0; 2699 2700 ctb = &connp->conn_trace_buf[last]; 2701 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2702 connp->conn_trace_last = last; 2703 return (1); 2704 } 2705 2706 int 2707 conn_untrace_ref(conn_t *connp) 2708 { 2709 int last; 2710 conn_trace_t *ctb; 2711 2712 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2713 last = connp->conn_trace_last; 2714 last++; 2715 if (last == CONN_TRACE_MAX) 2716 last = 0; 2717 2718 ctb = &connp->conn_trace_buf[last]; 2719 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2720 connp->conn_trace_last = last; 2721 return (1); 2722 } 2723 #endif 2724