1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 24 * Copyright 2022 Joyent, Inc. 25 */ 26 27 /* 28 * IP PACKET CLASSIFIER 29 * 30 * The IP packet classifier provides mapping between IP packets and persistent 31 * connection state for connection-oriented protocols. It also provides 32 * interface for managing connection states. 33 * 34 * The connection state is kept in conn_t data structure and contains, among 35 * other things: 36 * 37 * o local/remote address and ports 38 * o Transport protocol 39 * o squeue for the connection (for TCP only) 40 * o reference counter 41 * o Connection state 42 * o hash table linkage 43 * o interface/ire information 44 * o credentials 45 * o ipsec policy 46 * o send and receive functions. 47 * o mutex lock. 48 * 49 * Connections use a reference counting scheme. They are freed when the 50 * reference counter drops to zero. A reference is incremented when connection 51 * is placed in a list or table, when incoming packet for the connection arrives 52 * and when connection is processed via squeue (squeue processing may be 53 * asynchronous and the reference protects the connection from being destroyed 54 * before its processing is finished). 55 * 56 * conn_recv is used to pass up packets to the ULP. 57 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 58 * a listener, and changes to tcp_input_listener as the listener has picked a 59 * good squeue. For other cases it is set to tcp_input_data. 60 * 61 * conn_recvicmp is used to pass up ICMP errors to the ULP. 62 * 63 * Classifier uses several hash tables: 64 * 65 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 66 * ipcl_bind_fanout: contains all connections in BOUND state 67 * ipcl_proto_fanout: IPv4 protocol fanout 68 * ipcl_proto_fanout_v6: IPv6 protocol fanout 69 * ipcl_udp_fanout: contains all UDP connections 70 * ipcl_iptun_fanout: contains all IP tunnel connections 71 * ipcl_globalhash_fanout: contains all connections 72 * 73 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 74 * which need to view all existing connections. 75 * 76 * All tables are protected by per-bucket locks. When both per-bucket lock and 77 * connection lock need to be held, the per-bucket lock should be acquired 78 * first, followed by the connection lock. 79 * 80 * All functions doing search in one of these tables increment a reference 81 * counter on the connection found (if any). This reference should be dropped 82 * when the caller has finished processing the connection. 83 * 84 * 85 * INTERFACES: 86 * =========== 87 * 88 * Connection Lookup: 89 * ------------------ 90 * 91 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 92 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 93 * 94 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 95 * it can't find any associated connection. If the connection is found, its 96 * reference counter is incremented. 97 * 98 * mp: mblock, containing packet header. The full header should fit 99 * into a single mblock. It should also contain at least full IP 100 * and TCP or UDP header. 101 * 102 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 103 * 104 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 105 * the packet. 106 * 107 * ira->ira_zoneid: The zone in which the returned connection must be; the 108 * zoneid corresponding to the ire_zoneid on the IRE located for 109 * the packet's destination address. 110 * 111 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 112 * IRAF_TX_SHARED_ADDR flags 113 * 114 * For TCP connections, the lookup order is as follows: 115 * 5-tuple {src, dst, protocol, local port, remote port} 116 * lookup in ipcl_conn_fanout table. 117 * 3-tuple {dst, remote port, protocol} lookup in 118 * ipcl_bind_fanout table. 119 * 120 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 121 * remote port} lookup is done on ipcl_udp_fanout. Note that, 122 * these interfaces do not handle cases where a packets belongs 123 * to multiple UDP clients, which is handled in IP itself. 124 * 125 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 126 * determine which actual zone gets the segment. This is used only in a 127 * labeled environment. The matching rules are: 128 * 129 * - If it's not a multilevel port, then the label on the packet selects 130 * the zone. Unlabeled packets are delivered to the global zone. 131 * 132 * - If it's a multilevel port, then only the zone registered to receive 133 * packets on that port matches. 134 * 135 * Also, in a labeled environment, packet labels need to be checked. For fully 136 * bound TCP connections, we can assume that the packet label was checked 137 * during connection establishment, and doesn't need to be checked on each 138 * packet. For others, though, we need to check for strict equality or, for 139 * multilevel ports, membership in the range or set. This part currently does 140 * a tnrh lookup on each packet, but could be optimized to use cached results 141 * if that were necessary. (SCTP doesn't come through here, but if it did, 142 * we would apply the same rules as TCP.) 143 * 144 * An implication of the above is that fully-bound TCP sockets must always use 145 * distinct 4-tuples; they can't be discriminated by label alone. 146 * 147 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 148 * as there's no connection set-up handshake and no shared state. 149 * 150 * Labels on looped-back packets within a single zone do not need to be 151 * checked, as all processes in the same zone have the same label. 152 * 153 * Finally, for unlabeled packets received by a labeled system, special rules 154 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 155 * socket in the zone whose label matches the default label of the sender, if 156 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 157 * receiver's label must dominate the sender's default label. 158 * 159 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 160 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 161 * ip_stack); 162 * 163 * Lookup routine to find a exact match for {src, dst, local port, 164 * remote port) for TCP connections in ipcl_conn_fanout. The address and 165 * ports are read from the IP and TCP header respectively. 166 * 167 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 168 * zoneid, ip_stack); 169 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 170 * zoneid, ip_stack); 171 * 172 * Lookup routine to find a listener with the tuple {lport, laddr, 173 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 174 * parameter interface index is also compared. 175 * 176 * void ipcl_walk(func, arg, ip_stack) 177 * 178 * Apply 'func' to every connection available. The 'func' is called as 179 * (*func)(connp, arg). The walk is non-atomic so connections may be 180 * created and destroyed during the walk. The CONN_CONDEMNED and 181 * CONN_INCIPIENT flags ensure that connections which are newly created 182 * or being destroyed are not selected by the walker. 183 * 184 * Table Updates 185 * ------------- 186 * 187 * int ipcl_conn_insert(connp); 188 * int ipcl_conn_insert_v4(connp); 189 * int ipcl_conn_insert_v6(connp); 190 * 191 * Insert 'connp' in the ipcl_conn_fanout. 192 * Arguments : 193 * connp conn_t to be inserted 194 * 195 * Return value : 196 * 0 if connp was inserted 197 * EADDRINUSE if the connection with the same tuple 198 * already exists. 199 * 200 * int ipcl_bind_insert(connp); 201 * int ipcl_bind_insert_v4(connp); 202 * int ipcl_bind_insert_v6(connp); 203 * 204 * Insert 'connp' in ipcl_bind_fanout. 205 * Arguments : 206 * connp conn_t to be inserted 207 * 208 * 209 * void ipcl_hash_remove(connp); 210 * 211 * Removes the 'connp' from the connection fanout table. 212 * 213 * Connection Creation/Destruction 214 * ------------------------------- 215 * 216 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 217 * 218 * Creates a new conn based on the type flag, inserts it into 219 * globalhash table. 220 * 221 * type: This flag determines the type of conn_t which needs to be 222 * created i.e., which kmem_cache it comes from. 223 * IPCL_TCPCONN indicates a TCP connection 224 * IPCL_SCTPCONN indicates a SCTP connection 225 * IPCL_UDPCONN indicates a UDP conn_t. 226 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 227 * IPCL_RTSCONN indicates a RTS conn_t. 228 * IPCL_IPCCONN indicates all other connections. 229 * 230 * void ipcl_conn_destroy(connp) 231 * 232 * Destroys the connection state, removes it from the global 233 * connection hash table and frees its memory. 234 */ 235 236 #include <sys/types.h> 237 #include <sys/stream.h> 238 #include <sys/stropts.h> 239 #include <sys/sysmacros.h> 240 #include <sys/strsubr.h> 241 #include <sys/strsun.h> 242 #define _SUN_TPI_VERSION 2 243 #include <sys/ddi.h> 244 #include <sys/cmn_err.h> 245 #include <sys/debug.h> 246 247 #include <sys/systm.h> 248 #include <sys/param.h> 249 #include <sys/kmem.h> 250 #include <sys/isa_defs.h> 251 #include <inet/common.h> 252 #include <netinet/ip6.h> 253 #include <netinet/icmp6.h> 254 255 #include <inet/ip.h> 256 #include <inet/ip_if.h> 257 #include <inet/ip_ire.h> 258 #include <inet/ip6.h> 259 #include <inet/ip_ndp.h> 260 #include <inet/ip_impl.h> 261 #include <inet/udp_impl.h> 262 #include <inet/sctp_ip.h> 263 #include <inet/sctp/sctp_impl.h> 264 #include <inet/rawip_impl.h> 265 #include <inet/rts_impl.h> 266 #include <inet/iptun/iptun_impl.h> 267 268 #include <sys/cpuvar.h> 269 270 #include <inet/ipclassifier.h> 271 #include <inet/tcp.h> 272 #include <inet/ipsec_impl.h> 273 274 #include <sys/tsol/tnet.h> 275 #include <sys/sockio.h> 276 277 /* Old value for compatibility. Setable in /etc/system */ 278 uint_t tcp_conn_hash_size = 0; 279 280 /* New value. Zero means choose automatically. Setable in /etc/system */ 281 uint_t ipcl_conn_hash_size = 0; 282 uint_t ipcl_conn_hash_memfactor = 8192; 283 uint_t ipcl_conn_hash_maxsize = 82500; 284 285 /* bind/udp fanout table size */ 286 uint_t ipcl_bind_fanout_size = 512; 287 uint_t ipcl_udp_fanout_size = 16384; 288 289 /* Raw socket fanout size. Must be a power of 2. */ 290 uint_t ipcl_raw_fanout_size = 256; 291 292 /* 293 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 294 * expect that most large deployments would have hundreds of tunnels, and 295 * thousands in the extreme case. 296 */ 297 uint_t ipcl_iptun_fanout_size = 6143; 298 299 /* 300 * Power of 2^N Primes useful for hashing for N of 0-28, 301 * these primes are the nearest prime <= 2^N - 2^(N-2). 302 */ 303 304 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 305 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 306 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 307 50331599, 100663291, 201326557, 0} 308 309 /* 310 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 311 * are aligned on cache lines. 312 */ 313 typedef union itc_s { 314 conn_t itc_conn; 315 char itcu_filler[CACHE_ALIGN(conn_s)]; 316 } itc_t; 317 318 struct kmem_cache *tcp_conn_cache; 319 struct kmem_cache *ip_conn_cache; 320 extern struct kmem_cache *sctp_conn_cache; 321 struct kmem_cache *udp_conn_cache; 322 struct kmem_cache *rawip_conn_cache; 323 struct kmem_cache *rts_conn_cache; 324 325 extern void tcp_timermp_free(tcp_t *); 326 extern mblk_t *tcp_timermp_alloc(int); 327 328 static int ip_conn_constructor(void *, void *, int); 329 static void ip_conn_destructor(void *, void *); 330 331 static int tcp_conn_constructor(void *, void *, int); 332 static void tcp_conn_destructor(void *, void *); 333 334 static int udp_conn_constructor(void *, void *, int); 335 static void udp_conn_destructor(void *, void *); 336 337 static int rawip_conn_constructor(void *, void *, int); 338 static void rawip_conn_destructor(void *, void *); 339 340 static int rts_conn_constructor(void *, void *, int); 341 static void rts_conn_destructor(void *, void *); 342 343 /* 344 * Global (for all stack instances) init routine 345 */ 346 void 347 ipcl_g_init(void) 348 { 349 ip_conn_cache = kmem_cache_create("ip_conn_cache", 350 sizeof (conn_t), CACHE_ALIGN_SIZE, 351 ip_conn_constructor, ip_conn_destructor, 352 NULL, NULL, NULL, 0); 353 354 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 355 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 356 tcp_conn_constructor, tcp_conn_destructor, 357 tcp_conn_reclaim, NULL, NULL, 0); 358 359 udp_conn_cache = kmem_cache_create("udp_conn_cache", 360 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 361 udp_conn_constructor, udp_conn_destructor, 362 NULL, NULL, NULL, 0); 363 364 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 365 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 366 rawip_conn_constructor, rawip_conn_destructor, 367 NULL, NULL, NULL, 0); 368 369 rts_conn_cache = kmem_cache_create("rts_conn_cache", 370 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 371 rts_conn_constructor, rts_conn_destructor, 372 NULL, NULL, NULL, 0); 373 } 374 375 /* 376 * ipclassifier intialization routine, sets up hash tables. 377 */ 378 void 379 ipcl_init(ip_stack_t *ipst) 380 { 381 int i; 382 int sizes[] = P2Ps(); 383 384 /* 385 * Calculate size of conn fanout table from /etc/system settings 386 */ 387 if (ipcl_conn_hash_size != 0) { 388 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 389 } else if (tcp_conn_hash_size != 0) { 390 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 391 } else { 392 extern pgcnt_t freemem; 393 394 ipst->ips_ipcl_conn_fanout_size = 395 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 396 397 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 398 ipst->ips_ipcl_conn_fanout_size = 399 ipcl_conn_hash_maxsize; 400 } 401 } 402 403 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 404 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 405 break; 406 } 407 } 408 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 409 /* Out of range, use the 2^16 value */ 410 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 411 } 412 413 /* Take values from /etc/system */ 414 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 415 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 416 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 417 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 418 419 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 420 421 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 422 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 423 424 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 425 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 426 MUTEX_DEFAULT, NULL); 427 } 428 429 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 430 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 431 432 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 433 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 434 MUTEX_DEFAULT, NULL); 435 } 436 437 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 438 sizeof (connf_t), KM_SLEEP); 439 for (i = 0; i < IPPROTO_MAX; i++) { 440 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 441 MUTEX_DEFAULT, NULL); 442 } 443 444 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 445 sizeof (connf_t), KM_SLEEP); 446 for (i = 0; i < IPPROTO_MAX; i++) { 447 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 448 MUTEX_DEFAULT, NULL); 449 } 450 451 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 452 mutex_init(&ipst->ips_rts_clients->connf_lock, 453 NULL, MUTEX_DEFAULT, NULL); 454 455 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 456 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 457 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 458 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 459 MUTEX_DEFAULT, NULL); 460 } 461 462 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 463 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 464 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 465 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 466 MUTEX_DEFAULT, NULL); 467 } 468 469 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 470 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 471 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 472 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 473 MUTEX_DEFAULT, NULL); 474 } 475 476 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 477 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 478 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 479 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 480 NULL, MUTEX_DEFAULT, NULL); 481 } 482 } 483 484 void 485 ipcl_g_destroy(void) 486 { 487 kmem_cache_destroy(ip_conn_cache); 488 kmem_cache_destroy(tcp_conn_cache); 489 kmem_cache_destroy(udp_conn_cache); 490 kmem_cache_destroy(rawip_conn_cache); 491 kmem_cache_destroy(rts_conn_cache); 492 } 493 494 /* 495 * All user-level and kernel use of the stack must be gone 496 * by now. 497 */ 498 void 499 ipcl_destroy(ip_stack_t *ipst) 500 { 501 int i; 502 503 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 504 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 505 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 506 } 507 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 508 sizeof (connf_t)); 509 ipst->ips_ipcl_conn_fanout = NULL; 510 511 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 512 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 513 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 514 } 515 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 516 sizeof (connf_t)); 517 ipst->ips_ipcl_bind_fanout = NULL; 518 519 for (i = 0; i < IPPROTO_MAX; i++) { 520 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 521 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 522 } 523 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 524 IPPROTO_MAX * sizeof (connf_t)); 525 ipst->ips_ipcl_proto_fanout_v4 = NULL; 526 527 for (i = 0; i < IPPROTO_MAX; i++) { 528 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 529 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 530 } 531 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 532 IPPROTO_MAX * sizeof (connf_t)); 533 ipst->ips_ipcl_proto_fanout_v6 = NULL; 534 535 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 536 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 537 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 538 } 539 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 540 sizeof (connf_t)); 541 ipst->ips_ipcl_udp_fanout = NULL; 542 543 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 544 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 545 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 546 } 547 kmem_free(ipst->ips_ipcl_iptun_fanout, 548 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 549 ipst->ips_ipcl_iptun_fanout = NULL; 550 551 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 552 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 553 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 554 } 555 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 556 sizeof (connf_t)); 557 ipst->ips_ipcl_raw_fanout = NULL; 558 559 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 560 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 561 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 562 } 563 kmem_free(ipst->ips_ipcl_globalhash_fanout, 564 sizeof (connf_t) * CONN_G_HASH_SIZE); 565 ipst->ips_ipcl_globalhash_fanout = NULL; 566 567 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 568 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 569 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 570 ipst->ips_rts_clients = NULL; 571 } 572 573 /* 574 * conn creation routine. initialize the conn, sets the reference 575 * and inserts it in the global hash table. 576 */ 577 conn_t * 578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 579 { 580 conn_t *connp; 581 struct kmem_cache *conn_cache; 582 583 switch (type) { 584 case IPCL_SCTPCONN: 585 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 586 return (NULL); 587 sctp_conn_init(connp); 588 netstack_hold(ns); 589 connp->conn_netstack = ns; 590 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 591 connp->conn_ixa->ixa_conn_id = (long)connp; 592 ipcl_globalhash_insert(connp); 593 return (connp); 594 595 case IPCL_TCPCONN: 596 conn_cache = tcp_conn_cache; 597 break; 598 599 case IPCL_UDPCONN: 600 conn_cache = udp_conn_cache; 601 break; 602 603 case IPCL_RAWIPCONN: 604 conn_cache = rawip_conn_cache; 605 break; 606 607 case IPCL_RTSCONN: 608 conn_cache = rts_conn_cache; 609 break; 610 611 case IPCL_IPCCONN: 612 conn_cache = ip_conn_cache; 613 break; 614 615 default: 616 conn_cache = NULL; 617 connp = NULL; 618 ASSERT(0); 619 } 620 621 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 622 return (NULL); 623 624 connp->conn_ref = 1; 625 netstack_hold(ns); 626 connp->conn_netstack = ns; 627 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 628 connp->conn_ixa->ixa_conn_id = (long)connp; 629 ipcl_globalhash_insert(connp); 630 return (connp); 631 } 632 633 void 634 ipcl_conn_destroy(conn_t *connp) 635 { 636 mblk_t *mp; 637 netstack_t *ns = connp->conn_netstack; 638 639 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 640 ASSERT(connp->conn_ref == 0); 641 ASSERT(connp->conn_ioctlref == 0); 642 643 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 644 645 if (connp->conn_cred != NULL) { 646 crfree(connp->conn_cred); 647 connp->conn_cred = NULL; 648 /* ixa_cred done in ipcl_conn_cleanup below */ 649 } 650 651 if (connp->conn_ht_iphc != NULL) { 652 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 653 connp->conn_ht_iphc = NULL; 654 connp->conn_ht_iphc_allocated = 0; 655 connp->conn_ht_iphc_len = 0; 656 connp->conn_ht_ulp = NULL; 657 connp->conn_ht_ulp_len = 0; 658 } 659 ip_pkt_free(&connp->conn_xmit_ipp); 660 661 ipcl_globalhash_remove(connp); 662 663 if (connp->conn_latch != NULL) { 664 IPLATCH_REFRELE(connp->conn_latch); 665 connp->conn_latch = NULL; 666 } 667 if (connp->conn_latch_in_policy != NULL) { 668 IPPOL_REFRELE(connp->conn_latch_in_policy); 669 connp->conn_latch_in_policy = NULL; 670 } 671 if (connp->conn_latch_in_action != NULL) { 672 IPACT_REFRELE(connp->conn_latch_in_action); 673 connp->conn_latch_in_action = NULL; 674 } 675 if (connp->conn_policy != NULL) { 676 IPPH_REFRELE(connp->conn_policy, ns); 677 connp->conn_policy = NULL; 678 } 679 680 if (connp->conn_ipsec_opt_mp != NULL) { 681 freemsg(connp->conn_ipsec_opt_mp); 682 connp->conn_ipsec_opt_mp = NULL; 683 } 684 685 if (connp->conn_flags & IPCL_TCPCONN) { 686 tcp_t *tcp = connp->conn_tcp; 687 688 tcp_free(tcp); 689 mp = tcp->tcp_timercache; 690 691 tcp->tcp_tcps = NULL; 692 693 /* 694 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 695 * the mblk. 696 */ 697 if (tcp->tcp_rsrv_mp != NULL) { 698 freeb(tcp->tcp_rsrv_mp); 699 tcp->tcp_rsrv_mp = NULL; 700 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 701 } 702 703 ipcl_conn_cleanup(connp); 704 connp->conn_flags = IPCL_TCPCONN; 705 if (ns != NULL) { 706 ASSERT(tcp->tcp_tcps == NULL); 707 connp->conn_netstack = NULL; 708 connp->conn_ixa->ixa_ipst = NULL; 709 netstack_rele(ns); 710 } 711 712 bzero(tcp, sizeof (tcp_t)); 713 714 tcp->tcp_timercache = mp; 715 tcp->tcp_connp = connp; 716 kmem_cache_free(tcp_conn_cache, connp); 717 return; 718 } 719 720 if (connp->conn_flags & IPCL_SCTPCONN) { 721 ASSERT(ns != NULL); 722 sctp_free(connp); 723 return; 724 } 725 726 ipcl_conn_cleanup(connp); 727 if (ns != NULL) { 728 connp->conn_netstack = NULL; 729 connp->conn_ixa->ixa_ipst = NULL; 730 netstack_rele(ns); 731 } 732 733 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 734 if (connp->conn_flags & IPCL_UDPCONN) { 735 connp->conn_flags = IPCL_UDPCONN; 736 kmem_cache_free(udp_conn_cache, connp); 737 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 738 connp->conn_flags = IPCL_RAWIPCONN; 739 connp->conn_proto = IPPROTO_ICMP; 740 connp->conn_ixa->ixa_protocol = connp->conn_proto; 741 kmem_cache_free(rawip_conn_cache, connp); 742 } else if (connp->conn_flags & IPCL_RTSCONN) { 743 connp->conn_flags = IPCL_RTSCONN; 744 kmem_cache_free(rts_conn_cache, connp); 745 } else { 746 connp->conn_flags = IPCL_IPCCONN; 747 ASSERT(connp->conn_flags & IPCL_IPCCONN); 748 ASSERT(connp->conn_priv == NULL); 749 kmem_cache_free(ip_conn_cache, connp); 750 } 751 } 752 753 /* 754 * Running in cluster mode - deregister listener information 755 */ 756 static void 757 ipcl_conn_unlisten(conn_t *connp) 758 { 759 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 760 ASSERT(connp->conn_lport != 0); 761 762 if (cl_inet_unlisten != NULL) { 763 sa_family_t addr_family; 764 uint8_t *laddrp; 765 766 if (connp->conn_ipversion == IPV6_VERSION) { 767 addr_family = AF_INET6; 768 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 769 } else { 770 addr_family = AF_INET; 771 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 772 } 773 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 774 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 775 } 776 connp->conn_flags &= ~IPCL_CL_LISTENER; 777 } 778 779 /* 780 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 781 * which table the conn belonged to). So for debugging we can see which hash 782 * table this connection was in. 783 */ 784 #define IPCL_HASH_REMOVE(connp) { \ 785 connf_t *connfp = (connp)->conn_fanout; \ 786 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 787 if (connfp != NULL) { \ 788 mutex_enter(&connfp->connf_lock); \ 789 if ((connp)->conn_next != NULL) \ 790 (connp)->conn_next->conn_prev = \ 791 (connp)->conn_prev; \ 792 if ((connp)->conn_prev != NULL) \ 793 (connp)->conn_prev->conn_next = \ 794 (connp)->conn_next; \ 795 else \ 796 connfp->connf_head = (connp)->conn_next; \ 797 (connp)->conn_fanout = NULL; \ 798 (connp)->conn_next = NULL; \ 799 (connp)->conn_prev = NULL; \ 800 (connp)->conn_flags |= IPCL_REMOVED; \ 801 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 802 ipcl_conn_unlisten((connp)); \ 803 CONN_DEC_REF((connp)); \ 804 mutex_exit(&connfp->connf_lock); \ 805 } \ 806 } 807 808 void 809 ipcl_hash_remove(conn_t *connp) 810 { 811 uint8_t protocol = connp->conn_proto; 812 813 IPCL_HASH_REMOVE(connp); 814 if (protocol == IPPROTO_RSVP) 815 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 816 } 817 818 /* 819 * The whole purpose of this function is allow removal of 820 * a conn_t from the connected hash for timewait reclaim. 821 * This is essentially a TW reclaim fastpath where timewait 822 * collector checks under fanout lock (so no one else can 823 * get access to the conn_t) that refcnt is 2 i.e. one for 824 * TCP and one for the classifier hash list. If ref count 825 * is indeed 2, we can just remove the conn under lock and 826 * avoid cleaning up the conn under squeue. This gives us 827 * improved performance. 828 */ 829 void 830 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 831 { 832 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 833 ASSERT(MUTEX_HELD(&connp->conn_lock)); 834 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 835 836 if ((connp)->conn_next != NULL) { 837 (connp)->conn_next->conn_prev = (connp)->conn_prev; 838 } 839 if ((connp)->conn_prev != NULL) { 840 (connp)->conn_prev->conn_next = (connp)->conn_next; 841 } else { 842 connfp->connf_head = (connp)->conn_next; 843 } 844 (connp)->conn_fanout = NULL; 845 (connp)->conn_next = NULL; 846 (connp)->conn_prev = NULL; 847 (connp)->conn_flags |= IPCL_REMOVED; 848 ASSERT((connp)->conn_ref == 2); 849 (connp)->conn_ref--; 850 } 851 852 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 853 ASSERT((connp)->conn_fanout == NULL); \ 854 ASSERT((connp)->conn_next == NULL); \ 855 ASSERT((connp)->conn_prev == NULL); \ 856 if ((connfp)->connf_head != NULL) { \ 857 (connfp)->connf_head->conn_prev = (connp); \ 858 (connp)->conn_next = (connfp)->connf_head; \ 859 } \ 860 (connp)->conn_fanout = (connfp); \ 861 (connfp)->connf_head = (connp); \ 862 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 863 IPCL_CONNECTED; \ 864 CONN_INC_REF(connp); \ 865 } 866 867 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 868 IPCL_HASH_REMOVE((connp)); \ 869 mutex_enter(&(connfp)->connf_lock); \ 870 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 871 mutex_exit(&(connfp)->connf_lock); \ 872 } 873 874 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 875 conn_t *pconnp = NULL, *nconnp; \ 876 IPCL_HASH_REMOVE((connp)); \ 877 mutex_enter(&(connfp)->connf_lock); \ 878 nconnp = (connfp)->connf_head; \ 879 while (nconnp != NULL && \ 880 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 881 pconnp = nconnp; \ 882 nconnp = nconnp->conn_next; \ 883 } \ 884 if (pconnp != NULL) { \ 885 pconnp->conn_next = (connp); \ 886 (connp)->conn_prev = pconnp; \ 887 } else { \ 888 (connfp)->connf_head = (connp); \ 889 } \ 890 if (nconnp != NULL) { \ 891 (connp)->conn_next = nconnp; \ 892 nconnp->conn_prev = (connp); \ 893 } \ 894 (connp)->conn_fanout = (connfp); \ 895 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 896 IPCL_BOUND; \ 897 CONN_INC_REF(connp); \ 898 mutex_exit(&(connfp)->connf_lock); \ 899 } 900 901 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 902 conn_t **list, *prev, *next; \ 903 boolean_t isv4mapped = \ 904 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 905 IPCL_HASH_REMOVE((connp)); \ 906 mutex_enter(&(connfp)->connf_lock); \ 907 list = &(connfp)->connf_head; \ 908 prev = NULL; \ 909 while ((next = *list) != NULL) { \ 910 if (isv4mapped && \ 911 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 912 connp->conn_zoneid == next->conn_zoneid) { \ 913 (connp)->conn_next = next; \ 914 if (prev != NULL) \ 915 prev = next->conn_prev; \ 916 next->conn_prev = (connp); \ 917 break; \ 918 } \ 919 list = &next->conn_next; \ 920 prev = next; \ 921 } \ 922 (connp)->conn_prev = prev; \ 923 *list = (connp); \ 924 (connp)->conn_fanout = (connfp); \ 925 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 926 IPCL_BOUND; \ 927 CONN_INC_REF((connp)); \ 928 mutex_exit(&(connfp)->connf_lock); \ 929 } 930 931 void 932 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 933 { 934 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 935 } 936 937 /* 938 * Because the classifier is used to classify inbound packets, the destination 939 * address is meant to be our local tunnel address (tunnel source), and the 940 * source the remote tunnel address (tunnel destination). 941 * 942 * Note that conn_proto can't be used for fanout since the upper protocol 943 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 944 */ 945 conn_t * 946 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 947 { 948 connf_t *connfp; 949 conn_t *connp; 950 951 /* first look for IPv4 tunnel links */ 952 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 953 mutex_enter(&connfp->connf_lock); 954 for (connp = connfp->connf_head; connp != NULL; 955 connp = connp->conn_next) { 956 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 957 break; 958 } 959 if (connp != NULL) 960 goto done; 961 962 mutex_exit(&connfp->connf_lock); 963 964 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 965 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 966 INADDR_ANY)]; 967 mutex_enter(&connfp->connf_lock); 968 for (connp = connfp->connf_head; connp != NULL; 969 connp = connp->conn_next) { 970 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 971 break; 972 } 973 done: 974 if (connp != NULL) 975 CONN_INC_REF(connp); 976 mutex_exit(&connfp->connf_lock); 977 return (connp); 978 } 979 980 conn_t * 981 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 982 { 983 connf_t *connfp; 984 conn_t *connp; 985 986 /* Look for an IPv6 tunnel link */ 987 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 988 mutex_enter(&connfp->connf_lock); 989 for (connp = connfp->connf_head; connp != NULL; 990 connp = connp->conn_next) { 991 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 992 CONN_INC_REF(connp); 993 break; 994 } 995 } 996 mutex_exit(&connfp->connf_lock); 997 return (connp); 998 } 999 1000 /* 1001 * This function is used only for inserting SCTP raw socket now. 1002 * This may change later. 1003 * 1004 * Note that only one raw socket can be bound to a port. The param 1005 * lport is in network byte order. 1006 */ 1007 static int 1008 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1009 { 1010 connf_t *connfp; 1011 conn_t *oconnp; 1012 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1013 1014 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1015 1016 /* Check for existing raw socket already bound to the port. */ 1017 mutex_enter(&connfp->connf_lock); 1018 for (oconnp = connfp->connf_head; oconnp != NULL; 1019 oconnp = oconnp->conn_next) { 1020 if (oconnp->conn_lport == lport && 1021 oconnp->conn_zoneid == connp->conn_zoneid && 1022 oconnp->conn_family == connp->conn_family && 1023 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1024 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1025 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1026 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1027 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1028 &connp->conn_laddr_v6))) { 1029 break; 1030 } 1031 } 1032 mutex_exit(&connfp->connf_lock); 1033 if (oconnp != NULL) 1034 return (EADDRNOTAVAIL); 1035 1036 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1037 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1038 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1039 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1040 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1041 } else { 1042 IPCL_HASH_INSERT_BOUND(connfp, connp); 1043 } 1044 } else { 1045 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1046 } 1047 return (0); 1048 } 1049 1050 static int 1051 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1052 { 1053 connf_t *connfp; 1054 conn_t *tconnp; 1055 ipaddr_t laddr = connp->conn_laddr_v4; 1056 ipaddr_t faddr = connp->conn_faddr_v4; 1057 1058 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1059 mutex_enter(&connfp->connf_lock); 1060 for (tconnp = connfp->connf_head; tconnp != NULL; 1061 tconnp = tconnp->conn_next) { 1062 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1063 /* A tunnel is already bound to these addresses. */ 1064 mutex_exit(&connfp->connf_lock); 1065 return (EADDRINUSE); 1066 } 1067 } 1068 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1069 mutex_exit(&connfp->connf_lock); 1070 return (0); 1071 } 1072 1073 static int 1074 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1075 { 1076 connf_t *connfp; 1077 conn_t *tconnp; 1078 in6_addr_t *laddr = &connp->conn_laddr_v6; 1079 in6_addr_t *faddr = &connp->conn_faddr_v6; 1080 1081 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1082 mutex_enter(&connfp->connf_lock); 1083 for (tconnp = connfp->connf_head; tconnp != NULL; 1084 tconnp = tconnp->conn_next) { 1085 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1086 /* A tunnel is already bound to these addresses. */ 1087 mutex_exit(&connfp->connf_lock); 1088 return (EADDRINUSE); 1089 } 1090 } 1091 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1092 mutex_exit(&connfp->connf_lock); 1093 return (0); 1094 } 1095 1096 /* 1097 * Check for a MAC exemption conflict on a labeled system. Note that for 1098 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1099 * transport layer. This check is for binding all other protocols. 1100 * 1101 * Returns true if there's a conflict. 1102 */ 1103 static boolean_t 1104 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1105 { 1106 connf_t *connfp; 1107 conn_t *tconn; 1108 1109 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1110 mutex_enter(&connfp->connf_lock); 1111 for (tconn = connfp->connf_head; tconn != NULL; 1112 tconn = tconn->conn_next) { 1113 /* We don't allow v4 fallback for v6 raw socket */ 1114 if (connp->conn_family != tconn->conn_family) 1115 continue; 1116 /* If neither is exempt, then there's no conflict */ 1117 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1118 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1119 continue; 1120 /* We are only concerned about sockets for a different zone */ 1121 if (connp->conn_zoneid == tconn->conn_zoneid) 1122 continue; 1123 /* If both are bound to different specific addrs, ok */ 1124 if (connp->conn_laddr_v4 != INADDR_ANY && 1125 tconn->conn_laddr_v4 != INADDR_ANY && 1126 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1127 continue; 1128 /* These two conflict; fail */ 1129 break; 1130 } 1131 mutex_exit(&connfp->connf_lock); 1132 return (tconn != NULL); 1133 } 1134 1135 static boolean_t 1136 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1137 { 1138 connf_t *connfp; 1139 conn_t *tconn; 1140 1141 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1142 mutex_enter(&connfp->connf_lock); 1143 for (tconn = connfp->connf_head; tconn != NULL; 1144 tconn = tconn->conn_next) { 1145 /* We don't allow v4 fallback for v6 raw socket */ 1146 if (connp->conn_family != tconn->conn_family) 1147 continue; 1148 /* If neither is exempt, then there's no conflict */ 1149 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1150 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1151 continue; 1152 /* We are only concerned about sockets for a different zone */ 1153 if (connp->conn_zoneid == tconn->conn_zoneid) 1154 continue; 1155 /* If both are bound to different addrs, ok */ 1156 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1157 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1158 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1159 &tconn->conn_laddr_v6)) 1160 continue; 1161 /* These two conflict; fail */ 1162 break; 1163 } 1164 mutex_exit(&connfp->connf_lock); 1165 return (tconn != NULL); 1166 } 1167 1168 /* 1169 * (v4, v6) bind hash insertion routines 1170 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1171 */ 1172 1173 int 1174 ipcl_bind_insert(conn_t *connp) 1175 { 1176 if (connp->conn_ipversion == IPV6_VERSION) 1177 return (ipcl_bind_insert_v6(connp)); 1178 else 1179 return (ipcl_bind_insert_v4(connp)); 1180 } 1181 1182 int 1183 ipcl_bind_insert_v4(conn_t *connp) 1184 { 1185 connf_t *connfp; 1186 int ret = 0; 1187 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1188 uint16_t lport = connp->conn_lport; 1189 uint8_t protocol = connp->conn_proto; 1190 1191 if (IPCL_IS_IPTUN(connp)) 1192 return (ipcl_iptun_hash_insert(connp, ipst)); 1193 1194 switch (protocol) { 1195 default: 1196 if (is_system_labeled() && 1197 check_exempt_conflict_v4(connp, ipst)) 1198 return (EADDRINUSE); 1199 /* FALLTHROUGH */ 1200 case IPPROTO_UDP: 1201 if (protocol == IPPROTO_UDP) { 1202 connfp = &ipst->ips_ipcl_udp_fanout[ 1203 IPCL_UDP_HASH(lport, ipst)]; 1204 } else { 1205 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1206 } 1207 1208 if (connp->conn_faddr_v4 != INADDR_ANY) { 1209 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1210 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1211 IPCL_HASH_INSERT_BOUND(connfp, connp); 1212 } else { 1213 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1214 } 1215 if (protocol == IPPROTO_RSVP) 1216 ill_set_inputfn_all(ipst); 1217 break; 1218 1219 case IPPROTO_TCP: 1220 /* Insert it in the Bind Hash */ 1221 ASSERT(connp->conn_zoneid != ALL_ZONES); 1222 connfp = &ipst->ips_ipcl_bind_fanout[ 1223 IPCL_BIND_HASH(lport, ipst)]; 1224 if (connp->conn_laddr_v4 != INADDR_ANY) { 1225 IPCL_HASH_INSERT_BOUND(connfp, connp); 1226 } else { 1227 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1228 } 1229 if (cl_inet_listen != NULL) { 1230 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1231 connp->conn_flags |= IPCL_CL_LISTENER; 1232 (*cl_inet_listen)( 1233 connp->conn_netstack->netstack_stackid, 1234 IPPROTO_TCP, AF_INET, 1235 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1236 } 1237 break; 1238 1239 case IPPROTO_SCTP: 1240 ret = ipcl_sctp_hash_insert(connp, lport); 1241 break; 1242 } 1243 1244 return (ret); 1245 } 1246 1247 int 1248 ipcl_bind_insert_v6(conn_t *connp) 1249 { 1250 connf_t *connfp; 1251 int ret = 0; 1252 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1253 uint16_t lport = connp->conn_lport; 1254 uint8_t protocol = connp->conn_proto; 1255 1256 if (IPCL_IS_IPTUN(connp)) { 1257 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1258 } 1259 1260 switch (protocol) { 1261 default: 1262 if (is_system_labeled() && 1263 check_exempt_conflict_v6(connp, ipst)) 1264 return (EADDRINUSE); 1265 /* FALLTHROUGH */ 1266 case IPPROTO_UDP: 1267 if (protocol == IPPROTO_UDP) { 1268 connfp = &ipst->ips_ipcl_udp_fanout[ 1269 IPCL_UDP_HASH(lport, ipst)]; 1270 } else { 1271 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1272 } 1273 1274 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1275 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1276 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1277 IPCL_HASH_INSERT_BOUND(connfp, connp); 1278 } else { 1279 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1280 } 1281 break; 1282 1283 case IPPROTO_TCP: 1284 /* Insert it in the Bind Hash */ 1285 ASSERT(connp->conn_zoneid != ALL_ZONES); 1286 connfp = &ipst->ips_ipcl_bind_fanout[ 1287 IPCL_BIND_HASH(lport, ipst)]; 1288 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1289 IPCL_HASH_INSERT_BOUND(connfp, connp); 1290 } else { 1291 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1292 } 1293 if (cl_inet_listen != NULL) { 1294 sa_family_t addr_family; 1295 uint8_t *laddrp; 1296 1297 if (connp->conn_ipversion == IPV6_VERSION) { 1298 addr_family = AF_INET6; 1299 laddrp = 1300 (uint8_t *)&connp->conn_bound_addr_v6; 1301 } else { 1302 addr_family = AF_INET; 1303 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1304 } 1305 connp->conn_flags |= IPCL_CL_LISTENER; 1306 (*cl_inet_listen)( 1307 connp->conn_netstack->netstack_stackid, 1308 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1309 } 1310 break; 1311 1312 case IPPROTO_SCTP: 1313 ret = ipcl_sctp_hash_insert(connp, lport); 1314 break; 1315 } 1316 1317 return (ret); 1318 } 1319 1320 /* 1321 * ipcl_conn_hash insertion routines. 1322 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1323 */ 1324 1325 int 1326 ipcl_conn_insert(conn_t *connp) 1327 { 1328 if (connp->conn_ipversion == IPV6_VERSION) 1329 return (ipcl_conn_insert_v6(connp)); 1330 else 1331 return (ipcl_conn_insert_v4(connp)); 1332 } 1333 1334 int 1335 ipcl_conn_insert_v4(conn_t *connp) 1336 { 1337 connf_t *connfp; 1338 conn_t *tconnp; 1339 int ret = 0; 1340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1341 uint16_t lport = connp->conn_lport; 1342 uint8_t protocol = connp->conn_proto; 1343 1344 if (IPCL_IS_IPTUN(connp)) 1345 return (ipcl_iptun_hash_insert(connp, ipst)); 1346 1347 switch (protocol) { 1348 case IPPROTO_TCP: 1349 /* 1350 * For TCP, we check whether the connection tuple already 1351 * exists before allowing the connection to proceed. We 1352 * also allow indexing on the zoneid. This is to allow 1353 * multiple shared stack zones to have the same tcp 1354 * connection tuple. In practice this only happens for 1355 * INADDR_LOOPBACK as it's the only local address which 1356 * doesn't have to be unique. 1357 */ 1358 connfp = &ipst->ips_ipcl_conn_fanout[ 1359 IPCL_CONN_HASH(connp->conn_faddr_v4, 1360 connp->conn_ports, ipst)]; 1361 mutex_enter(&connfp->connf_lock); 1362 for (tconnp = connfp->connf_head; tconnp != NULL; 1363 tconnp = tconnp->conn_next) { 1364 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1365 connp->conn_faddr_v4, connp->conn_laddr_v4, 1366 connp->conn_ports) && 1367 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1368 /* Already have a conn. bail out */ 1369 mutex_exit(&connfp->connf_lock); 1370 return (EADDRINUSE); 1371 } 1372 } 1373 if (connp->conn_fanout != NULL) { 1374 /* 1375 * Probably a XTI/TLI application trying to do a 1376 * rebind. Let it happen. 1377 */ 1378 mutex_exit(&connfp->connf_lock); 1379 IPCL_HASH_REMOVE(connp); 1380 mutex_enter(&connfp->connf_lock); 1381 } 1382 1383 ASSERT(connp->conn_recv != NULL); 1384 ASSERT(connp->conn_recvicmp != NULL); 1385 1386 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1387 mutex_exit(&connfp->connf_lock); 1388 break; 1389 1390 case IPPROTO_SCTP: 1391 /* 1392 * The raw socket may have already been bound, remove it 1393 * from the hash first. 1394 */ 1395 IPCL_HASH_REMOVE(connp); 1396 ret = ipcl_sctp_hash_insert(connp, lport); 1397 break; 1398 1399 default: 1400 /* 1401 * Check for conflicts among MAC exempt bindings. For 1402 * transports with port numbers, this is done by the upper 1403 * level per-transport binding logic. For all others, it's 1404 * done here. 1405 */ 1406 if (is_system_labeled() && 1407 check_exempt_conflict_v4(connp, ipst)) 1408 return (EADDRINUSE); 1409 /* FALLTHROUGH */ 1410 1411 case IPPROTO_UDP: 1412 if (protocol == IPPROTO_UDP) { 1413 connfp = &ipst->ips_ipcl_udp_fanout[ 1414 IPCL_UDP_HASH(lport, ipst)]; 1415 } else { 1416 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1417 } 1418 1419 if (connp->conn_faddr_v4 != INADDR_ANY) { 1420 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1421 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1422 IPCL_HASH_INSERT_BOUND(connfp, connp); 1423 } else { 1424 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1425 } 1426 break; 1427 } 1428 1429 return (ret); 1430 } 1431 1432 int 1433 ipcl_conn_insert_v6(conn_t *connp) 1434 { 1435 connf_t *connfp; 1436 conn_t *tconnp; 1437 int ret = 0; 1438 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1439 uint16_t lport = connp->conn_lport; 1440 uint8_t protocol = connp->conn_proto; 1441 uint_t ifindex = connp->conn_bound_if; 1442 1443 if (IPCL_IS_IPTUN(connp)) 1444 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1445 1446 switch (protocol) { 1447 case IPPROTO_TCP: 1448 1449 /* 1450 * For tcp, we check whether the connection tuple already 1451 * exists before allowing the connection to proceed. We 1452 * also allow indexing on the zoneid. This is to allow 1453 * multiple shared stack zones to have the same tcp 1454 * connection tuple. In practice this only happens for 1455 * ipv6_loopback as it's the only local address which 1456 * doesn't have to be unique. 1457 */ 1458 connfp = &ipst->ips_ipcl_conn_fanout[ 1459 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1460 ipst)]; 1461 mutex_enter(&connfp->connf_lock); 1462 for (tconnp = connfp->connf_head; tconnp != NULL; 1463 tconnp = tconnp->conn_next) { 1464 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1465 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1466 connp->conn_faddr_v6, connp->conn_laddr_v6, 1467 connp->conn_ports) && 1468 (tconnp->conn_bound_if == 0 || 1469 tconnp->conn_bound_if == ifindex) && 1470 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1471 /* Already have a conn. bail out */ 1472 mutex_exit(&connfp->connf_lock); 1473 return (EADDRINUSE); 1474 } 1475 } 1476 if (connp->conn_fanout != NULL) { 1477 /* 1478 * Probably a XTI/TLI application trying to do a 1479 * rebind. Let it happen. 1480 */ 1481 mutex_exit(&connfp->connf_lock); 1482 IPCL_HASH_REMOVE(connp); 1483 mutex_enter(&connfp->connf_lock); 1484 } 1485 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1486 mutex_exit(&connfp->connf_lock); 1487 break; 1488 1489 case IPPROTO_SCTP: 1490 IPCL_HASH_REMOVE(connp); 1491 ret = ipcl_sctp_hash_insert(connp, lport); 1492 break; 1493 1494 default: 1495 if (is_system_labeled() && 1496 check_exempt_conflict_v6(connp, ipst)) 1497 return (EADDRINUSE); 1498 /* FALLTHROUGH */ 1499 case IPPROTO_UDP: 1500 if (protocol == IPPROTO_UDP) { 1501 connfp = &ipst->ips_ipcl_udp_fanout[ 1502 IPCL_UDP_HASH(lport, ipst)]; 1503 } else { 1504 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1505 } 1506 1507 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1508 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1509 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1510 IPCL_HASH_INSERT_BOUND(connfp, connp); 1511 } else { 1512 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1513 } 1514 break; 1515 } 1516 1517 return (ret); 1518 } 1519 1520 /* 1521 * v4 packet classifying function. looks up the fanout table to 1522 * find the conn, the packet belongs to. returns the conn with 1523 * the reference held, null otherwise. 1524 * 1525 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1526 * Lookup" comment block are applied. Labels are also checked as described 1527 * above. If the packet is from the inside (looped back), and is from the same 1528 * zone, then label checks are omitted. 1529 */ 1530 conn_t * 1531 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1532 ip_recv_attr_t *ira, ip_stack_t *ipst) 1533 { 1534 ipha_t *ipha; 1535 connf_t *connfp, *bind_connfp; 1536 uint16_t lport; 1537 uint16_t fport; 1538 uint32_t ports; 1539 conn_t *connp; 1540 uint16_t *up; 1541 zoneid_t zoneid = ira->ira_zoneid; 1542 1543 ipha = (ipha_t *)mp->b_rptr; 1544 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1545 1546 switch (protocol) { 1547 case IPPROTO_TCP: 1548 ports = *(uint32_t *)up; 1549 connfp = 1550 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1551 ports, ipst)]; 1552 mutex_enter(&connfp->connf_lock); 1553 for (connp = connfp->connf_head; connp != NULL; 1554 connp = connp->conn_next) { 1555 if (IPCL_CONN_MATCH(connp, protocol, 1556 ipha->ipha_src, ipha->ipha_dst, ports) && 1557 (connp->conn_zoneid == zoneid || 1558 connp->conn_allzones || 1559 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1560 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1561 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1562 break; 1563 } 1564 1565 if (connp != NULL) { 1566 /* 1567 * We have a fully-bound TCP connection. 1568 * 1569 * For labeled systems, there's no need to check the 1570 * label here. It's known to be good as we checked 1571 * before allowing the connection to become bound. 1572 */ 1573 CONN_INC_REF(connp); 1574 mutex_exit(&connfp->connf_lock); 1575 return (connp); 1576 } 1577 1578 mutex_exit(&connfp->connf_lock); 1579 lport = up[1]; 1580 bind_connfp = 1581 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1582 mutex_enter(&bind_connfp->connf_lock); 1583 for (connp = bind_connfp->connf_head; connp != NULL; 1584 connp = connp->conn_next) { 1585 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1586 lport) && 1587 (connp->conn_zoneid == zoneid || 1588 connp->conn_allzones || 1589 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1590 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1591 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1592 break; 1593 } 1594 1595 /* 1596 * If the matching connection is SLP on a private address, then 1597 * the label on the packet must match the local zone's label. 1598 * Otherwise, it must be in the label range defined by tnrh. 1599 * This is ensured by tsol_receive_local. 1600 * 1601 * Note that we don't check tsol_receive_local for 1602 * the connected case. 1603 */ 1604 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1605 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1606 ira, connp)) { 1607 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1608 char *, "connp(1) could not receive mp(2)", 1609 conn_t *, connp, mblk_t *, mp); 1610 connp = NULL; 1611 } 1612 1613 if (connp != NULL) { 1614 /* Have a listener at least */ 1615 CONN_INC_REF(connp); 1616 mutex_exit(&bind_connfp->connf_lock); 1617 return (connp); 1618 } 1619 1620 mutex_exit(&bind_connfp->connf_lock); 1621 break; 1622 1623 case IPPROTO_UDP: 1624 lport = up[1]; 1625 fport = up[0]; 1626 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1627 mutex_enter(&connfp->connf_lock); 1628 for (connp = connfp->connf_head; connp != NULL; 1629 connp = connp->conn_next) { 1630 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1631 fport, ipha->ipha_src) && 1632 (connp->conn_zoneid == zoneid || 1633 connp->conn_allzones || 1634 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1635 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1636 break; 1637 } 1638 1639 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1640 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1641 ira, connp)) { 1642 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1643 char *, "connp(1) could not receive mp(2)", 1644 conn_t *, connp, mblk_t *, mp); 1645 connp = NULL; 1646 } 1647 1648 if (connp != NULL) { 1649 CONN_INC_REF(connp); 1650 mutex_exit(&connfp->connf_lock); 1651 return (connp); 1652 } 1653 1654 /* 1655 * We shouldn't come here for multicast/broadcast packets 1656 */ 1657 mutex_exit(&connfp->connf_lock); 1658 1659 break; 1660 1661 case IPPROTO_ENCAP: 1662 case IPPROTO_IPV6: 1663 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1664 &ipha->ipha_dst, ipst)); 1665 } 1666 1667 return (NULL); 1668 } 1669 1670 conn_t * 1671 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1672 ip_recv_attr_t *ira, ip_stack_t *ipst) 1673 { 1674 ip6_t *ip6h; 1675 connf_t *connfp, *bind_connfp; 1676 uint16_t lport; 1677 uint16_t fport; 1678 tcpha_t *tcpha; 1679 uint32_t ports; 1680 conn_t *connp; 1681 uint16_t *up; 1682 zoneid_t zoneid = ira->ira_zoneid; 1683 1684 ip6h = (ip6_t *)mp->b_rptr; 1685 1686 switch (protocol) { 1687 case IPPROTO_TCP: 1688 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1689 up = &tcpha->tha_lport; 1690 ports = *(uint32_t *)up; 1691 1692 connfp = 1693 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1694 ports, ipst)]; 1695 mutex_enter(&connfp->connf_lock); 1696 for (connp = connfp->connf_head; connp != NULL; 1697 connp = connp->conn_next) { 1698 if (IPCL_CONN_MATCH_V6(connp, protocol, 1699 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1700 (connp->conn_zoneid == zoneid || 1701 connp->conn_allzones || 1702 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1703 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1704 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1705 break; 1706 } 1707 1708 if (connp != NULL) { 1709 /* 1710 * We have a fully-bound TCP connection. 1711 * 1712 * For labeled systems, there's no need to check the 1713 * label here. It's known to be good as we checked 1714 * before allowing the connection to become bound. 1715 */ 1716 CONN_INC_REF(connp); 1717 mutex_exit(&connfp->connf_lock); 1718 return (connp); 1719 } 1720 1721 mutex_exit(&connfp->connf_lock); 1722 1723 lport = up[1]; 1724 bind_connfp = 1725 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1726 mutex_enter(&bind_connfp->connf_lock); 1727 for (connp = bind_connfp->connf_head; connp != NULL; 1728 connp = connp->conn_next) { 1729 if (IPCL_BIND_MATCH_V6(connp, protocol, 1730 ip6h->ip6_dst, lport) && 1731 (connp->conn_zoneid == zoneid || 1732 connp->conn_allzones || 1733 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1734 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1735 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1736 break; 1737 } 1738 1739 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1740 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1741 ira, connp)) { 1742 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1743 char *, "connp(1) could not receive mp(2)", 1744 conn_t *, connp, mblk_t *, mp); 1745 connp = NULL; 1746 } 1747 1748 if (connp != NULL) { 1749 /* Have a listner at least */ 1750 CONN_INC_REF(connp); 1751 mutex_exit(&bind_connfp->connf_lock); 1752 return (connp); 1753 } 1754 1755 mutex_exit(&bind_connfp->connf_lock); 1756 break; 1757 1758 case IPPROTO_UDP: 1759 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1760 lport = up[1]; 1761 fport = up[0]; 1762 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1763 mutex_enter(&connfp->connf_lock); 1764 for (connp = connfp->connf_head; connp != NULL; 1765 connp = connp->conn_next) { 1766 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1767 fport, ip6h->ip6_src) && 1768 (connp->conn_zoneid == zoneid || 1769 connp->conn_allzones || 1770 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1771 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1772 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1773 break; 1774 } 1775 1776 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1777 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1778 ira, connp)) { 1779 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1780 char *, "connp(1) could not receive mp(2)", 1781 conn_t *, connp, mblk_t *, mp); 1782 connp = NULL; 1783 } 1784 1785 if (connp != NULL) { 1786 CONN_INC_REF(connp); 1787 mutex_exit(&connfp->connf_lock); 1788 return (connp); 1789 } 1790 1791 /* 1792 * We shouldn't come here for multicast/broadcast packets 1793 */ 1794 mutex_exit(&connfp->connf_lock); 1795 break; 1796 case IPPROTO_ENCAP: 1797 case IPPROTO_IPV6: 1798 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1799 &ip6h->ip6_dst, ipst)); 1800 } 1801 1802 return (NULL); 1803 } 1804 1805 /* 1806 * wrapper around ipcl_classify_(v4,v6) routines. 1807 */ 1808 conn_t * 1809 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1810 { 1811 if (ira->ira_flags & IRAF_IS_IPV4) { 1812 return (ipcl_classify_v4(mp, ira->ira_protocol, 1813 ira->ira_ip_hdr_length, ira, ipst)); 1814 } else { 1815 return (ipcl_classify_v6(mp, ira->ira_protocol, 1816 ira->ira_ip_hdr_length, ira, ipst)); 1817 } 1818 } 1819 1820 /* 1821 * Only used to classify SCTP RAW sockets 1822 */ 1823 conn_t * 1824 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1825 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1826 { 1827 connf_t *connfp; 1828 conn_t *connp; 1829 in_port_t lport; 1830 int ipversion; 1831 const void *dst; 1832 zoneid_t zoneid = ira->ira_zoneid; 1833 1834 lport = ((uint16_t *)&ports)[1]; 1835 if (ira->ira_flags & IRAF_IS_IPV4) { 1836 dst = (const void *)&ipha->ipha_dst; 1837 ipversion = IPV4_VERSION; 1838 } else { 1839 dst = (const void *)&ip6h->ip6_dst; 1840 ipversion = IPV6_VERSION; 1841 } 1842 1843 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1844 mutex_enter(&connfp->connf_lock); 1845 for (connp = connfp->connf_head; connp != NULL; 1846 connp = connp->conn_next) { 1847 /* We don't allow v4 fallback for v6 raw socket. */ 1848 if (ipversion != connp->conn_ipversion) 1849 continue; 1850 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1851 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1852 if (ipversion == IPV4_VERSION) { 1853 if (!IPCL_CONN_MATCH(connp, protocol, 1854 ipha->ipha_src, ipha->ipha_dst, ports)) 1855 continue; 1856 } else { 1857 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1858 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1859 continue; 1860 } 1861 } else { 1862 if (ipversion == IPV4_VERSION) { 1863 if (!IPCL_BIND_MATCH(connp, protocol, 1864 ipha->ipha_dst, lport)) 1865 continue; 1866 } else { 1867 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1868 ip6h->ip6_dst, lport)) 1869 continue; 1870 } 1871 } 1872 1873 if (connp->conn_zoneid == zoneid || 1874 connp->conn_allzones || 1875 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1876 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1877 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1878 break; 1879 } 1880 1881 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1882 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1883 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1884 char *, "connp(1) could not receive mp(2)", 1885 conn_t *, connp, mblk_t *, mp); 1886 connp = NULL; 1887 } 1888 1889 if (connp != NULL) 1890 goto found; 1891 mutex_exit(&connfp->connf_lock); 1892 1893 /* Try to look for a wildcard SCTP RAW socket match. */ 1894 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1895 mutex_enter(&connfp->connf_lock); 1896 for (connp = connfp->connf_head; connp != NULL; 1897 connp = connp->conn_next) { 1898 /* We don't allow v4 fallback for v6 raw socket. */ 1899 if (ipversion != connp->conn_ipversion) 1900 continue; 1901 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1902 continue; 1903 1904 if (ipversion == IPV4_VERSION) { 1905 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1906 break; 1907 } else { 1908 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1909 break; 1910 } 1911 } 1912 } 1913 1914 if (connp != NULL) 1915 goto found; 1916 1917 mutex_exit(&connfp->connf_lock); 1918 return (NULL); 1919 1920 found: 1921 ASSERT(connp != NULL); 1922 CONN_INC_REF(connp); 1923 mutex_exit(&connfp->connf_lock); 1924 return (connp); 1925 } 1926 1927 /* ARGSUSED */ 1928 static int 1929 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1930 { 1931 itc_t *itc = (itc_t *)buf; 1932 conn_t *connp = &itc->itc_conn; 1933 tcp_t *tcp = (tcp_t *)&itc[1]; 1934 1935 bzero(connp, sizeof (conn_t)); 1936 bzero(tcp, sizeof (tcp_t)); 1937 1938 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1939 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1940 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1941 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1942 if (tcp->tcp_timercache == NULL) 1943 return (ENOMEM); 1944 connp->conn_tcp = tcp; 1945 connp->conn_flags = IPCL_TCPCONN; 1946 connp->conn_proto = IPPROTO_TCP; 1947 tcp->tcp_connp = connp; 1948 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1949 1950 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1951 if (connp->conn_ixa == NULL) { 1952 tcp_timermp_free(tcp); 1953 return (ENOMEM); 1954 } 1955 connp->conn_ixa->ixa_refcnt = 1; 1956 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1957 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1958 return (0); 1959 } 1960 1961 /* ARGSUSED */ 1962 static void 1963 tcp_conn_destructor(void *buf, void *cdrarg) 1964 { 1965 itc_t *itc = (itc_t *)buf; 1966 conn_t *connp = &itc->itc_conn; 1967 tcp_t *tcp = (tcp_t *)&itc[1]; 1968 1969 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1970 ASSERT(tcp->tcp_connp == connp); 1971 ASSERT(connp->conn_tcp == tcp); 1972 tcp_timermp_free(tcp); 1973 mutex_destroy(&connp->conn_lock); 1974 cv_destroy(&connp->conn_cv); 1975 cv_destroy(&connp->conn_sq_cv); 1976 rw_destroy(&connp->conn_ilg_lock); 1977 1978 /* Can be NULL if constructor failed */ 1979 if (connp->conn_ixa != NULL) { 1980 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1981 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1982 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1983 ixa_refrele(connp->conn_ixa); 1984 } 1985 } 1986 1987 /* ARGSUSED */ 1988 static int 1989 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1990 { 1991 itc_t *itc = (itc_t *)buf; 1992 conn_t *connp = &itc->itc_conn; 1993 1994 bzero(connp, sizeof (conn_t)); 1995 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1996 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1997 connp->conn_flags = IPCL_IPCCONN; 1998 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1999 2000 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2001 if (connp->conn_ixa == NULL) 2002 return (ENOMEM); 2003 connp->conn_ixa->ixa_refcnt = 1; 2004 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2005 return (0); 2006 } 2007 2008 /* ARGSUSED */ 2009 static void 2010 ip_conn_destructor(void *buf, void *cdrarg) 2011 { 2012 itc_t *itc = (itc_t *)buf; 2013 conn_t *connp = &itc->itc_conn; 2014 2015 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2016 ASSERT(connp->conn_priv == NULL); 2017 mutex_destroy(&connp->conn_lock); 2018 cv_destroy(&connp->conn_cv); 2019 rw_destroy(&connp->conn_ilg_lock); 2020 2021 /* Can be NULL if constructor failed */ 2022 if (connp->conn_ixa != NULL) { 2023 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2024 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2025 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2026 ixa_refrele(connp->conn_ixa); 2027 } 2028 } 2029 2030 /* ARGSUSED */ 2031 static int 2032 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2033 { 2034 itc_t *itc = (itc_t *)buf; 2035 conn_t *connp = &itc->itc_conn; 2036 udp_t *udp = (udp_t *)&itc[1]; 2037 2038 bzero(connp, sizeof (conn_t)); 2039 bzero(udp, sizeof (udp_t)); 2040 2041 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2042 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2043 connp->conn_udp = udp; 2044 connp->conn_flags = IPCL_UDPCONN; 2045 connp->conn_proto = IPPROTO_UDP; 2046 udp->udp_connp = connp; 2047 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2048 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2049 if (connp->conn_ixa == NULL) 2050 return (ENOMEM); 2051 connp->conn_ixa->ixa_refcnt = 1; 2052 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2053 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2054 return (0); 2055 } 2056 2057 /* ARGSUSED */ 2058 static void 2059 udp_conn_destructor(void *buf, void *cdrarg) 2060 { 2061 itc_t *itc = (itc_t *)buf; 2062 conn_t *connp = &itc->itc_conn; 2063 udp_t *udp = (udp_t *)&itc[1]; 2064 2065 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2066 ASSERT(udp->udp_connp == connp); 2067 ASSERT(connp->conn_udp == udp); 2068 mutex_destroy(&connp->conn_lock); 2069 cv_destroy(&connp->conn_cv); 2070 rw_destroy(&connp->conn_ilg_lock); 2071 2072 /* Can be NULL if constructor failed */ 2073 if (connp->conn_ixa != NULL) { 2074 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2075 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2076 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2077 ixa_refrele(connp->conn_ixa); 2078 } 2079 } 2080 2081 /* ARGSUSED */ 2082 static int 2083 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2084 { 2085 itc_t *itc = (itc_t *)buf; 2086 conn_t *connp = &itc->itc_conn; 2087 icmp_t *icmp = (icmp_t *)&itc[1]; 2088 2089 bzero(connp, sizeof (conn_t)); 2090 bzero(icmp, sizeof (icmp_t)); 2091 2092 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2093 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2094 connp->conn_icmp = icmp; 2095 connp->conn_flags = IPCL_RAWIPCONN; 2096 connp->conn_proto = IPPROTO_ICMP; 2097 icmp->icmp_connp = connp; 2098 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2099 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2100 if (connp->conn_ixa == NULL) 2101 return (ENOMEM); 2102 connp->conn_ixa->ixa_refcnt = 1; 2103 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2104 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2105 return (0); 2106 } 2107 2108 /* ARGSUSED */ 2109 static void 2110 rawip_conn_destructor(void *buf, void *cdrarg) 2111 { 2112 itc_t *itc = (itc_t *)buf; 2113 conn_t *connp = &itc->itc_conn; 2114 icmp_t *icmp = (icmp_t *)&itc[1]; 2115 2116 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2117 ASSERT(icmp->icmp_connp == connp); 2118 ASSERT(connp->conn_icmp == icmp); 2119 mutex_destroy(&connp->conn_lock); 2120 cv_destroy(&connp->conn_cv); 2121 rw_destroy(&connp->conn_ilg_lock); 2122 2123 /* Can be NULL if constructor failed */ 2124 if (connp->conn_ixa != NULL) { 2125 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2126 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2127 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2128 ixa_refrele(connp->conn_ixa); 2129 } 2130 } 2131 2132 /* ARGSUSED */ 2133 static int 2134 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2135 { 2136 itc_t *itc = (itc_t *)buf; 2137 conn_t *connp = &itc->itc_conn; 2138 rts_t *rts = (rts_t *)&itc[1]; 2139 2140 bzero(connp, sizeof (conn_t)); 2141 bzero(rts, sizeof (rts_t)); 2142 2143 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2144 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2145 connp->conn_rts = rts; 2146 connp->conn_flags = IPCL_RTSCONN; 2147 rts->rts_connp = connp; 2148 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2149 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2150 if (connp->conn_ixa == NULL) 2151 return (ENOMEM); 2152 connp->conn_ixa->ixa_refcnt = 1; 2153 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2154 return (0); 2155 } 2156 2157 /* ARGSUSED */ 2158 static void 2159 rts_conn_destructor(void *buf, void *cdrarg) 2160 { 2161 itc_t *itc = (itc_t *)buf; 2162 conn_t *connp = &itc->itc_conn; 2163 rts_t *rts = (rts_t *)&itc[1]; 2164 2165 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2166 ASSERT(rts->rts_connp == connp); 2167 ASSERT(connp->conn_rts == rts); 2168 mutex_destroy(&connp->conn_lock); 2169 cv_destroy(&connp->conn_cv); 2170 rw_destroy(&connp->conn_ilg_lock); 2171 2172 /* Can be NULL if constructor failed */ 2173 if (connp->conn_ixa != NULL) { 2174 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2175 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2176 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2177 ixa_refrele(connp->conn_ixa); 2178 } 2179 } 2180 2181 /* 2182 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2183 * in the conn_t. 2184 * 2185 * Below we list all the pointers in the conn_t as a documentation aid. 2186 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2187 * If you add any pointers to the conn_t please add an ASSERT here 2188 * and #ifdef it out if it can't be actually asserted to be NULL. 2189 * In any case, we bzero most of the conn_t at the end of the function. 2190 */ 2191 void 2192 ipcl_conn_cleanup(conn_t *connp) 2193 { 2194 ip_xmit_attr_t *ixa; 2195 2196 ASSERT(connp->conn_latch == NULL); 2197 ASSERT(connp->conn_latch_in_policy == NULL); 2198 ASSERT(connp->conn_latch_in_action == NULL); 2199 #ifdef notdef 2200 ASSERT(connp->conn_rq == NULL); 2201 ASSERT(connp->conn_wq == NULL); 2202 #endif 2203 ASSERT(connp->conn_cred == NULL); 2204 ASSERT(connp->conn_g_fanout == NULL); 2205 ASSERT(connp->conn_g_next == NULL); 2206 ASSERT(connp->conn_g_prev == NULL); 2207 ASSERT(connp->conn_policy == NULL); 2208 ASSERT(connp->conn_fanout == NULL); 2209 ASSERT(connp->conn_next == NULL); 2210 ASSERT(connp->conn_prev == NULL); 2211 ASSERT(connp->conn_oper_pending_ill == NULL); 2212 ASSERT(connp->conn_ilg == NULL); 2213 ASSERT(connp->conn_drain_next == NULL); 2214 ASSERT(connp->conn_drain_prev == NULL); 2215 #ifdef notdef 2216 /* conn_idl is not cleared when removed from idl list */ 2217 ASSERT(connp->conn_idl == NULL); 2218 #endif 2219 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2220 #ifdef notdef 2221 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2222 ASSERT(connp->conn_netstack == NULL); 2223 #endif 2224 2225 ASSERT(connp->conn_helper_info == NULL); 2226 ASSERT(connp->conn_ixa != NULL); 2227 ixa = connp->conn_ixa; 2228 ASSERT(ixa->ixa_refcnt == 1); 2229 /* Need to preserve ixa_protocol */ 2230 ixa_cleanup(ixa); 2231 ixa->ixa_flags = 0; 2232 2233 /* Clear out the conn_t fields that are not preserved */ 2234 bzero(&connp->conn_start_clr, 2235 sizeof (conn_t) - 2236 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2237 } 2238 2239 /* 2240 * All conns are inserted in a global multi-list for the benefit of 2241 * walkers. The walk is guaranteed to walk all open conns at the time 2242 * of the start of the walk exactly once. This property is needed to 2243 * achieve some cleanups during unplumb of interfaces. This is achieved 2244 * as follows. 2245 * 2246 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2247 * call the insert and delete functions below at creation and deletion 2248 * time respectively. The conn never moves or changes its position in this 2249 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2250 * won't increase due to walkers, once the conn deletion has started. Note 2251 * that we can't remove the conn from the global list and then wait for 2252 * the refcnt to drop to zero, since walkers would then see a truncated 2253 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2254 * conns until ip_open is ready to make them globally visible. 2255 * The global round robin multi-list locks are held only to get the 2256 * next member/insertion/deletion and contention should be negligible 2257 * if the multi-list is much greater than the number of cpus. 2258 */ 2259 void 2260 ipcl_globalhash_insert(conn_t *connp) 2261 { 2262 int index; 2263 struct connf_s *connfp; 2264 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2265 2266 /* 2267 * No need for atomic here. Approximate even distribution 2268 * in the global lists is sufficient. 2269 */ 2270 ipst->ips_conn_g_index++; 2271 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2272 2273 connp->conn_g_prev = NULL; 2274 /* 2275 * Mark as INCIPIENT, so that walkers will ignore this 2276 * for now, till ip_open is ready to make it visible globally. 2277 */ 2278 connp->conn_state_flags |= CONN_INCIPIENT; 2279 2280 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2281 /* Insert at the head of the list */ 2282 mutex_enter(&connfp->connf_lock); 2283 connp->conn_g_next = connfp->connf_head; 2284 if (connp->conn_g_next != NULL) 2285 connp->conn_g_next->conn_g_prev = connp; 2286 connfp->connf_head = connp; 2287 2288 /* The fanout bucket this conn points to */ 2289 connp->conn_g_fanout = connfp; 2290 2291 mutex_exit(&connfp->connf_lock); 2292 } 2293 2294 void 2295 ipcl_globalhash_remove(conn_t *connp) 2296 { 2297 struct connf_s *connfp; 2298 2299 /* 2300 * We were never inserted in the global multi list. 2301 * IPCL_NONE variety is never inserted in the global multilist 2302 * since it is presumed to not need any cleanup and is transient. 2303 */ 2304 if (connp->conn_g_fanout == NULL) 2305 return; 2306 2307 connfp = connp->conn_g_fanout; 2308 mutex_enter(&connfp->connf_lock); 2309 if (connp->conn_g_prev != NULL) 2310 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2311 else 2312 connfp->connf_head = connp->conn_g_next; 2313 if (connp->conn_g_next != NULL) 2314 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2315 mutex_exit(&connfp->connf_lock); 2316 2317 /* Better to stumble on a null pointer than to corrupt memory */ 2318 connp->conn_g_next = NULL; 2319 connp->conn_g_prev = NULL; 2320 connp->conn_g_fanout = NULL; 2321 } 2322 2323 /* 2324 * Walk the list of all conn_t's in the system, calling the function provided 2325 * With the specified argument for each. 2326 * Applies to both IPv4 and IPv6. 2327 * 2328 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2329 * conn_oper_pending_ill). To guard against stale pointers 2330 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2331 * unplumbed or removed. New conn_t's that are created while we are walking 2332 * may be missed by this walk, because they are not necessarily inserted 2333 * at the tail of the list. They are new conn_t's and thus don't have any 2334 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2335 * is created to the struct that is going away. 2336 */ 2337 void 2338 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2339 { 2340 int i; 2341 conn_t *connp; 2342 conn_t *prev_connp; 2343 2344 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2345 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2346 prev_connp = NULL; 2347 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2348 while (connp != NULL) { 2349 mutex_enter(&connp->conn_lock); 2350 if (connp->conn_state_flags & 2351 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2352 mutex_exit(&connp->conn_lock); 2353 connp = connp->conn_g_next; 2354 continue; 2355 } 2356 CONN_INC_REF_LOCKED(connp); 2357 mutex_exit(&connp->conn_lock); 2358 mutex_exit( 2359 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2360 (*func)(connp, arg); 2361 if (prev_connp != NULL) 2362 CONN_DEC_REF(prev_connp); 2363 mutex_enter( 2364 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2365 prev_connp = connp; 2366 connp = connp->conn_g_next; 2367 } 2368 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2369 if (prev_connp != NULL) 2370 CONN_DEC_REF(prev_connp); 2371 } 2372 } 2373 2374 /* 2375 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2376 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2377 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2378 * (peer tcp in ESTABLISHED state). 2379 */ 2380 conn_t * 2381 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2382 ip_stack_t *ipst) 2383 { 2384 uint32_t ports; 2385 uint16_t *pports = (uint16_t *)&ports; 2386 connf_t *connfp; 2387 conn_t *tconnp; 2388 boolean_t zone_chk; 2389 2390 /* 2391 * If either the source of destination address is loopback, then 2392 * both endpoints must be in the same Zone. Otherwise, both of 2393 * the addresses are system-wide unique (tcp is in ESTABLISHED 2394 * state) and the endpoints may reside in different Zones. 2395 */ 2396 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2397 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2398 2399 pports[0] = tcpha->tha_fport; 2400 pports[1] = tcpha->tha_lport; 2401 2402 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2403 ports, ipst)]; 2404 2405 mutex_enter(&connfp->connf_lock); 2406 for (tconnp = connfp->connf_head; tconnp != NULL; 2407 tconnp = tconnp->conn_next) { 2408 2409 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2410 ipha->ipha_dst, ipha->ipha_src, ports) && 2411 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2412 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2413 2414 ASSERT(tconnp != connp); 2415 CONN_INC_REF(tconnp); 2416 mutex_exit(&connfp->connf_lock); 2417 return (tconnp); 2418 } 2419 } 2420 mutex_exit(&connfp->connf_lock); 2421 return (NULL); 2422 } 2423 2424 /* 2425 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2426 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2427 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2428 * (peer tcp in ESTABLISHED state). 2429 */ 2430 conn_t * 2431 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2432 ip_stack_t *ipst) 2433 { 2434 uint32_t ports; 2435 uint16_t *pports = (uint16_t *)&ports; 2436 connf_t *connfp; 2437 conn_t *tconnp; 2438 boolean_t zone_chk; 2439 2440 /* 2441 * If either the source of destination address is loopback, then 2442 * both endpoints must be in the same Zone. Otherwise, both of 2443 * the addresses are system-wide unique (tcp is in ESTABLISHED 2444 * state) and the endpoints may reside in different Zones. We 2445 * don't do Zone check for link local address(es) because the 2446 * current Zone implementation treats each link local address as 2447 * being unique per system node, i.e. they belong to global Zone. 2448 */ 2449 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2450 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2451 2452 pports[0] = tcpha->tha_fport; 2453 pports[1] = tcpha->tha_lport; 2454 2455 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2456 ports, ipst)]; 2457 2458 mutex_enter(&connfp->connf_lock); 2459 for (tconnp = connfp->connf_head; tconnp != NULL; 2460 tconnp = tconnp->conn_next) { 2461 2462 /* We skip conn_bound_if check here as this is loopback tcp */ 2463 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2464 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2465 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2466 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2467 2468 ASSERT(tconnp != connp); 2469 CONN_INC_REF(tconnp); 2470 mutex_exit(&connfp->connf_lock); 2471 return (tconnp); 2472 } 2473 } 2474 mutex_exit(&connfp->connf_lock); 2475 return (NULL); 2476 } 2477 2478 /* 2479 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2480 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2481 * Only checks for connected entries i.e. no INADDR_ANY checks. 2482 */ 2483 conn_t * 2484 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2485 ip_stack_t *ipst) 2486 { 2487 uint32_t ports; 2488 uint16_t *pports; 2489 connf_t *connfp; 2490 conn_t *tconnp; 2491 2492 pports = (uint16_t *)&ports; 2493 pports[0] = tcpha->tha_fport; 2494 pports[1] = tcpha->tha_lport; 2495 2496 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2497 ports, ipst)]; 2498 2499 mutex_enter(&connfp->connf_lock); 2500 for (tconnp = connfp->connf_head; tconnp != NULL; 2501 tconnp = tconnp->conn_next) { 2502 2503 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2504 ipha->ipha_dst, ipha->ipha_src, ports) && 2505 tconnp->conn_tcp->tcp_state >= min_state) { 2506 2507 CONN_INC_REF(tconnp); 2508 mutex_exit(&connfp->connf_lock); 2509 return (tconnp); 2510 } 2511 } 2512 mutex_exit(&connfp->connf_lock); 2513 return (NULL); 2514 } 2515 2516 /* 2517 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2518 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2519 * Only checks for connected entries i.e. no INADDR_ANY checks. 2520 * Match on ifindex in addition to addresses. 2521 */ 2522 conn_t * 2523 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2524 uint_t ifindex, ip_stack_t *ipst) 2525 { 2526 tcp_t *tcp; 2527 uint32_t ports; 2528 uint16_t *pports; 2529 connf_t *connfp; 2530 conn_t *tconnp; 2531 2532 pports = (uint16_t *)&ports; 2533 pports[0] = tcpha->tha_fport; 2534 pports[1] = tcpha->tha_lport; 2535 2536 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2537 ports, ipst)]; 2538 2539 mutex_enter(&connfp->connf_lock); 2540 for (tconnp = connfp->connf_head; tconnp != NULL; 2541 tconnp = tconnp->conn_next) { 2542 2543 tcp = tconnp->conn_tcp; 2544 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2545 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2546 tcp->tcp_state >= min_state && 2547 (tconnp->conn_bound_if == 0 || 2548 tconnp->conn_bound_if == ifindex)) { 2549 2550 CONN_INC_REF(tconnp); 2551 mutex_exit(&connfp->connf_lock); 2552 return (tconnp); 2553 } 2554 } 2555 mutex_exit(&connfp->connf_lock); 2556 return (NULL); 2557 } 2558 2559 /* 2560 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2561 * a listener when changing state. 2562 */ 2563 conn_t * 2564 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2565 ip_stack_t *ipst) 2566 { 2567 connf_t *bind_connfp; 2568 conn_t *connp; 2569 tcp_t *tcp; 2570 2571 /* 2572 * Avoid false matches for packets sent to an IP destination of 2573 * all zeros. 2574 */ 2575 if (laddr == 0) 2576 return (NULL); 2577 2578 ASSERT(zoneid != ALL_ZONES); 2579 2580 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2581 mutex_enter(&bind_connfp->connf_lock); 2582 for (connp = bind_connfp->connf_head; connp != NULL; 2583 connp = connp->conn_next) { 2584 tcp = connp->conn_tcp; 2585 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2586 IPCL_ZONE_MATCH(connp, zoneid) && 2587 (tcp->tcp_listener == NULL)) { 2588 CONN_INC_REF(connp); 2589 mutex_exit(&bind_connfp->connf_lock); 2590 return (connp); 2591 } 2592 } 2593 mutex_exit(&bind_connfp->connf_lock); 2594 return (NULL); 2595 } 2596 2597 /* 2598 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2599 * a listener when changing state. 2600 */ 2601 conn_t * 2602 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2603 zoneid_t zoneid, ip_stack_t *ipst) 2604 { 2605 connf_t *bind_connfp; 2606 conn_t *connp = NULL; 2607 tcp_t *tcp; 2608 2609 /* 2610 * Avoid false matches for packets sent to an IP destination of 2611 * all zeros. 2612 */ 2613 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2614 return (NULL); 2615 2616 ASSERT(zoneid != ALL_ZONES); 2617 2618 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2619 mutex_enter(&bind_connfp->connf_lock); 2620 for (connp = bind_connfp->connf_head; connp != NULL; 2621 connp = connp->conn_next) { 2622 tcp = connp->conn_tcp; 2623 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2624 IPCL_ZONE_MATCH(connp, zoneid) && 2625 (connp->conn_bound_if == 0 || 2626 connp->conn_bound_if == ifindex) && 2627 tcp->tcp_listener == NULL) { 2628 CONN_INC_REF(connp); 2629 mutex_exit(&bind_connfp->connf_lock); 2630 return (connp); 2631 } 2632 } 2633 mutex_exit(&bind_connfp->connf_lock); 2634 return (NULL); 2635 } 2636 2637 /* 2638 * ipcl_get_next_conn 2639 * get the next entry in the conn global list 2640 * and put a reference on the next_conn. 2641 * decrement the reference on the current conn. 2642 * 2643 * This is an iterator based walker function that also provides for 2644 * some selection by the caller. It walks through the conn_hash bucket 2645 * searching for the next valid connp in the list, and selects connections 2646 * that are neither closed nor condemned. It also REFHOLDS the conn 2647 * thus ensuring that the conn exists when the caller uses the conn. 2648 */ 2649 conn_t * 2650 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2651 { 2652 conn_t *next_connp; 2653 2654 if (connfp == NULL) 2655 return (NULL); 2656 2657 mutex_enter(&connfp->connf_lock); 2658 2659 next_connp = (connp == NULL) ? 2660 connfp->connf_head : connp->conn_g_next; 2661 2662 while (next_connp != NULL) { 2663 mutex_enter(&next_connp->conn_lock); 2664 if (!(next_connp->conn_flags & conn_flags) || 2665 (next_connp->conn_state_flags & 2666 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2667 /* 2668 * This conn has been condemned or 2669 * is closing, or the flags don't match 2670 */ 2671 mutex_exit(&next_connp->conn_lock); 2672 next_connp = next_connp->conn_g_next; 2673 continue; 2674 } 2675 CONN_INC_REF_LOCKED(next_connp); 2676 mutex_exit(&next_connp->conn_lock); 2677 break; 2678 } 2679 2680 mutex_exit(&connfp->connf_lock); 2681 2682 if (connp != NULL) 2683 CONN_DEC_REF(connp); 2684 2685 return (next_connp); 2686 } 2687 2688 #ifdef CONN_DEBUG 2689 /* 2690 * Trace of the last NBUF refhold/refrele 2691 */ 2692 int 2693 conn_trace_ref(conn_t *connp) 2694 { 2695 int last; 2696 conn_trace_t *ctb; 2697 2698 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2699 last = connp->conn_trace_last; 2700 last++; 2701 if (last == CONN_TRACE_MAX) 2702 last = 0; 2703 2704 ctb = &connp->conn_trace_buf[last]; 2705 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2706 connp->conn_trace_last = last; 2707 return (1); 2708 } 2709 2710 int 2711 conn_untrace_ref(conn_t *connp) 2712 { 2713 int last; 2714 conn_trace_t *ctb; 2715 2716 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2717 last = connp->conn_trace_last; 2718 last++; 2719 if (last == CONN_TRACE_MAX) 2720 last = 0; 2721 2722 ctb = &connp->conn_trace_buf[last]; 2723 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2724 connp->conn_trace_last = last; 2725 return (1); 2726 } 2727 #endif 2728 2729 mib2_socketInfoEntry_t * 2730 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) 2731 { 2732 vnode_t *vn = NULL; 2733 vattr_t attr; 2734 uint64_t flags = 0; 2735 sock_upcalls_t *upcalls; 2736 sock_upper_handle_t upper_handle; 2737 2738 /* 2739 * If the connection is closing, it is not safe to make an upcall or 2740 * access the stream associated with the connection. 2741 * The callers of this function have a reference on connp itself 2742 * so, as long as it is not closing, it's safe to continue. 2743 */ 2744 mutex_enter(&connp->conn_lock); 2745 2746 if ((connp->conn_state_flags & CONN_CLOSING)) { 2747 mutex_exit(&connp->conn_lock); 2748 return (NULL); 2749 } 2750 2751 /* 2752 * Continue to hold conn_lock because we don't want to race with an 2753 * in-progress close, which will have set-to-NULL (and destroyed 2754 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING. 2755 * 2756 * There is still a race with an in-progress OPEN, however, where 2757 * conn_upper_handle and conn_upcalls are being assigned (in multiple 2758 * codepaths) WITHOUT conn_lock being held. We address that race 2759 * HERE, however, given that both are going from NULL to non-NULL, 2760 * if we lose the race, we don't get any data for the in-progress-OPEN 2761 * socket. 2762 */ 2763 2764 upcalls = connp->conn_upcalls; 2765 upper_handle = connp->conn_upper_handle; 2766 /* Check BOTH for non-NULL before attempting an upcall. */ 2767 if (upper_handle != NULL && upcalls != NULL) { 2768 /* su_get_vnode() returns one with VN_HOLD() already done. */ 2769 vn = upcalls->su_get_vnode(upper_handle); 2770 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) { 2771 vn = STREAM(connp->conn_rq)->sd_pvnode; 2772 if (vn != NULL) 2773 VN_HOLD(vn); 2774 flags |= MIB2_SOCKINFO_STREAM; 2775 } 2776 2777 mutex_exit(&connp->conn_lock); 2778 2779 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) { 2780 if (vn != NULL) 2781 VN_RELE(vn); 2782 return (NULL); 2783 } 2784 2785 VN_RELE(vn); 2786 2787 bzero(sie, sizeof (*sie)); 2788 2789 sie->sie_flags = flags; 2790 sie->sie_inode = attr.va_nodeid; 2791 sie->sie_dev = attr.va_rdev; 2792 2793 return (sie); 2794 } 2795