1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * IP PACKET CLASSIFIER 27 * 28 * The IP packet classifier provides mapping between IP packets and persistent 29 * connection state for connection-oriented protocols. It also provides 30 * interface for managing connection states. 31 * 32 * The connection state is kept in conn_t data structure and contains, among 33 * other things: 34 * 35 * o local/remote address and ports 36 * o Transport protocol 37 * o squeue for the connection (for TCP only) 38 * o reference counter 39 * o Connection state 40 * o hash table linkage 41 * o interface/ire information 42 * o credentials 43 * o ipsec policy 44 * o send and receive functions. 45 * o mutex lock. 46 * 47 * Connections use a reference counting scheme. They are freed when the 48 * reference counter drops to zero. A reference is incremented when connection 49 * is placed in a list or table, when incoming packet for the connection arrives 50 * and when connection is processed via squeue (squeue processing may be 51 * asynchronous and the reference protects the connection from being destroyed 52 * before its processing is finished). 53 * 54 * conn_recv is used to pass up packets to the ULP. 55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 56 * a listener, and changes to tcp_input_listener as the listener has picked a 57 * good squeue. For other cases it is set to tcp_input_data. 58 * 59 * conn_recvicmp is used to pass up ICMP errors to the ULP. 60 * 61 * Classifier uses several hash tables: 62 * 63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 64 * ipcl_bind_fanout: contains all connections in BOUND state 65 * ipcl_proto_fanout: IPv4 protocol fanout 66 * ipcl_proto_fanout_v6: IPv6 protocol fanout 67 * ipcl_udp_fanout: contains all UDP connections 68 * ipcl_iptun_fanout: contains all IP tunnel connections 69 * ipcl_globalhash_fanout: contains all connections 70 * 71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 72 * which need to view all existing connections. 73 * 74 * All tables are protected by per-bucket locks. When both per-bucket lock and 75 * connection lock need to be held, the per-bucket lock should be acquired 76 * first, followed by the connection lock. 77 * 78 * All functions doing search in one of these tables increment a reference 79 * counter on the connection found (if any). This reference should be dropped 80 * when the caller has finished processing the connection. 81 * 82 * 83 * INTERFACES: 84 * =========== 85 * 86 * Connection Lookup: 87 * ------------------ 88 * 89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 91 * 92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 93 * it can't find any associated connection. If the connection is found, its 94 * reference counter is incremented. 95 * 96 * mp: mblock, containing packet header. The full header should fit 97 * into a single mblock. It should also contain at least full IP 98 * and TCP or UDP header. 99 * 100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 101 * 102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 103 * the packet. 104 * 105 * ira->ira_zoneid: The zone in which the returned connection must be; the 106 * zoneid corresponding to the ire_zoneid on the IRE located for 107 * the packet's destination address. 108 * 109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 110 * IRAF_TX_SHARED_ADDR flags 111 * 112 * For TCP connections, the lookup order is as follows: 113 * 5-tuple {src, dst, protocol, local port, remote port} 114 * lookup in ipcl_conn_fanout table. 115 * 3-tuple {dst, remote port, protocol} lookup in 116 * ipcl_bind_fanout table. 117 * 118 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 119 * remote port} lookup is done on ipcl_udp_fanout. Note that, 120 * these interfaces do not handle cases where a packets belongs 121 * to multiple UDP clients, which is handled in IP itself. 122 * 123 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 124 * determine which actual zone gets the segment. This is used only in a 125 * labeled environment. The matching rules are: 126 * 127 * - If it's not a multilevel port, then the label on the packet selects 128 * the zone. Unlabeled packets are delivered to the global zone. 129 * 130 * - If it's a multilevel port, then only the zone registered to receive 131 * packets on that port matches. 132 * 133 * Also, in a labeled environment, packet labels need to be checked. For fully 134 * bound TCP connections, we can assume that the packet label was checked 135 * during connection establishment, and doesn't need to be checked on each 136 * packet. For others, though, we need to check for strict equality or, for 137 * multilevel ports, membership in the range or set. This part currently does 138 * a tnrh lookup on each packet, but could be optimized to use cached results 139 * if that were necessary. (SCTP doesn't come through here, but if it did, 140 * we would apply the same rules as TCP.) 141 * 142 * An implication of the above is that fully-bound TCP sockets must always use 143 * distinct 4-tuples; they can't be discriminated by label alone. 144 * 145 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 146 * as there's no connection set-up handshake and no shared state. 147 * 148 * Labels on looped-back packets within a single zone do not need to be 149 * checked, as all processes in the same zone have the same label. 150 * 151 * Finally, for unlabeled packets received by a labeled system, special rules 152 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 153 * socket in the zone whose label matches the default label of the sender, if 154 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 155 * receiver's label must dominate the sender's default label. 156 * 157 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 158 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 159 * ip_stack); 160 * 161 * Lookup routine to find a exact match for {src, dst, local port, 162 * remote port) for TCP connections in ipcl_conn_fanout. The address and 163 * ports are read from the IP and TCP header respectively. 164 * 165 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 166 * zoneid, ip_stack); 167 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 168 * zoneid, ip_stack); 169 * 170 * Lookup routine to find a listener with the tuple {lport, laddr, 171 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 172 * parameter interface index is also compared. 173 * 174 * void ipcl_walk(func, arg, ip_stack) 175 * 176 * Apply 'func' to every connection available. The 'func' is called as 177 * (*func)(connp, arg). The walk is non-atomic so connections may be 178 * created and destroyed during the walk. The CONN_CONDEMNED and 179 * CONN_INCIPIENT flags ensure that connections which are newly created 180 * or being destroyed are not selected by the walker. 181 * 182 * Table Updates 183 * ------------- 184 * 185 * int ipcl_conn_insert(connp); 186 * int ipcl_conn_insert_v4(connp); 187 * int ipcl_conn_insert_v6(connp); 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * 193 * Return value : 194 * 0 if connp was inserted 195 * EADDRINUSE if the connection with the same tuple 196 * already exists. 197 * 198 * int ipcl_bind_insert(connp); 199 * int ipcl_bind_insert_v4(connp); 200 * int ipcl_bind_insert_v6(connp); 201 * 202 * Insert 'connp' in ipcl_bind_fanout. 203 * Arguements : 204 * connp conn_t to be inserted 205 * 206 * 207 * void ipcl_hash_remove(connp); 208 * 209 * Removes the 'connp' from the connection fanout table. 210 * 211 * Connection Creation/Destruction 212 * ------------------------------- 213 * 214 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 215 * 216 * Creates a new conn based on the type flag, inserts it into 217 * globalhash table. 218 * 219 * type: This flag determines the type of conn_t which needs to be 220 * created i.e., which kmem_cache it comes from. 221 * IPCL_TCPCONN indicates a TCP connection 222 * IPCL_SCTPCONN indicates a SCTP connection 223 * IPCL_UDPCONN indicates a UDP conn_t. 224 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 225 * IPCL_RTSCONN indicates a RTS conn_t. 226 * IPCL_IPCCONN indicates all other connections. 227 * 228 * void ipcl_conn_destroy(connp) 229 * 230 * Destroys the connection state, removes it from the global 231 * connection hash table and frees its memory. 232 */ 233 234 #include <sys/types.h> 235 #include <sys/stream.h> 236 #include <sys/stropts.h> 237 #include <sys/sysmacros.h> 238 #include <sys/strsubr.h> 239 #include <sys/strsun.h> 240 #define _SUN_TPI_VERSION 2 241 #include <sys/ddi.h> 242 #include <sys/cmn_err.h> 243 #include <sys/debug.h> 244 245 #include <sys/systm.h> 246 #include <sys/param.h> 247 #include <sys/kmem.h> 248 #include <sys/isa_defs.h> 249 #include <inet/common.h> 250 #include <netinet/ip6.h> 251 #include <netinet/icmp6.h> 252 253 #include <inet/ip.h> 254 #include <inet/ip_if.h> 255 #include <inet/ip_ire.h> 256 #include <inet/ip6.h> 257 #include <inet/ip_ndp.h> 258 #include <inet/ip_impl.h> 259 #include <inet/udp_impl.h> 260 #include <inet/sctp_ip.h> 261 #include <inet/sctp/sctp_impl.h> 262 #include <inet/rawip_impl.h> 263 #include <inet/rts_impl.h> 264 #include <inet/iptun/iptun_impl.h> 265 266 #include <sys/cpuvar.h> 267 268 #include <inet/ipclassifier.h> 269 #include <inet/tcp.h> 270 #include <inet/ipsec_impl.h> 271 272 #include <sys/tsol/tnet.h> 273 #include <sys/sockio.h> 274 275 /* Old value for compatibility. Setable in /etc/system */ 276 uint_t tcp_conn_hash_size = 0; 277 278 /* New value. Zero means choose automatically. Setable in /etc/system */ 279 uint_t ipcl_conn_hash_size = 0; 280 uint_t ipcl_conn_hash_memfactor = 8192; 281 uint_t ipcl_conn_hash_maxsize = 82500; 282 283 /* bind/udp fanout table size */ 284 uint_t ipcl_bind_fanout_size = 512; 285 uint_t ipcl_udp_fanout_size = 16384; 286 287 /* Raw socket fanout size. Must be a power of 2. */ 288 uint_t ipcl_raw_fanout_size = 256; 289 290 /* 291 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 292 * expect that most large deployments would have hundreds of tunnels, and 293 * thousands in the extreme case. 294 */ 295 uint_t ipcl_iptun_fanout_size = 6143; 296 297 /* 298 * Power of 2^N Primes useful for hashing for N of 0-28, 299 * these primes are the nearest prime <= 2^N - 2^(N-2). 300 */ 301 302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 303 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 304 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 305 50331599, 100663291, 201326557, 0} 306 307 /* 308 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 309 * are aligned on cache lines. 310 */ 311 typedef union itc_s { 312 conn_t itc_conn; 313 char itcu_filler[CACHE_ALIGN(conn_s)]; 314 } itc_t; 315 316 struct kmem_cache *tcp_conn_cache; 317 struct kmem_cache *ip_conn_cache; 318 extern struct kmem_cache *sctp_conn_cache; 319 struct kmem_cache *udp_conn_cache; 320 struct kmem_cache *rawip_conn_cache; 321 struct kmem_cache *rts_conn_cache; 322 323 extern void tcp_timermp_free(tcp_t *); 324 extern mblk_t *tcp_timermp_alloc(int); 325 326 static int ip_conn_constructor(void *, void *, int); 327 static void ip_conn_destructor(void *, void *); 328 329 static int tcp_conn_constructor(void *, void *, int); 330 static void tcp_conn_destructor(void *, void *); 331 332 static int udp_conn_constructor(void *, void *, int); 333 static void udp_conn_destructor(void *, void *); 334 335 static int rawip_conn_constructor(void *, void *, int); 336 static void rawip_conn_destructor(void *, void *); 337 338 static int rts_conn_constructor(void *, void *, int); 339 static void rts_conn_destructor(void *, void *); 340 341 /* 342 * Global (for all stack instances) init routine 343 */ 344 void 345 ipcl_g_init(void) 346 { 347 ip_conn_cache = kmem_cache_create("ip_conn_cache", 348 sizeof (conn_t), CACHE_ALIGN_SIZE, 349 ip_conn_constructor, ip_conn_destructor, 350 NULL, NULL, NULL, 0); 351 352 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 353 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 354 tcp_conn_constructor, tcp_conn_destructor, 355 tcp_conn_reclaim, NULL, NULL, 0); 356 357 udp_conn_cache = kmem_cache_create("udp_conn_cache", 358 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 359 udp_conn_constructor, udp_conn_destructor, 360 NULL, NULL, NULL, 0); 361 362 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 363 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 364 rawip_conn_constructor, rawip_conn_destructor, 365 NULL, NULL, NULL, 0); 366 367 rts_conn_cache = kmem_cache_create("rts_conn_cache", 368 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 369 rts_conn_constructor, rts_conn_destructor, 370 NULL, NULL, NULL, 0); 371 } 372 373 /* 374 * ipclassifier intialization routine, sets up hash tables. 375 */ 376 void 377 ipcl_init(ip_stack_t *ipst) 378 { 379 int i; 380 int sizes[] = P2Ps(); 381 382 /* 383 * Calculate size of conn fanout table from /etc/system settings 384 */ 385 if (ipcl_conn_hash_size != 0) { 386 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 387 } else if (tcp_conn_hash_size != 0) { 388 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 389 } else { 390 extern pgcnt_t freemem; 391 392 ipst->ips_ipcl_conn_fanout_size = 393 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 394 395 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 396 ipst->ips_ipcl_conn_fanout_size = 397 ipcl_conn_hash_maxsize; 398 } 399 } 400 401 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 402 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 403 break; 404 } 405 } 406 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 407 /* Out of range, use the 2^16 value */ 408 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 409 } 410 411 /* Take values from /etc/system */ 412 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 413 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 414 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 415 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 416 417 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 418 419 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 420 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 421 422 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 423 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 424 MUTEX_DEFAULT, NULL); 425 } 426 427 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 428 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 429 430 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 431 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 432 MUTEX_DEFAULT, NULL); 433 } 434 435 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 436 sizeof (connf_t), KM_SLEEP); 437 for (i = 0; i < IPPROTO_MAX; i++) { 438 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 439 MUTEX_DEFAULT, NULL); 440 } 441 442 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 443 sizeof (connf_t), KM_SLEEP); 444 for (i = 0; i < IPPROTO_MAX; i++) { 445 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 446 MUTEX_DEFAULT, NULL); 447 } 448 449 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 450 mutex_init(&ipst->ips_rts_clients->connf_lock, 451 NULL, MUTEX_DEFAULT, NULL); 452 453 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 454 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 455 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 456 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 457 MUTEX_DEFAULT, NULL); 458 } 459 460 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 461 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 462 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 463 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 464 MUTEX_DEFAULT, NULL); 465 } 466 467 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 468 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 469 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 470 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 471 MUTEX_DEFAULT, NULL); 472 } 473 474 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 475 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 476 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 477 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 478 NULL, MUTEX_DEFAULT, NULL); 479 } 480 } 481 482 void 483 ipcl_g_destroy(void) 484 { 485 kmem_cache_destroy(ip_conn_cache); 486 kmem_cache_destroy(tcp_conn_cache); 487 kmem_cache_destroy(udp_conn_cache); 488 kmem_cache_destroy(rawip_conn_cache); 489 kmem_cache_destroy(rts_conn_cache); 490 } 491 492 /* 493 * All user-level and kernel use of the stack must be gone 494 * by now. 495 */ 496 void 497 ipcl_destroy(ip_stack_t *ipst) 498 { 499 int i; 500 501 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 502 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 503 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 504 } 505 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 506 sizeof (connf_t)); 507 ipst->ips_ipcl_conn_fanout = NULL; 508 509 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 510 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 511 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 512 } 513 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 514 sizeof (connf_t)); 515 ipst->ips_ipcl_bind_fanout = NULL; 516 517 for (i = 0; i < IPPROTO_MAX; i++) { 518 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 519 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 520 } 521 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 522 IPPROTO_MAX * sizeof (connf_t)); 523 ipst->ips_ipcl_proto_fanout_v4 = NULL; 524 525 for (i = 0; i < IPPROTO_MAX; i++) { 526 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 527 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 528 } 529 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 530 IPPROTO_MAX * sizeof (connf_t)); 531 ipst->ips_ipcl_proto_fanout_v6 = NULL; 532 533 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 534 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 535 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 536 } 537 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 538 sizeof (connf_t)); 539 ipst->ips_ipcl_udp_fanout = NULL; 540 541 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 542 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 543 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 544 } 545 kmem_free(ipst->ips_ipcl_iptun_fanout, 546 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 547 ipst->ips_ipcl_iptun_fanout = NULL; 548 549 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 550 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 551 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 552 } 553 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 554 sizeof (connf_t)); 555 ipst->ips_ipcl_raw_fanout = NULL; 556 557 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 558 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 559 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 560 } 561 kmem_free(ipst->ips_ipcl_globalhash_fanout, 562 sizeof (connf_t) * CONN_G_HASH_SIZE); 563 ipst->ips_ipcl_globalhash_fanout = NULL; 564 565 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 566 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 567 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 568 ipst->ips_rts_clients = NULL; 569 } 570 571 /* 572 * conn creation routine. initialize the conn, sets the reference 573 * and inserts it in the global hash table. 574 */ 575 conn_t * 576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 577 { 578 conn_t *connp; 579 struct kmem_cache *conn_cache; 580 581 switch (type) { 582 case IPCL_SCTPCONN: 583 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 584 return (NULL); 585 sctp_conn_init(connp); 586 netstack_hold(ns); 587 connp->conn_netstack = ns; 588 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 589 ipcl_globalhash_insert(connp); 590 return (connp); 591 592 case IPCL_TCPCONN: 593 conn_cache = tcp_conn_cache; 594 break; 595 596 case IPCL_UDPCONN: 597 conn_cache = udp_conn_cache; 598 break; 599 600 case IPCL_RAWIPCONN: 601 conn_cache = rawip_conn_cache; 602 break; 603 604 case IPCL_RTSCONN: 605 conn_cache = rts_conn_cache; 606 break; 607 608 case IPCL_IPCCONN: 609 conn_cache = ip_conn_cache; 610 break; 611 612 default: 613 connp = NULL; 614 ASSERT(0); 615 } 616 617 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 618 return (NULL); 619 620 connp->conn_ref = 1; 621 netstack_hold(ns); 622 connp->conn_netstack = ns; 623 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 624 ipcl_globalhash_insert(connp); 625 return (connp); 626 } 627 628 void 629 ipcl_conn_destroy(conn_t *connp) 630 { 631 mblk_t *mp; 632 netstack_t *ns = connp->conn_netstack; 633 634 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 635 ASSERT(connp->conn_ref == 0); 636 637 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 638 639 if (connp->conn_cred != NULL) { 640 crfree(connp->conn_cred); 641 connp->conn_cred = NULL; 642 /* ixa_cred done in ipcl_conn_cleanup below */ 643 } 644 645 if (connp->conn_ht_iphc != NULL) { 646 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 647 connp->conn_ht_iphc = NULL; 648 connp->conn_ht_iphc_allocated = 0; 649 connp->conn_ht_iphc_len = 0; 650 connp->conn_ht_ulp = NULL; 651 connp->conn_ht_ulp_len = 0; 652 } 653 ip_pkt_free(&connp->conn_xmit_ipp); 654 655 ipcl_globalhash_remove(connp); 656 657 if (connp->conn_latch != NULL) { 658 IPLATCH_REFRELE(connp->conn_latch); 659 connp->conn_latch = NULL; 660 } 661 if (connp->conn_latch_in_policy != NULL) { 662 IPPOL_REFRELE(connp->conn_latch_in_policy); 663 connp->conn_latch_in_policy = NULL; 664 } 665 if (connp->conn_latch_in_action != NULL) { 666 IPACT_REFRELE(connp->conn_latch_in_action); 667 connp->conn_latch_in_action = NULL; 668 } 669 if (connp->conn_policy != NULL) { 670 IPPH_REFRELE(connp->conn_policy, ns); 671 connp->conn_policy = NULL; 672 } 673 674 if (connp->conn_ipsec_opt_mp != NULL) { 675 freemsg(connp->conn_ipsec_opt_mp); 676 connp->conn_ipsec_opt_mp = NULL; 677 } 678 679 if (connp->conn_flags & IPCL_TCPCONN) { 680 tcp_t *tcp = connp->conn_tcp; 681 682 tcp_free(tcp); 683 mp = tcp->tcp_timercache; 684 685 tcp->tcp_tcps = NULL; 686 687 /* 688 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 689 * the mblk. 690 */ 691 if (tcp->tcp_rsrv_mp != NULL) { 692 freeb(tcp->tcp_rsrv_mp); 693 tcp->tcp_rsrv_mp = NULL; 694 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 695 } 696 697 ipcl_conn_cleanup(connp); 698 connp->conn_flags = IPCL_TCPCONN; 699 if (ns != NULL) { 700 ASSERT(tcp->tcp_tcps == NULL); 701 connp->conn_netstack = NULL; 702 connp->conn_ixa->ixa_ipst = NULL; 703 netstack_rele(ns); 704 } 705 706 bzero(tcp, sizeof (tcp_t)); 707 708 tcp->tcp_timercache = mp; 709 tcp->tcp_connp = connp; 710 kmem_cache_free(tcp_conn_cache, connp); 711 return; 712 } 713 714 if (connp->conn_flags & IPCL_SCTPCONN) { 715 ASSERT(ns != NULL); 716 sctp_free(connp); 717 return; 718 } 719 720 ipcl_conn_cleanup(connp); 721 if (ns != NULL) { 722 connp->conn_netstack = NULL; 723 connp->conn_ixa->ixa_ipst = NULL; 724 netstack_rele(ns); 725 } 726 727 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 728 if (connp->conn_flags & IPCL_UDPCONN) { 729 connp->conn_flags = IPCL_UDPCONN; 730 kmem_cache_free(udp_conn_cache, connp); 731 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 732 connp->conn_flags = IPCL_RAWIPCONN; 733 connp->conn_proto = IPPROTO_ICMP; 734 connp->conn_ixa->ixa_protocol = connp->conn_proto; 735 kmem_cache_free(rawip_conn_cache, connp); 736 } else if (connp->conn_flags & IPCL_RTSCONN) { 737 connp->conn_flags = IPCL_RTSCONN; 738 kmem_cache_free(rts_conn_cache, connp); 739 } else { 740 connp->conn_flags = IPCL_IPCCONN; 741 ASSERT(connp->conn_flags & IPCL_IPCCONN); 742 ASSERT(connp->conn_priv == NULL); 743 kmem_cache_free(ip_conn_cache, connp); 744 } 745 } 746 747 /* 748 * Running in cluster mode - deregister listener information 749 */ 750 static void 751 ipcl_conn_unlisten(conn_t *connp) 752 { 753 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 754 ASSERT(connp->conn_lport != 0); 755 756 if (cl_inet_unlisten != NULL) { 757 sa_family_t addr_family; 758 uint8_t *laddrp; 759 760 if (connp->conn_ipversion == IPV6_VERSION) { 761 addr_family = AF_INET6; 762 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 763 } else { 764 addr_family = AF_INET; 765 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 766 } 767 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 768 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 769 } 770 connp->conn_flags &= ~IPCL_CL_LISTENER; 771 } 772 773 /* 774 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 775 * which table the conn belonged to). So for debugging we can see which hash 776 * table this connection was in. 777 */ 778 #define IPCL_HASH_REMOVE(connp) { \ 779 connf_t *connfp = (connp)->conn_fanout; \ 780 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 781 if (connfp != NULL) { \ 782 mutex_enter(&connfp->connf_lock); \ 783 if ((connp)->conn_next != NULL) \ 784 (connp)->conn_next->conn_prev = \ 785 (connp)->conn_prev; \ 786 if ((connp)->conn_prev != NULL) \ 787 (connp)->conn_prev->conn_next = \ 788 (connp)->conn_next; \ 789 else \ 790 connfp->connf_head = (connp)->conn_next; \ 791 (connp)->conn_fanout = NULL; \ 792 (connp)->conn_next = NULL; \ 793 (connp)->conn_prev = NULL; \ 794 (connp)->conn_flags |= IPCL_REMOVED; \ 795 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 796 ipcl_conn_unlisten((connp)); \ 797 CONN_DEC_REF((connp)); \ 798 mutex_exit(&connfp->connf_lock); \ 799 } \ 800 } 801 802 void 803 ipcl_hash_remove(conn_t *connp) 804 { 805 uint8_t protocol = connp->conn_proto; 806 807 IPCL_HASH_REMOVE(connp); 808 if (protocol == IPPROTO_RSVP) 809 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 810 } 811 812 /* 813 * The whole purpose of this function is allow removal of 814 * a conn_t from the connected hash for timewait reclaim. 815 * This is essentially a TW reclaim fastpath where timewait 816 * collector checks under fanout lock (so no one else can 817 * get access to the conn_t) that refcnt is 2 i.e. one for 818 * TCP and one for the classifier hash list. If ref count 819 * is indeed 2, we can just remove the conn under lock and 820 * avoid cleaning up the conn under squeue. This gives us 821 * improved performance. 822 */ 823 void 824 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 825 { 826 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 827 ASSERT(MUTEX_HELD(&connp->conn_lock)); 828 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 829 830 if ((connp)->conn_next != NULL) { 831 (connp)->conn_next->conn_prev = (connp)->conn_prev; 832 } 833 if ((connp)->conn_prev != NULL) { 834 (connp)->conn_prev->conn_next = (connp)->conn_next; 835 } else { 836 connfp->connf_head = (connp)->conn_next; 837 } 838 (connp)->conn_fanout = NULL; 839 (connp)->conn_next = NULL; 840 (connp)->conn_prev = NULL; 841 (connp)->conn_flags |= IPCL_REMOVED; 842 ASSERT((connp)->conn_ref == 2); 843 (connp)->conn_ref--; 844 } 845 846 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 847 ASSERT((connp)->conn_fanout == NULL); \ 848 ASSERT((connp)->conn_next == NULL); \ 849 ASSERT((connp)->conn_prev == NULL); \ 850 if ((connfp)->connf_head != NULL) { \ 851 (connfp)->connf_head->conn_prev = (connp); \ 852 (connp)->conn_next = (connfp)->connf_head; \ 853 } \ 854 (connp)->conn_fanout = (connfp); \ 855 (connfp)->connf_head = (connp); \ 856 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 857 IPCL_CONNECTED; \ 858 CONN_INC_REF(connp); \ 859 } 860 861 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 862 IPCL_HASH_REMOVE((connp)); \ 863 mutex_enter(&(connfp)->connf_lock); \ 864 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 865 mutex_exit(&(connfp)->connf_lock); \ 866 } 867 868 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 869 conn_t *pconnp = NULL, *nconnp; \ 870 IPCL_HASH_REMOVE((connp)); \ 871 mutex_enter(&(connfp)->connf_lock); \ 872 nconnp = (connfp)->connf_head; \ 873 while (nconnp != NULL && \ 874 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 875 pconnp = nconnp; \ 876 nconnp = nconnp->conn_next; \ 877 } \ 878 if (pconnp != NULL) { \ 879 pconnp->conn_next = (connp); \ 880 (connp)->conn_prev = pconnp; \ 881 } else { \ 882 (connfp)->connf_head = (connp); \ 883 } \ 884 if (nconnp != NULL) { \ 885 (connp)->conn_next = nconnp; \ 886 nconnp->conn_prev = (connp); \ 887 } \ 888 (connp)->conn_fanout = (connfp); \ 889 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 890 IPCL_BOUND; \ 891 CONN_INC_REF(connp); \ 892 mutex_exit(&(connfp)->connf_lock); \ 893 } 894 895 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 896 conn_t **list, *prev, *next; \ 897 boolean_t isv4mapped = \ 898 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 899 IPCL_HASH_REMOVE((connp)); \ 900 mutex_enter(&(connfp)->connf_lock); \ 901 list = &(connfp)->connf_head; \ 902 prev = NULL; \ 903 while ((next = *list) != NULL) { \ 904 if (isv4mapped && \ 905 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 906 connp->conn_zoneid == next->conn_zoneid) { \ 907 (connp)->conn_next = next; \ 908 if (prev != NULL) \ 909 prev = next->conn_prev; \ 910 next->conn_prev = (connp); \ 911 break; \ 912 } \ 913 list = &next->conn_next; \ 914 prev = next; \ 915 } \ 916 (connp)->conn_prev = prev; \ 917 *list = (connp); \ 918 (connp)->conn_fanout = (connfp); \ 919 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 920 IPCL_BOUND; \ 921 CONN_INC_REF((connp)); \ 922 mutex_exit(&(connfp)->connf_lock); \ 923 } 924 925 void 926 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 927 { 928 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 929 } 930 931 /* 932 * Because the classifier is used to classify inbound packets, the destination 933 * address is meant to be our local tunnel address (tunnel source), and the 934 * source the remote tunnel address (tunnel destination). 935 * 936 * Note that conn_proto can't be used for fanout since the upper protocol 937 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 938 */ 939 conn_t * 940 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 941 { 942 connf_t *connfp; 943 conn_t *connp; 944 945 /* first look for IPv4 tunnel links */ 946 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 947 mutex_enter(&connfp->connf_lock); 948 for (connp = connfp->connf_head; connp != NULL; 949 connp = connp->conn_next) { 950 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 951 break; 952 } 953 if (connp != NULL) 954 goto done; 955 956 mutex_exit(&connfp->connf_lock); 957 958 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 959 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 960 INADDR_ANY)]; 961 mutex_enter(&connfp->connf_lock); 962 for (connp = connfp->connf_head; connp != NULL; 963 connp = connp->conn_next) { 964 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 965 break; 966 } 967 done: 968 if (connp != NULL) 969 CONN_INC_REF(connp); 970 mutex_exit(&connfp->connf_lock); 971 return (connp); 972 } 973 974 conn_t * 975 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 976 { 977 connf_t *connfp; 978 conn_t *connp; 979 980 /* Look for an IPv6 tunnel link */ 981 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 982 mutex_enter(&connfp->connf_lock); 983 for (connp = connfp->connf_head; connp != NULL; 984 connp = connp->conn_next) { 985 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 986 CONN_INC_REF(connp); 987 break; 988 } 989 } 990 mutex_exit(&connfp->connf_lock); 991 return (connp); 992 } 993 994 /* 995 * This function is used only for inserting SCTP raw socket now. 996 * This may change later. 997 * 998 * Note that only one raw socket can be bound to a port. The param 999 * lport is in network byte order. 1000 */ 1001 static int 1002 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1003 { 1004 connf_t *connfp; 1005 conn_t *oconnp; 1006 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1007 1008 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1009 1010 /* Check for existing raw socket already bound to the port. */ 1011 mutex_enter(&connfp->connf_lock); 1012 for (oconnp = connfp->connf_head; oconnp != NULL; 1013 oconnp = oconnp->conn_next) { 1014 if (oconnp->conn_lport == lport && 1015 oconnp->conn_zoneid == connp->conn_zoneid && 1016 oconnp->conn_family == connp->conn_family && 1017 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1018 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1019 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1020 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1021 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1022 &connp->conn_laddr_v6))) { 1023 break; 1024 } 1025 } 1026 mutex_exit(&connfp->connf_lock); 1027 if (oconnp != NULL) 1028 return (EADDRNOTAVAIL); 1029 1030 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1031 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1032 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1033 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1034 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1035 } else { 1036 IPCL_HASH_INSERT_BOUND(connfp, connp); 1037 } 1038 } else { 1039 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1040 } 1041 return (0); 1042 } 1043 1044 static int 1045 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1046 { 1047 connf_t *connfp; 1048 conn_t *tconnp; 1049 ipaddr_t laddr = connp->conn_laddr_v4; 1050 ipaddr_t faddr = connp->conn_faddr_v4; 1051 1052 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1053 mutex_enter(&connfp->connf_lock); 1054 for (tconnp = connfp->connf_head; tconnp != NULL; 1055 tconnp = tconnp->conn_next) { 1056 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1057 /* A tunnel is already bound to these addresses. */ 1058 mutex_exit(&connfp->connf_lock); 1059 return (EADDRINUSE); 1060 } 1061 } 1062 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1063 mutex_exit(&connfp->connf_lock); 1064 return (0); 1065 } 1066 1067 static int 1068 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1069 { 1070 connf_t *connfp; 1071 conn_t *tconnp; 1072 in6_addr_t *laddr = &connp->conn_laddr_v6; 1073 in6_addr_t *faddr = &connp->conn_faddr_v6; 1074 1075 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1076 mutex_enter(&connfp->connf_lock); 1077 for (tconnp = connfp->connf_head; tconnp != NULL; 1078 tconnp = tconnp->conn_next) { 1079 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1080 /* A tunnel is already bound to these addresses. */ 1081 mutex_exit(&connfp->connf_lock); 1082 return (EADDRINUSE); 1083 } 1084 } 1085 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1086 mutex_exit(&connfp->connf_lock); 1087 return (0); 1088 } 1089 1090 /* 1091 * Check for a MAC exemption conflict on a labeled system. Note that for 1092 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1093 * transport layer. This check is for binding all other protocols. 1094 * 1095 * Returns true if there's a conflict. 1096 */ 1097 static boolean_t 1098 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1099 { 1100 connf_t *connfp; 1101 conn_t *tconn; 1102 1103 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1104 mutex_enter(&connfp->connf_lock); 1105 for (tconn = connfp->connf_head; tconn != NULL; 1106 tconn = tconn->conn_next) { 1107 /* We don't allow v4 fallback for v6 raw socket */ 1108 if (connp->conn_family != tconn->conn_family) 1109 continue; 1110 /* If neither is exempt, then there's no conflict */ 1111 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1112 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1113 continue; 1114 /* We are only concerned about sockets for a different zone */ 1115 if (connp->conn_zoneid == tconn->conn_zoneid) 1116 continue; 1117 /* If both are bound to different specific addrs, ok */ 1118 if (connp->conn_laddr_v4 != INADDR_ANY && 1119 tconn->conn_laddr_v4 != INADDR_ANY && 1120 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1121 continue; 1122 /* These two conflict; fail */ 1123 break; 1124 } 1125 mutex_exit(&connfp->connf_lock); 1126 return (tconn != NULL); 1127 } 1128 1129 static boolean_t 1130 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1131 { 1132 connf_t *connfp; 1133 conn_t *tconn; 1134 1135 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1136 mutex_enter(&connfp->connf_lock); 1137 for (tconn = connfp->connf_head; tconn != NULL; 1138 tconn = tconn->conn_next) { 1139 /* We don't allow v4 fallback for v6 raw socket */ 1140 if (connp->conn_family != tconn->conn_family) 1141 continue; 1142 /* If neither is exempt, then there's no conflict */ 1143 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1144 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1145 continue; 1146 /* We are only concerned about sockets for a different zone */ 1147 if (connp->conn_zoneid == tconn->conn_zoneid) 1148 continue; 1149 /* If both are bound to different addrs, ok */ 1150 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1151 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1152 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1153 &tconn->conn_laddr_v6)) 1154 continue; 1155 /* These two conflict; fail */ 1156 break; 1157 } 1158 mutex_exit(&connfp->connf_lock); 1159 return (tconn != NULL); 1160 } 1161 1162 /* 1163 * (v4, v6) bind hash insertion routines 1164 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1165 */ 1166 1167 int 1168 ipcl_bind_insert(conn_t *connp) 1169 { 1170 if (connp->conn_ipversion == IPV6_VERSION) 1171 return (ipcl_bind_insert_v6(connp)); 1172 else 1173 return (ipcl_bind_insert_v4(connp)); 1174 } 1175 1176 int 1177 ipcl_bind_insert_v4(conn_t *connp) 1178 { 1179 connf_t *connfp; 1180 int ret = 0; 1181 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1182 uint16_t lport = connp->conn_lport; 1183 uint8_t protocol = connp->conn_proto; 1184 1185 if (IPCL_IS_IPTUN(connp)) 1186 return (ipcl_iptun_hash_insert(connp, ipst)); 1187 1188 switch (protocol) { 1189 default: 1190 if (is_system_labeled() && 1191 check_exempt_conflict_v4(connp, ipst)) 1192 return (EADDRINUSE); 1193 /* FALLTHROUGH */ 1194 case IPPROTO_UDP: 1195 if (protocol == IPPROTO_UDP) { 1196 connfp = &ipst->ips_ipcl_udp_fanout[ 1197 IPCL_UDP_HASH(lport, ipst)]; 1198 } else { 1199 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1200 } 1201 1202 if (connp->conn_faddr_v4 != INADDR_ANY) { 1203 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1204 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1205 IPCL_HASH_INSERT_BOUND(connfp, connp); 1206 } else { 1207 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1208 } 1209 if (protocol == IPPROTO_RSVP) 1210 ill_set_inputfn_all(ipst); 1211 break; 1212 1213 case IPPROTO_TCP: 1214 /* Insert it in the Bind Hash */ 1215 ASSERT(connp->conn_zoneid != ALL_ZONES); 1216 connfp = &ipst->ips_ipcl_bind_fanout[ 1217 IPCL_BIND_HASH(lport, ipst)]; 1218 if (connp->conn_laddr_v4 != INADDR_ANY) { 1219 IPCL_HASH_INSERT_BOUND(connfp, connp); 1220 } else { 1221 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1222 } 1223 if (cl_inet_listen != NULL) { 1224 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1225 connp->conn_flags |= IPCL_CL_LISTENER; 1226 (*cl_inet_listen)( 1227 connp->conn_netstack->netstack_stackid, 1228 IPPROTO_TCP, AF_INET, 1229 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1230 } 1231 break; 1232 1233 case IPPROTO_SCTP: 1234 ret = ipcl_sctp_hash_insert(connp, lport); 1235 break; 1236 } 1237 1238 return (ret); 1239 } 1240 1241 int 1242 ipcl_bind_insert_v6(conn_t *connp) 1243 { 1244 connf_t *connfp; 1245 int ret = 0; 1246 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1247 uint16_t lport = connp->conn_lport; 1248 uint8_t protocol = connp->conn_proto; 1249 1250 if (IPCL_IS_IPTUN(connp)) { 1251 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1252 } 1253 1254 switch (protocol) { 1255 default: 1256 if (is_system_labeled() && 1257 check_exempt_conflict_v6(connp, ipst)) 1258 return (EADDRINUSE); 1259 /* FALLTHROUGH */ 1260 case IPPROTO_UDP: 1261 if (protocol == IPPROTO_UDP) { 1262 connfp = &ipst->ips_ipcl_udp_fanout[ 1263 IPCL_UDP_HASH(lport, ipst)]; 1264 } else { 1265 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1266 } 1267 1268 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1269 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1270 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1271 IPCL_HASH_INSERT_BOUND(connfp, connp); 1272 } else { 1273 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1274 } 1275 break; 1276 1277 case IPPROTO_TCP: 1278 /* Insert it in the Bind Hash */ 1279 ASSERT(connp->conn_zoneid != ALL_ZONES); 1280 connfp = &ipst->ips_ipcl_bind_fanout[ 1281 IPCL_BIND_HASH(lport, ipst)]; 1282 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1283 IPCL_HASH_INSERT_BOUND(connfp, connp); 1284 } else { 1285 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1286 } 1287 if (cl_inet_listen != NULL) { 1288 sa_family_t addr_family; 1289 uint8_t *laddrp; 1290 1291 if (connp->conn_ipversion == IPV6_VERSION) { 1292 addr_family = AF_INET6; 1293 laddrp = 1294 (uint8_t *)&connp->conn_bound_addr_v6; 1295 } else { 1296 addr_family = AF_INET; 1297 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1298 } 1299 connp->conn_flags |= IPCL_CL_LISTENER; 1300 (*cl_inet_listen)( 1301 connp->conn_netstack->netstack_stackid, 1302 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1303 } 1304 break; 1305 1306 case IPPROTO_SCTP: 1307 ret = ipcl_sctp_hash_insert(connp, lport); 1308 break; 1309 } 1310 1311 return (ret); 1312 } 1313 1314 /* 1315 * ipcl_conn_hash insertion routines. 1316 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1317 */ 1318 1319 int 1320 ipcl_conn_insert(conn_t *connp) 1321 { 1322 if (connp->conn_ipversion == IPV6_VERSION) 1323 return (ipcl_conn_insert_v6(connp)); 1324 else 1325 return (ipcl_conn_insert_v4(connp)); 1326 } 1327 1328 int 1329 ipcl_conn_insert_v4(conn_t *connp) 1330 { 1331 connf_t *connfp; 1332 conn_t *tconnp; 1333 int ret = 0; 1334 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1335 uint16_t lport = connp->conn_lport; 1336 uint8_t protocol = connp->conn_proto; 1337 1338 if (IPCL_IS_IPTUN(connp)) 1339 return (ipcl_iptun_hash_insert(connp, ipst)); 1340 1341 switch (protocol) { 1342 case IPPROTO_TCP: 1343 /* 1344 * For TCP, we check whether the connection tuple already 1345 * exists before allowing the connection to proceed. We 1346 * also allow indexing on the zoneid. This is to allow 1347 * multiple shared stack zones to have the same tcp 1348 * connection tuple. In practice this only happens for 1349 * INADDR_LOOPBACK as it's the only local address which 1350 * doesn't have to be unique. 1351 */ 1352 connfp = &ipst->ips_ipcl_conn_fanout[ 1353 IPCL_CONN_HASH(connp->conn_faddr_v4, 1354 connp->conn_ports, ipst)]; 1355 mutex_enter(&connfp->connf_lock); 1356 for (tconnp = connfp->connf_head; tconnp != NULL; 1357 tconnp = tconnp->conn_next) { 1358 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1359 connp->conn_faddr_v4, connp->conn_laddr_v4, 1360 connp->conn_ports) && 1361 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1362 /* Already have a conn. bail out */ 1363 mutex_exit(&connfp->connf_lock); 1364 return (EADDRINUSE); 1365 } 1366 } 1367 if (connp->conn_fanout != NULL) { 1368 /* 1369 * Probably a XTI/TLI application trying to do a 1370 * rebind. Let it happen. 1371 */ 1372 mutex_exit(&connfp->connf_lock); 1373 IPCL_HASH_REMOVE(connp); 1374 mutex_enter(&connfp->connf_lock); 1375 } 1376 1377 ASSERT(connp->conn_recv != NULL); 1378 ASSERT(connp->conn_recvicmp != NULL); 1379 1380 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1381 mutex_exit(&connfp->connf_lock); 1382 break; 1383 1384 case IPPROTO_SCTP: 1385 /* 1386 * The raw socket may have already been bound, remove it 1387 * from the hash first. 1388 */ 1389 IPCL_HASH_REMOVE(connp); 1390 ret = ipcl_sctp_hash_insert(connp, lport); 1391 break; 1392 1393 default: 1394 /* 1395 * Check for conflicts among MAC exempt bindings. For 1396 * transports with port numbers, this is done by the upper 1397 * level per-transport binding logic. For all others, it's 1398 * done here. 1399 */ 1400 if (is_system_labeled() && 1401 check_exempt_conflict_v4(connp, ipst)) 1402 return (EADDRINUSE); 1403 /* FALLTHROUGH */ 1404 1405 case IPPROTO_UDP: 1406 if (protocol == IPPROTO_UDP) { 1407 connfp = &ipst->ips_ipcl_udp_fanout[ 1408 IPCL_UDP_HASH(lport, ipst)]; 1409 } else { 1410 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1411 } 1412 1413 if (connp->conn_faddr_v4 != INADDR_ANY) { 1414 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1415 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1416 IPCL_HASH_INSERT_BOUND(connfp, connp); 1417 } else { 1418 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1419 } 1420 break; 1421 } 1422 1423 return (ret); 1424 } 1425 1426 int 1427 ipcl_conn_insert_v6(conn_t *connp) 1428 { 1429 connf_t *connfp; 1430 conn_t *tconnp; 1431 int ret = 0; 1432 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1433 uint16_t lport = connp->conn_lport; 1434 uint8_t protocol = connp->conn_proto; 1435 uint_t ifindex = connp->conn_bound_if; 1436 1437 if (IPCL_IS_IPTUN(connp)) 1438 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1439 1440 switch (protocol) { 1441 case IPPROTO_TCP: 1442 1443 /* 1444 * For tcp, we check whether the connection tuple already 1445 * exists before allowing the connection to proceed. We 1446 * also allow indexing on the zoneid. This is to allow 1447 * multiple shared stack zones to have the same tcp 1448 * connection tuple. In practice this only happens for 1449 * ipv6_loopback as it's the only local address which 1450 * doesn't have to be unique. 1451 */ 1452 connfp = &ipst->ips_ipcl_conn_fanout[ 1453 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1454 ipst)]; 1455 mutex_enter(&connfp->connf_lock); 1456 for (tconnp = connfp->connf_head; tconnp != NULL; 1457 tconnp = tconnp->conn_next) { 1458 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1459 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1460 connp->conn_faddr_v6, connp->conn_laddr_v6, 1461 connp->conn_ports) && 1462 (tconnp->conn_bound_if == 0 || 1463 tconnp->conn_bound_if == ifindex) && 1464 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1465 /* Already have a conn. bail out */ 1466 mutex_exit(&connfp->connf_lock); 1467 return (EADDRINUSE); 1468 } 1469 } 1470 if (connp->conn_fanout != NULL) { 1471 /* 1472 * Probably a XTI/TLI application trying to do a 1473 * rebind. Let it happen. 1474 */ 1475 mutex_exit(&connfp->connf_lock); 1476 IPCL_HASH_REMOVE(connp); 1477 mutex_enter(&connfp->connf_lock); 1478 } 1479 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1480 mutex_exit(&connfp->connf_lock); 1481 break; 1482 1483 case IPPROTO_SCTP: 1484 IPCL_HASH_REMOVE(connp); 1485 ret = ipcl_sctp_hash_insert(connp, lport); 1486 break; 1487 1488 default: 1489 if (is_system_labeled() && 1490 check_exempt_conflict_v6(connp, ipst)) 1491 return (EADDRINUSE); 1492 /* FALLTHROUGH */ 1493 case IPPROTO_UDP: 1494 if (protocol == IPPROTO_UDP) { 1495 connfp = &ipst->ips_ipcl_udp_fanout[ 1496 IPCL_UDP_HASH(lport, ipst)]; 1497 } else { 1498 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1499 } 1500 1501 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1502 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1503 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1504 IPCL_HASH_INSERT_BOUND(connfp, connp); 1505 } else { 1506 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1507 } 1508 break; 1509 } 1510 1511 return (ret); 1512 } 1513 1514 /* 1515 * v4 packet classifying function. looks up the fanout table to 1516 * find the conn, the packet belongs to. returns the conn with 1517 * the reference held, null otherwise. 1518 * 1519 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1520 * Lookup" comment block are applied. Labels are also checked as described 1521 * above. If the packet is from the inside (looped back), and is from the same 1522 * zone, then label checks are omitted. 1523 */ 1524 conn_t * 1525 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1526 ip_recv_attr_t *ira, ip_stack_t *ipst) 1527 { 1528 ipha_t *ipha; 1529 connf_t *connfp, *bind_connfp; 1530 uint16_t lport; 1531 uint16_t fport; 1532 uint32_t ports; 1533 conn_t *connp; 1534 uint16_t *up; 1535 zoneid_t zoneid = ira->ira_zoneid; 1536 1537 ipha = (ipha_t *)mp->b_rptr; 1538 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1539 1540 switch (protocol) { 1541 case IPPROTO_TCP: 1542 ports = *(uint32_t *)up; 1543 connfp = 1544 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1545 ports, ipst)]; 1546 mutex_enter(&connfp->connf_lock); 1547 for (connp = connfp->connf_head; connp != NULL; 1548 connp = connp->conn_next) { 1549 if (IPCL_CONN_MATCH(connp, protocol, 1550 ipha->ipha_src, ipha->ipha_dst, ports) && 1551 (connp->conn_zoneid == zoneid || 1552 connp->conn_allzones || 1553 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1554 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1555 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1556 break; 1557 } 1558 1559 if (connp != NULL) { 1560 /* 1561 * We have a fully-bound TCP connection. 1562 * 1563 * For labeled systems, there's no need to check the 1564 * label here. It's known to be good as we checked 1565 * before allowing the connection to become bound. 1566 */ 1567 CONN_INC_REF(connp); 1568 mutex_exit(&connfp->connf_lock); 1569 return (connp); 1570 } 1571 1572 mutex_exit(&connfp->connf_lock); 1573 lport = up[1]; 1574 bind_connfp = 1575 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1576 mutex_enter(&bind_connfp->connf_lock); 1577 for (connp = bind_connfp->connf_head; connp != NULL; 1578 connp = connp->conn_next) { 1579 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1580 lport) && 1581 (connp->conn_zoneid == zoneid || 1582 connp->conn_allzones || 1583 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1584 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1585 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1586 break; 1587 } 1588 1589 /* 1590 * If the matching connection is SLP on a private address, then 1591 * the label on the packet must match the local zone's label. 1592 * Otherwise, it must be in the label range defined by tnrh. 1593 * This is ensured by tsol_receive_local. 1594 * 1595 * Note that we don't check tsol_receive_local for 1596 * the connected case. 1597 */ 1598 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1599 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1600 ira, connp)) { 1601 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1602 char *, "connp(1) could not receive mp(2)", 1603 conn_t *, connp, mblk_t *, mp); 1604 connp = NULL; 1605 } 1606 1607 if (connp != NULL) { 1608 /* Have a listener at least */ 1609 CONN_INC_REF(connp); 1610 mutex_exit(&bind_connfp->connf_lock); 1611 return (connp); 1612 } 1613 1614 mutex_exit(&bind_connfp->connf_lock); 1615 break; 1616 1617 case IPPROTO_UDP: 1618 lport = up[1]; 1619 fport = up[0]; 1620 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1621 mutex_enter(&connfp->connf_lock); 1622 for (connp = connfp->connf_head; connp != NULL; 1623 connp = connp->conn_next) { 1624 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1625 fport, ipha->ipha_src) && 1626 (connp->conn_zoneid == zoneid || 1627 connp->conn_allzones || 1628 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1629 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1630 break; 1631 } 1632 1633 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1634 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1635 ira, connp)) { 1636 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1637 char *, "connp(1) could not receive mp(2)", 1638 conn_t *, connp, mblk_t *, mp); 1639 connp = NULL; 1640 } 1641 1642 if (connp != NULL) { 1643 CONN_INC_REF(connp); 1644 mutex_exit(&connfp->connf_lock); 1645 return (connp); 1646 } 1647 1648 /* 1649 * We shouldn't come here for multicast/broadcast packets 1650 */ 1651 mutex_exit(&connfp->connf_lock); 1652 1653 break; 1654 1655 case IPPROTO_ENCAP: 1656 case IPPROTO_IPV6: 1657 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1658 &ipha->ipha_dst, ipst)); 1659 } 1660 1661 return (NULL); 1662 } 1663 1664 conn_t * 1665 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1666 ip_recv_attr_t *ira, ip_stack_t *ipst) 1667 { 1668 ip6_t *ip6h; 1669 connf_t *connfp, *bind_connfp; 1670 uint16_t lport; 1671 uint16_t fport; 1672 tcpha_t *tcpha; 1673 uint32_t ports; 1674 conn_t *connp; 1675 uint16_t *up; 1676 zoneid_t zoneid = ira->ira_zoneid; 1677 1678 ip6h = (ip6_t *)mp->b_rptr; 1679 1680 switch (protocol) { 1681 case IPPROTO_TCP: 1682 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1683 up = &tcpha->tha_lport; 1684 ports = *(uint32_t *)up; 1685 1686 connfp = 1687 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1688 ports, ipst)]; 1689 mutex_enter(&connfp->connf_lock); 1690 for (connp = connfp->connf_head; connp != NULL; 1691 connp = connp->conn_next) { 1692 if (IPCL_CONN_MATCH_V6(connp, protocol, 1693 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1694 (connp->conn_zoneid == zoneid || 1695 connp->conn_allzones || 1696 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1697 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1698 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1699 break; 1700 } 1701 1702 if (connp != NULL) { 1703 /* 1704 * We have a fully-bound TCP connection. 1705 * 1706 * For labeled systems, there's no need to check the 1707 * label here. It's known to be good as we checked 1708 * before allowing the connection to become bound. 1709 */ 1710 CONN_INC_REF(connp); 1711 mutex_exit(&connfp->connf_lock); 1712 return (connp); 1713 } 1714 1715 mutex_exit(&connfp->connf_lock); 1716 1717 lport = up[1]; 1718 bind_connfp = 1719 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1720 mutex_enter(&bind_connfp->connf_lock); 1721 for (connp = bind_connfp->connf_head; connp != NULL; 1722 connp = connp->conn_next) { 1723 if (IPCL_BIND_MATCH_V6(connp, protocol, 1724 ip6h->ip6_dst, lport) && 1725 (connp->conn_zoneid == zoneid || 1726 connp->conn_allzones || 1727 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1728 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1729 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1730 break; 1731 } 1732 1733 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1734 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1735 ira, connp)) { 1736 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1737 char *, "connp(1) could not receive mp(2)", 1738 conn_t *, connp, mblk_t *, mp); 1739 connp = NULL; 1740 } 1741 1742 if (connp != NULL) { 1743 /* Have a listner at least */ 1744 CONN_INC_REF(connp); 1745 mutex_exit(&bind_connfp->connf_lock); 1746 return (connp); 1747 } 1748 1749 mutex_exit(&bind_connfp->connf_lock); 1750 break; 1751 1752 case IPPROTO_UDP: 1753 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1754 lport = up[1]; 1755 fport = up[0]; 1756 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1757 mutex_enter(&connfp->connf_lock); 1758 for (connp = connfp->connf_head; connp != NULL; 1759 connp = connp->conn_next) { 1760 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1761 fport, ip6h->ip6_src) && 1762 (connp->conn_zoneid == zoneid || 1763 connp->conn_allzones || 1764 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1765 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1766 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1767 break; 1768 } 1769 1770 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1771 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1772 ira, connp)) { 1773 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1774 char *, "connp(1) could not receive mp(2)", 1775 conn_t *, connp, mblk_t *, mp); 1776 connp = NULL; 1777 } 1778 1779 if (connp != NULL) { 1780 CONN_INC_REF(connp); 1781 mutex_exit(&connfp->connf_lock); 1782 return (connp); 1783 } 1784 1785 /* 1786 * We shouldn't come here for multicast/broadcast packets 1787 */ 1788 mutex_exit(&connfp->connf_lock); 1789 break; 1790 case IPPROTO_ENCAP: 1791 case IPPROTO_IPV6: 1792 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1793 &ip6h->ip6_dst, ipst)); 1794 } 1795 1796 return (NULL); 1797 } 1798 1799 /* 1800 * wrapper around ipcl_classify_(v4,v6) routines. 1801 */ 1802 conn_t * 1803 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1804 { 1805 if (ira->ira_flags & IRAF_IS_IPV4) { 1806 return (ipcl_classify_v4(mp, ira->ira_protocol, 1807 ira->ira_ip_hdr_length, ira, ipst)); 1808 } else { 1809 return (ipcl_classify_v6(mp, ira->ira_protocol, 1810 ira->ira_ip_hdr_length, ira, ipst)); 1811 } 1812 } 1813 1814 /* 1815 * Only used to classify SCTP RAW sockets 1816 */ 1817 conn_t * 1818 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1819 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1820 { 1821 connf_t *connfp; 1822 conn_t *connp; 1823 in_port_t lport; 1824 int ipversion; 1825 const void *dst; 1826 zoneid_t zoneid = ira->ira_zoneid; 1827 1828 lport = ((uint16_t *)&ports)[1]; 1829 if (ira->ira_flags & IRAF_IS_IPV4) { 1830 dst = (const void *)&ipha->ipha_dst; 1831 ipversion = IPV4_VERSION; 1832 } else { 1833 dst = (const void *)&ip6h->ip6_dst; 1834 ipversion = IPV6_VERSION; 1835 } 1836 1837 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1838 mutex_enter(&connfp->connf_lock); 1839 for (connp = connfp->connf_head; connp != NULL; 1840 connp = connp->conn_next) { 1841 /* We don't allow v4 fallback for v6 raw socket. */ 1842 if (ipversion != connp->conn_ipversion) 1843 continue; 1844 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1845 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1846 if (ipversion == IPV4_VERSION) { 1847 if (!IPCL_CONN_MATCH(connp, protocol, 1848 ipha->ipha_src, ipha->ipha_dst, ports)) 1849 continue; 1850 } else { 1851 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1852 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1853 continue; 1854 } 1855 } else { 1856 if (ipversion == IPV4_VERSION) { 1857 if (!IPCL_BIND_MATCH(connp, protocol, 1858 ipha->ipha_dst, lport)) 1859 continue; 1860 } else { 1861 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1862 ip6h->ip6_dst, lport)) 1863 continue; 1864 } 1865 } 1866 1867 if (connp->conn_zoneid == zoneid || 1868 connp->conn_allzones || 1869 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1870 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1871 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1872 break; 1873 } 1874 1875 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1876 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1877 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1878 char *, "connp(1) could not receive mp(2)", 1879 conn_t *, connp, mblk_t *, mp); 1880 connp = NULL; 1881 } 1882 1883 if (connp != NULL) 1884 goto found; 1885 mutex_exit(&connfp->connf_lock); 1886 1887 /* Try to look for a wildcard SCTP RAW socket match. */ 1888 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1889 mutex_enter(&connfp->connf_lock); 1890 for (connp = connfp->connf_head; connp != NULL; 1891 connp = connp->conn_next) { 1892 /* We don't allow v4 fallback for v6 raw socket. */ 1893 if (ipversion != connp->conn_ipversion) 1894 continue; 1895 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1896 continue; 1897 1898 if (ipversion == IPV4_VERSION) { 1899 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1900 break; 1901 } else { 1902 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1903 break; 1904 } 1905 } 1906 } 1907 1908 if (connp != NULL) 1909 goto found; 1910 1911 mutex_exit(&connfp->connf_lock); 1912 return (NULL); 1913 1914 found: 1915 ASSERT(connp != NULL); 1916 CONN_INC_REF(connp); 1917 mutex_exit(&connfp->connf_lock); 1918 return (connp); 1919 } 1920 1921 /* ARGSUSED */ 1922 static int 1923 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1924 { 1925 itc_t *itc = (itc_t *)buf; 1926 conn_t *connp = &itc->itc_conn; 1927 tcp_t *tcp = (tcp_t *)&itc[1]; 1928 1929 bzero(connp, sizeof (conn_t)); 1930 bzero(tcp, sizeof (tcp_t)); 1931 1932 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1933 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1934 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1935 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1936 if (tcp->tcp_timercache == NULL) 1937 return (ENOMEM); 1938 connp->conn_tcp = tcp; 1939 connp->conn_flags = IPCL_TCPCONN; 1940 connp->conn_proto = IPPROTO_TCP; 1941 tcp->tcp_connp = connp; 1942 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1943 1944 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1945 if (connp->conn_ixa == NULL) { 1946 tcp_timermp_free(tcp); 1947 return (ENOMEM); 1948 } 1949 connp->conn_ixa->ixa_refcnt = 1; 1950 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1951 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1952 return (0); 1953 } 1954 1955 /* ARGSUSED */ 1956 static void 1957 tcp_conn_destructor(void *buf, void *cdrarg) 1958 { 1959 itc_t *itc = (itc_t *)buf; 1960 conn_t *connp = &itc->itc_conn; 1961 tcp_t *tcp = (tcp_t *)&itc[1]; 1962 1963 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1964 ASSERT(tcp->tcp_connp == connp); 1965 ASSERT(connp->conn_tcp == tcp); 1966 tcp_timermp_free(tcp); 1967 mutex_destroy(&connp->conn_lock); 1968 cv_destroy(&connp->conn_cv); 1969 cv_destroy(&connp->conn_sq_cv); 1970 rw_destroy(&connp->conn_ilg_lock); 1971 1972 /* Can be NULL if constructor failed */ 1973 if (connp->conn_ixa != NULL) { 1974 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1975 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1976 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1977 ixa_refrele(connp->conn_ixa); 1978 } 1979 } 1980 1981 /* ARGSUSED */ 1982 static int 1983 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1984 { 1985 itc_t *itc = (itc_t *)buf; 1986 conn_t *connp = &itc->itc_conn; 1987 1988 bzero(connp, sizeof (conn_t)); 1989 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1990 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1991 connp->conn_flags = IPCL_IPCCONN; 1992 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1993 1994 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1995 if (connp->conn_ixa == NULL) 1996 return (ENOMEM); 1997 connp->conn_ixa->ixa_refcnt = 1; 1998 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1999 return (0); 2000 } 2001 2002 /* ARGSUSED */ 2003 static void 2004 ip_conn_destructor(void *buf, void *cdrarg) 2005 { 2006 itc_t *itc = (itc_t *)buf; 2007 conn_t *connp = &itc->itc_conn; 2008 2009 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2010 ASSERT(connp->conn_priv == NULL); 2011 mutex_destroy(&connp->conn_lock); 2012 cv_destroy(&connp->conn_cv); 2013 rw_destroy(&connp->conn_ilg_lock); 2014 2015 /* Can be NULL if constructor failed */ 2016 if (connp->conn_ixa != NULL) { 2017 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2018 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2019 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2020 ixa_refrele(connp->conn_ixa); 2021 } 2022 } 2023 2024 /* ARGSUSED */ 2025 static int 2026 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2027 { 2028 itc_t *itc = (itc_t *)buf; 2029 conn_t *connp = &itc->itc_conn; 2030 udp_t *udp = (udp_t *)&itc[1]; 2031 2032 bzero(connp, sizeof (conn_t)); 2033 bzero(udp, sizeof (udp_t)); 2034 2035 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2036 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2037 connp->conn_udp = udp; 2038 connp->conn_flags = IPCL_UDPCONN; 2039 connp->conn_proto = IPPROTO_UDP; 2040 udp->udp_connp = connp; 2041 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2042 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2043 if (connp->conn_ixa == NULL) 2044 return (ENOMEM); 2045 connp->conn_ixa->ixa_refcnt = 1; 2046 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2047 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2048 return (0); 2049 } 2050 2051 /* ARGSUSED */ 2052 static void 2053 udp_conn_destructor(void *buf, void *cdrarg) 2054 { 2055 itc_t *itc = (itc_t *)buf; 2056 conn_t *connp = &itc->itc_conn; 2057 udp_t *udp = (udp_t *)&itc[1]; 2058 2059 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2060 ASSERT(udp->udp_connp == connp); 2061 ASSERT(connp->conn_udp == udp); 2062 mutex_destroy(&connp->conn_lock); 2063 cv_destroy(&connp->conn_cv); 2064 rw_destroy(&connp->conn_ilg_lock); 2065 2066 /* Can be NULL if constructor failed */ 2067 if (connp->conn_ixa != NULL) { 2068 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2069 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2070 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2071 ixa_refrele(connp->conn_ixa); 2072 } 2073 } 2074 2075 /* ARGSUSED */ 2076 static int 2077 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2078 { 2079 itc_t *itc = (itc_t *)buf; 2080 conn_t *connp = &itc->itc_conn; 2081 icmp_t *icmp = (icmp_t *)&itc[1]; 2082 2083 bzero(connp, sizeof (conn_t)); 2084 bzero(icmp, sizeof (icmp_t)); 2085 2086 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2087 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2088 connp->conn_icmp = icmp; 2089 connp->conn_flags = IPCL_RAWIPCONN; 2090 connp->conn_proto = IPPROTO_ICMP; 2091 icmp->icmp_connp = connp; 2092 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2093 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2094 if (connp->conn_ixa == NULL) 2095 return (ENOMEM); 2096 connp->conn_ixa->ixa_refcnt = 1; 2097 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2098 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2099 return (0); 2100 } 2101 2102 /* ARGSUSED */ 2103 static void 2104 rawip_conn_destructor(void *buf, void *cdrarg) 2105 { 2106 itc_t *itc = (itc_t *)buf; 2107 conn_t *connp = &itc->itc_conn; 2108 icmp_t *icmp = (icmp_t *)&itc[1]; 2109 2110 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2111 ASSERT(icmp->icmp_connp == connp); 2112 ASSERT(connp->conn_icmp == icmp); 2113 mutex_destroy(&connp->conn_lock); 2114 cv_destroy(&connp->conn_cv); 2115 rw_destroy(&connp->conn_ilg_lock); 2116 2117 /* Can be NULL if constructor failed */ 2118 if (connp->conn_ixa != NULL) { 2119 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2120 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2121 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2122 ixa_refrele(connp->conn_ixa); 2123 } 2124 } 2125 2126 /* ARGSUSED */ 2127 static int 2128 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2129 { 2130 itc_t *itc = (itc_t *)buf; 2131 conn_t *connp = &itc->itc_conn; 2132 rts_t *rts = (rts_t *)&itc[1]; 2133 2134 bzero(connp, sizeof (conn_t)); 2135 bzero(rts, sizeof (rts_t)); 2136 2137 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2138 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2139 connp->conn_rts = rts; 2140 connp->conn_flags = IPCL_RTSCONN; 2141 rts->rts_connp = connp; 2142 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2143 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2144 if (connp->conn_ixa == NULL) 2145 return (ENOMEM); 2146 connp->conn_ixa->ixa_refcnt = 1; 2147 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2148 return (0); 2149 } 2150 2151 /* ARGSUSED */ 2152 static void 2153 rts_conn_destructor(void *buf, void *cdrarg) 2154 { 2155 itc_t *itc = (itc_t *)buf; 2156 conn_t *connp = &itc->itc_conn; 2157 rts_t *rts = (rts_t *)&itc[1]; 2158 2159 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2160 ASSERT(rts->rts_connp == connp); 2161 ASSERT(connp->conn_rts == rts); 2162 mutex_destroy(&connp->conn_lock); 2163 cv_destroy(&connp->conn_cv); 2164 rw_destroy(&connp->conn_ilg_lock); 2165 2166 /* Can be NULL if constructor failed */ 2167 if (connp->conn_ixa != NULL) { 2168 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2169 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2170 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2171 ixa_refrele(connp->conn_ixa); 2172 } 2173 } 2174 2175 /* 2176 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2177 * in the conn_t. 2178 * 2179 * Below we list all the pointers in the conn_t as a documentation aid. 2180 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2181 * If you add any pointers to the conn_t please add an ASSERT here 2182 * and #ifdef it out if it can't be actually asserted to be NULL. 2183 * In any case, we bzero most of the conn_t at the end of the function. 2184 */ 2185 void 2186 ipcl_conn_cleanup(conn_t *connp) 2187 { 2188 ip_xmit_attr_t *ixa; 2189 2190 ASSERT(connp->conn_latch == NULL); 2191 ASSERT(connp->conn_latch_in_policy == NULL); 2192 ASSERT(connp->conn_latch_in_action == NULL); 2193 #ifdef notdef 2194 ASSERT(connp->conn_rq == NULL); 2195 ASSERT(connp->conn_wq == NULL); 2196 #endif 2197 ASSERT(connp->conn_cred == NULL); 2198 ASSERT(connp->conn_g_fanout == NULL); 2199 ASSERT(connp->conn_g_next == NULL); 2200 ASSERT(connp->conn_g_prev == NULL); 2201 ASSERT(connp->conn_policy == NULL); 2202 ASSERT(connp->conn_fanout == NULL); 2203 ASSERT(connp->conn_next == NULL); 2204 ASSERT(connp->conn_prev == NULL); 2205 ASSERT(connp->conn_oper_pending_ill == NULL); 2206 ASSERT(connp->conn_ilg == NULL); 2207 ASSERT(connp->conn_drain_next == NULL); 2208 ASSERT(connp->conn_drain_prev == NULL); 2209 #ifdef notdef 2210 /* conn_idl is not cleared when removed from idl list */ 2211 ASSERT(connp->conn_idl == NULL); 2212 #endif 2213 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2214 #ifdef notdef 2215 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2216 ASSERT(connp->conn_netstack == NULL); 2217 #endif 2218 2219 ASSERT(connp->conn_helper_info == NULL); 2220 ASSERT(connp->conn_ixa != NULL); 2221 ixa = connp->conn_ixa; 2222 ASSERT(ixa->ixa_refcnt == 1); 2223 /* Need to preserve ixa_protocol */ 2224 ixa_cleanup(ixa); 2225 ixa->ixa_flags = 0; 2226 2227 /* Clear out the conn_t fields that are not preserved */ 2228 bzero(&connp->conn_start_clr, 2229 sizeof (conn_t) - 2230 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2231 } 2232 2233 /* 2234 * All conns are inserted in a global multi-list for the benefit of 2235 * walkers. The walk is guaranteed to walk all open conns at the time 2236 * of the start of the walk exactly once. This property is needed to 2237 * achieve some cleanups during unplumb of interfaces. This is achieved 2238 * as follows. 2239 * 2240 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2241 * call the insert and delete functions below at creation and deletion 2242 * time respectively. The conn never moves or changes its position in this 2243 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2244 * won't increase due to walkers, once the conn deletion has started. Note 2245 * that we can't remove the conn from the global list and then wait for 2246 * the refcnt to drop to zero, since walkers would then see a truncated 2247 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2248 * conns until ip_open is ready to make them globally visible. 2249 * The global round robin multi-list locks are held only to get the 2250 * next member/insertion/deletion and contention should be negligible 2251 * if the multi-list is much greater than the number of cpus. 2252 */ 2253 void 2254 ipcl_globalhash_insert(conn_t *connp) 2255 { 2256 int index; 2257 struct connf_s *connfp; 2258 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2259 2260 /* 2261 * No need for atomic here. Approximate even distribution 2262 * in the global lists is sufficient. 2263 */ 2264 ipst->ips_conn_g_index++; 2265 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2266 2267 connp->conn_g_prev = NULL; 2268 /* 2269 * Mark as INCIPIENT, so that walkers will ignore this 2270 * for now, till ip_open is ready to make it visible globally. 2271 */ 2272 connp->conn_state_flags |= CONN_INCIPIENT; 2273 2274 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2275 /* Insert at the head of the list */ 2276 mutex_enter(&connfp->connf_lock); 2277 connp->conn_g_next = connfp->connf_head; 2278 if (connp->conn_g_next != NULL) 2279 connp->conn_g_next->conn_g_prev = connp; 2280 connfp->connf_head = connp; 2281 2282 /* The fanout bucket this conn points to */ 2283 connp->conn_g_fanout = connfp; 2284 2285 mutex_exit(&connfp->connf_lock); 2286 } 2287 2288 void 2289 ipcl_globalhash_remove(conn_t *connp) 2290 { 2291 struct connf_s *connfp; 2292 2293 /* 2294 * We were never inserted in the global multi list. 2295 * IPCL_NONE variety is never inserted in the global multilist 2296 * since it is presumed to not need any cleanup and is transient. 2297 */ 2298 if (connp->conn_g_fanout == NULL) 2299 return; 2300 2301 connfp = connp->conn_g_fanout; 2302 mutex_enter(&connfp->connf_lock); 2303 if (connp->conn_g_prev != NULL) 2304 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2305 else 2306 connfp->connf_head = connp->conn_g_next; 2307 if (connp->conn_g_next != NULL) 2308 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2309 mutex_exit(&connfp->connf_lock); 2310 2311 /* Better to stumble on a null pointer than to corrupt memory */ 2312 connp->conn_g_next = NULL; 2313 connp->conn_g_prev = NULL; 2314 connp->conn_g_fanout = NULL; 2315 } 2316 2317 /* 2318 * Walk the list of all conn_t's in the system, calling the function provided 2319 * With the specified argument for each. 2320 * Applies to both IPv4 and IPv6. 2321 * 2322 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2323 * conn_oper_pending_ill). To guard against stale pointers 2324 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2325 * unplumbed or removed. New conn_t's that are created while we are walking 2326 * may be missed by this walk, because they are not necessarily inserted 2327 * at the tail of the list. They are new conn_t's and thus don't have any 2328 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2329 * is created to the struct that is going away. 2330 */ 2331 void 2332 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2333 { 2334 int i; 2335 conn_t *connp; 2336 conn_t *prev_connp; 2337 2338 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2339 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2340 prev_connp = NULL; 2341 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2342 while (connp != NULL) { 2343 mutex_enter(&connp->conn_lock); 2344 if (connp->conn_state_flags & 2345 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2346 mutex_exit(&connp->conn_lock); 2347 connp = connp->conn_g_next; 2348 continue; 2349 } 2350 CONN_INC_REF_LOCKED(connp); 2351 mutex_exit(&connp->conn_lock); 2352 mutex_exit( 2353 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2354 (*func)(connp, arg); 2355 if (prev_connp != NULL) 2356 CONN_DEC_REF(prev_connp); 2357 mutex_enter( 2358 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2359 prev_connp = connp; 2360 connp = connp->conn_g_next; 2361 } 2362 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2363 if (prev_connp != NULL) 2364 CONN_DEC_REF(prev_connp); 2365 } 2366 } 2367 2368 /* 2369 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2370 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2371 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2372 * (peer tcp in ESTABLISHED state). 2373 */ 2374 conn_t * 2375 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2376 ip_stack_t *ipst) 2377 { 2378 uint32_t ports; 2379 uint16_t *pports = (uint16_t *)&ports; 2380 connf_t *connfp; 2381 conn_t *tconnp; 2382 boolean_t zone_chk; 2383 2384 /* 2385 * If either the source of destination address is loopback, then 2386 * both endpoints must be in the same Zone. Otherwise, both of 2387 * the addresses are system-wide unique (tcp is in ESTABLISHED 2388 * state) and the endpoints may reside in different Zones. 2389 */ 2390 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2391 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2392 2393 pports[0] = tcpha->tha_fport; 2394 pports[1] = tcpha->tha_lport; 2395 2396 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2397 ports, ipst)]; 2398 2399 mutex_enter(&connfp->connf_lock); 2400 for (tconnp = connfp->connf_head; tconnp != NULL; 2401 tconnp = tconnp->conn_next) { 2402 2403 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2404 ipha->ipha_dst, ipha->ipha_src, ports) && 2405 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2406 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2407 2408 ASSERT(tconnp != connp); 2409 CONN_INC_REF(tconnp); 2410 mutex_exit(&connfp->connf_lock); 2411 return (tconnp); 2412 } 2413 } 2414 mutex_exit(&connfp->connf_lock); 2415 return (NULL); 2416 } 2417 2418 /* 2419 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2420 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2421 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2422 * (peer tcp in ESTABLISHED state). 2423 */ 2424 conn_t * 2425 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2426 ip_stack_t *ipst) 2427 { 2428 uint32_t ports; 2429 uint16_t *pports = (uint16_t *)&ports; 2430 connf_t *connfp; 2431 conn_t *tconnp; 2432 boolean_t zone_chk; 2433 2434 /* 2435 * If either the source of destination address is loopback, then 2436 * both endpoints must be in the same Zone. Otherwise, both of 2437 * the addresses are system-wide unique (tcp is in ESTABLISHED 2438 * state) and the endpoints may reside in different Zones. We 2439 * don't do Zone check for link local address(es) because the 2440 * current Zone implementation treats each link local address as 2441 * being unique per system node, i.e. they belong to global Zone. 2442 */ 2443 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2444 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2445 2446 pports[0] = tcpha->tha_fport; 2447 pports[1] = tcpha->tha_lport; 2448 2449 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2450 ports, ipst)]; 2451 2452 mutex_enter(&connfp->connf_lock); 2453 for (tconnp = connfp->connf_head; tconnp != NULL; 2454 tconnp = tconnp->conn_next) { 2455 2456 /* We skip conn_bound_if check here as this is loopback tcp */ 2457 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2458 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2459 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2460 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2461 2462 ASSERT(tconnp != connp); 2463 CONN_INC_REF(tconnp); 2464 mutex_exit(&connfp->connf_lock); 2465 return (tconnp); 2466 } 2467 } 2468 mutex_exit(&connfp->connf_lock); 2469 return (NULL); 2470 } 2471 2472 /* 2473 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2474 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2475 * Only checks for connected entries i.e. no INADDR_ANY checks. 2476 */ 2477 conn_t * 2478 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2479 ip_stack_t *ipst) 2480 { 2481 uint32_t ports; 2482 uint16_t *pports; 2483 connf_t *connfp; 2484 conn_t *tconnp; 2485 2486 pports = (uint16_t *)&ports; 2487 pports[0] = tcpha->tha_fport; 2488 pports[1] = tcpha->tha_lport; 2489 2490 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2491 ports, ipst)]; 2492 2493 mutex_enter(&connfp->connf_lock); 2494 for (tconnp = connfp->connf_head; tconnp != NULL; 2495 tconnp = tconnp->conn_next) { 2496 2497 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2498 ipha->ipha_dst, ipha->ipha_src, ports) && 2499 tconnp->conn_tcp->tcp_state >= min_state) { 2500 2501 CONN_INC_REF(tconnp); 2502 mutex_exit(&connfp->connf_lock); 2503 return (tconnp); 2504 } 2505 } 2506 mutex_exit(&connfp->connf_lock); 2507 return (NULL); 2508 } 2509 2510 /* 2511 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2512 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2513 * Only checks for connected entries i.e. no INADDR_ANY checks. 2514 * Match on ifindex in addition to addresses. 2515 */ 2516 conn_t * 2517 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2518 uint_t ifindex, ip_stack_t *ipst) 2519 { 2520 tcp_t *tcp; 2521 uint32_t ports; 2522 uint16_t *pports; 2523 connf_t *connfp; 2524 conn_t *tconnp; 2525 2526 pports = (uint16_t *)&ports; 2527 pports[0] = tcpha->tha_fport; 2528 pports[1] = tcpha->tha_lport; 2529 2530 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2531 ports, ipst)]; 2532 2533 mutex_enter(&connfp->connf_lock); 2534 for (tconnp = connfp->connf_head; tconnp != NULL; 2535 tconnp = tconnp->conn_next) { 2536 2537 tcp = tconnp->conn_tcp; 2538 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2539 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2540 tcp->tcp_state >= min_state && 2541 (tconnp->conn_bound_if == 0 || 2542 tconnp->conn_bound_if == ifindex)) { 2543 2544 CONN_INC_REF(tconnp); 2545 mutex_exit(&connfp->connf_lock); 2546 return (tconnp); 2547 } 2548 } 2549 mutex_exit(&connfp->connf_lock); 2550 return (NULL); 2551 } 2552 2553 /* 2554 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2555 * a listener when changing state. 2556 */ 2557 conn_t * 2558 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2559 ip_stack_t *ipst) 2560 { 2561 connf_t *bind_connfp; 2562 conn_t *connp; 2563 tcp_t *tcp; 2564 2565 /* 2566 * Avoid false matches for packets sent to an IP destination of 2567 * all zeros. 2568 */ 2569 if (laddr == 0) 2570 return (NULL); 2571 2572 ASSERT(zoneid != ALL_ZONES); 2573 2574 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2575 mutex_enter(&bind_connfp->connf_lock); 2576 for (connp = bind_connfp->connf_head; connp != NULL; 2577 connp = connp->conn_next) { 2578 tcp = connp->conn_tcp; 2579 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2580 IPCL_ZONE_MATCH(connp, zoneid) && 2581 (tcp->tcp_listener == NULL)) { 2582 CONN_INC_REF(connp); 2583 mutex_exit(&bind_connfp->connf_lock); 2584 return (connp); 2585 } 2586 } 2587 mutex_exit(&bind_connfp->connf_lock); 2588 return (NULL); 2589 } 2590 2591 /* 2592 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2593 * a listener when changing state. 2594 */ 2595 conn_t * 2596 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2597 zoneid_t zoneid, ip_stack_t *ipst) 2598 { 2599 connf_t *bind_connfp; 2600 conn_t *connp = NULL; 2601 tcp_t *tcp; 2602 2603 /* 2604 * Avoid false matches for packets sent to an IP destination of 2605 * all zeros. 2606 */ 2607 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2608 return (NULL); 2609 2610 ASSERT(zoneid != ALL_ZONES); 2611 2612 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2613 mutex_enter(&bind_connfp->connf_lock); 2614 for (connp = bind_connfp->connf_head; connp != NULL; 2615 connp = connp->conn_next) { 2616 tcp = connp->conn_tcp; 2617 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2618 IPCL_ZONE_MATCH(connp, zoneid) && 2619 (connp->conn_bound_if == 0 || 2620 connp->conn_bound_if == ifindex) && 2621 tcp->tcp_listener == NULL) { 2622 CONN_INC_REF(connp); 2623 mutex_exit(&bind_connfp->connf_lock); 2624 return (connp); 2625 } 2626 } 2627 mutex_exit(&bind_connfp->connf_lock); 2628 return (NULL); 2629 } 2630 2631 /* 2632 * ipcl_get_next_conn 2633 * get the next entry in the conn global list 2634 * and put a reference on the next_conn. 2635 * decrement the reference on the current conn. 2636 * 2637 * This is an iterator based walker function that also provides for 2638 * some selection by the caller. It walks through the conn_hash bucket 2639 * searching for the next valid connp in the list, and selects connections 2640 * that are neither closed nor condemned. It also REFHOLDS the conn 2641 * thus ensuring that the conn exists when the caller uses the conn. 2642 */ 2643 conn_t * 2644 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2645 { 2646 conn_t *next_connp; 2647 2648 if (connfp == NULL) 2649 return (NULL); 2650 2651 mutex_enter(&connfp->connf_lock); 2652 2653 next_connp = (connp == NULL) ? 2654 connfp->connf_head : connp->conn_g_next; 2655 2656 while (next_connp != NULL) { 2657 mutex_enter(&next_connp->conn_lock); 2658 if (!(next_connp->conn_flags & conn_flags) || 2659 (next_connp->conn_state_flags & 2660 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2661 /* 2662 * This conn has been condemned or 2663 * is closing, or the flags don't match 2664 */ 2665 mutex_exit(&next_connp->conn_lock); 2666 next_connp = next_connp->conn_g_next; 2667 continue; 2668 } 2669 CONN_INC_REF_LOCKED(next_connp); 2670 mutex_exit(&next_connp->conn_lock); 2671 break; 2672 } 2673 2674 mutex_exit(&connfp->connf_lock); 2675 2676 if (connp != NULL) 2677 CONN_DEC_REF(connp); 2678 2679 return (next_connp); 2680 } 2681 2682 #ifdef CONN_DEBUG 2683 /* 2684 * Trace of the last NBUF refhold/refrele 2685 */ 2686 int 2687 conn_trace_ref(conn_t *connp) 2688 { 2689 int last; 2690 conn_trace_t *ctb; 2691 2692 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2693 last = connp->conn_trace_last; 2694 last++; 2695 if (last == CONN_TRACE_MAX) 2696 last = 0; 2697 2698 ctb = &connp->conn_trace_buf[last]; 2699 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2700 connp->conn_trace_last = last; 2701 return (1); 2702 } 2703 2704 int 2705 conn_untrace_ref(conn_t *connp) 2706 { 2707 int last; 2708 conn_trace_t *ctb; 2709 2710 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2711 last = connp->conn_trace_last; 2712 last++; 2713 if (last == CONN_TRACE_MAX) 2714 last = 0; 2715 2716 ctb = &connp->conn_trace_buf[last]; 2717 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2718 connp->conn_trace_last = last; 2719 return (1); 2720 } 2721 #endif 2722