1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * conn_recv is used to pass up packets to the ULP. 56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 57 * a listener, and changes to tcp_input_listener as the listener has picked a 58 * good squeue. For other cases it is set to tcp_input_data. 59 * 60 * conn_recvicmp is used to pass up ICMP errors to the ULP. 61 * 62 * Classifier uses several hash tables: 63 * 64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 65 * ipcl_bind_fanout: contains all connections in BOUND state 66 * ipcl_proto_fanout: IPv4 protocol fanout 67 * ipcl_proto_fanout_v6: IPv6 protocol fanout 68 * ipcl_udp_fanout: contains all UDP connections 69 * ipcl_iptun_fanout: contains all IP tunnel connections 70 * ipcl_globalhash_fanout: contains all connections 71 * 72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 73 * which need to view all existing connections. 74 * 75 * All tables are protected by per-bucket locks. When both per-bucket lock and 76 * connection lock need to be held, the per-bucket lock should be acquired 77 * first, followed by the connection lock. 78 * 79 * All functions doing search in one of these tables increment a reference 80 * counter on the connection found (if any). This reference should be dropped 81 * when the caller has finished processing the connection. 82 * 83 * 84 * INTERFACES: 85 * =========== 86 * 87 * Connection Lookup: 88 * ------------------ 89 * 90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 92 * 93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 94 * it can't find any associated connection. If the connection is found, its 95 * reference counter is incremented. 96 * 97 * mp: mblock, containing packet header. The full header should fit 98 * into a single mblock. It should also contain at least full IP 99 * and TCP or UDP header. 100 * 101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 102 * 103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 104 * the packet. 105 * 106 * ira->ira_zoneid: The zone in which the returned connection must be; the 107 * zoneid corresponding to the ire_zoneid on the IRE located for 108 * the packet's destination address. 109 * 110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 111 * IRAF_TX_SHARED_ADDR flags 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp); 187 * int ipcl_conn_insert_v4(connp); 188 * int ipcl_conn_insert_v6(connp); 189 * 190 * Insert 'connp' in the ipcl_conn_fanout. 191 * Arguments : 192 * connp conn_t to be inserted 193 * 194 * Return value : 195 * 0 if connp was inserted 196 * EADDRINUSE if the connection with the same tuple 197 * already exists. 198 * 199 * int ipcl_bind_insert(connp); 200 * int ipcl_bind_insert_v4(connp); 201 * int ipcl_bind_insert_v6(connp); 202 * 203 * Insert 'connp' in ipcl_bind_fanout. 204 * Arguments : 205 * connp conn_t to be inserted 206 * 207 * 208 * void ipcl_hash_remove(connp); 209 * 210 * Removes the 'connp' from the connection fanout table. 211 * 212 * Connection Creation/Destruction 213 * ------------------------------- 214 * 215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 216 * 217 * Creates a new conn based on the type flag, inserts it into 218 * globalhash table. 219 * 220 * type: This flag determines the type of conn_t which needs to be 221 * created i.e., which kmem_cache it comes from. 222 * IPCL_TCPCONN indicates a TCP connection 223 * IPCL_SCTPCONN indicates a SCTP connection 224 * IPCL_UDPCONN indicates a UDP conn_t. 225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 226 * IPCL_RTSCONN indicates a RTS conn_t. 227 * IPCL_IPCCONN indicates all other connections. 228 * 229 * void ipcl_conn_destroy(connp) 230 * 231 * Destroys the connection state, removes it from the global 232 * connection hash table and frees its memory. 233 */ 234 235 #include <sys/types.h> 236 #include <sys/stream.h> 237 #include <sys/stropts.h> 238 #include <sys/sysmacros.h> 239 #include <sys/strsubr.h> 240 #include <sys/strsun.h> 241 #define _SUN_TPI_VERSION 2 242 #include <sys/ddi.h> 243 #include <sys/cmn_err.h> 244 #include <sys/debug.h> 245 246 #include <sys/systm.h> 247 #include <sys/param.h> 248 #include <sys/kmem.h> 249 #include <sys/isa_defs.h> 250 #include <inet/common.h> 251 #include <netinet/ip6.h> 252 #include <netinet/icmp6.h> 253 254 #include <inet/ip.h> 255 #include <inet/ip_if.h> 256 #include <inet/ip_ire.h> 257 #include <inet/ip6.h> 258 #include <inet/ip_ndp.h> 259 #include <inet/ip_impl.h> 260 #include <inet/udp_impl.h> 261 #include <inet/sctp_ip.h> 262 #include <inet/sctp/sctp_impl.h> 263 #include <inet/rawip_impl.h> 264 #include <inet/rts_impl.h> 265 #include <inet/iptun/iptun_impl.h> 266 267 #include <sys/cpuvar.h> 268 269 #include <inet/ipclassifier.h> 270 #include <inet/tcp.h> 271 #include <inet/ipsec_impl.h> 272 273 #include <sys/tsol/tnet.h> 274 #include <sys/sockio.h> 275 276 /* Old value for compatibility. Setable in /etc/system */ 277 uint_t tcp_conn_hash_size = 0; 278 279 /* New value. Zero means choose automatically. Setable in /etc/system */ 280 uint_t ipcl_conn_hash_size = 0; 281 uint_t ipcl_conn_hash_memfactor = 8192; 282 uint_t ipcl_conn_hash_maxsize = 82500; 283 284 /* bind/udp fanout table size */ 285 uint_t ipcl_bind_fanout_size = 512; 286 uint_t ipcl_udp_fanout_size = 16384; 287 288 /* Raw socket fanout size. Must be a power of 2. */ 289 uint_t ipcl_raw_fanout_size = 256; 290 291 /* 292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 293 * expect that most large deployments would have hundreds of tunnels, and 294 * thousands in the extreme case. 295 */ 296 uint_t ipcl_iptun_fanout_size = 6143; 297 298 /* 299 * Power of 2^N Primes useful for hashing for N of 0-28, 300 * these primes are the nearest prime <= 2^N - 2^(N-2). 301 */ 302 303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 306 50331599, 100663291, 201326557, 0} 307 308 /* 309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 310 * are aligned on cache lines. 311 */ 312 typedef union itc_s { 313 conn_t itc_conn; 314 char itcu_filler[CACHE_ALIGN(conn_s)]; 315 } itc_t; 316 317 struct kmem_cache *tcp_conn_cache; 318 struct kmem_cache *ip_conn_cache; 319 extern struct kmem_cache *sctp_conn_cache; 320 struct kmem_cache *udp_conn_cache; 321 struct kmem_cache *rawip_conn_cache; 322 struct kmem_cache *rts_conn_cache; 323 324 extern void tcp_timermp_free(tcp_t *); 325 extern mblk_t *tcp_timermp_alloc(int); 326 327 static int ip_conn_constructor(void *, void *, int); 328 static void ip_conn_destructor(void *, void *); 329 330 static int tcp_conn_constructor(void *, void *, int); 331 static void tcp_conn_destructor(void *, void *); 332 333 static int udp_conn_constructor(void *, void *, int); 334 static void udp_conn_destructor(void *, void *); 335 336 static int rawip_conn_constructor(void *, void *, int); 337 static void rawip_conn_destructor(void *, void *); 338 339 static int rts_conn_constructor(void *, void *, int); 340 static void rts_conn_destructor(void *, void *); 341 342 /* 343 * Global (for all stack instances) init routine 344 */ 345 void 346 ipcl_g_init(void) 347 { 348 ip_conn_cache = kmem_cache_create("ip_conn_cache", 349 sizeof (conn_t), CACHE_ALIGN_SIZE, 350 ip_conn_constructor, ip_conn_destructor, 351 NULL, NULL, NULL, 0); 352 353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 355 tcp_conn_constructor, tcp_conn_destructor, 356 tcp_conn_reclaim, NULL, NULL, 0); 357 358 udp_conn_cache = kmem_cache_create("udp_conn_cache", 359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 360 udp_conn_constructor, udp_conn_destructor, 361 NULL, NULL, NULL, 0); 362 363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 365 rawip_conn_constructor, rawip_conn_destructor, 366 NULL, NULL, NULL, 0); 367 368 rts_conn_cache = kmem_cache_create("rts_conn_cache", 369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 370 rts_conn_constructor, rts_conn_destructor, 371 NULL, NULL, NULL, 0); 372 } 373 374 /* 375 * ipclassifier intialization routine, sets up hash tables. 376 */ 377 void 378 ipcl_init(ip_stack_t *ipst) 379 { 380 int i; 381 int sizes[] = P2Ps(); 382 383 /* 384 * Calculate size of conn fanout table from /etc/system settings 385 */ 386 if (ipcl_conn_hash_size != 0) { 387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 388 } else if (tcp_conn_hash_size != 0) { 389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 390 } else { 391 extern pgcnt_t freemem; 392 393 ipst->ips_ipcl_conn_fanout_size = 394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 395 396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 397 ipst->ips_ipcl_conn_fanout_size = 398 ipcl_conn_hash_maxsize; 399 } 400 } 401 402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 404 break; 405 } 406 } 407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 408 /* Out of range, use the 2^16 value */ 409 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 410 } 411 412 /* Take values from /etc/system */ 413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 417 418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 419 420 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 422 423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 425 MUTEX_DEFAULT, NULL); 426 } 427 428 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 430 431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 433 MUTEX_DEFAULT, NULL); 434 } 435 436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 437 sizeof (connf_t), KM_SLEEP); 438 for (i = 0; i < IPPROTO_MAX; i++) { 439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 440 MUTEX_DEFAULT, NULL); 441 } 442 443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 444 sizeof (connf_t), KM_SLEEP); 445 for (i = 0; i < IPPROTO_MAX; i++) { 446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 447 MUTEX_DEFAULT, NULL); 448 } 449 450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 451 mutex_init(&ipst->ips_rts_clients->connf_lock, 452 NULL, MUTEX_DEFAULT, NULL); 453 454 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 458 MUTEX_DEFAULT, NULL); 459 } 460 461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 465 MUTEX_DEFAULT, NULL); 466 } 467 468 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 472 MUTEX_DEFAULT, NULL); 473 } 474 475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 477 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 479 NULL, MUTEX_DEFAULT, NULL); 480 } 481 } 482 483 void 484 ipcl_g_destroy(void) 485 { 486 kmem_cache_destroy(ip_conn_cache); 487 kmem_cache_destroy(tcp_conn_cache); 488 kmem_cache_destroy(udp_conn_cache); 489 kmem_cache_destroy(rawip_conn_cache); 490 kmem_cache_destroy(rts_conn_cache); 491 } 492 493 /* 494 * All user-level and kernel use of the stack must be gone 495 * by now. 496 */ 497 void 498 ipcl_destroy(ip_stack_t *ipst) 499 { 500 int i; 501 502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 505 } 506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 507 sizeof (connf_t)); 508 ipst->ips_ipcl_conn_fanout = NULL; 509 510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 513 } 514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 515 sizeof (connf_t)); 516 ipst->ips_ipcl_bind_fanout = NULL; 517 518 for (i = 0; i < IPPROTO_MAX; i++) { 519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 521 } 522 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 523 IPPROTO_MAX * sizeof (connf_t)); 524 ipst->ips_ipcl_proto_fanout_v4 = NULL; 525 526 for (i = 0; i < IPPROTO_MAX; i++) { 527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 529 } 530 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 531 IPPROTO_MAX * sizeof (connf_t)); 532 ipst->ips_ipcl_proto_fanout_v6 = NULL; 533 534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 537 } 538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 539 sizeof (connf_t)); 540 ipst->ips_ipcl_udp_fanout = NULL; 541 542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 545 } 546 kmem_free(ipst->ips_ipcl_iptun_fanout, 547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 548 ipst->ips_ipcl_iptun_fanout = NULL; 549 550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 553 } 554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 555 sizeof (connf_t)); 556 ipst->ips_ipcl_raw_fanout = NULL; 557 558 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 561 } 562 kmem_free(ipst->ips_ipcl_globalhash_fanout, 563 sizeof (connf_t) * CONN_G_HASH_SIZE); 564 ipst->ips_ipcl_globalhash_fanout = NULL; 565 566 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 567 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 569 ipst->ips_rts_clients = NULL; 570 } 571 572 /* 573 * conn creation routine. initialize the conn, sets the reference 574 * and inserts it in the global hash table. 575 */ 576 conn_t * 577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 578 { 579 conn_t *connp; 580 struct kmem_cache *conn_cache; 581 582 switch (type) { 583 case IPCL_SCTPCONN: 584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 585 return (NULL); 586 sctp_conn_init(connp); 587 netstack_hold(ns); 588 connp->conn_netstack = ns; 589 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 590 connp->conn_ixa->ixa_conn_id = (long)connp; 591 ipcl_globalhash_insert(connp); 592 return (connp); 593 594 case IPCL_TCPCONN: 595 conn_cache = tcp_conn_cache; 596 break; 597 598 case IPCL_UDPCONN: 599 conn_cache = udp_conn_cache; 600 break; 601 602 case IPCL_RAWIPCONN: 603 conn_cache = rawip_conn_cache; 604 break; 605 606 case IPCL_RTSCONN: 607 conn_cache = rts_conn_cache; 608 break; 609 610 case IPCL_IPCCONN: 611 conn_cache = ip_conn_cache; 612 break; 613 614 default: 615 connp = NULL; 616 ASSERT(0); 617 } 618 619 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 620 return (NULL); 621 622 connp->conn_ref = 1; 623 netstack_hold(ns); 624 connp->conn_netstack = ns; 625 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 626 connp->conn_ixa->ixa_conn_id = (long)connp; 627 ipcl_globalhash_insert(connp); 628 return (connp); 629 } 630 631 void 632 ipcl_conn_destroy(conn_t *connp) 633 { 634 mblk_t *mp; 635 netstack_t *ns = connp->conn_netstack; 636 637 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 638 ASSERT(connp->conn_ref == 0); 639 ASSERT(connp->conn_ioctlref == 0); 640 641 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 642 643 if (connp->conn_cred != NULL) { 644 crfree(connp->conn_cred); 645 connp->conn_cred = NULL; 646 /* ixa_cred done in ipcl_conn_cleanup below */ 647 } 648 649 if (connp->conn_ht_iphc != NULL) { 650 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 651 connp->conn_ht_iphc = NULL; 652 connp->conn_ht_iphc_allocated = 0; 653 connp->conn_ht_iphc_len = 0; 654 connp->conn_ht_ulp = NULL; 655 connp->conn_ht_ulp_len = 0; 656 } 657 ip_pkt_free(&connp->conn_xmit_ipp); 658 659 ipcl_globalhash_remove(connp); 660 661 if (connp->conn_latch != NULL) { 662 IPLATCH_REFRELE(connp->conn_latch); 663 connp->conn_latch = NULL; 664 } 665 if (connp->conn_latch_in_policy != NULL) { 666 IPPOL_REFRELE(connp->conn_latch_in_policy); 667 connp->conn_latch_in_policy = NULL; 668 } 669 if (connp->conn_latch_in_action != NULL) { 670 IPACT_REFRELE(connp->conn_latch_in_action); 671 connp->conn_latch_in_action = NULL; 672 } 673 if (connp->conn_policy != NULL) { 674 IPPH_REFRELE(connp->conn_policy, ns); 675 connp->conn_policy = NULL; 676 } 677 678 if (connp->conn_ipsec_opt_mp != NULL) { 679 freemsg(connp->conn_ipsec_opt_mp); 680 connp->conn_ipsec_opt_mp = NULL; 681 } 682 683 if (connp->conn_flags & IPCL_TCPCONN) { 684 tcp_t *tcp = connp->conn_tcp; 685 686 tcp_free(tcp); 687 mp = tcp->tcp_timercache; 688 689 tcp->tcp_tcps = NULL; 690 691 /* 692 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 693 * the mblk. 694 */ 695 if (tcp->tcp_rsrv_mp != NULL) { 696 freeb(tcp->tcp_rsrv_mp); 697 tcp->tcp_rsrv_mp = NULL; 698 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 699 } 700 701 ipcl_conn_cleanup(connp); 702 connp->conn_flags = IPCL_TCPCONN; 703 if (ns != NULL) { 704 ASSERT(tcp->tcp_tcps == NULL); 705 connp->conn_netstack = NULL; 706 connp->conn_ixa->ixa_ipst = NULL; 707 netstack_rele(ns); 708 } 709 710 bzero(tcp, sizeof (tcp_t)); 711 712 tcp->tcp_timercache = mp; 713 tcp->tcp_connp = connp; 714 kmem_cache_free(tcp_conn_cache, connp); 715 return; 716 } 717 718 if (connp->conn_flags & IPCL_SCTPCONN) { 719 ASSERT(ns != NULL); 720 sctp_free(connp); 721 return; 722 } 723 724 ipcl_conn_cleanup(connp); 725 if (ns != NULL) { 726 connp->conn_netstack = NULL; 727 connp->conn_ixa->ixa_ipst = NULL; 728 netstack_rele(ns); 729 } 730 731 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 732 if (connp->conn_flags & IPCL_UDPCONN) { 733 connp->conn_flags = IPCL_UDPCONN; 734 kmem_cache_free(udp_conn_cache, connp); 735 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 736 connp->conn_flags = IPCL_RAWIPCONN; 737 connp->conn_proto = IPPROTO_ICMP; 738 connp->conn_ixa->ixa_protocol = connp->conn_proto; 739 kmem_cache_free(rawip_conn_cache, connp); 740 } else if (connp->conn_flags & IPCL_RTSCONN) { 741 connp->conn_flags = IPCL_RTSCONN; 742 kmem_cache_free(rts_conn_cache, connp); 743 } else { 744 connp->conn_flags = IPCL_IPCCONN; 745 ASSERT(connp->conn_flags & IPCL_IPCCONN); 746 ASSERT(connp->conn_priv == NULL); 747 kmem_cache_free(ip_conn_cache, connp); 748 } 749 } 750 751 /* 752 * Running in cluster mode - deregister listener information 753 */ 754 static void 755 ipcl_conn_unlisten(conn_t *connp) 756 { 757 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 758 ASSERT(connp->conn_lport != 0); 759 760 if (cl_inet_unlisten != NULL) { 761 sa_family_t addr_family; 762 uint8_t *laddrp; 763 764 if (connp->conn_ipversion == IPV6_VERSION) { 765 addr_family = AF_INET6; 766 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 767 } else { 768 addr_family = AF_INET; 769 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 770 } 771 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 772 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 773 } 774 connp->conn_flags &= ~IPCL_CL_LISTENER; 775 } 776 777 /* 778 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 779 * which table the conn belonged to). So for debugging we can see which hash 780 * table this connection was in. 781 */ 782 #define IPCL_HASH_REMOVE(connp) { \ 783 connf_t *connfp = (connp)->conn_fanout; \ 784 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 785 if (connfp != NULL) { \ 786 mutex_enter(&connfp->connf_lock); \ 787 if ((connp)->conn_next != NULL) \ 788 (connp)->conn_next->conn_prev = \ 789 (connp)->conn_prev; \ 790 if ((connp)->conn_prev != NULL) \ 791 (connp)->conn_prev->conn_next = \ 792 (connp)->conn_next; \ 793 else \ 794 connfp->connf_head = (connp)->conn_next; \ 795 (connp)->conn_fanout = NULL; \ 796 (connp)->conn_next = NULL; \ 797 (connp)->conn_prev = NULL; \ 798 (connp)->conn_flags |= IPCL_REMOVED; \ 799 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 800 ipcl_conn_unlisten((connp)); \ 801 CONN_DEC_REF((connp)); \ 802 mutex_exit(&connfp->connf_lock); \ 803 } \ 804 } 805 806 void 807 ipcl_hash_remove(conn_t *connp) 808 { 809 uint8_t protocol = connp->conn_proto; 810 811 IPCL_HASH_REMOVE(connp); 812 if (protocol == IPPROTO_RSVP) 813 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 814 } 815 816 /* 817 * The whole purpose of this function is allow removal of 818 * a conn_t from the connected hash for timewait reclaim. 819 * This is essentially a TW reclaim fastpath where timewait 820 * collector checks under fanout lock (so no one else can 821 * get access to the conn_t) that refcnt is 2 i.e. one for 822 * TCP and one for the classifier hash list. If ref count 823 * is indeed 2, we can just remove the conn under lock and 824 * avoid cleaning up the conn under squeue. This gives us 825 * improved performance. 826 */ 827 void 828 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 829 { 830 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 831 ASSERT(MUTEX_HELD(&connp->conn_lock)); 832 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 833 834 if ((connp)->conn_next != NULL) { 835 (connp)->conn_next->conn_prev = (connp)->conn_prev; 836 } 837 if ((connp)->conn_prev != NULL) { 838 (connp)->conn_prev->conn_next = (connp)->conn_next; 839 } else { 840 connfp->connf_head = (connp)->conn_next; 841 } 842 (connp)->conn_fanout = NULL; 843 (connp)->conn_next = NULL; 844 (connp)->conn_prev = NULL; 845 (connp)->conn_flags |= IPCL_REMOVED; 846 ASSERT((connp)->conn_ref == 2); 847 (connp)->conn_ref--; 848 } 849 850 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 851 ASSERT((connp)->conn_fanout == NULL); \ 852 ASSERT((connp)->conn_next == NULL); \ 853 ASSERT((connp)->conn_prev == NULL); \ 854 if ((connfp)->connf_head != NULL) { \ 855 (connfp)->connf_head->conn_prev = (connp); \ 856 (connp)->conn_next = (connfp)->connf_head; \ 857 } \ 858 (connp)->conn_fanout = (connfp); \ 859 (connfp)->connf_head = (connp); \ 860 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 861 IPCL_CONNECTED; \ 862 CONN_INC_REF(connp); \ 863 } 864 865 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 866 IPCL_HASH_REMOVE((connp)); \ 867 mutex_enter(&(connfp)->connf_lock); \ 868 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 869 mutex_exit(&(connfp)->connf_lock); \ 870 } 871 872 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 873 conn_t *pconnp = NULL, *nconnp; \ 874 IPCL_HASH_REMOVE((connp)); \ 875 mutex_enter(&(connfp)->connf_lock); \ 876 nconnp = (connfp)->connf_head; \ 877 while (nconnp != NULL && \ 878 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 879 pconnp = nconnp; \ 880 nconnp = nconnp->conn_next; \ 881 } \ 882 if (pconnp != NULL) { \ 883 pconnp->conn_next = (connp); \ 884 (connp)->conn_prev = pconnp; \ 885 } else { \ 886 (connfp)->connf_head = (connp); \ 887 } \ 888 if (nconnp != NULL) { \ 889 (connp)->conn_next = nconnp; \ 890 nconnp->conn_prev = (connp); \ 891 } \ 892 (connp)->conn_fanout = (connfp); \ 893 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 894 IPCL_BOUND; \ 895 CONN_INC_REF(connp); \ 896 mutex_exit(&(connfp)->connf_lock); \ 897 } 898 899 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 900 conn_t **list, *prev, *next; \ 901 boolean_t isv4mapped = \ 902 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 903 IPCL_HASH_REMOVE((connp)); \ 904 mutex_enter(&(connfp)->connf_lock); \ 905 list = &(connfp)->connf_head; \ 906 prev = NULL; \ 907 while ((next = *list) != NULL) { \ 908 if (isv4mapped && \ 909 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 910 connp->conn_zoneid == next->conn_zoneid) { \ 911 (connp)->conn_next = next; \ 912 if (prev != NULL) \ 913 prev = next->conn_prev; \ 914 next->conn_prev = (connp); \ 915 break; \ 916 } \ 917 list = &next->conn_next; \ 918 prev = next; \ 919 } \ 920 (connp)->conn_prev = prev; \ 921 *list = (connp); \ 922 (connp)->conn_fanout = (connfp); \ 923 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 924 IPCL_BOUND; \ 925 CONN_INC_REF((connp)); \ 926 mutex_exit(&(connfp)->connf_lock); \ 927 } 928 929 void 930 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 931 { 932 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 933 } 934 935 /* 936 * Because the classifier is used to classify inbound packets, the destination 937 * address is meant to be our local tunnel address (tunnel source), and the 938 * source the remote tunnel address (tunnel destination). 939 * 940 * Note that conn_proto can't be used for fanout since the upper protocol 941 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 942 */ 943 conn_t * 944 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 945 { 946 connf_t *connfp; 947 conn_t *connp; 948 949 /* first look for IPv4 tunnel links */ 950 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 951 mutex_enter(&connfp->connf_lock); 952 for (connp = connfp->connf_head; connp != NULL; 953 connp = connp->conn_next) { 954 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 955 break; 956 } 957 if (connp != NULL) 958 goto done; 959 960 mutex_exit(&connfp->connf_lock); 961 962 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 963 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 964 INADDR_ANY)]; 965 mutex_enter(&connfp->connf_lock); 966 for (connp = connfp->connf_head; connp != NULL; 967 connp = connp->conn_next) { 968 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 969 break; 970 } 971 done: 972 if (connp != NULL) 973 CONN_INC_REF(connp); 974 mutex_exit(&connfp->connf_lock); 975 return (connp); 976 } 977 978 conn_t * 979 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 980 { 981 connf_t *connfp; 982 conn_t *connp; 983 984 /* Look for an IPv6 tunnel link */ 985 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 986 mutex_enter(&connfp->connf_lock); 987 for (connp = connfp->connf_head; connp != NULL; 988 connp = connp->conn_next) { 989 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 990 CONN_INC_REF(connp); 991 break; 992 } 993 } 994 mutex_exit(&connfp->connf_lock); 995 return (connp); 996 } 997 998 /* 999 * This function is used only for inserting SCTP raw socket now. 1000 * This may change later. 1001 * 1002 * Note that only one raw socket can be bound to a port. The param 1003 * lport is in network byte order. 1004 */ 1005 static int 1006 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1007 { 1008 connf_t *connfp; 1009 conn_t *oconnp; 1010 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1011 1012 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1013 1014 /* Check for existing raw socket already bound to the port. */ 1015 mutex_enter(&connfp->connf_lock); 1016 for (oconnp = connfp->connf_head; oconnp != NULL; 1017 oconnp = oconnp->conn_next) { 1018 if (oconnp->conn_lport == lport && 1019 oconnp->conn_zoneid == connp->conn_zoneid && 1020 oconnp->conn_family == connp->conn_family && 1021 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1022 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1023 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1024 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1025 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1026 &connp->conn_laddr_v6))) { 1027 break; 1028 } 1029 } 1030 mutex_exit(&connfp->connf_lock); 1031 if (oconnp != NULL) 1032 return (EADDRNOTAVAIL); 1033 1034 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1035 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1036 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1037 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1038 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1039 } else { 1040 IPCL_HASH_INSERT_BOUND(connfp, connp); 1041 } 1042 } else { 1043 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1044 } 1045 return (0); 1046 } 1047 1048 static int 1049 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1050 { 1051 connf_t *connfp; 1052 conn_t *tconnp; 1053 ipaddr_t laddr = connp->conn_laddr_v4; 1054 ipaddr_t faddr = connp->conn_faddr_v4; 1055 1056 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1057 mutex_enter(&connfp->connf_lock); 1058 for (tconnp = connfp->connf_head; tconnp != NULL; 1059 tconnp = tconnp->conn_next) { 1060 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1061 /* A tunnel is already bound to these addresses. */ 1062 mutex_exit(&connfp->connf_lock); 1063 return (EADDRINUSE); 1064 } 1065 } 1066 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1067 mutex_exit(&connfp->connf_lock); 1068 return (0); 1069 } 1070 1071 static int 1072 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1073 { 1074 connf_t *connfp; 1075 conn_t *tconnp; 1076 in6_addr_t *laddr = &connp->conn_laddr_v6; 1077 in6_addr_t *faddr = &connp->conn_faddr_v6; 1078 1079 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1080 mutex_enter(&connfp->connf_lock); 1081 for (tconnp = connfp->connf_head; tconnp != NULL; 1082 tconnp = tconnp->conn_next) { 1083 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1084 /* A tunnel is already bound to these addresses. */ 1085 mutex_exit(&connfp->connf_lock); 1086 return (EADDRINUSE); 1087 } 1088 } 1089 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1090 mutex_exit(&connfp->connf_lock); 1091 return (0); 1092 } 1093 1094 /* 1095 * Check for a MAC exemption conflict on a labeled system. Note that for 1096 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1097 * transport layer. This check is for binding all other protocols. 1098 * 1099 * Returns true if there's a conflict. 1100 */ 1101 static boolean_t 1102 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1103 { 1104 connf_t *connfp; 1105 conn_t *tconn; 1106 1107 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1108 mutex_enter(&connfp->connf_lock); 1109 for (tconn = connfp->connf_head; tconn != NULL; 1110 tconn = tconn->conn_next) { 1111 /* We don't allow v4 fallback for v6 raw socket */ 1112 if (connp->conn_family != tconn->conn_family) 1113 continue; 1114 /* If neither is exempt, then there's no conflict */ 1115 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1116 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1117 continue; 1118 /* We are only concerned about sockets for a different zone */ 1119 if (connp->conn_zoneid == tconn->conn_zoneid) 1120 continue; 1121 /* If both are bound to different specific addrs, ok */ 1122 if (connp->conn_laddr_v4 != INADDR_ANY && 1123 tconn->conn_laddr_v4 != INADDR_ANY && 1124 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1125 continue; 1126 /* These two conflict; fail */ 1127 break; 1128 } 1129 mutex_exit(&connfp->connf_lock); 1130 return (tconn != NULL); 1131 } 1132 1133 static boolean_t 1134 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1135 { 1136 connf_t *connfp; 1137 conn_t *tconn; 1138 1139 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1140 mutex_enter(&connfp->connf_lock); 1141 for (tconn = connfp->connf_head; tconn != NULL; 1142 tconn = tconn->conn_next) { 1143 /* We don't allow v4 fallback for v6 raw socket */ 1144 if (connp->conn_family != tconn->conn_family) 1145 continue; 1146 /* If neither is exempt, then there's no conflict */ 1147 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1148 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1149 continue; 1150 /* We are only concerned about sockets for a different zone */ 1151 if (connp->conn_zoneid == tconn->conn_zoneid) 1152 continue; 1153 /* If both are bound to different addrs, ok */ 1154 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1155 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1156 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1157 &tconn->conn_laddr_v6)) 1158 continue; 1159 /* These two conflict; fail */ 1160 break; 1161 } 1162 mutex_exit(&connfp->connf_lock); 1163 return (tconn != NULL); 1164 } 1165 1166 /* 1167 * (v4, v6) bind hash insertion routines 1168 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1169 */ 1170 1171 int 1172 ipcl_bind_insert(conn_t *connp) 1173 { 1174 if (connp->conn_ipversion == IPV6_VERSION) 1175 return (ipcl_bind_insert_v6(connp)); 1176 else 1177 return (ipcl_bind_insert_v4(connp)); 1178 } 1179 1180 int 1181 ipcl_bind_insert_v4(conn_t *connp) 1182 { 1183 connf_t *connfp; 1184 int ret = 0; 1185 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1186 uint16_t lport = connp->conn_lport; 1187 uint8_t protocol = connp->conn_proto; 1188 1189 if (IPCL_IS_IPTUN(connp)) 1190 return (ipcl_iptun_hash_insert(connp, ipst)); 1191 1192 switch (protocol) { 1193 default: 1194 if (is_system_labeled() && 1195 check_exempt_conflict_v4(connp, ipst)) 1196 return (EADDRINUSE); 1197 /* FALLTHROUGH */ 1198 case IPPROTO_UDP: 1199 if (protocol == IPPROTO_UDP) { 1200 connfp = &ipst->ips_ipcl_udp_fanout[ 1201 IPCL_UDP_HASH(lport, ipst)]; 1202 } else { 1203 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1204 } 1205 1206 if (connp->conn_faddr_v4 != INADDR_ANY) { 1207 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1208 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1209 IPCL_HASH_INSERT_BOUND(connfp, connp); 1210 } else { 1211 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1212 } 1213 if (protocol == IPPROTO_RSVP) 1214 ill_set_inputfn_all(ipst); 1215 break; 1216 1217 case IPPROTO_TCP: 1218 /* Insert it in the Bind Hash */ 1219 ASSERT(connp->conn_zoneid != ALL_ZONES); 1220 connfp = &ipst->ips_ipcl_bind_fanout[ 1221 IPCL_BIND_HASH(lport, ipst)]; 1222 if (connp->conn_laddr_v4 != INADDR_ANY) { 1223 IPCL_HASH_INSERT_BOUND(connfp, connp); 1224 } else { 1225 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1226 } 1227 if (cl_inet_listen != NULL) { 1228 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1229 connp->conn_flags |= IPCL_CL_LISTENER; 1230 (*cl_inet_listen)( 1231 connp->conn_netstack->netstack_stackid, 1232 IPPROTO_TCP, AF_INET, 1233 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1234 } 1235 break; 1236 1237 case IPPROTO_SCTP: 1238 ret = ipcl_sctp_hash_insert(connp, lport); 1239 break; 1240 } 1241 1242 return (ret); 1243 } 1244 1245 int 1246 ipcl_bind_insert_v6(conn_t *connp) 1247 { 1248 connf_t *connfp; 1249 int ret = 0; 1250 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1251 uint16_t lport = connp->conn_lport; 1252 uint8_t protocol = connp->conn_proto; 1253 1254 if (IPCL_IS_IPTUN(connp)) { 1255 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1256 } 1257 1258 switch (protocol) { 1259 default: 1260 if (is_system_labeled() && 1261 check_exempt_conflict_v6(connp, ipst)) 1262 return (EADDRINUSE); 1263 /* FALLTHROUGH */ 1264 case IPPROTO_UDP: 1265 if (protocol == IPPROTO_UDP) { 1266 connfp = &ipst->ips_ipcl_udp_fanout[ 1267 IPCL_UDP_HASH(lport, ipst)]; 1268 } else { 1269 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1270 } 1271 1272 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1273 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1274 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1275 IPCL_HASH_INSERT_BOUND(connfp, connp); 1276 } else { 1277 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1278 } 1279 break; 1280 1281 case IPPROTO_TCP: 1282 /* Insert it in the Bind Hash */ 1283 ASSERT(connp->conn_zoneid != ALL_ZONES); 1284 connfp = &ipst->ips_ipcl_bind_fanout[ 1285 IPCL_BIND_HASH(lport, ipst)]; 1286 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1287 IPCL_HASH_INSERT_BOUND(connfp, connp); 1288 } else { 1289 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1290 } 1291 if (cl_inet_listen != NULL) { 1292 sa_family_t addr_family; 1293 uint8_t *laddrp; 1294 1295 if (connp->conn_ipversion == IPV6_VERSION) { 1296 addr_family = AF_INET6; 1297 laddrp = 1298 (uint8_t *)&connp->conn_bound_addr_v6; 1299 } else { 1300 addr_family = AF_INET; 1301 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1302 } 1303 connp->conn_flags |= IPCL_CL_LISTENER; 1304 (*cl_inet_listen)( 1305 connp->conn_netstack->netstack_stackid, 1306 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1307 } 1308 break; 1309 1310 case IPPROTO_SCTP: 1311 ret = ipcl_sctp_hash_insert(connp, lport); 1312 break; 1313 } 1314 1315 return (ret); 1316 } 1317 1318 /* 1319 * ipcl_conn_hash insertion routines. 1320 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1321 */ 1322 1323 int 1324 ipcl_conn_insert(conn_t *connp) 1325 { 1326 if (connp->conn_ipversion == IPV6_VERSION) 1327 return (ipcl_conn_insert_v6(connp)); 1328 else 1329 return (ipcl_conn_insert_v4(connp)); 1330 } 1331 1332 int 1333 ipcl_conn_insert_v4(conn_t *connp) 1334 { 1335 connf_t *connfp; 1336 conn_t *tconnp; 1337 int ret = 0; 1338 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1339 uint16_t lport = connp->conn_lport; 1340 uint8_t protocol = connp->conn_proto; 1341 1342 if (IPCL_IS_IPTUN(connp)) 1343 return (ipcl_iptun_hash_insert(connp, ipst)); 1344 1345 switch (protocol) { 1346 case IPPROTO_TCP: 1347 /* 1348 * For TCP, we check whether the connection tuple already 1349 * exists before allowing the connection to proceed. We 1350 * also allow indexing on the zoneid. This is to allow 1351 * multiple shared stack zones to have the same tcp 1352 * connection tuple. In practice this only happens for 1353 * INADDR_LOOPBACK as it's the only local address which 1354 * doesn't have to be unique. 1355 */ 1356 connfp = &ipst->ips_ipcl_conn_fanout[ 1357 IPCL_CONN_HASH(connp->conn_faddr_v4, 1358 connp->conn_ports, ipst)]; 1359 mutex_enter(&connfp->connf_lock); 1360 for (tconnp = connfp->connf_head; tconnp != NULL; 1361 tconnp = tconnp->conn_next) { 1362 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1363 connp->conn_faddr_v4, connp->conn_laddr_v4, 1364 connp->conn_ports) && 1365 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1366 /* Already have a conn. bail out */ 1367 mutex_exit(&connfp->connf_lock); 1368 return (EADDRINUSE); 1369 } 1370 } 1371 if (connp->conn_fanout != NULL) { 1372 /* 1373 * Probably a XTI/TLI application trying to do a 1374 * rebind. Let it happen. 1375 */ 1376 mutex_exit(&connfp->connf_lock); 1377 IPCL_HASH_REMOVE(connp); 1378 mutex_enter(&connfp->connf_lock); 1379 } 1380 1381 ASSERT(connp->conn_recv != NULL); 1382 ASSERT(connp->conn_recvicmp != NULL); 1383 1384 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1385 mutex_exit(&connfp->connf_lock); 1386 break; 1387 1388 case IPPROTO_SCTP: 1389 /* 1390 * The raw socket may have already been bound, remove it 1391 * from the hash first. 1392 */ 1393 IPCL_HASH_REMOVE(connp); 1394 ret = ipcl_sctp_hash_insert(connp, lport); 1395 break; 1396 1397 default: 1398 /* 1399 * Check for conflicts among MAC exempt bindings. For 1400 * transports with port numbers, this is done by the upper 1401 * level per-transport binding logic. For all others, it's 1402 * done here. 1403 */ 1404 if (is_system_labeled() && 1405 check_exempt_conflict_v4(connp, ipst)) 1406 return (EADDRINUSE); 1407 /* FALLTHROUGH */ 1408 1409 case IPPROTO_UDP: 1410 if (protocol == IPPROTO_UDP) { 1411 connfp = &ipst->ips_ipcl_udp_fanout[ 1412 IPCL_UDP_HASH(lport, ipst)]; 1413 } else { 1414 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1415 } 1416 1417 if (connp->conn_faddr_v4 != INADDR_ANY) { 1418 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1419 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1420 IPCL_HASH_INSERT_BOUND(connfp, connp); 1421 } else { 1422 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1423 } 1424 break; 1425 } 1426 1427 return (ret); 1428 } 1429 1430 int 1431 ipcl_conn_insert_v6(conn_t *connp) 1432 { 1433 connf_t *connfp; 1434 conn_t *tconnp; 1435 int ret = 0; 1436 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1437 uint16_t lport = connp->conn_lport; 1438 uint8_t protocol = connp->conn_proto; 1439 uint_t ifindex = connp->conn_bound_if; 1440 1441 if (IPCL_IS_IPTUN(connp)) 1442 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1443 1444 switch (protocol) { 1445 case IPPROTO_TCP: 1446 1447 /* 1448 * For tcp, we check whether the connection tuple already 1449 * exists before allowing the connection to proceed. We 1450 * also allow indexing on the zoneid. This is to allow 1451 * multiple shared stack zones to have the same tcp 1452 * connection tuple. In practice this only happens for 1453 * ipv6_loopback as it's the only local address which 1454 * doesn't have to be unique. 1455 */ 1456 connfp = &ipst->ips_ipcl_conn_fanout[ 1457 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1458 ipst)]; 1459 mutex_enter(&connfp->connf_lock); 1460 for (tconnp = connfp->connf_head; tconnp != NULL; 1461 tconnp = tconnp->conn_next) { 1462 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1463 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1464 connp->conn_faddr_v6, connp->conn_laddr_v6, 1465 connp->conn_ports) && 1466 (tconnp->conn_bound_if == 0 || 1467 tconnp->conn_bound_if == ifindex) && 1468 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1469 /* Already have a conn. bail out */ 1470 mutex_exit(&connfp->connf_lock); 1471 return (EADDRINUSE); 1472 } 1473 } 1474 if (connp->conn_fanout != NULL) { 1475 /* 1476 * Probably a XTI/TLI application trying to do a 1477 * rebind. Let it happen. 1478 */ 1479 mutex_exit(&connfp->connf_lock); 1480 IPCL_HASH_REMOVE(connp); 1481 mutex_enter(&connfp->connf_lock); 1482 } 1483 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1484 mutex_exit(&connfp->connf_lock); 1485 break; 1486 1487 case IPPROTO_SCTP: 1488 IPCL_HASH_REMOVE(connp); 1489 ret = ipcl_sctp_hash_insert(connp, lport); 1490 break; 1491 1492 default: 1493 if (is_system_labeled() && 1494 check_exempt_conflict_v6(connp, ipst)) 1495 return (EADDRINUSE); 1496 /* FALLTHROUGH */ 1497 case IPPROTO_UDP: 1498 if (protocol == IPPROTO_UDP) { 1499 connfp = &ipst->ips_ipcl_udp_fanout[ 1500 IPCL_UDP_HASH(lport, ipst)]; 1501 } else { 1502 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1503 } 1504 1505 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1506 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1507 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1508 IPCL_HASH_INSERT_BOUND(connfp, connp); 1509 } else { 1510 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1511 } 1512 break; 1513 } 1514 1515 return (ret); 1516 } 1517 1518 /* 1519 * v4 packet classifying function. looks up the fanout table to 1520 * find the conn, the packet belongs to. returns the conn with 1521 * the reference held, null otherwise. 1522 * 1523 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1524 * Lookup" comment block are applied. Labels are also checked as described 1525 * above. If the packet is from the inside (looped back), and is from the same 1526 * zone, then label checks are omitted. 1527 */ 1528 conn_t * 1529 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1530 ip_recv_attr_t *ira, ip_stack_t *ipst) 1531 { 1532 ipha_t *ipha; 1533 connf_t *connfp, *bind_connfp; 1534 uint16_t lport; 1535 uint16_t fport; 1536 uint32_t ports; 1537 conn_t *connp; 1538 uint16_t *up; 1539 zoneid_t zoneid = ira->ira_zoneid; 1540 1541 ipha = (ipha_t *)mp->b_rptr; 1542 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1543 1544 switch (protocol) { 1545 case IPPROTO_TCP: 1546 ports = *(uint32_t *)up; 1547 connfp = 1548 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1549 ports, ipst)]; 1550 mutex_enter(&connfp->connf_lock); 1551 for (connp = connfp->connf_head; connp != NULL; 1552 connp = connp->conn_next) { 1553 if (IPCL_CONN_MATCH(connp, protocol, 1554 ipha->ipha_src, ipha->ipha_dst, ports) && 1555 (connp->conn_zoneid == zoneid || 1556 connp->conn_allzones || 1557 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1558 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1559 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1560 break; 1561 } 1562 1563 if (connp != NULL) { 1564 /* 1565 * We have a fully-bound TCP connection. 1566 * 1567 * For labeled systems, there's no need to check the 1568 * label here. It's known to be good as we checked 1569 * before allowing the connection to become bound. 1570 */ 1571 CONN_INC_REF(connp); 1572 mutex_exit(&connfp->connf_lock); 1573 return (connp); 1574 } 1575 1576 mutex_exit(&connfp->connf_lock); 1577 lport = up[1]; 1578 bind_connfp = 1579 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1580 mutex_enter(&bind_connfp->connf_lock); 1581 for (connp = bind_connfp->connf_head; connp != NULL; 1582 connp = connp->conn_next) { 1583 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1584 lport) && 1585 (connp->conn_zoneid == zoneid || 1586 connp->conn_allzones || 1587 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1588 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1589 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1590 break; 1591 } 1592 1593 /* 1594 * If the matching connection is SLP on a private address, then 1595 * the label on the packet must match the local zone's label. 1596 * Otherwise, it must be in the label range defined by tnrh. 1597 * This is ensured by tsol_receive_local. 1598 * 1599 * Note that we don't check tsol_receive_local for 1600 * the connected case. 1601 */ 1602 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1603 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1604 ira, connp)) { 1605 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1606 char *, "connp(1) could not receive mp(2)", 1607 conn_t *, connp, mblk_t *, mp); 1608 connp = NULL; 1609 } 1610 1611 if (connp != NULL) { 1612 /* Have a listener at least */ 1613 CONN_INC_REF(connp); 1614 mutex_exit(&bind_connfp->connf_lock); 1615 return (connp); 1616 } 1617 1618 mutex_exit(&bind_connfp->connf_lock); 1619 break; 1620 1621 case IPPROTO_UDP: 1622 lport = up[1]; 1623 fport = up[0]; 1624 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1625 mutex_enter(&connfp->connf_lock); 1626 for (connp = connfp->connf_head; connp != NULL; 1627 connp = connp->conn_next) { 1628 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1629 fport, ipha->ipha_src) && 1630 (connp->conn_zoneid == zoneid || 1631 connp->conn_allzones || 1632 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1633 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1634 break; 1635 } 1636 1637 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1638 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1639 ira, connp)) { 1640 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1641 char *, "connp(1) could not receive mp(2)", 1642 conn_t *, connp, mblk_t *, mp); 1643 connp = NULL; 1644 } 1645 1646 if (connp != NULL) { 1647 CONN_INC_REF(connp); 1648 mutex_exit(&connfp->connf_lock); 1649 return (connp); 1650 } 1651 1652 /* 1653 * We shouldn't come here for multicast/broadcast packets 1654 */ 1655 mutex_exit(&connfp->connf_lock); 1656 1657 break; 1658 1659 case IPPROTO_ENCAP: 1660 case IPPROTO_IPV6: 1661 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1662 &ipha->ipha_dst, ipst)); 1663 } 1664 1665 return (NULL); 1666 } 1667 1668 conn_t * 1669 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1670 ip_recv_attr_t *ira, ip_stack_t *ipst) 1671 { 1672 ip6_t *ip6h; 1673 connf_t *connfp, *bind_connfp; 1674 uint16_t lport; 1675 uint16_t fport; 1676 tcpha_t *tcpha; 1677 uint32_t ports; 1678 conn_t *connp; 1679 uint16_t *up; 1680 zoneid_t zoneid = ira->ira_zoneid; 1681 1682 ip6h = (ip6_t *)mp->b_rptr; 1683 1684 switch (protocol) { 1685 case IPPROTO_TCP: 1686 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1687 up = &tcpha->tha_lport; 1688 ports = *(uint32_t *)up; 1689 1690 connfp = 1691 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1692 ports, ipst)]; 1693 mutex_enter(&connfp->connf_lock); 1694 for (connp = connfp->connf_head; connp != NULL; 1695 connp = connp->conn_next) { 1696 if (IPCL_CONN_MATCH_V6(connp, protocol, 1697 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1698 (connp->conn_zoneid == zoneid || 1699 connp->conn_allzones || 1700 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1701 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1702 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1703 break; 1704 } 1705 1706 if (connp != NULL) { 1707 /* 1708 * We have a fully-bound TCP connection. 1709 * 1710 * For labeled systems, there's no need to check the 1711 * label here. It's known to be good as we checked 1712 * before allowing the connection to become bound. 1713 */ 1714 CONN_INC_REF(connp); 1715 mutex_exit(&connfp->connf_lock); 1716 return (connp); 1717 } 1718 1719 mutex_exit(&connfp->connf_lock); 1720 1721 lport = up[1]; 1722 bind_connfp = 1723 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1724 mutex_enter(&bind_connfp->connf_lock); 1725 for (connp = bind_connfp->connf_head; connp != NULL; 1726 connp = connp->conn_next) { 1727 if (IPCL_BIND_MATCH_V6(connp, protocol, 1728 ip6h->ip6_dst, lport) && 1729 (connp->conn_zoneid == zoneid || 1730 connp->conn_allzones || 1731 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1732 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1733 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1734 break; 1735 } 1736 1737 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1738 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1739 ira, connp)) { 1740 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1741 char *, "connp(1) could not receive mp(2)", 1742 conn_t *, connp, mblk_t *, mp); 1743 connp = NULL; 1744 } 1745 1746 if (connp != NULL) { 1747 /* Have a listner at least */ 1748 CONN_INC_REF(connp); 1749 mutex_exit(&bind_connfp->connf_lock); 1750 return (connp); 1751 } 1752 1753 mutex_exit(&bind_connfp->connf_lock); 1754 break; 1755 1756 case IPPROTO_UDP: 1757 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1758 lport = up[1]; 1759 fport = up[0]; 1760 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1761 mutex_enter(&connfp->connf_lock); 1762 for (connp = connfp->connf_head; connp != NULL; 1763 connp = connp->conn_next) { 1764 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1765 fport, ip6h->ip6_src) && 1766 (connp->conn_zoneid == zoneid || 1767 connp->conn_allzones || 1768 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1769 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1770 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1771 break; 1772 } 1773 1774 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1775 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1776 ira, connp)) { 1777 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1778 char *, "connp(1) could not receive mp(2)", 1779 conn_t *, connp, mblk_t *, mp); 1780 connp = NULL; 1781 } 1782 1783 if (connp != NULL) { 1784 CONN_INC_REF(connp); 1785 mutex_exit(&connfp->connf_lock); 1786 return (connp); 1787 } 1788 1789 /* 1790 * We shouldn't come here for multicast/broadcast packets 1791 */ 1792 mutex_exit(&connfp->connf_lock); 1793 break; 1794 case IPPROTO_ENCAP: 1795 case IPPROTO_IPV6: 1796 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1797 &ip6h->ip6_dst, ipst)); 1798 } 1799 1800 return (NULL); 1801 } 1802 1803 /* 1804 * wrapper around ipcl_classify_(v4,v6) routines. 1805 */ 1806 conn_t * 1807 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1808 { 1809 if (ira->ira_flags & IRAF_IS_IPV4) { 1810 return (ipcl_classify_v4(mp, ira->ira_protocol, 1811 ira->ira_ip_hdr_length, ira, ipst)); 1812 } else { 1813 return (ipcl_classify_v6(mp, ira->ira_protocol, 1814 ira->ira_ip_hdr_length, ira, ipst)); 1815 } 1816 } 1817 1818 /* 1819 * Only used to classify SCTP RAW sockets 1820 */ 1821 conn_t * 1822 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1823 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1824 { 1825 connf_t *connfp; 1826 conn_t *connp; 1827 in_port_t lport; 1828 int ipversion; 1829 const void *dst; 1830 zoneid_t zoneid = ira->ira_zoneid; 1831 1832 lport = ((uint16_t *)&ports)[1]; 1833 if (ira->ira_flags & IRAF_IS_IPV4) { 1834 dst = (const void *)&ipha->ipha_dst; 1835 ipversion = IPV4_VERSION; 1836 } else { 1837 dst = (const void *)&ip6h->ip6_dst; 1838 ipversion = IPV6_VERSION; 1839 } 1840 1841 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1842 mutex_enter(&connfp->connf_lock); 1843 for (connp = connfp->connf_head; connp != NULL; 1844 connp = connp->conn_next) { 1845 /* We don't allow v4 fallback for v6 raw socket. */ 1846 if (ipversion != connp->conn_ipversion) 1847 continue; 1848 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1849 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1850 if (ipversion == IPV4_VERSION) { 1851 if (!IPCL_CONN_MATCH(connp, protocol, 1852 ipha->ipha_src, ipha->ipha_dst, ports)) 1853 continue; 1854 } else { 1855 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1856 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1857 continue; 1858 } 1859 } else { 1860 if (ipversion == IPV4_VERSION) { 1861 if (!IPCL_BIND_MATCH(connp, protocol, 1862 ipha->ipha_dst, lport)) 1863 continue; 1864 } else { 1865 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1866 ip6h->ip6_dst, lport)) 1867 continue; 1868 } 1869 } 1870 1871 if (connp->conn_zoneid == zoneid || 1872 connp->conn_allzones || 1873 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1874 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1875 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1876 break; 1877 } 1878 1879 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1880 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1881 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1882 char *, "connp(1) could not receive mp(2)", 1883 conn_t *, connp, mblk_t *, mp); 1884 connp = NULL; 1885 } 1886 1887 if (connp != NULL) 1888 goto found; 1889 mutex_exit(&connfp->connf_lock); 1890 1891 /* Try to look for a wildcard SCTP RAW socket match. */ 1892 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1893 mutex_enter(&connfp->connf_lock); 1894 for (connp = connfp->connf_head; connp != NULL; 1895 connp = connp->conn_next) { 1896 /* We don't allow v4 fallback for v6 raw socket. */ 1897 if (ipversion != connp->conn_ipversion) 1898 continue; 1899 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1900 continue; 1901 1902 if (ipversion == IPV4_VERSION) { 1903 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1904 break; 1905 } else { 1906 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1907 break; 1908 } 1909 } 1910 } 1911 1912 if (connp != NULL) 1913 goto found; 1914 1915 mutex_exit(&connfp->connf_lock); 1916 return (NULL); 1917 1918 found: 1919 ASSERT(connp != NULL); 1920 CONN_INC_REF(connp); 1921 mutex_exit(&connfp->connf_lock); 1922 return (connp); 1923 } 1924 1925 /* ARGSUSED */ 1926 static int 1927 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1928 { 1929 itc_t *itc = (itc_t *)buf; 1930 conn_t *connp = &itc->itc_conn; 1931 tcp_t *tcp = (tcp_t *)&itc[1]; 1932 1933 bzero(connp, sizeof (conn_t)); 1934 bzero(tcp, sizeof (tcp_t)); 1935 1936 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1937 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1938 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1939 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1940 if (tcp->tcp_timercache == NULL) 1941 return (ENOMEM); 1942 connp->conn_tcp = tcp; 1943 connp->conn_flags = IPCL_TCPCONN; 1944 connp->conn_proto = IPPROTO_TCP; 1945 tcp->tcp_connp = connp; 1946 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1947 1948 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1949 if (connp->conn_ixa == NULL) { 1950 tcp_timermp_free(tcp); 1951 return (ENOMEM); 1952 } 1953 connp->conn_ixa->ixa_refcnt = 1; 1954 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1955 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1956 return (0); 1957 } 1958 1959 /* ARGSUSED */ 1960 static void 1961 tcp_conn_destructor(void *buf, void *cdrarg) 1962 { 1963 itc_t *itc = (itc_t *)buf; 1964 conn_t *connp = &itc->itc_conn; 1965 tcp_t *tcp = (tcp_t *)&itc[1]; 1966 1967 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1968 ASSERT(tcp->tcp_connp == connp); 1969 ASSERT(connp->conn_tcp == tcp); 1970 tcp_timermp_free(tcp); 1971 mutex_destroy(&connp->conn_lock); 1972 cv_destroy(&connp->conn_cv); 1973 cv_destroy(&connp->conn_sq_cv); 1974 rw_destroy(&connp->conn_ilg_lock); 1975 1976 /* Can be NULL if constructor failed */ 1977 if (connp->conn_ixa != NULL) { 1978 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1979 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1980 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1981 ixa_refrele(connp->conn_ixa); 1982 } 1983 } 1984 1985 /* ARGSUSED */ 1986 static int 1987 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1988 { 1989 itc_t *itc = (itc_t *)buf; 1990 conn_t *connp = &itc->itc_conn; 1991 1992 bzero(connp, sizeof (conn_t)); 1993 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1994 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1995 connp->conn_flags = IPCL_IPCCONN; 1996 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1997 1998 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1999 if (connp->conn_ixa == NULL) 2000 return (ENOMEM); 2001 connp->conn_ixa->ixa_refcnt = 1; 2002 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2003 return (0); 2004 } 2005 2006 /* ARGSUSED */ 2007 static void 2008 ip_conn_destructor(void *buf, void *cdrarg) 2009 { 2010 itc_t *itc = (itc_t *)buf; 2011 conn_t *connp = &itc->itc_conn; 2012 2013 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2014 ASSERT(connp->conn_priv == NULL); 2015 mutex_destroy(&connp->conn_lock); 2016 cv_destroy(&connp->conn_cv); 2017 rw_destroy(&connp->conn_ilg_lock); 2018 2019 /* Can be NULL if constructor failed */ 2020 if (connp->conn_ixa != NULL) { 2021 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2022 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2023 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2024 ixa_refrele(connp->conn_ixa); 2025 } 2026 } 2027 2028 /* ARGSUSED */ 2029 static int 2030 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2031 { 2032 itc_t *itc = (itc_t *)buf; 2033 conn_t *connp = &itc->itc_conn; 2034 udp_t *udp = (udp_t *)&itc[1]; 2035 2036 bzero(connp, sizeof (conn_t)); 2037 bzero(udp, sizeof (udp_t)); 2038 2039 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2040 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2041 connp->conn_udp = udp; 2042 connp->conn_flags = IPCL_UDPCONN; 2043 connp->conn_proto = IPPROTO_UDP; 2044 udp->udp_connp = connp; 2045 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2046 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2047 if (connp->conn_ixa == NULL) 2048 return (ENOMEM); 2049 connp->conn_ixa->ixa_refcnt = 1; 2050 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2051 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2052 return (0); 2053 } 2054 2055 /* ARGSUSED */ 2056 static void 2057 udp_conn_destructor(void *buf, void *cdrarg) 2058 { 2059 itc_t *itc = (itc_t *)buf; 2060 conn_t *connp = &itc->itc_conn; 2061 udp_t *udp = (udp_t *)&itc[1]; 2062 2063 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2064 ASSERT(udp->udp_connp == connp); 2065 ASSERT(connp->conn_udp == udp); 2066 mutex_destroy(&connp->conn_lock); 2067 cv_destroy(&connp->conn_cv); 2068 rw_destroy(&connp->conn_ilg_lock); 2069 2070 /* Can be NULL if constructor failed */ 2071 if (connp->conn_ixa != NULL) { 2072 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2073 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2074 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2075 ixa_refrele(connp->conn_ixa); 2076 } 2077 } 2078 2079 /* ARGSUSED */ 2080 static int 2081 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2082 { 2083 itc_t *itc = (itc_t *)buf; 2084 conn_t *connp = &itc->itc_conn; 2085 icmp_t *icmp = (icmp_t *)&itc[1]; 2086 2087 bzero(connp, sizeof (conn_t)); 2088 bzero(icmp, sizeof (icmp_t)); 2089 2090 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2091 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2092 connp->conn_icmp = icmp; 2093 connp->conn_flags = IPCL_RAWIPCONN; 2094 connp->conn_proto = IPPROTO_ICMP; 2095 icmp->icmp_connp = connp; 2096 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2097 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2098 if (connp->conn_ixa == NULL) 2099 return (ENOMEM); 2100 connp->conn_ixa->ixa_refcnt = 1; 2101 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2102 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2103 return (0); 2104 } 2105 2106 /* ARGSUSED */ 2107 static void 2108 rawip_conn_destructor(void *buf, void *cdrarg) 2109 { 2110 itc_t *itc = (itc_t *)buf; 2111 conn_t *connp = &itc->itc_conn; 2112 icmp_t *icmp = (icmp_t *)&itc[1]; 2113 2114 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2115 ASSERT(icmp->icmp_connp == connp); 2116 ASSERT(connp->conn_icmp == icmp); 2117 mutex_destroy(&connp->conn_lock); 2118 cv_destroy(&connp->conn_cv); 2119 rw_destroy(&connp->conn_ilg_lock); 2120 2121 /* Can be NULL if constructor failed */ 2122 if (connp->conn_ixa != NULL) { 2123 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2124 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2125 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2126 ixa_refrele(connp->conn_ixa); 2127 } 2128 } 2129 2130 /* ARGSUSED */ 2131 static int 2132 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2133 { 2134 itc_t *itc = (itc_t *)buf; 2135 conn_t *connp = &itc->itc_conn; 2136 rts_t *rts = (rts_t *)&itc[1]; 2137 2138 bzero(connp, sizeof (conn_t)); 2139 bzero(rts, sizeof (rts_t)); 2140 2141 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2142 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2143 connp->conn_rts = rts; 2144 connp->conn_flags = IPCL_RTSCONN; 2145 rts->rts_connp = connp; 2146 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2147 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2148 if (connp->conn_ixa == NULL) 2149 return (ENOMEM); 2150 connp->conn_ixa->ixa_refcnt = 1; 2151 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2152 return (0); 2153 } 2154 2155 /* ARGSUSED */ 2156 static void 2157 rts_conn_destructor(void *buf, void *cdrarg) 2158 { 2159 itc_t *itc = (itc_t *)buf; 2160 conn_t *connp = &itc->itc_conn; 2161 rts_t *rts = (rts_t *)&itc[1]; 2162 2163 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2164 ASSERT(rts->rts_connp == connp); 2165 ASSERT(connp->conn_rts == rts); 2166 mutex_destroy(&connp->conn_lock); 2167 cv_destroy(&connp->conn_cv); 2168 rw_destroy(&connp->conn_ilg_lock); 2169 2170 /* Can be NULL if constructor failed */ 2171 if (connp->conn_ixa != NULL) { 2172 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2173 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2174 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2175 ixa_refrele(connp->conn_ixa); 2176 } 2177 } 2178 2179 /* 2180 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2181 * in the conn_t. 2182 * 2183 * Below we list all the pointers in the conn_t as a documentation aid. 2184 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2185 * If you add any pointers to the conn_t please add an ASSERT here 2186 * and #ifdef it out if it can't be actually asserted to be NULL. 2187 * In any case, we bzero most of the conn_t at the end of the function. 2188 */ 2189 void 2190 ipcl_conn_cleanup(conn_t *connp) 2191 { 2192 ip_xmit_attr_t *ixa; 2193 2194 ASSERT(connp->conn_latch == NULL); 2195 ASSERT(connp->conn_latch_in_policy == NULL); 2196 ASSERT(connp->conn_latch_in_action == NULL); 2197 #ifdef notdef 2198 ASSERT(connp->conn_rq == NULL); 2199 ASSERT(connp->conn_wq == NULL); 2200 #endif 2201 ASSERT(connp->conn_cred == NULL); 2202 ASSERT(connp->conn_g_fanout == NULL); 2203 ASSERT(connp->conn_g_next == NULL); 2204 ASSERT(connp->conn_g_prev == NULL); 2205 ASSERT(connp->conn_policy == NULL); 2206 ASSERT(connp->conn_fanout == NULL); 2207 ASSERT(connp->conn_next == NULL); 2208 ASSERT(connp->conn_prev == NULL); 2209 ASSERT(connp->conn_oper_pending_ill == NULL); 2210 ASSERT(connp->conn_ilg == NULL); 2211 ASSERT(connp->conn_drain_next == NULL); 2212 ASSERT(connp->conn_drain_prev == NULL); 2213 #ifdef notdef 2214 /* conn_idl is not cleared when removed from idl list */ 2215 ASSERT(connp->conn_idl == NULL); 2216 #endif 2217 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2218 #ifdef notdef 2219 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2220 ASSERT(connp->conn_netstack == NULL); 2221 #endif 2222 2223 ASSERT(connp->conn_helper_info == NULL); 2224 ASSERT(connp->conn_ixa != NULL); 2225 ixa = connp->conn_ixa; 2226 ASSERT(ixa->ixa_refcnt == 1); 2227 /* Need to preserve ixa_protocol */ 2228 ixa_cleanup(ixa); 2229 ixa->ixa_flags = 0; 2230 2231 /* Clear out the conn_t fields that are not preserved */ 2232 bzero(&connp->conn_start_clr, 2233 sizeof (conn_t) - 2234 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2235 } 2236 2237 /* 2238 * All conns are inserted in a global multi-list for the benefit of 2239 * walkers. The walk is guaranteed to walk all open conns at the time 2240 * of the start of the walk exactly once. This property is needed to 2241 * achieve some cleanups during unplumb of interfaces. This is achieved 2242 * as follows. 2243 * 2244 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2245 * call the insert and delete functions below at creation and deletion 2246 * time respectively. The conn never moves or changes its position in this 2247 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2248 * won't increase due to walkers, once the conn deletion has started. Note 2249 * that we can't remove the conn from the global list and then wait for 2250 * the refcnt to drop to zero, since walkers would then see a truncated 2251 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2252 * conns until ip_open is ready to make them globally visible. 2253 * The global round robin multi-list locks are held only to get the 2254 * next member/insertion/deletion and contention should be negligible 2255 * if the multi-list is much greater than the number of cpus. 2256 */ 2257 void 2258 ipcl_globalhash_insert(conn_t *connp) 2259 { 2260 int index; 2261 struct connf_s *connfp; 2262 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2263 2264 /* 2265 * No need for atomic here. Approximate even distribution 2266 * in the global lists is sufficient. 2267 */ 2268 ipst->ips_conn_g_index++; 2269 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2270 2271 connp->conn_g_prev = NULL; 2272 /* 2273 * Mark as INCIPIENT, so that walkers will ignore this 2274 * for now, till ip_open is ready to make it visible globally. 2275 */ 2276 connp->conn_state_flags |= CONN_INCIPIENT; 2277 2278 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2279 /* Insert at the head of the list */ 2280 mutex_enter(&connfp->connf_lock); 2281 connp->conn_g_next = connfp->connf_head; 2282 if (connp->conn_g_next != NULL) 2283 connp->conn_g_next->conn_g_prev = connp; 2284 connfp->connf_head = connp; 2285 2286 /* The fanout bucket this conn points to */ 2287 connp->conn_g_fanout = connfp; 2288 2289 mutex_exit(&connfp->connf_lock); 2290 } 2291 2292 void 2293 ipcl_globalhash_remove(conn_t *connp) 2294 { 2295 struct connf_s *connfp; 2296 2297 /* 2298 * We were never inserted in the global multi list. 2299 * IPCL_NONE variety is never inserted in the global multilist 2300 * since it is presumed to not need any cleanup and is transient. 2301 */ 2302 if (connp->conn_g_fanout == NULL) 2303 return; 2304 2305 connfp = connp->conn_g_fanout; 2306 mutex_enter(&connfp->connf_lock); 2307 if (connp->conn_g_prev != NULL) 2308 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2309 else 2310 connfp->connf_head = connp->conn_g_next; 2311 if (connp->conn_g_next != NULL) 2312 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2313 mutex_exit(&connfp->connf_lock); 2314 2315 /* Better to stumble on a null pointer than to corrupt memory */ 2316 connp->conn_g_next = NULL; 2317 connp->conn_g_prev = NULL; 2318 connp->conn_g_fanout = NULL; 2319 } 2320 2321 /* 2322 * Walk the list of all conn_t's in the system, calling the function provided 2323 * With the specified argument for each. 2324 * Applies to both IPv4 and IPv6. 2325 * 2326 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2327 * conn_oper_pending_ill). To guard against stale pointers 2328 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2329 * unplumbed or removed. New conn_t's that are created while we are walking 2330 * may be missed by this walk, because they are not necessarily inserted 2331 * at the tail of the list. They are new conn_t's and thus don't have any 2332 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2333 * is created to the struct that is going away. 2334 */ 2335 void 2336 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2337 { 2338 int i; 2339 conn_t *connp; 2340 conn_t *prev_connp; 2341 2342 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2343 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2344 prev_connp = NULL; 2345 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2346 while (connp != NULL) { 2347 mutex_enter(&connp->conn_lock); 2348 if (connp->conn_state_flags & 2349 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2350 mutex_exit(&connp->conn_lock); 2351 connp = connp->conn_g_next; 2352 continue; 2353 } 2354 CONN_INC_REF_LOCKED(connp); 2355 mutex_exit(&connp->conn_lock); 2356 mutex_exit( 2357 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2358 (*func)(connp, arg); 2359 if (prev_connp != NULL) 2360 CONN_DEC_REF(prev_connp); 2361 mutex_enter( 2362 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2363 prev_connp = connp; 2364 connp = connp->conn_g_next; 2365 } 2366 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2367 if (prev_connp != NULL) 2368 CONN_DEC_REF(prev_connp); 2369 } 2370 } 2371 2372 /* 2373 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2374 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2375 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2376 * (peer tcp in ESTABLISHED state). 2377 */ 2378 conn_t * 2379 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2380 ip_stack_t *ipst) 2381 { 2382 uint32_t ports; 2383 uint16_t *pports = (uint16_t *)&ports; 2384 connf_t *connfp; 2385 conn_t *tconnp; 2386 boolean_t zone_chk; 2387 2388 /* 2389 * If either the source of destination address is loopback, then 2390 * both endpoints must be in the same Zone. Otherwise, both of 2391 * the addresses are system-wide unique (tcp is in ESTABLISHED 2392 * state) and the endpoints may reside in different Zones. 2393 */ 2394 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2395 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2396 2397 pports[0] = tcpha->tha_fport; 2398 pports[1] = tcpha->tha_lport; 2399 2400 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2401 ports, ipst)]; 2402 2403 mutex_enter(&connfp->connf_lock); 2404 for (tconnp = connfp->connf_head; tconnp != NULL; 2405 tconnp = tconnp->conn_next) { 2406 2407 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2408 ipha->ipha_dst, ipha->ipha_src, ports) && 2409 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2410 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2411 2412 ASSERT(tconnp != connp); 2413 CONN_INC_REF(tconnp); 2414 mutex_exit(&connfp->connf_lock); 2415 return (tconnp); 2416 } 2417 } 2418 mutex_exit(&connfp->connf_lock); 2419 return (NULL); 2420 } 2421 2422 /* 2423 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2424 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2425 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2426 * (peer tcp in ESTABLISHED state). 2427 */ 2428 conn_t * 2429 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2430 ip_stack_t *ipst) 2431 { 2432 uint32_t ports; 2433 uint16_t *pports = (uint16_t *)&ports; 2434 connf_t *connfp; 2435 conn_t *tconnp; 2436 boolean_t zone_chk; 2437 2438 /* 2439 * If either the source of destination address is loopback, then 2440 * both endpoints must be in the same Zone. Otherwise, both of 2441 * the addresses are system-wide unique (tcp is in ESTABLISHED 2442 * state) and the endpoints may reside in different Zones. We 2443 * don't do Zone check for link local address(es) because the 2444 * current Zone implementation treats each link local address as 2445 * being unique per system node, i.e. they belong to global Zone. 2446 */ 2447 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2448 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2449 2450 pports[0] = tcpha->tha_fport; 2451 pports[1] = tcpha->tha_lport; 2452 2453 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2454 ports, ipst)]; 2455 2456 mutex_enter(&connfp->connf_lock); 2457 for (tconnp = connfp->connf_head; tconnp != NULL; 2458 tconnp = tconnp->conn_next) { 2459 2460 /* We skip conn_bound_if check here as this is loopback tcp */ 2461 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2462 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2463 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2464 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2465 2466 ASSERT(tconnp != connp); 2467 CONN_INC_REF(tconnp); 2468 mutex_exit(&connfp->connf_lock); 2469 return (tconnp); 2470 } 2471 } 2472 mutex_exit(&connfp->connf_lock); 2473 return (NULL); 2474 } 2475 2476 /* 2477 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2478 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2479 * Only checks for connected entries i.e. no INADDR_ANY checks. 2480 */ 2481 conn_t * 2482 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2483 ip_stack_t *ipst) 2484 { 2485 uint32_t ports; 2486 uint16_t *pports; 2487 connf_t *connfp; 2488 conn_t *tconnp; 2489 2490 pports = (uint16_t *)&ports; 2491 pports[0] = tcpha->tha_fport; 2492 pports[1] = tcpha->tha_lport; 2493 2494 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2495 ports, ipst)]; 2496 2497 mutex_enter(&connfp->connf_lock); 2498 for (tconnp = connfp->connf_head; tconnp != NULL; 2499 tconnp = tconnp->conn_next) { 2500 2501 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2502 ipha->ipha_dst, ipha->ipha_src, ports) && 2503 tconnp->conn_tcp->tcp_state >= min_state) { 2504 2505 CONN_INC_REF(tconnp); 2506 mutex_exit(&connfp->connf_lock); 2507 return (tconnp); 2508 } 2509 } 2510 mutex_exit(&connfp->connf_lock); 2511 return (NULL); 2512 } 2513 2514 /* 2515 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2516 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2517 * Only checks for connected entries i.e. no INADDR_ANY checks. 2518 * Match on ifindex in addition to addresses. 2519 */ 2520 conn_t * 2521 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2522 uint_t ifindex, ip_stack_t *ipst) 2523 { 2524 tcp_t *tcp; 2525 uint32_t ports; 2526 uint16_t *pports; 2527 connf_t *connfp; 2528 conn_t *tconnp; 2529 2530 pports = (uint16_t *)&ports; 2531 pports[0] = tcpha->tha_fport; 2532 pports[1] = tcpha->tha_lport; 2533 2534 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2535 ports, ipst)]; 2536 2537 mutex_enter(&connfp->connf_lock); 2538 for (tconnp = connfp->connf_head; tconnp != NULL; 2539 tconnp = tconnp->conn_next) { 2540 2541 tcp = tconnp->conn_tcp; 2542 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2543 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2544 tcp->tcp_state >= min_state && 2545 (tconnp->conn_bound_if == 0 || 2546 tconnp->conn_bound_if == ifindex)) { 2547 2548 CONN_INC_REF(tconnp); 2549 mutex_exit(&connfp->connf_lock); 2550 return (tconnp); 2551 } 2552 } 2553 mutex_exit(&connfp->connf_lock); 2554 return (NULL); 2555 } 2556 2557 /* 2558 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2559 * a listener when changing state. 2560 */ 2561 conn_t * 2562 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2563 ip_stack_t *ipst) 2564 { 2565 connf_t *bind_connfp; 2566 conn_t *connp; 2567 tcp_t *tcp; 2568 2569 /* 2570 * Avoid false matches for packets sent to an IP destination of 2571 * all zeros. 2572 */ 2573 if (laddr == 0) 2574 return (NULL); 2575 2576 ASSERT(zoneid != ALL_ZONES); 2577 2578 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2579 mutex_enter(&bind_connfp->connf_lock); 2580 for (connp = bind_connfp->connf_head; connp != NULL; 2581 connp = connp->conn_next) { 2582 tcp = connp->conn_tcp; 2583 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2584 IPCL_ZONE_MATCH(connp, zoneid) && 2585 (tcp->tcp_listener == NULL)) { 2586 CONN_INC_REF(connp); 2587 mutex_exit(&bind_connfp->connf_lock); 2588 return (connp); 2589 } 2590 } 2591 mutex_exit(&bind_connfp->connf_lock); 2592 return (NULL); 2593 } 2594 2595 /* 2596 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2597 * a listener when changing state. 2598 */ 2599 conn_t * 2600 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2601 zoneid_t zoneid, ip_stack_t *ipst) 2602 { 2603 connf_t *bind_connfp; 2604 conn_t *connp = NULL; 2605 tcp_t *tcp; 2606 2607 /* 2608 * Avoid false matches for packets sent to an IP destination of 2609 * all zeros. 2610 */ 2611 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2612 return (NULL); 2613 2614 ASSERT(zoneid != ALL_ZONES); 2615 2616 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2617 mutex_enter(&bind_connfp->connf_lock); 2618 for (connp = bind_connfp->connf_head; connp != NULL; 2619 connp = connp->conn_next) { 2620 tcp = connp->conn_tcp; 2621 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2622 IPCL_ZONE_MATCH(connp, zoneid) && 2623 (connp->conn_bound_if == 0 || 2624 connp->conn_bound_if == ifindex) && 2625 tcp->tcp_listener == NULL) { 2626 CONN_INC_REF(connp); 2627 mutex_exit(&bind_connfp->connf_lock); 2628 return (connp); 2629 } 2630 } 2631 mutex_exit(&bind_connfp->connf_lock); 2632 return (NULL); 2633 } 2634 2635 /* 2636 * ipcl_get_next_conn 2637 * get the next entry in the conn global list 2638 * and put a reference on the next_conn. 2639 * decrement the reference on the current conn. 2640 * 2641 * This is an iterator based walker function that also provides for 2642 * some selection by the caller. It walks through the conn_hash bucket 2643 * searching for the next valid connp in the list, and selects connections 2644 * that are neither closed nor condemned. It also REFHOLDS the conn 2645 * thus ensuring that the conn exists when the caller uses the conn. 2646 */ 2647 conn_t * 2648 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2649 { 2650 conn_t *next_connp; 2651 2652 if (connfp == NULL) 2653 return (NULL); 2654 2655 mutex_enter(&connfp->connf_lock); 2656 2657 next_connp = (connp == NULL) ? 2658 connfp->connf_head : connp->conn_g_next; 2659 2660 while (next_connp != NULL) { 2661 mutex_enter(&next_connp->conn_lock); 2662 if (!(next_connp->conn_flags & conn_flags) || 2663 (next_connp->conn_state_flags & 2664 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2665 /* 2666 * This conn has been condemned or 2667 * is closing, or the flags don't match 2668 */ 2669 mutex_exit(&next_connp->conn_lock); 2670 next_connp = next_connp->conn_g_next; 2671 continue; 2672 } 2673 CONN_INC_REF_LOCKED(next_connp); 2674 mutex_exit(&next_connp->conn_lock); 2675 break; 2676 } 2677 2678 mutex_exit(&connfp->connf_lock); 2679 2680 if (connp != NULL) 2681 CONN_DEC_REF(connp); 2682 2683 return (next_connp); 2684 } 2685 2686 #ifdef CONN_DEBUG 2687 /* 2688 * Trace of the last NBUF refhold/refrele 2689 */ 2690 int 2691 conn_trace_ref(conn_t *connp) 2692 { 2693 int last; 2694 conn_trace_t *ctb; 2695 2696 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2697 last = connp->conn_trace_last; 2698 last++; 2699 if (last == CONN_TRACE_MAX) 2700 last = 0; 2701 2702 ctb = &connp->conn_trace_buf[last]; 2703 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2704 connp->conn_trace_last = last; 2705 return (1); 2706 } 2707 2708 int 2709 conn_untrace_ref(conn_t *connp) 2710 { 2711 int last; 2712 conn_trace_t *ctb; 2713 2714 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2715 last = connp->conn_trace_last; 2716 last++; 2717 if (last == CONN_TRACE_MAX) 2718 last = 0; 2719 2720 ctb = &connp->conn_trace_buf[last]; 2721 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2722 connp->conn_trace_last = last; 2723 return (1); 2724 } 2725 #endif 2726 2727 mib2_socketInfoEntry_t * 2728 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) 2729 { 2730 vnode_t *vn = NULL; 2731 vattr_t attr; 2732 uint64_t flags = 0; 2733 2734 /* 2735 * If the connection is closing, it is not safe to make an upcall or 2736 * access the stream associated with the connection. 2737 * The callers of this function have a reference on connp itself 2738 * so, as long as it is not closing, it's safe to continue. 2739 */ 2740 mutex_enter(&connp->conn_lock); 2741 2742 if ((connp->conn_state_flags & CONN_CLOSING)) { 2743 mutex_exit(&connp->conn_lock); 2744 return (NULL); 2745 } 2746 2747 mutex_exit(&connp->conn_lock); 2748 2749 if (connp->conn_upper_handle != NULL) { 2750 vn = (*connp->conn_upcalls->su_get_vnode) 2751 (connp->conn_upper_handle); 2752 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) { 2753 vn = STREAM(connp->conn_rq)->sd_pvnode; 2754 if (vn != NULL) 2755 VN_HOLD(vn); 2756 flags |= MIB2_SOCKINFO_STREAM; 2757 } 2758 2759 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) { 2760 if (vn != NULL) 2761 VN_RELE(vn); 2762 return (NULL); 2763 } 2764 2765 VN_RELE(vn); 2766 2767 bzero(sie, sizeof (*sie)); 2768 2769 sie->sie_flags = flags; 2770 sie->sie_inode = attr.va_nodeid; 2771 sie->sie_dev = attr.va_rdev; 2772 2773 return (sie); 2774 } 2775