1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * conn_recv is used to pass up packets to the ULP. 56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 57 * a listener, and changes to tcp_input_listener as the listener has picked a 58 * good squeue. For other cases it is set to tcp_input_data. 59 * 60 * conn_recvicmp is used to pass up ICMP errors to the ULP. 61 * 62 * Classifier uses several hash tables: 63 * 64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 65 * ipcl_bind_fanout: contains all connections in BOUND state 66 * ipcl_proto_fanout: IPv4 protocol fanout 67 * ipcl_proto_fanout_v6: IPv6 protocol fanout 68 * ipcl_udp_fanout: contains all UDP connections 69 * ipcl_iptun_fanout: contains all IP tunnel connections 70 * ipcl_globalhash_fanout: contains all connections 71 * 72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 73 * which need to view all existing connections. 74 * 75 * All tables are protected by per-bucket locks. When both per-bucket lock and 76 * connection lock need to be held, the per-bucket lock should be acquired 77 * first, followed by the connection lock. 78 * 79 * All functions doing search in one of these tables increment a reference 80 * counter on the connection found (if any). This reference should be dropped 81 * when the caller has finished processing the connection. 82 * 83 * 84 * INTERFACES: 85 * =========== 86 * 87 * Connection Lookup: 88 * ------------------ 89 * 90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 92 * 93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 94 * it can't find any associated connection. If the connection is found, its 95 * reference counter is incremented. 96 * 97 * mp: mblock, containing packet header. The full header should fit 98 * into a single mblock. It should also contain at least full IP 99 * and TCP or UDP header. 100 * 101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 102 * 103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 104 * the packet. 105 * 106 * ira->ira_zoneid: The zone in which the returned connection must be; the 107 * zoneid corresponding to the ire_zoneid on the IRE located for 108 * the packet's destination address. 109 * 110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 111 * IRAF_TX_SHARED_ADDR flags 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp); 187 * int ipcl_conn_insert_v4(connp); 188 * int ipcl_conn_insert_v6(connp); 189 * 190 * Insert 'connp' in the ipcl_conn_fanout. 191 * Arguments : 192 * connp conn_t to be inserted 193 * 194 * Return value : 195 * 0 if connp was inserted 196 * EADDRINUSE if the connection with the same tuple 197 * already exists. 198 * 199 * int ipcl_bind_insert(connp); 200 * int ipcl_bind_insert_v4(connp); 201 * int ipcl_bind_insert_v6(connp); 202 * 203 * Insert 'connp' in ipcl_bind_fanout. 204 * Arguments : 205 * connp conn_t to be inserted 206 * 207 * 208 * void ipcl_hash_remove(connp); 209 * 210 * Removes the 'connp' from the connection fanout table. 211 * 212 * Connection Creation/Destruction 213 * ------------------------------- 214 * 215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 216 * 217 * Creates a new conn based on the type flag, inserts it into 218 * globalhash table. 219 * 220 * type: This flag determines the type of conn_t which needs to be 221 * created i.e., which kmem_cache it comes from. 222 * IPCL_TCPCONN indicates a TCP connection 223 * IPCL_SCTPCONN indicates a SCTP connection 224 * IPCL_UDPCONN indicates a UDP conn_t. 225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 226 * IPCL_RTSCONN indicates a RTS conn_t. 227 * IPCL_IPCCONN indicates all other connections. 228 * 229 * void ipcl_conn_destroy(connp) 230 * 231 * Destroys the connection state, removes it from the global 232 * connection hash table and frees its memory. 233 */ 234 235 #include <sys/types.h> 236 #include <sys/stream.h> 237 #include <sys/stropts.h> 238 #include <sys/sysmacros.h> 239 #include <sys/strsubr.h> 240 #include <sys/strsun.h> 241 #define _SUN_TPI_VERSION 2 242 #include <sys/ddi.h> 243 #include <sys/cmn_err.h> 244 #include <sys/debug.h> 245 246 #include <sys/systm.h> 247 #include <sys/param.h> 248 #include <sys/kmem.h> 249 #include <sys/isa_defs.h> 250 #include <inet/common.h> 251 #include <netinet/ip6.h> 252 #include <netinet/icmp6.h> 253 254 #include <inet/ip.h> 255 #include <inet/ip_if.h> 256 #include <inet/ip_ire.h> 257 #include <inet/ip6.h> 258 #include <inet/ip_ndp.h> 259 #include <inet/ip_impl.h> 260 #include <inet/udp_impl.h> 261 #include <inet/sctp_ip.h> 262 #include <inet/sctp/sctp_impl.h> 263 #include <inet/rawip_impl.h> 264 #include <inet/rts_impl.h> 265 #include <inet/iptun/iptun_impl.h> 266 267 #include <sys/cpuvar.h> 268 269 #include <inet/ipclassifier.h> 270 #include <inet/tcp.h> 271 #include <inet/ipsec_impl.h> 272 273 #include <sys/tsol/tnet.h> 274 #include <sys/sockio.h> 275 276 /* Old value for compatibility. Setable in /etc/system */ 277 uint_t tcp_conn_hash_size = 0; 278 279 /* New value. Zero means choose automatically. Setable in /etc/system */ 280 uint_t ipcl_conn_hash_size = 0; 281 uint_t ipcl_conn_hash_memfactor = 8192; 282 uint_t ipcl_conn_hash_maxsize = 82500; 283 284 /* bind/udp fanout table size */ 285 uint_t ipcl_bind_fanout_size = 512; 286 uint_t ipcl_udp_fanout_size = 16384; 287 288 /* Raw socket fanout size. Must be a power of 2. */ 289 uint_t ipcl_raw_fanout_size = 256; 290 291 /* 292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 293 * expect that most large deployments would have hundreds of tunnels, and 294 * thousands in the extreme case. 295 */ 296 uint_t ipcl_iptun_fanout_size = 6143; 297 298 /* 299 * Power of 2^N Primes useful for hashing for N of 0-28, 300 * these primes are the nearest prime <= 2^N - 2^(N-2). 301 */ 302 303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 306 50331599, 100663291, 201326557, 0} 307 308 /* 309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 310 * are aligned on cache lines. 311 */ 312 typedef union itc_s { 313 conn_t itc_conn; 314 char itcu_filler[CACHE_ALIGN(conn_s)]; 315 } itc_t; 316 317 struct kmem_cache *tcp_conn_cache; 318 struct kmem_cache *ip_conn_cache; 319 extern struct kmem_cache *sctp_conn_cache; 320 struct kmem_cache *udp_conn_cache; 321 struct kmem_cache *rawip_conn_cache; 322 struct kmem_cache *rts_conn_cache; 323 324 extern void tcp_timermp_free(tcp_t *); 325 extern mblk_t *tcp_timermp_alloc(int); 326 327 static int ip_conn_constructor(void *, void *, int); 328 static void ip_conn_destructor(void *, void *); 329 330 static int tcp_conn_constructor(void *, void *, int); 331 static void tcp_conn_destructor(void *, void *); 332 333 static int udp_conn_constructor(void *, void *, int); 334 static void udp_conn_destructor(void *, void *); 335 336 static int rawip_conn_constructor(void *, void *, int); 337 static void rawip_conn_destructor(void *, void *); 338 339 static int rts_conn_constructor(void *, void *, int); 340 static void rts_conn_destructor(void *, void *); 341 342 /* 343 * Global (for all stack instances) init routine 344 */ 345 void 346 ipcl_g_init(void) 347 { 348 ip_conn_cache = kmem_cache_create("ip_conn_cache", 349 sizeof (conn_t), CACHE_ALIGN_SIZE, 350 ip_conn_constructor, ip_conn_destructor, 351 NULL, NULL, NULL, 0); 352 353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 355 tcp_conn_constructor, tcp_conn_destructor, 356 tcp_conn_reclaim, NULL, NULL, 0); 357 358 udp_conn_cache = kmem_cache_create("udp_conn_cache", 359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 360 udp_conn_constructor, udp_conn_destructor, 361 NULL, NULL, NULL, 0); 362 363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 365 rawip_conn_constructor, rawip_conn_destructor, 366 NULL, NULL, NULL, 0); 367 368 rts_conn_cache = kmem_cache_create("rts_conn_cache", 369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 370 rts_conn_constructor, rts_conn_destructor, 371 NULL, NULL, NULL, 0); 372 } 373 374 /* 375 * ipclassifier intialization routine, sets up hash tables. 376 */ 377 void 378 ipcl_init(ip_stack_t *ipst) 379 { 380 int i; 381 int sizes[] = P2Ps(); 382 383 /* 384 * Calculate size of conn fanout table from /etc/system settings 385 */ 386 if (ipcl_conn_hash_size != 0) { 387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 388 } else if (tcp_conn_hash_size != 0) { 389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 390 } else { 391 extern pgcnt_t freemem; 392 393 ipst->ips_ipcl_conn_fanout_size = 394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 395 396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 397 ipst->ips_ipcl_conn_fanout_size = 398 ipcl_conn_hash_maxsize; 399 } 400 } 401 402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 404 break; 405 } 406 } 407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 408 /* Out of range, use the 2^16 value */ 409 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 410 } 411 412 /* Take values from /etc/system */ 413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 417 418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 419 420 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 422 423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 425 MUTEX_DEFAULT, NULL); 426 } 427 428 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 430 431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 433 MUTEX_DEFAULT, NULL); 434 } 435 436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 437 sizeof (connf_t), KM_SLEEP); 438 for (i = 0; i < IPPROTO_MAX; i++) { 439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 440 MUTEX_DEFAULT, NULL); 441 } 442 443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 444 sizeof (connf_t), KM_SLEEP); 445 for (i = 0; i < IPPROTO_MAX; i++) { 446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 447 MUTEX_DEFAULT, NULL); 448 } 449 450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 451 mutex_init(&ipst->ips_rts_clients->connf_lock, 452 NULL, MUTEX_DEFAULT, NULL); 453 454 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 458 MUTEX_DEFAULT, NULL); 459 } 460 461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 465 MUTEX_DEFAULT, NULL); 466 } 467 468 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 472 MUTEX_DEFAULT, NULL); 473 } 474 475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 477 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 479 NULL, MUTEX_DEFAULT, NULL); 480 } 481 } 482 483 void 484 ipcl_g_destroy(void) 485 { 486 kmem_cache_destroy(ip_conn_cache); 487 kmem_cache_destroy(tcp_conn_cache); 488 kmem_cache_destroy(udp_conn_cache); 489 kmem_cache_destroy(rawip_conn_cache); 490 kmem_cache_destroy(rts_conn_cache); 491 } 492 493 /* 494 * All user-level and kernel use of the stack must be gone 495 * by now. 496 */ 497 void 498 ipcl_destroy(ip_stack_t *ipst) 499 { 500 int i; 501 502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 505 } 506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 507 sizeof (connf_t)); 508 ipst->ips_ipcl_conn_fanout = NULL; 509 510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 513 } 514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 515 sizeof (connf_t)); 516 ipst->ips_ipcl_bind_fanout = NULL; 517 518 for (i = 0; i < IPPROTO_MAX; i++) { 519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 521 } 522 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 523 IPPROTO_MAX * sizeof (connf_t)); 524 ipst->ips_ipcl_proto_fanout_v4 = NULL; 525 526 for (i = 0; i < IPPROTO_MAX; i++) { 527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 529 } 530 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 531 IPPROTO_MAX * sizeof (connf_t)); 532 ipst->ips_ipcl_proto_fanout_v6 = NULL; 533 534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 537 } 538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 539 sizeof (connf_t)); 540 ipst->ips_ipcl_udp_fanout = NULL; 541 542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 545 } 546 kmem_free(ipst->ips_ipcl_iptun_fanout, 547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 548 ipst->ips_ipcl_iptun_fanout = NULL; 549 550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 553 } 554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 555 sizeof (connf_t)); 556 ipst->ips_ipcl_raw_fanout = NULL; 557 558 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 561 } 562 kmem_free(ipst->ips_ipcl_globalhash_fanout, 563 sizeof (connf_t) * CONN_G_HASH_SIZE); 564 ipst->ips_ipcl_globalhash_fanout = NULL; 565 566 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 567 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 569 ipst->ips_rts_clients = NULL; 570 } 571 572 /* 573 * conn creation routine. initialize the conn, sets the reference 574 * and inserts it in the global hash table. 575 */ 576 conn_t * 577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 578 { 579 conn_t *connp; 580 struct kmem_cache *conn_cache; 581 582 switch (type) { 583 case IPCL_SCTPCONN: 584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 585 return (NULL); 586 sctp_conn_init(connp); 587 netstack_hold(ns); 588 connp->conn_netstack = ns; 589 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 590 connp->conn_ixa->ixa_conn_id = (long)connp; 591 ipcl_globalhash_insert(connp); 592 return (connp); 593 594 case IPCL_TCPCONN: 595 conn_cache = tcp_conn_cache; 596 break; 597 598 case IPCL_UDPCONN: 599 conn_cache = udp_conn_cache; 600 break; 601 602 case IPCL_RAWIPCONN: 603 conn_cache = rawip_conn_cache; 604 break; 605 606 case IPCL_RTSCONN: 607 conn_cache = rts_conn_cache; 608 break; 609 610 case IPCL_IPCCONN: 611 conn_cache = ip_conn_cache; 612 break; 613 614 default: 615 conn_cache = NULL; 616 connp = NULL; 617 ASSERT(0); 618 } 619 620 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 621 return (NULL); 622 623 connp->conn_ref = 1; 624 netstack_hold(ns); 625 connp->conn_netstack = ns; 626 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 627 connp->conn_ixa->ixa_conn_id = (long)connp; 628 ipcl_globalhash_insert(connp); 629 return (connp); 630 } 631 632 void 633 ipcl_conn_destroy(conn_t *connp) 634 { 635 mblk_t *mp; 636 netstack_t *ns = connp->conn_netstack; 637 638 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 639 ASSERT(connp->conn_ref == 0); 640 ASSERT(connp->conn_ioctlref == 0); 641 642 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 643 644 if (connp->conn_cred != NULL) { 645 crfree(connp->conn_cred); 646 connp->conn_cred = NULL; 647 /* ixa_cred done in ipcl_conn_cleanup below */ 648 } 649 650 if (connp->conn_ht_iphc != NULL) { 651 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 652 connp->conn_ht_iphc = NULL; 653 connp->conn_ht_iphc_allocated = 0; 654 connp->conn_ht_iphc_len = 0; 655 connp->conn_ht_ulp = NULL; 656 connp->conn_ht_ulp_len = 0; 657 } 658 ip_pkt_free(&connp->conn_xmit_ipp); 659 660 ipcl_globalhash_remove(connp); 661 662 if (connp->conn_latch != NULL) { 663 IPLATCH_REFRELE(connp->conn_latch); 664 connp->conn_latch = NULL; 665 } 666 if (connp->conn_latch_in_policy != NULL) { 667 IPPOL_REFRELE(connp->conn_latch_in_policy); 668 connp->conn_latch_in_policy = NULL; 669 } 670 if (connp->conn_latch_in_action != NULL) { 671 IPACT_REFRELE(connp->conn_latch_in_action); 672 connp->conn_latch_in_action = NULL; 673 } 674 if (connp->conn_policy != NULL) { 675 IPPH_REFRELE(connp->conn_policy, ns); 676 connp->conn_policy = NULL; 677 } 678 679 if (connp->conn_ipsec_opt_mp != NULL) { 680 freemsg(connp->conn_ipsec_opt_mp); 681 connp->conn_ipsec_opt_mp = NULL; 682 } 683 684 if (connp->conn_flags & IPCL_TCPCONN) { 685 tcp_t *tcp = connp->conn_tcp; 686 687 tcp_free(tcp); 688 mp = tcp->tcp_timercache; 689 690 tcp->tcp_tcps = NULL; 691 692 /* 693 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 694 * the mblk. 695 */ 696 if (tcp->tcp_rsrv_mp != NULL) { 697 freeb(tcp->tcp_rsrv_mp); 698 tcp->tcp_rsrv_mp = NULL; 699 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 700 } 701 702 ipcl_conn_cleanup(connp); 703 connp->conn_flags = IPCL_TCPCONN; 704 if (ns != NULL) { 705 ASSERT(tcp->tcp_tcps == NULL); 706 connp->conn_netstack = NULL; 707 connp->conn_ixa->ixa_ipst = NULL; 708 netstack_rele(ns); 709 } 710 711 bzero(tcp, sizeof (tcp_t)); 712 713 tcp->tcp_timercache = mp; 714 tcp->tcp_connp = connp; 715 kmem_cache_free(tcp_conn_cache, connp); 716 return; 717 } 718 719 if (connp->conn_flags & IPCL_SCTPCONN) { 720 ASSERT(ns != NULL); 721 sctp_free(connp); 722 return; 723 } 724 725 ipcl_conn_cleanup(connp); 726 if (ns != NULL) { 727 connp->conn_netstack = NULL; 728 connp->conn_ixa->ixa_ipst = NULL; 729 netstack_rele(ns); 730 } 731 732 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 733 if (connp->conn_flags & IPCL_UDPCONN) { 734 connp->conn_flags = IPCL_UDPCONN; 735 kmem_cache_free(udp_conn_cache, connp); 736 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 737 connp->conn_flags = IPCL_RAWIPCONN; 738 connp->conn_proto = IPPROTO_ICMP; 739 connp->conn_ixa->ixa_protocol = connp->conn_proto; 740 kmem_cache_free(rawip_conn_cache, connp); 741 } else if (connp->conn_flags & IPCL_RTSCONN) { 742 connp->conn_flags = IPCL_RTSCONN; 743 kmem_cache_free(rts_conn_cache, connp); 744 } else { 745 connp->conn_flags = IPCL_IPCCONN; 746 ASSERT(connp->conn_flags & IPCL_IPCCONN); 747 ASSERT(connp->conn_priv == NULL); 748 kmem_cache_free(ip_conn_cache, connp); 749 } 750 } 751 752 /* 753 * Running in cluster mode - deregister listener information 754 */ 755 static void 756 ipcl_conn_unlisten(conn_t *connp) 757 { 758 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 759 ASSERT(connp->conn_lport != 0); 760 761 if (cl_inet_unlisten != NULL) { 762 sa_family_t addr_family; 763 uint8_t *laddrp; 764 765 if (connp->conn_ipversion == IPV6_VERSION) { 766 addr_family = AF_INET6; 767 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 768 } else { 769 addr_family = AF_INET; 770 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 771 } 772 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 773 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 774 } 775 connp->conn_flags &= ~IPCL_CL_LISTENER; 776 } 777 778 /* 779 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 780 * which table the conn belonged to). So for debugging we can see which hash 781 * table this connection was in. 782 */ 783 #define IPCL_HASH_REMOVE(connp) { \ 784 connf_t *connfp = (connp)->conn_fanout; \ 785 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 786 if (connfp != NULL) { \ 787 mutex_enter(&connfp->connf_lock); \ 788 if ((connp)->conn_next != NULL) \ 789 (connp)->conn_next->conn_prev = \ 790 (connp)->conn_prev; \ 791 if ((connp)->conn_prev != NULL) \ 792 (connp)->conn_prev->conn_next = \ 793 (connp)->conn_next; \ 794 else \ 795 connfp->connf_head = (connp)->conn_next; \ 796 (connp)->conn_fanout = NULL; \ 797 (connp)->conn_next = NULL; \ 798 (connp)->conn_prev = NULL; \ 799 (connp)->conn_flags |= IPCL_REMOVED; \ 800 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 801 ipcl_conn_unlisten((connp)); \ 802 CONN_DEC_REF((connp)); \ 803 mutex_exit(&connfp->connf_lock); \ 804 } \ 805 } 806 807 void 808 ipcl_hash_remove(conn_t *connp) 809 { 810 uint8_t protocol = connp->conn_proto; 811 812 IPCL_HASH_REMOVE(connp); 813 if (protocol == IPPROTO_RSVP) 814 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 815 } 816 817 /* 818 * The whole purpose of this function is allow removal of 819 * a conn_t from the connected hash for timewait reclaim. 820 * This is essentially a TW reclaim fastpath where timewait 821 * collector checks under fanout lock (so no one else can 822 * get access to the conn_t) that refcnt is 2 i.e. one for 823 * TCP and one for the classifier hash list. If ref count 824 * is indeed 2, we can just remove the conn under lock and 825 * avoid cleaning up the conn under squeue. This gives us 826 * improved performance. 827 */ 828 void 829 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 830 { 831 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 832 ASSERT(MUTEX_HELD(&connp->conn_lock)); 833 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 834 835 if ((connp)->conn_next != NULL) { 836 (connp)->conn_next->conn_prev = (connp)->conn_prev; 837 } 838 if ((connp)->conn_prev != NULL) { 839 (connp)->conn_prev->conn_next = (connp)->conn_next; 840 } else { 841 connfp->connf_head = (connp)->conn_next; 842 } 843 (connp)->conn_fanout = NULL; 844 (connp)->conn_next = NULL; 845 (connp)->conn_prev = NULL; 846 (connp)->conn_flags |= IPCL_REMOVED; 847 ASSERT((connp)->conn_ref == 2); 848 (connp)->conn_ref--; 849 } 850 851 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 852 ASSERT((connp)->conn_fanout == NULL); \ 853 ASSERT((connp)->conn_next == NULL); \ 854 ASSERT((connp)->conn_prev == NULL); \ 855 if ((connfp)->connf_head != NULL) { \ 856 (connfp)->connf_head->conn_prev = (connp); \ 857 (connp)->conn_next = (connfp)->connf_head; \ 858 } \ 859 (connp)->conn_fanout = (connfp); \ 860 (connfp)->connf_head = (connp); \ 861 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 862 IPCL_CONNECTED; \ 863 CONN_INC_REF(connp); \ 864 } 865 866 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 867 IPCL_HASH_REMOVE((connp)); \ 868 mutex_enter(&(connfp)->connf_lock); \ 869 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 870 mutex_exit(&(connfp)->connf_lock); \ 871 } 872 873 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 874 conn_t *pconnp = NULL, *nconnp; \ 875 IPCL_HASH_REMOVE((connp)); \ 876 mutex_enter(&(connfp)->connf_lock); \ 877 nconnp = (connfp)->connf_head; \ 878 while (nconnp != NULL && \ 879 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 880 pconnp = nconnp; \ 881 nconnp = nconnp->conn_next; \ 882 } \ 883 if (pconnp != NULL) { \ 884 pconnp->conn_next = (connp); \ 885 (connp)->conn_prev = pconnp; \ 886 } else { \ 887 (connfp)->connf_head = (connp); \ 888 } \ 889 if (nconnp != NULL) { \ 890 (connp)->conn_next = nconnp; \ 891 nconnp->conn_prev = (connp); \ 892 } \ 893 (connp)->conn_fanout = (connfp); \ 894 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 895 IPCL_BOUND; \ 896 CONN_INC_REF(connp); \ 897 mutex_exit(&(connfp)->connf_lock); \ 898 } 899 900 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 901 conn_t **list, *prev, *next; \ 902 boolean_t isv4mapped = \ 903 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 904 IPCL_HASH_REMOVE((connp)); \ 905 mutex_enter(&(connfp)->connf_lock); \ 906 list = &(connfp)->connf_head; \ 907 prev = NULL; \ 908 while ((next = *list) != NULL) { \ 909 if (isv4mapped && \ 910 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 911 connp->conn_zoneid == next->conn_zoneid) { \ 912 (connp)->conn_next = next; \ 913 if (prev != NULL) \ 914 prev = next->conn_prev; \ 915 next->conn_prev = (connp); \ 916 break; \ 917 } \ 918 list = &next->conn_next; \ 919 prev = next; \ 920 } \ 921 (connp)->conn_prev = prev; \ 922 *list = (connp); \ 923 (connp)->conn_fanout = (connfp); \ 924 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 925 IPCL_BOUND; \ 926 CONN_INC_REF((connp)); \ 927 mutex_exit(&(connfp)->connf_lock); \ 928 } 929 930 void 931 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 932 { 933 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 934 } 935 936 /* 937 * Because the classifier is used to classify inbound packets, the destination 938 * address is meant to be our local tunnel address (tunnel source), and the 939 * source the remote tunnel address (tunnel destination). 940 * 941 * Note that conn_proto can't be used for fanout since the upper protocol 942 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 943 */ 944 conn_t * 945 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 946 { 947 connf_t *connfp; 948 conn_t *connp; 949 950 /* first look for IPv4 tunnel links */ 951 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 952 mutex_enter(&connfp->connf_lock); 953 for (connp = connfp->connf_head; connp != NULL; 954 connp = connp->conn_next) { 955 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 956 break; 957 } 958 if (connp != NULL) 959 goto done; 960 961 mutex_exit(&connfp->connf_lock); 962 963 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 964 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 965 INADDR_ANY)]; 966 mutex_enter(&connfp->connf_lock); 967 for (connp = connfp->connf_head; connp != NULL; 968 connp = connp->conn_next) { 969 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 970 break; 971 } 972 done: 973 if (connp != NULL) 974 CONN_INC_REF(connp); 975 mutex_exit(&connfp->connf_lock); 976 return (connp); 977 } 978 979 conn_t * 980 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 981 { 982 connf_t *connfp; 983 conn_t *connp; 984 985 /* Look for an IPv6 tunnel link */ 986 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 987 mutex_enter(&connfp->connf_lock); 988 for (connp = connfp->connf_head; connp != NULL; 989 connp = connp->conn_next) { 990 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 991 CONN_INC_REF(connp); 992 break; 993 } 994 } 995 mutex_exit(&connfp->connf_lock); 996 return (connp); 997 } 998 999 /* 1000 * This function is used only for inserting SCTP raw socket now. 1001 * This may change later. 1002 * 1003 * Note that only one raw socket can be bound to a port. The param 1004 * lport is in network byte order. 1005 */ 1006 static int 1007 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1008 { 1009 connf_t *connfp; 1010 conn_t *oconnp; 1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1012 1013 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1014 1015 /* Check for existing raw socket already bound to the port. */ 1016 mutex_enter(&connfp->connf_lock); 1017 for (oconnp = connfp->connf_head; oconnp != NULL; 1018 oconnp = oconnp->conn_next) { 1019 if (oconnp->conn_lport == lport && 1020 oconnp->conn_zoneid == connp->conn_zoneid && 1021 oconnp->conn_family == connp->conn_family && 1022 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1023 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1024 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1025 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1026 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1027 &connp->conn_laddr_v6))) { 1028 break; 1029 } 1030 } 1031 mutex_exit(&connfp->connf_lock); 1032 if (oconnp != NULL) 1033 return (EADDRNOTAVAIL); 1034 1035 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1036 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1037 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1038 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1039 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1040 } else { 1041 IPCL_HASH_INSERT_BOUND(connfp, connp); 1042 } 1043 } else { 1044 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1045 } 1046 return (0); 1047 } 1048 1049 static int 1050 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1051 { 1052 connf_t *connfp; 1053 conn_t *tconnp; 1054 ipaddr_t laddr = connp->conn_laddr_v4; 1055 ipaddr_t faddr = connp->conn_faddr_v4; 1056 1057 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1058 mutex_enter(&connfp->connf_lock); 1059 for (tconnp = connfp->connf_head; tconnp != NULL; 1060 tconnp = tconnp->conn_next) { 1061 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1062 /* A tunnel is already bound to these addresses. */ 1063 mutex_exit(&connfp->connf_lock); 1064 return (EADDRINUSE); 1065 } 1066 } 1067 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1068 mutex_exit(&connfp->connf_lock); 1069 return (0); 1070 } 1071 1072 static int 1073 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1074 { 1075 connf_t *connfp; 1076 conn_t *tconnp; 1077 in6_addr_t *laddr = &connp->conn_laddr_v6; 1078 in6_addr_t *faddr = &connp->conn_faddr_v6; 1079 1080 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1081 mutex_enter(&connfp->connf_lock); 1082 for (tconnp = connfp->connf_head; tconnp != NULL; 1083 tconnp = tconnp->conn_next) { 1084 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1085 /* A tunnel is already bound to these addresses. */ 1086 mutex_exit(&connfp->connf_lock); 1087 return (EADDRINUSE); 1088 } 1089 } 1090 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1091 mutex_exit(&connfp->connf_lock); 1092 return (0); 1093 } 1094 1095 /* 1096 * Check for a MAC exemption conflict on a labeled system. Note that for 1097 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1098 * transport layer. This check is for binding all other protocols. 1099 * 1100 * Returns true if there's a conflict. 1101 */ 1102 static boolean_t 1103 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1104 { 1105 connf_t *connfp; 1106 conn_t *tconn; 1107 1108 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1109 mutex_enter(&connfp->connf_lock); 1110 for (tconn = connfp->connf_head; tconn != NULL; 1111 tconn = tconn->conn_next) { 1112 /* We don't allow v4 fallback for v6 raw socket */ 1113 if (connp->conn_family != tconn->conn_family) 1114 continue; 1115 /* If neither is exempt, then there's no conflict */ 1116 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1117 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1118 continue; 1119 /* We are only concerned about sockets for a different zone */ 1120 if (connp->conn_zoneid == tconn->conn_zoneid) 1121 continue; 1122 /* If both are bound to different specific addrs, ok */ 1123 if (connp->conn_laddr_v4 != INADDR_ANY && 1124 tconn->conn_laddr_v4 != INADDR_ANY && 1125 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1126 continue; 1127 /* These two conflict; fail */ 1128 break; 1129 } 1130 mutex_exit(&connfp->connf_lock); 1131 return (tconn != NULL); 1132 } 1133 1134 static boolean_t 1135 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1136 { 1137 connf_t *connfp; 1138 conn_t *tconn; 1139 1140 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1141 mutex_enter(&connfp->connf_lock); 1142 for (tconn = connfp->connf_head; tconn != NULL; 1143 tconn = tconn->conn_next) { 1144 /* We don't allow v4 fallback for v6 raw socket */ 1145 if (connp->conn_family != tconn->conn_family) 1146 continue; 1147 /* If neither is exempt, then there's no conflict */ 1148 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1149 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1150 continue; 1151 /* We are only concerned about sockets for a different zone */ 1152 if (connp->conn_zoneid == tconn->conn_zoneid) 1153 continue; 1154 /* If both are bound to different addrs, ok */ 1155 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1156 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1157 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1158 &tconn->conn_laddr_v6)) 1159 continue; 1160 /* These two conflict; fail */ 1161 break; 1162 } 1163 mutex_exit(&connfp->connf_lock); 1164 return (tconn != NULL); 1165 } 1166 1167 /* 1168 * (v4, v6) bind hash insertion routines 1169 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1170 */ 1171 1172 int 1173 ipcl_bind_insert(conn_t *connp) 1174 { 1175 if (connp->conn_ipversion == IPV6_VERSION) 1176 return (ipcl_bind_insert_v6(connp)); 1177 else 1178 return (ipcl_bind_insert_v4(connp)); 1179 } 1180 1181 int 1182 ipcl_bind_insert_v4(conn_t *connp) 1183 { 1184 connf_t *connfp; 1185 int ret = 0; 1186 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1187 uint16_t lport = connp->conn_lport; 1188 uint8_t protocol = connp->conn_proto; 1189 1190 if (IPCL_IS_IPTUN(connp)) 1191 return (ipcl_iptun_hash_insert(connp, ipst)); 1192 1193 switch (protocol) { 1194 default: 1195 if (is_system_labeled() && 1196 check_exempt_conflict_v4(connp, ipst)) 1197 return (EADDRINUSE); 1198 /* FALLTHROUGH */ 1199 case IPPROTO_UDP: 1200 if (protocol == IPPROTO_UDP) { 1201 connfp = &ipst->ips_ipcl_udp_fanout[ 1202 IPCL_UDP_HASH(lport, ipst)]; 1203 } else { 1204 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1205 } 1206 1207 if (connp->conn_faddr_v4 != INADDR_ANY) { 1208 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1209 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1210 IPCL_HASH_INSERT_BOUND(connfp, connp); 1211 } else { 1212 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1213 } 1214 if (protocol == IPPROTO_RSVP) 1215 ill_set_inputfn_all(ipst); 1216 break; 1217 1218 case IPPROTO_TCP: 1219 /* Insert it in the Bind Hash */ 1220 ASSERT(connp->conn_zoneid != ALL_ZONES); 1221 connfp = &ipst->ips_ipcl_bind_fanout[ 1222 IPCL_BIND_HASH(lport, ipst)]; 1223 if (connp->conn_laddr_v4 != INADDR_ANY) { 1224 IPCL_HASH_INSERT_BOUND(connfp, connp); 1225 } else { 1226 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1227 } 1228 if (cl_inet_listen != NULL) { 1229 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1230 connp->conn_flags |= IPCL_CL_LISTENER; 1231 (*cl_inet_listen)( 1232 connp->conn_netstack->netstack_stackid, 1233 IPPROTO_TCP, AF_INET, 1234 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1235 } 1236 break; 1237 1238 case IPPROTO_SCTP: 1239 ret = ipcl_sctp_hash_insert(connp, lport); 1240 break; 1241 } 1242 1243 return (ret); 1244 } 1245 1246 int 1247 ipcl_bind_insert_v6(conn_t *connp) 1248 { 1249 connf_t *connfp; 1250 int ret = 0; 1251 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1252 uint16_t lport = connp->conn_lport; 1253 uint8_t protocol = connp->conn_proto; 1254 1255 if (IPCL_IS_IPTUN(connp)) { 1256 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1257 } 1258 1259 switch (protocol) { 1260 default: 1261 if (is_system_labeled() && 1262 check_exempt_conflict_v6(connp, ipst)) 1263 return (EADDRINUSE); 1264 /* FALLTHROUGH */ 1265 case IPPROTO_UDP: 1266 if (protocol == IPPROTO_UDP) { 1267 connfp = &ipst->ips_ipcl_udp_fanout[ 1268 IPCL_UDP_HASH(lport, ipst)]; 1269 } else { 1270 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1271 } 1272 1273 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1274 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1275 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1276 IPCL_HASH_INSERT_BOUND(connfp, connp); 1277 } else { 1278 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1279 } 1280 break; 1281 1282 case IPPROTO_TCP: 1283 /* Insert it in the Bind Hash */ 1284 ASSERT(connp->conn_zoneid != ALL_ZONES); 1285 connfp = &ipst->ips_ipcl_bind_fanout[ 1286 IPCL_BIND_HASH(lport, ipst)]; 1287 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1288 IPCL_HASH_INSERT_BOUND(connfp, connp); 1289 } else { 1290 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1291 } 1292 if (cl_inet_listen != NULL) { 1293 sa_family_t addr_family; 1294 uint8_t *laddrp; 1295 1296 if (connp->conn_ipversion == IPV6_VERSION) { 1297 addr_family = AF_INET6; 1298 laddrp = 1299 (uint8_t *)&connp->conn_bound_addr_v6; 1300 } else { 1301 addr_family = AF_INET; 1302 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1303 } 1304 connp->conn_flags |= IPCL_CL_LISTENER; 1305 (*cl_inet_listen)( 1306 connp->conn_netstack->netstack_stackid, 1307 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1308 } 1309 break; 1310 1311 case IPPROTO_SCTP: 1312 ret = ipcl_sctp_hash_insert(connp, lport); 1313 break; 1314 } 1315 1316 return (ret); 1317 } 1318 1319 /* 1320 * ipcl_conn_hash insertion routines. 1321 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1322 */ 1323 1324 int 1325 ipcl_conn_insert(conn_t *connp) 1326 { 1327 if (connp->conn_ipversion == IPV6_VERSION) 1328 return (ipcl_conn_insert_v6(connp)); 1329 else 1330 return (ipcl_conn_insert_v4(connp)); 1331 } 1332 1333 int 1334 ipcl_conn_insert_v4(conn_t *connp) 1335 { 1336 connf_t *connfp; 1337 conn_t *tconnp; 1338 int ret = 0; 1339 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1340 uint16_t lport = connp->conn_lport; 1341 uint8_t protocol = connp->conn_proto; 1342 1343 if (IPCL_IS_IPTUN(connp)) 1344 return (ipcl_iptun_hash_insert(connp, ipst)); 1345 1346 switch (protocol) { 1347 case IPPROTO_TCP: 1348 /* 1349 * For TCP, we check whether the connection tuple already 1350 * exists before allowing the connection to proceed. We 1351 * also allow indexing on the zoneid. This is to allow 1352 * multiple shared stack zones to have the same tcp 1353 * connection tuple. In practice this only happens for 1354 * INADDR_LOOPBACK as it's the only local address which 1355 * doesn't have to be unique. 1356 */ 1357 connfp = &ipst->ips_ipcl_conn_fanout[ 1358 IPCL_CONN_HASH(connp->conn_faddr_v4, 1359 connp->conn_ports, ipst)]; 1360 mutex_enter(&connfp->connf_lock); 1361 for (tconnp = connfp->connf_head; tconnp != NULL; 1362 tconnp = tconnp->conn_next) { 1363 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1364 connp->conn_faddr_v4, connp->conn_laddr_v4, 1365 connp->conn_ports) && 1366 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1367 /* Already have a conn. bail out */ 1368 mutex_exit(&connfp->connf_lock); 1369 return (EADDRINUSE); 1370 } 1371 } 1372 if (connp->conn_fanout != NULL) { 1373 /* 1374 * Probably a XTI/TLI application trying to do a 1375 * rebind. Let it happen. 1376 */ 1377 mutex_exit(&connfp->connf_lock); 1378 IPCL_HASH_REMOVE(connp); 1379 mutex_enter(&connfp->connf_lock); 1380 } 1381 1382 ASSERT(connp->conn_recv != NULL); 1383 ASSERT(connp->conn_recvicmp != NULL); 1384 1385 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1386 mutex_exit(&connfp->connf_lock); 1387 break; 1388 1389 case IPPROTO_SCTP: 1390 /* 1391 * The raw socket may have already been bound, remove it 1392 * from the hash first. 1393 */ 1394 IPCL_HASH_REMOVE(connp); 1395 ret = ipcl_sctp_hash_insert(connp, lport); 1396 break; 1397 1398 default: 1399 /* 1400 * Check for conflicts among MAC exempt bindings. For 1401 * transports with port numbers, this is done by the upper 1402 * level per-transport binding logic. For all others, it's 1403 * done here. 1404 */ 1405 if (is_system_labeled() && 1406 check_exempt_conflict_v4(connp, ipst)) 1407 return (EADDRINUSE); 1408 /* FALLTHROUGH */ 1409 1410 case IPPROTO_UDP: 1411 if (protocol == IPPROTO_UDP) { 1412 connfp = &ipst->ips_ipcl_udp_fanout[ 1413 IPCL_UDP_HASH(lport, ipst)]; 1414 } else { 1415 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1416 } 1417 1418 if (connp->conn_faddr_v4 != INADDR_ANY) { 1419 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1420 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1421 IPCL_HASH_INSERT_BOUND(connfp, connp); 1422 } else { 1423 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1424 } 1425 break; 1426 } 1427 1428 return (ret); 1429 } 1430 1431 int 1432 ipcl_conn_insert_v6(conn_t *connp) 1433 { 1434 connf_t *connfp; 1435 conn_t *tconnp; 1436 int ret = 0; 1437 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1438 uint16_t lport = connp->conn_lport; 1439 uint8_t protocol = connp->conn_proto; 1440 uint_t ifindex = connp->conn_bound_if; 1441 1442 if (IPCL_IS_IPTUN(connp)) 1443 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1444 1445 switch (protocol) { 1446 case IPPROTO_TCP: 1447 1448 /* 1449 * For tcp, we check whether the connection tuple already 1450 * exists before allowing the connection to proceed. We 1451 * also allow indexing on the zoneid. This is to allow 1452 * multiple shared stack zones to have the same tcp 1453 * connection tuple. In practice this only happens for 1454 * ipv6_loopback as it's the only local address which 1455 * doesn't have to be unique. 1456 */ 1457 connfp = &ipst->ips_ipcl_conn_fanout[ 1458 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1459 ipst)]; 1460 mutex_enter(&connfp->connf_lock); 1461 for (tconnp = connfp->connf_head; tconnp != NULL; 1462 tconnp = tconnp->conn_next) { 1463 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1464 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1465 connp->conn_faddr_v6, connp->conn_laddr_v6, 1466 connp->conn_ports) && 1467 (tconnp->conn_bound_if == 0 || 1468 tconnp->conn_bound_if == ifindex) && 1469 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1470 /* Already have a conn. bail out */ 1471 mutex_exit(&connfp->connf_lock); 1472 return (EADDRINUSE); 1473 } 1474 } 1475 if (connp->conn_fanout != NULL) { 1476 /* 1477 * Probably a XTI/TLI application trying to do a 1478 * rebind. Let it happen. 1479 */ 1480 mutex_exit(&connfp->connf_lock); 1481 IPCL_HASH_REMOVE(connp); 1482 mutex_enter(&connfp->connf_lock); 1483 } 1484 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1485 mutex_exit(&connfp->connf_lock); 1486 break; 1487 1488 case IPPROTO_SCTP: 1489 IPCL_HASH_REMOVE(connp); 1490 ret = ipcl_sctp_hash_insert(connp, lport); 1491 break; 1492 1493 default: 1494 if (is_system_labeled() && 1495 check_exempt_conflict_v6(connp, ipst)) 1496 return (EADDRINUSE); 1497 /* FALLTHROUGH */ 1498 case IPPROTO_UDP: 1499 if (protocol == IPPROTO_UDP) { 1500 connfp = &ipst->ips_ipcl_udp_fanout[ 1501 IPCL_UDP_HASH(lport, ipst)]; 1502 } else { 1503 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1504 } 1505 1506 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1507 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1508 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1509 IPCL_HASH_INSERT_BOUND(connfp, connp); 1510 } else { 1511 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1512 } 1513 break; 1514 } 1515 1516 return (ret); 1517 } 1518 1519 /* 1520 * v4 packet classifying function. looks up the fanout table to 1521 * find the conn, the packet belongs to. returns the conn with 1522 * the reference held, null otherwise. 1523 * 1524 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1525 * Lookup" comment block are applied. Labels are also checked as described 1526 * above. If the packet is from the inside (looped back), and is from the same 1527 * zone, then label checks are omitted. 1528 */ 1529 conn_t * 1530 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1531 ip_recv_attr_t *ira, ip_stack_t *ipst) 1532 { 1533 ipha_t *ipha; 1534 connf_t *connfp, *bind_connfp; 1535 uint16_t lport; 1536 uint16_t fport; 1537 uint32_t ports; 1538 conn_t *connp; 1539 uint16_t *up; 1540 zoneid_t zoneid = ira->ira_zoneid; 1541 1542 ipha = (ipha_t *)mp->b_rptr; 1543 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1544 1545 switch (protocol) { 1546 case IPPROTO_TCP: 1547 ports = *(uint32_t *)up; 1548 connfp = 1549 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1550 ports, ipst)]; 1551 mutex_enter(&connfp->connf_lock); 1552 for (connp = connfp->connf_head; connp != NULL; 1553 connp = connp->conn_next) { 1554 if (IPCL_CONN_MATCH(connp, protocol, 1555 ipha->ipha_src, ipha->ipha_dst, ports) && 1556 (connp->conn_zoneid == zoneid || 1557 connp->conn_allzones || 1558 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1559 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1560 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1561 break; 1562 } 1563 1564 if (connp != NULL) { 1565 /* 1566 * We have a fully-bound TCP connection. 1567 * 1568 * For labeled systems, there's no need to check the 1569 * label here. It's known to be good as we checked 1570 * before allowing the connection to become bound. 1571 */ 1572 CONN_INC_REF(connp); 1573 mutex_exit(&connfp->connf_lock); 1574 return (connp); 1575 } 1576 1577 mutex_exit(&connfp->connf_lock); 1578 lport = up[1]; 1579 bind_connfp = 1580 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1581 mutex_enter(&bind_connfp->connf_lock); 1582 for (connp = bind_connfp->connf_head; connp != NULL; 1583 connp = connp->conn_next) { 1584 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1585 lport) && 1586 (connp->conn_zoneid == zoneid || 1587 connp->conn_allzones || 1588 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1589 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1590 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1591 break; 1592 } 1593 1594 /* 1595 * If the matching connection is SLP on a private address, then 1596 * the label on the packet must match the local zone's label. 1597 * Otherwise, it must be in the label range defined by tnrh. 1598 * This is ensured by tsol_receive_local. 1599 * 1600 * Note that we don't check tsol_receive_local for 1601 * the connected case. 1602 */ 1603 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1604 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1605 ira, connp)) { 1606 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1607 char *, "connp(1) could not receive mp(2)", 1608 conn_t *, connp, mblk_t *, mp); 1609 connp = NULL; 1610 } 1611 1612 if (connp != NULL) { 1613 /* Have a listener at least */ 1614 CONN_INC_REF(connp); 1615 mutex_exit(&bind_connfp->connf_lock); 1616 return (connp); 1617 } 1618 1619 mutex_exit(&bind_connfp->connf_lock); 1620 break; 1621 1622 case IPPROTO_UDP: 1623 lport = up[1]; 1624 fport = up[0]; 1625 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1626 mutex_enter(&connfp->connf_lock); 1627 for (connp = connfp->connf_head; connp != NULL; 1628 connp = connp->conn_next) { 1629 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1630 fport, ipha->ipha_src) && 1631 (connp->conn_zoneid == zoneid || 1632 connp->conn_allzones || 1633 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1634 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1635 break; 1636 } 1637 1638 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1639 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1640 ira, connp)) { 1641 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1642 char *, "connp(1) could not receive mp(2)", 1643 conn_t *, connp, mblk_t *, mp); 1644 connp = NULL; 1645 } 1646 1647 if (connp != NULL) { 1648 CONN_INC_REF(connp); 1649 mutex_exit(&connfp->connf_lock); 1650 return (connp); 1651 } 1652 1653 /* 1654 * We shouldn't come here for multicast/broadcast packets 1655 */ 1656 mutex_exit(&connfp->connf_lock); 1657 1658 break; 1659 1660 case IPPROTO_ENCAP: 1661 case IPPROTO_IPV6: 1662 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1663 &ipha->ipha_dst, ipst)); 1664 } 1665 1666 return (NULL); 1667 } 1668 1669 conn_t * 1670 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1671 ip_recv_attr_t *ira, ip_stack_t *ipst) 1672 { 1673 ip6_t *ip6h; 1674 connf_t *connfp, *bind_connfp; 1675 uint16_t lport; 1676 uint16_t fport; 1677 tcpha_t *tcpha; 1678 uint32_t ports; 1679 conn_t *connp; 1680 uint16_t *up; 1681 zoneid_t zoneid = ira->ira_zoneid; 1682 1683 ip6h = (ip6_t *)mp->b_rptr; 1684 1685 switch (protocol) { 1686 case IPPROTO_TCP: 1687 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1688 up = &tcpha->tha_lport; 1689 ports = *(uint32_t *)up; 1690 1691 connfp = 1692 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1693 ports, ipst)]; 1694 mutex_enter(&connfp->connf_lock); 1695 for (connp = connfp->connf_head; connp != NULL; 1696 connp = connp->conn_next) { 1697 if (IPCL_CONN_MATCH_V6(connp, protocol, 1698 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1699 (connp->conn_zoneid == zoneid || 1700 connp->conn_allzones || 1701 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1702 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1703 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1704 break; 1705 } 1706 1707 if (connp != NULL) { 1708 /* 1709 * We have a fully-bound TCP connection. 1710 * 1711 * For labeled systems, there's no need to check the 1712 * label here. It's known to be good as we checked 1713 * before allowing the connection to become bound. 1714 */ 1715 CONN_INC_REF(connp); 1716 mutex_exit(&connfp->connf_lock); 1717 return (connp); 1718 } 1719 1720 mutex_exit(&connfp->connf_lock); 1721 1722 lport = up[1]; 1723 bind_connfp = 1724 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1725 mutex_enter(&bind_connfp->connf_lock); 1726 for (connp = bind_connfp->connf_head; connp != NULL; 1727 connp = connp->conn_next) { 1728 if (IPCL_BIND_MATCH_V6(connp, protocol, 1729 ip6h->ip6_dst, lport) && 1730 (connp->conn_zoneid == zoneid || 1731 connp->conn_allzones || 1732 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1733 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1734 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1735 break; 1736 } 1737 1738 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1739 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1740 ira, connp)) { 1741 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1742 char *, "connp(1) could not receive mp(2)", 1743 conn_t *, connp, mblk_t *, mp); 1744 connp = NULL; 1745 } 1746 1747 if (connp != NULL) { 1748 /* Have a listner at least */ 1749 CONN_INC_REF(connp); 1750 mutex_exit(&bind_connfp->connf_lock); 1751 return (connp); 1752 } 1753 1754 mutex_exit(&bind_connfp->connf_lock); 1755 break; 1756 1757 case IPPROTO_UDP: 1758 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1759 lport = up[1]; 1760 fport = up[0]; 1761 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1762 mutex_enter(&connfp->connf_lock); 1763 for (connp = connfp->connf_head; connp != NULL; 1764 connp = connp->conn_next) { 1765 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1766 fport, ip6h->ip6_src) && 1767 (connp->conn_zoneid == zoneid || 1768 connp->conn_allzones || 1769 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1770 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1771 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1772 break; 1773 } 1774 1775 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1776 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1777 ira, connp)) { 1778 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1779 char *, "connp(1) could not receive mp(2)", 1780 conn_t *, connp, mblk_t *, mp); 1781 connp = NULL; 1782 } 1783 1784 if (connp != NULL) { 1785 CONN_INC_REF(connp); 1786 mutex_exit(&connfp->connf_lock); 1787 return (connp); 1788 } 1789 1790 /* 1791 * We shouldn't come here for multicast/broadcast packets 1792 */ 1793 mutex_exit(&connfp->connf_lock); 1794 break; 1795 case IPPROTO_ENCAP: 1796 case IPPROTO_IPV6: 1797 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1798 &ip6h->ip6_dst, ipst)); 1799 } 1800 1801 return (NULL); 1802 } 1803 1804 /* 1805 * wrapper around ipcl_classify_(v4,v6) routines. 1806 */ 1807 conn_t * 1808 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1809 { 1810 if (ira->ira_flags & IRAF_IS_IPV4) { 1811 return (ipcl_classify_v4(mp, ira->ira_protocol, 1812 ira->ira_ip_hdr_length, ira, ipst)); 1813 } else { 1814 return (ipcl_classify_v6(mp, ira->ira_protocol, 1815 ira->ira_ip_hdr_length, ira, ipst)); 1816 } 1817 } 1818 1819 /* 1820 * Only used to classify SCTP RAW sockets 1821 */ 1822 conn_t * 1823 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1824 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1825 { 1826 connf_t *connfp; 1827 conn_t *connp; 1828 in_port_t lport; 1829 int ipversion; 1830 const void *dst; 1831 zoneid_t zoneid = ira->ira_zoneid; 1832 1833 lport = ((uint16_t *)&ports)[1]; 1834 if (ira->ira_flags & IRAF_IS_IPV4) { 1835 dst = (const void *)&ipha->ipha_dst; 1836 ipversion = IPV4_VERSION; 1837 } else { 1838 dst = (const void *)&ip6h->ip6_dst; 1839 ipversion = IPV6_VERSION; 1840 } 1841 1842 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1843 mutex_enter(&connfp->connf_lock); 1844 for (connp = connfp->connf_head; connp != NULL; 1845 connp = connp->conn_next) { 1846 /* We don't allow v4 fallback for v6 raw socket. */ 1847 if (ipversion != connp->conn_ipversion) 1848 continue; 1849 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1850 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1851 if (ipversion == IPV4_VERSION) { 1852 if (!IPCL_CONN_MATCH(connp, protocol, 1853 ipha->ipha_src, ipha->ipha_dst, ports)) 1854 continue; 1855 } else { 1856 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1857 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1858 continue; 1859 } 1860 } else { 1861 if (ipversion == IPV4_VERSION) { 1862 if (!IPCL_BIND_MATCH(connp, protocol, 1863 ipha->ipha_dst, lport)) 1864 continue; 1865 } else { 1866 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1867 ip6h->ip6_dst, lport)) 1868 continue; 1869 } 1870 } 1871 1872 if (connp->conn_zoneid == zoneid || 1873 connp->conn_allzones || 1874 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1875 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1876 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1877 break; 1878 } 1879 1880 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1881 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1882 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1883 char *, "connp(1) could not receive mp(2)", 1884 conn_t *, connp, mblk_t *, mp); 1885 connp = NULL; 1886 } 1887 1888 if (connp != NULL) 1889 goto found; 1890 mutex_exit(&connfp->connf_lock); 1891 1892 /* Try to look for a wildcard SCTP RAW socket match. */ 1893 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1894 mutex_enter(&connfp->connf_lock); 1895 for (connp = connfp->connf_head; connp != NULL; 1896 connp = connp->conn_next) { 1897 /* We don't allow v4 fallback for v6 raw socket. */ 1898 if (ipversion != connp->conn_ipversion) 1899 continue; 1900 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1901 continue; 1902 1903 if (ipversion == IPV4_VERSION) { 1904 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1905 break; 1906 } else { 1907 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1908 break; 1909 } 1910 } 1911 } 1912 1913 if (connp != NULL) 1914 goto found; 1915 1916 mutex_exit(&connfp->connf_lock); 1917 return (NULL); 1918 1919 found: 1920 ASSERT(connp != NULL); 1921 CONN_INC_REF(connp); 1922 mutex_exit(&connfp->connf_lock); 1923 return (connp); 1924 } 1925 1926 /* ARGSUSED */ 1927 static int 1928 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1929 { 1930 itc_t *itc = (itc_t *)buf; 1931 conn_t *connp = &itc->itc_conn; 1932 tcp_t *tcp = (tcp_t *)&itc[1]; 1933 1934 bzero(connp, sizeof (conn_t)); 1935 bzero(tcp, sizeof (tcp_t)); 1936 1937 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1938 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1939 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1940 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1941 if (tcp->tcp_timercache == NULL) 1942 return (ENOMEM); 1943 connp->conn_tcp = tcp; 1944 connp->conn_flags = IPCL_TCPCONN; 1945 connp->conn_proto = IPPROTO_TCP; 1946 tcp->tcp_connp = connp; 1947 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1948 1949 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1950 if (connp->conn_ixa == NULL) { 1951 tcp_timermp_free(tcp); 1952 return (ENOMEM); 1953 } 1954 connp->conn_ixa->ixa_refcnt = 1; 1955 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1956 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1957 return (0); 1958 } 1959 1960 /* ARGSUSED */ 1961 static void 1962 tcp_conn_destructor(void *buf, void *cdrarg) 1963 { 1964 itc_t *itc = (itc_t *)buf; 1965 conn_t *connp = &itc->itc_conn; 1966 tcp_t *tcp = (tcp_t *)&itc[1]; 1967 1968 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1969 ASSERT(tcp->tcp_connp == connp); 1970 ASSERT(connp->conn_tcp == tcp); 1971 tcp_timermp_free(tcp); 1972 mutex_destroy(&connp->conn_lock); 1973 cv_destroy(&connp->conn_cv); 1974 cv_destroy(&connp->conn_sq_cv); 1975 rw_destroy(&connp->conn_ilg_lock); 1976 1977 /* Can be NULL if constructor failed */ 1978 if (connp->conn_ixa != NULL) { 1979 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1980 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1981 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1982 ixa_refrele(connp->conn_ixa); 1983 } 1984 } 1985 1986 /* ARGSUSED */ 1987 static int 1988 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1989 { 1990 itc_t *itc = (itc_t *)buf; 1991 conn_t *connp = &itc->itc_conn; 1992 1993 bzero(connp, sizeof (conn_t)); 1994 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1995 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1996 connp->conn_flags = IPCL_IPCCONN; 1997 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1998 1999 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2000 if (connp->conn_ixa == NULL) 2001 return (ENOMEM); 2002 connp->conn_ixa->ixa_refcnt = 1; 2003 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2004 return (0); 2005 } 2006 2007 /* ARGSUSED */ 2008 static void 2009 ip_conn_destructor(void *buf, void *cdrarg) 2010 { 2011 itc_t *itc = (itc_t *)buf; 2012 conn_t *connp = &itc->itc_conn; 2013 2014 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2015 ASSERT(connp->conn_priv == NULL); 2016 mutex_destroy(&connp->conn_lock); 2017 cv_destroy(&connp->conn_cv); 2018 rw_destroy(&connp->conn_ilg_lock); 2019 2020 /* Can be NULL if constructor failed */ 2021 if (connp->conn_ixa != NULL) { 2022 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2023 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2024 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2025 ixa_refrele(connp->conn_ixa); 2026 } 2027 } 2028 2029 /* ARGSUSED */ 2030 static int 2031 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2032 { 2033 itc_t *itc = (itc_t *)buf; 2034 conn_t *connp = &itc->itc_conn; 2035 udp_t *udp = (udp_t *)&itc[1]; 2036 2037 bzero(connp, sizeof (conn_t)); 2038 bzero(udp, sizeof (udp_t)); 2039 2040 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2041 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2042 connp->conn_udp = udp; 2043 connp->conn_flags = IPCL_UDPCONN; 2044 connp->conn_proto = IPPROTO_UDP; 2045 udp->udp_connp = connp; 2046 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2047 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2048 if (connp->conn_ixa == NULL) 2049 return (ENOMEM); 2050 connp->conn_ixa->ixa_refcnt = 1; 2051 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2052 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2053 return (0); 2054 } 2055 2056 /* ARGSUSED */ 2057 static void 2058 udp_conn_destructor(void *buf, void *cdrarg) 2059 { 2060 itc_t *itc = (itc_t *)buf; 2061 conn_t *connp = &itc->itc_conn; 2062 udp_t *udp = (udp_t *)&itc[1]; 2063 2064 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2065 ASSERT(udp->udp_connp == connp); 2066 ASSERT(connp->conn_udp == udp); 2067 mutex_destroy(&connp->conn_lock); 2068 cv_destroy(&connp->conn_cv); 2069 rw_destroy(&connp->conn_ilg_lock); 2070 2071 /* Can be NULL if constructor failed */ 2072 if (connp->conn_ixa != NULL) { 2073 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2074 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2075 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2076 ixa_refrele(connp->conn_ixa); 2077 } 2078 } 2079 2080 /* ARGSUSED */ 2081 static int 2082 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2083 { 2084 itc_t *itc = (itc_t *)buf; 2085 conn_t *connp = &itc->itc_conn; 2086 icmp_t *icmp = (icmp_t *)&itc[1]; 2087 2088 bzero(connp, sizeof (conn_t)); 2089 bzero(icmp, sizeof (icmp_t)); 2090 2091 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2092 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2093 connp->conn_icmp = icmp; 2094 connp->conn_flags = IPCL_RAWIPCONN; 2095 connp->conn_proto = IPPROTO_ICMP; 2096 icmp->icmp_connp = connp; 2097 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2098 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2099 if (connp->conn_ixa == NULL) 2100 return (ENOMEM); 2101 connp->conn_ixa->ixa_refcnt = 1; 2102 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2103 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2104 return (0); 2105 } 2106 2107 /* ARGSUSED */ 2108 static void 2109 rawip_conn_destructor(void *buf, void *cdrarg) 2110 { 2111 itc_t *itc = (itc_t *)buf; 2112 conn_t *connp = &itc->itc_conn; 2113 icmp_t *icmp = (icmp_t *)&itc[1]; 2114 2115 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2116 ASSERT(icmp->icmp_connp == connp); 2117 ASSERT(connp->conn_icmp == icmp); 2118 mutex_destroy(&connp->conn_lock); 2119 cv_destroy(&connp->conn_cv); 2120 rw_destroy(&connp->conn_ilg_lock); 2121 2122 /* Can be NULL if constructor failed */ 2123 if (connp->conn_ixa != NULL) { 2124 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2125 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2126 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2127 ixa_refrele(connp->conn_ixa); 2128 } 2129 } 2130 2131 /* ARGSUSED */ 2132 static int 2133 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2134 { 2135 itc_t *itc = (itc_t *)buf; 2136 conn_t *connp = &itc->itc_conn; 2137 rts_t *rts = (rts_t *)&itc[1]; 2138 2139 bzero(connp, sizeof (conn_t)); 2140 bzero(rts, sizeof (rts_t)); 2141 2142 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2143 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2144 connp->conn_rts = rts; 2145 connp->conn_flags = IPCL_RTSCONN; 2146 rts->rts_connp = connp; 2147 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2148 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2149 if (connp->conn_ixa == NULL) 2150 return (ENOMEM); 2151 connp->conn_ixa->ixa_refcnt = 1; 2152 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2153 return (0); 2154 } 2155 2156 /* ARGSUSED */ 2157 static void 2158 rts_conn_destructor(void *buf, void *cdrarg) 2159 { 2160 itc_t *itc = (itc_t *)buf; 2161 conn_t *connp = &itc->itc_conn; 2162 rts_t *rts = (rts_t *)&itc[1]; 2163 2164 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2165 ASSERT(rts->rts_connp == connp); 2166 ASSERT(connp->conn_rts == rts); 2167 mutex_destroy(&connp->conn_lock); 2168 cv_destroy(&connp->conn_cv); 2169 rw_destroy(&connp->conn_ilg_lock); 2170 2171 /* Can be NULL if constructor failed */ 2172 if (connp->conn_ixa != NULL) { 2173 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2174 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2175 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2176 ixa_refrele(connp->conn_ixa); 2177 } 2178 } 2179 2180 /* 2181 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2182 * in the conn_t. 2183 * 2184 * Below we list all the pointers in the conn_t as a documentation aid. 2185 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2186 * If you add any pointers to the conn_t please add an ASSERT here 2187 * and #ifdef it out if it can't be actually asserted to be NULL. 2188 * In any case, we bzero most of the conn_t at the end of the function. 2189 */ 2190 void 2191 ipcl_conn_cleanup(conn_t *connp) 2192 { 2193 ip_xmit_attr_t *ixa; 2194 2195 ASSERT(connp->conn_latch == NULL); 2196 ASSERT(connp->conn_latch_in_policy == NULL); 2197 ASSERT(connp->conn_latch_in_action == NULL); 2198 #ifdef notdef 2199 ASSERT(connp->conn_rq == NULL); 2200 ASSERT(connp->conn_wq == NULL); 2201 #endif 2202 ASSERT(connp->conn_cred == NULL); 2203 ASSERT(connp->conn_g_fanout == NULL); 2204 ASSERT(connp->conn_g_next == NULL); 2205 ASSERT(connp->conn_g_prev == NULL); 2206 ASSERT(connp->conn_policy == NULL); 2207 ASSERT(connp->conn_fanout == NULL); 2208 ASSERT(connp->conn_next == NULL); 2209 ASSERT(connp->conn_prev == NULL); 2210 ASSERT(connp->conn_oper_pending_ill == NULL); 2211 ASSERT(connp->conn_ilg == NULL); 2212 ASSERT(connp->conn_drain_next == NULL); 2213 ASSERT(connp->conn_drain_prev == NULL); 2214 #ifdef notdef 2215 /* conn_idl is not cleared when removed from idl list */ 2216 ASSERT(connp->conn_idl == NULL); 2217 #endif 2218 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2219 #ifdef notdef 2220 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2221 ASSERT(connp->conn_netstack == NULL); 2222 #endif 2223 2224 ASSERT(connp->conn_helper_info == NULL); 2225 ASSERT(connp->conn_ixa != NULL); 2226 ixa = connp->conn_ixa; 2227 ASSERT(ixa->ixa_refcnt == 1); 2228 /* Need to preserve ixa_protocol */ 2229 ixa_cleanup(ixa); 2230 ixa->ixa_flags = 0; 2231 2232 /* Clear out the conn_t fields that are not preserved */ 2233 bzero(&connp->conn_start_clr, 2234 sizeof (conn_t) - 2235 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2236 } 2237 2238 /* 2239 * All conns are inserted in a global multi-list for the benefit of 2240 * walkers. The walk is guaranteed to walk all open conns at the time 2241 * of the start of the walk exactly once. This property is needed to 2242 * achieve some cleanups during unplumb of interfaces. This is achieved 2243 * as follows. 2244 * 2245 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2246 * call the insert and delete functions below at creation and deletion 2247 * time respectively. The conn never moves or changes its position in this 2248 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2249 * won't increase due to walkers, once the conn deletion has started. Note 2250 * that we can't remove the conn from the global list and then wait for 2251 * the refcnt to drop to zero, since walkers would then see a truncated 2252 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2253 * conns until ip_open is ready to make them globally visible. 2254 * The global round robin multi-list locks are held only to get the 2255 * next member/insertion/deletion and contention should be negligible 2256 * if the multi-list is much greater than the number of cpus. 2257 */ 2258 void 2259 ipcl_globalhash_insert(conn_t *connp) 2260 { 2261 int index; 2262 struct connf_s *connfp; 2263 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2264 2265 /* 2266 * No need for atomic here. Approximate even distribution 2267 * in the global lists is sufficient. 2268 */ 2269 ipst->ips_conn_g_index++; 2270 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2271 2272 connp->conn_g_prev = NULL; 2273 /* 2274 * Mark as INCIPIENT, so that walkers will ignore this 2275 * for now, till ip_open is ready to make it visible globally. 2276 */ 2277 connp->conn_state_flags |= CONN_INCIPIENT; 2278 2279 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2280 /* Insert at the head of the list */ 2281 mutex_enter(&connfp->connf_lock); 2282 connp->conn_g_next = connfp->connf_head; 2283 if (connp->conn_g_next != NULL) 2284 connp->conn_g_next->conn_g_prev = connp; 2285 connfp->connf_head = connp; 2286 2287 /* The fanout bucket this conn points to */ 2288 connp->conn_g_fanout = connfp; 2289 2290 mutex_exit(&connfp->connf_lock); 2291 } 2292 2293 void 2294 ipcl_globalhash_remove(conn_t *connp) 2295 { 2296 struct connf_s *connfp; 2297 2298 /* 2299 * We were never inserted in the global multi list. 2300 * IPCL_NONE variety is never inserted in the global multilist 2301 * since it is presumed to not need any cleanup and is transient. 2302 */ 2303 if (connp->conn_g_fanout == NULL) 2304 return; 2305 2306 connfp = connp->conn_g_fanout; 2307 mutex_enter(&connfp->connf_lock); 2308 if (connp->conn_g_prev != NULL) 2309 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2310 else 2311 connfp->connf_head = connp->conn_g_next; 2312 if (connp->conn_g_next != NULL) 2313 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2314 mutex_exit(&connfp->connf_lock); 2315 2316 /* Better to stumble on a null pointer than to corrupt memory */ 2317 connp->conn_g_next = NULL; 2318 connp->conn_g_prev = NULL; 2319 connp->conn_g_fanout = NULL; 2320 } 2321 2322 /* 2323 * Walk the list of all conn_t's in the system, calling the function provided 2324 * With the specified argument for each. 2325 * Applies to both IPv4 and IPv6. 2326 * 2327 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2328 * conn_oper_pending_ill). To guard against stale pointers 2329 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2330 * unplumbed or removed. New conn_t's that are created while we are walking 2331 * may be missed by this walk, because they are not necessarily inserted 2332 * at the tail of the list. They are new conn_t's and thus don't have any 2333 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2334 * is created to the struct that is going away. 2335 */ 2336 void 2337 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2338 { 2339 int i; 2340 conn_t *connp; 2341 conn_t *prev_connp; 2342 2343 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2344 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2345 prev_connp = NULL; 2346 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2347 while (connp != NULL) { 2348 mutex_enter(&connp->conn_lock); 2349 if (connp->conn_state_flags & 2350 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2351 mutex_exit(&connp->conn_lock); 2352 connp = connp->conn_g_next; 2353 continue; 2354 } 2355 CONN_INC_REF_LOCKED(connp); 2356 mutex_exit(&connp->conn_lock); 2357 mutex_exit( 2358 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2359 (*func)(connp, arg); 2360 if (prev_connp != NULL) 2361 CONN_DEC_REF(prev_connp); 2362 mutex_enter( 2363 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2364 prev_connp = connp; 2365 connp = connp->conn_g_next; 2366 } 2367 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2368 if (prev_connp != NULL) 2369 CONN_DEC_REF(prev_connp); 2370 } 2371 } 2372 2373 /* 2374 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2375 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2376 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2377 * (peer tcp in ESTABLISHED state). 2378 */ 2379 conn_t * 2380 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2381 ip_stack_t *ipst) 2382 { 2383 uint32_t ports; 2384 uint16_t *pports = (uint16_t *)&ports; 2385 connf_t *connfp; 2386 conn_t *tconnp; 2387 boolean_t zone_chk; 2388 2389 /* 2390 * If either the source of destination address is loopback, then 2391 * both endpoints must be in the same Zone. Otherwise, both of 2392 * the addresses are system-wide unique (tcp is in ESTABLISHED 2393 * state) and the endpoints may reside in different Zones. 2394 */ 2395 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2396 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2397 2398 pports[0] = tcpha->tha_fport; 2399 pports[1] = tcpha->tha_lport; 2400 2401 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2402 ports, ipst)]; 2403 2404 mutex_enter(&connfp->connf_lock); 2405 for (tconnp = connfp->connf_head; tconnp != NULL; 2406 tconnp = tconnp->conn_next) { 2407 2408 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2409 ipha->ipha_dst, ipha->ipha_src, ports) && 2410 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2411 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2412 2413 ASSERT(tconnp != connp); 2414 CONN_INC_REF(tconnp); 2415 mutex_exit(&connfp->connf_lock); 2416 return (tconnp); 2417 } 2418 } 2419 mutex_exit(&connfp->connf_lock); 2420 return (NULL); 2421 } 2422 2423 /* 2424 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2425 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2426 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2427 * (peer tcp in ESTABLISHED state). 2428 */ 2429 conn_t * 2430 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2431 ip_stack_t *ipst) 2432 { 2433 uint32_t ports; 2434 uint16_t *pports = (uint16_t *)&ports; 2435 connf_t *connfp; 2436 conn_t *tconnp; 2437 boolean_t zone_chk; 2438 2439 /* 2440 * If either the source of destination address is loopback, then 2441 * both endpoints must be in the same Zone. Otherwise, both of 2442 * the addresses are system-wide unique (tcp is in ESTABLISHED 2443 * state) and the endpoints may reside in different Zones. We 2444 * don't do Zone check for link local address(es) because the 2445 * current Zone implementation treats each link local address as 2446 * being unique per system node, i.e. they belong to global Zone. 2447 */ 2448 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2449 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2450 2451 pports[0] = tcpha->tha_fport; 2452 pports[1] = tcpha->tha_lport; 2453 2454 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2455 ports, ipst)]; 2456 2457 mutex_enter(&connfp->connf_lock); 2458 for (tconnp = connfp->connf_head; tconnp != NULL; 2459 tconnp = tconnp->conn_next) { 2460 2461 /* We skip conn_bound_if check here as this is loopback tcp */ 2462 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2463 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2464 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2465 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2466 2467 ASSERT(tconnp != connp); 2468 CONN_INC_REF(tconnp); 2469 mutex_exit(&connfp->connf_lock); 2470 return (tconnp); 2471 } 2472 } 2473 mutex_exit(&connfp->connf_lock); 2474 return (NULL); 2475 } 2476 2477 /* 2478 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2479 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2480 * Only checks for connected entries i.e. no INADDR_ANY checks. 2481 */ 2482 conn_t * 2483 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2484 ip_stack_t *ipst) 2485 { 2486 uint32_t ports; 2487 uint16_t *pports; 2488 connf_t *connfp; 2489 conn_t *tconnp; 2490 2491 pports = (uint16_t *)&ports; 2492 pports[0] = tcpha->tha_fport; 2493 pports[1] = tcpha->tha_lport; 2494 2495 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2496 ports, ipst)]; 2497 2498 mutex_enter(&connfp->connf_lock); 2499 for (tconnp = connfp->connf_head; tconnp != NULL; 2500 tconnp = tconnp->conn_next) { 2501 2502 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2503 ipha->ipha_dst, ipha->ipha_src, ports) && 2504 tconnp->conn_tcp->tcp_state >= min_state) { 2505 2506 CONN_INC_REF(tconnp); 2507 mutex_exit(&connfp->connf_lock); 2508 return (tconnp); 2509 } 2510 } 2511 mutex_exit(&connfp->connf_lock); 2512 return (NULL); 2513 } 2514 2515 /* 2516 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2517 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2518 * Only checks for connected entries i.e. no INADDR_ANY checks. 2519 * Match on ifindex in addition to addresses. 2520 */ 2521 conn_t * 2522 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2523 uint_t ifindex, ip_stack_t *ipst) 2524 { 2525 tcp_t *tcp; 2526 uint32_t ports; 2527 uint16_t *pports; 2528 connf_t *connfp; 2529 conn_t *tconnp; 2530 2531 pports = (uint16_t *)&ports; 2532 pports[0] = tcpha->tha_fport; 2533 pports[1] = tcpha->tha_lport; 2534 2535 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2536 ports, ipst)]; 2537 2538 mutex_enter(&connfp->connf_lock); 2539 for (tconnp = connfp->connf_head; tconnp != NULL; 2540 tconnp = tconnp->conn_next) { 2541 2542 tcp = tconnp->conn_tcp; 2543 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2544 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2545 tcp->tcp_state >= min_state && 2546 (tconnp->conn_bound_if == 0 || 2547 tconnp->conn_bound_if == ifindex)) { 2548 2549 CONN_INC_REF(tconnp); 2550 mutex_exit(&connfp->connf_lock); 2551 return (tconnp); 2552 } 2553 } 2554 mutex_exit(&connfp->connf_lock); 2555 return (NULL); 2556 } 2557 2558 /* 2559 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2560 * a listener when changing state. 2561 */ 2562 conn_t * 2563 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2564 ip_stack_t *ipst) 2565 { 2566 connf_t *bind_connfp; 2567 conn_t *connp; 2568 tcp_t *tcp; 2569 2570 /* 2571 * Avoid false matches for packets sent to an IP destination of 2572 * all zeros. 2573 */ 2574 if (laddr == 0) 2575 return (NULL); 2576 2577 ASSERT(zoneid != ALL_ZONES); 2578 2579 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2580 mutex_enter(&bind_connfp->connf_lock); 2581 for (connp = bind_connfp->connf_head; connp != NULL; 2582 connp = connp->conn_next) { 2583 tcp = connp->conn_tcp; 2584 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2585 IPCL_ZONE_MATCH(connp, zoneid) && 2586 (tcp->tcp_listener == NULL)) { 2587 CONN_INC_REF(connp); 2588 mutex_exit(&bind_connfp->connf_lock); 2589 return (connp); 2590 } 2591 } 2592 mutex_exit(&bind_connfp->connf_lock); 2593 return (NULL); 2594 } 2595 2596 /* 2597 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2598 * a listener when changing state. 2599 */ 2600 conn_t * 2601 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2602 zoneid_t zoneid, ip_stack_t *ipst) 2603 { 2604 connf_t *bind_connfp; 2605 conn_t *connp = NULL; 2606 tcp_t *tcp; 2607 2608 /* 2609 * Avoid false matches for packets sent to an IP destination of 2610 * all zeros. 2611 */ 2612 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2613 return (NULL); 2614 2615 ASSERT(zoneid != ALL_ZONES); 2616 2617 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2618 mutex_enter(&bind_connfp->connf_lock); 2619 for (connp = bind_connfp->connf_head; connp != NULL; 2620 connp = connp->conn_next) { 2621 tcp = connp->conn_tcp; 2622 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2623 IPCL_ZONE_MATCH(connp, zoneid) && 2624 (connp->conn_bound_if == 0 || 2625 connp->conn_bound_if == ifindex) && 2626 tcp->tcp_listener == NULL) { 2627 CONN_INC_REF(connp); 2628 mutex_exit(&bind_connfp->connf_lock); 2629 return (connp); 2630 } 2631 } 2632 mutex_exit(&bind_connfp->connf_lock); 2633 return (NULL); 2634 } 2635 2636 /* 2637 * ipcl_get_next_conn 2638 * get the next entry in the conn global list 2639 * and put a reference on the next_conn. 2640 * decrement the reference on the current conn. 2641 * 2642 * This is an iterator based walker function that also provides for 2643 * some selection by the caller. It walks through the conn_hash bucket 2644 * searching for the next valid connp in the list, and selects connections 2645 * that are neither closed nor condemned. It also REFHOLDS the conn 2646 * thus ensuring that the conn exists when the caller uses the conn. 2647 */ 2648 conn_t * 2649 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2650 { 2651 conn_t *next_connp; 2652 2653 if (connfp == NULL) 2654 return (NULL); 2655 2656 mutex_enter(&connfp->connf_lock); 2657 2658 next_connp = (connp == NULL) ? 2659 connfp->connf_head : connp->conn_g_next; 2660 2661 while (next_connp != NULL) { 2662 mutex_enter(&next_connp->conn_lock); 2663 if (!(next_connp->conn_flags & conn_flags) || 2664 (next_connp->conn_state_flags & 2665 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2666 /* 2667 * This conn has been condemned or 2668 * is closing, or the flags don't match 2669 */ 2670 mutex_exit(&next_connp->conn_lock); 2671 next_connp = next_connp->conn_g_next; 2672 continue; 2673 } 2674 CONN_INC_REF_LOCKED(next_connp); 2675 mutex_exit(&next_connp->conn_lock); 2676 break; 2677 } 2678 2679 mutex_exit(&connfp->connf_lock); 2680 2681 if (connp != NULL) 2682 CONN_DEC_REF(connp); 2683 2684 return (next_connp); 2685 } 2686 2687 #ifdef CONN_DEBUG 2688 /* 2689 * Trace of the last NBUF refhold/refrele 2690 */ 2691 int 2692 conn_trace_ref(conn_t *connp) 2693 { 2694 int last; 2695 conn_trace_t *ctb; 2696 2697 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2698 last = connp->conn_trace_last; 2699 last++; 2700 if (last == CONN_TRACE_MAX) 2701 last = 0; 2702 2703 ctb = &connp->conn_trace_buf[last]; 2704 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2705 connp->conn_trace_last = last; 2706 return (1); 2707 } 2708 2709 int 2710 conn_untrace_ref(conn_t *connp) 2711 { 2712 int last; 2713 conn_trace_t *ctb; 2714 2715 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2716 last = connp->conn_trace_last; 2717 last++; 2718 if (last == CONN_TRACE_MAX) 2719 last = 0; 2720 2721 ctb = &connp->conn_trace_buf[last]; 2722 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2723 connp->conn_trace_last = last; 2724 return (1); 2725 } 2726 #endif 2727 2728 mib2_socketInfoEntry_t * 2729 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) 2730 { 2731 vnode_t *vn = NULL; 2732 vattr_t attr; 2733 uint64_t flags = 0; 2734 2735 /* 2736 * If the connection is closing, it is not safe to make an upcall or 2737 * access the stream associated with the connection. 2738 * The callers of this function have a reference on connp itself 2739 * so, as long as it is not closing, it's safe to continue. 2740 */ 2741 mutex_enter(&connp->conn_lock); 2742 2743 if ((connp->conn_state_flags & CONN_CLOSING)) { 2744 mutex_exit(&connp->conn_lock); 2745 return (NULL); 2746 } 2747 2748 mutex_exit(&connp->conn_lock); 2749 2750 if (connp->conn_upper_handle != NULL) { 2751 vn = (*connp->conn_upcalls->su_get_vnode) 2752 (connp->conn_upper_handle); 2753 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) { 2754 vn = STREAM(connp->conn_rq)->sd_pvnode; 2755 if (vn != NULL) 2756 VN_HOLD(vn); 2757 flags |= MIB2_SOCKINFO_STREAM; 2758 } 2759 2760 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) { 2761 if (vn != NULL) 2762 VN_RELE(vn); 2763 return (NULL); 2764 } 2765 2766 VN_RELE(vn); 2767 2768 bzero(sie, sizeof (*sie)); 2769 2770 sie->sie_flags = flags; 2771 sie->sie_inode = attr.va_nodeid; 2772 sie->sie_dev = attr.va_rdev; 2773 2774 return (sie); 2775 } 2776