1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 24 * Copyright 2022 Joyent, Inc. 25 * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org> 26 */ 27 28 /* 29 * IP PACKET CLASSIFIER 30 * 31 * The IP packet classifier provides mapping between IP packets and persistent 32 * connection state for connection-oriented protocols. It also provides 33 * interface for managing connection states. 34 * 35 * The connection state is kept in conn_t data structure and contains, among 36 * other things: 37 * 38 * o local/remote address and ports 39 * o Transport protocol 40 * o squeue for the connection (for TCP only) 41 * o reference counter 42 * o Connection state 43 * o hash table linkage 44 * o interface/ire information 45 * o credentials 46 * o ipsec policy 47 * o send and receive functions. 48 * o mutex lock. 49 * 50 * Connections use a reference counting scheme. They are freed when the 51 * reference counter drops to zero. A reference is incremented when connection 52 * is placed in a list or table, when incoming packet for the connection arrives 53 * and when connection is processed via squeue (squeue processing may be 54 * asynchronous and the reference protects the connection from being destroyed 55 * before its processing is finished). 56 * 57 * conn_recv is used to pass up packets to the ULP. 58 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 59 * a listener, and changes to tcp_input_listener as the listener has picked a 60 * good squeue. For other cases it is set to tcp_input_data. 61 * 62 * conn_recvicmp is used to pass up ICMP errors to the ULP. 63 * 64 * Classifier uses several hash tables: 65 * 66 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 67 * ipcl_bind_fanout: contains all connections in BOUND state 68 * ipcl_proto_fanout: IPv4 protocol fanout 69 * ipcl_proto_fanout_v6: IPv6 protocol fanout 70 * ipcl_udp_fanout: contains all UDP connections 71 * ipcl_iptun_fanout: contains all IP tunnel connections 72 * ipcl_globalhash_fanout: contains all connections 73 * 74 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 75 * which need to view all existing connections. 76 * 77 * All tables are protected by per-bucket locks. When both per-bucket lock and 78 * connection lock need to be held, the per-bucket lock should be acquired 79 * first, followed by the connection lock. 80 * 81 * All functions doing search in one of these tables increment a reference 82 * counter on the connection found (if any). This reference should be dropped 83 * when the caller has finished processing the connection. 84 * 85 * 86 * INTERFACES: 87 * =========== 88 * 89 * Connection Lookup: 90 * ------------------ 91 * 92 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 93 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 94 * 95 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 96 * it can't find any associated connection. If the connection is found, its 97 * reference counter is incremented. 98 * 99 * mp: mblock, containing packet header. The full header should fit 100 * into a single mblock. It should also contain at least full IP 101 * and TCP or UDP header. 102 * 103 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 104 * 105 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 106 * the packet. 107 * 108 * ira->ira_zoneid: The zone in which the returned connection must be; the 109 * zoneid corresponding to the ire_zoneid on the IRE located for 110 * the packet's destination address. 111 * 112 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 113 * IRAF_TX_SHARED_ADDR flags 114 * 115 * For TCP connections, the lookup order is as follows: 116 * 5-tuple {src, dst, protocol, local port, remote port} 117 * lookup in ipcl_conn_fanout table. 118 * 3-tuple {dst, remote port, protocol} lookup in 119 * ipcl_bind_fanout table. 120 * 121 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 122 * remote port} lookup is done on ipcl_udp_fanout. Note that, 123 * these interfaces do not handle cases where a packets belongs 124 * to multiple UDP clients, which is handled in IP itself. 125 * 126 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 127 * determine which actual zone gets the segment. This is used only in a 128 * labeled environment. The matching rules are: 129 * 130 * - If it's not a multilevel port, then the label on the packet selects 131 * the zone. Unlabeled packets are delivered to the global zone. 132 * 133 * - If it's a multilevel port, then only the zone registered to receive 134 * packets on that port matches. 135 * 136 * Also, in a labeled environment, packet labels need to be checked. For fully 137 * bound TCP connections, we can assume that the packet label was checked 138 * during connection establishment, and doesn't need to be checked on each 139 * packet. For others, though, we need to check for strict equality or, for 140 * multilevel ports, membership in the range or set. This part currently does 141 * a tnrh lookup on each packet, but could be optimized to use cached results 142 * if that were necessary. (SCTP doesn't come through here, but if it did, 143 * we would apply the same rules as TCP.) 144 * 145 * An implication of the above is that fully-bound TCP sockets must always use 146 * distinct 4-tuples; they can't be discriminated by label alone. 147 * 148 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 149 * as there's no connection set-up handshake and no shared state. 150 * 151 * Labels on looped-back packets within a single zone do not need to be 152 * checked, as all processes in the same zone have the same label. 153 * 154 * Finally, for unlabeled packets received by a labeled system, special rules 155 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 156 * socket in the zone whose label matches the default label of the sender, if 157 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 158 * receiver's label must dominate the sender's default label. 159 * 160 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 161 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 162 * ip_stack); 163 * 164 * Lookup routine to find a exact match for {src, dst, local port, 165 * remote port) for TCP connections in ipcl_conn_fanout. The address and 166 * ports are read from the IP and TCP header respectively. 167 * 168 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 169 * zoneid, ip_stack); 170 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 171 * zoneid, ip_stack); 172 * 173 * Lookup routine to find a listener with the tuple {lport, laddr, 174 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 175 * parameter interface index is also compared. 176 * 177 * void ipcl_walk(func, arg, ip_stack) 178 * 179 * Apply 'func' to every connection available. The 'func' is called as 180 * (*func)(connp, arg). The walk is non-atomic so connections may be 181 * created and destroyed during the walk. The CONN_CONDEMNED and 182 * CONN_INCIPIENT flags ensure that connections which are newly created 183 * or being destroyed are not selected by the walker. 184 * 185 * Table Updates 186 * ------------- 187 * 188 * int ipcl_conn_insert(connp); 189 * int ipcl_conn_insert_v4(connp); 190 * int ipcl_conn_insert_v6(connp); 191 * 192 * Insert 'connp' in the ipcl_conn_fanout. 193 * Arguments : 194 * connp conn_t to be inserted 195 * 196 * Return value : 197 * 0 if connp was inserted 198 * EADDRINUSE if the connection with the same tuple 199 * already exists. 200 * 201 * int ipcl_bind_insert(connp); 202 * int ipcl_bind_insert_v4(connp); 203 * int ipcl_bind_insert_v6(connp); 204 * 205 * Insert 'connp' in ipcl_bind_fanout. 206 * Arguments : 207 * connp conn_t to be inserted 208 * 209 * 210 * void ipcl_hash_remove(connp); 211 * 212 * Removes the 'connp' from the connection fanout table. 213 * 214 * Connection Creation/Destruction 215 * ------------------------------- 216 * 217 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 218 * 219 * Creates a new conn based on the type flag, inserts it into 220 * globalhash table. 221 * 222 * type: This flag determines the type of conn_t which needs to be 223 * created i.e., which kmem_cache it comes from. 224 * IPCL_TCPCONN indicates a TCP connection 225 * IPCL_SCTPCONN indicates a SCTP connection 226 * IPCL_UDPCONN indicates a UDP conn_t. 227 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 228 * IPCL_RTSCONN indicates a RTS conn_t. 229 * IPCL_IPCCONN indicates all other connections. 230 * 231 * void ipcl_conn_destroy(connp) 232 * 233 * Destroys the connection state, removes it from the global 234 * connection hash table and frees its memory. 235 */ 236 237 #include <sys/types.h> 238 #include <sys/stream.h> 239 #include <sys/stropts.h> 240 #include <sys/sysmacros.h> 241 #include <sys/strsubr.h> 242 #include <sys/strsun.h> 243 #define _SUN_TPI_VERSION 2 244 #include <sys/ddi.h> 245 #include <sys/cmn_err.h> 246 #include <sys/debug.h> 247 248 #include <sys/systm.h> 249 #include <sys/param.h> 250 #include <sys/kmem.h> 251 #include <sys/isa_defs.h> 252 #include <inet/common.h> 253 #include <netinet/ip6.h> 254 #include <netinet/icmp6.h> 255 256 #include <inet/ip.h> 257 #include <inet/ip_if.h> 258 #include <inet/ip_ire.h> 259 #include <inet/ip6.h> 260 #include <inet/ip_ndp.h> 261 #include <inet/ip_impl.h> 262 #include <inet/udp_impl.h> 263 #include <inet/sctp_ip.h> 264 #include <inet/sctp/sctp_impl.h> 265 #include <inet/rawip_impl.h> 266 #include <inet/rts_impl.h> 267 #include <inet/iptun/iptun_impl.h> 268 269 #include <sys/cpuvar.h> 270 271 #include <inet/ipclassifier.h> 272 #include <inet/tcp.h> 273 #include <inet/ipsec_impl.h> 274 275 #include <sys/tsol/tnet.h> 276 #include <sys/sockio.h> 277 278 /* Old value for compatibility. Setable in /etc/system */ 279 uint_t tcp_conn_hash_size = 0; 280 281 /* New value. Zero means choose automatically. Setable in /etc/system */ 282 uint_t ipcl_conn_hash_size = 0; 283 uint_t ipcl_conn_hash_memfactor = 8192; 284 uint_t ipcl_conn_hash_maxsize = 82500; 285 286 /* bind/udp fanout table size */ 287 uint_t ipcl_bind_fanout_size = 512; 288 uint_t ipcl_udp_fanout_size = 16384; 289 290 /* Raw socket fanout size. Must be a power of 2. */ 291 uint_t ipcl_raw_fanout_size = 256; 292 293 /* 294 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 295 * expect that most large deployments would have hundreds of tunnels, and 296 * thousands in the extreme case. 297 */ 298 uint_t ipcl_iptun_fanout_size = 6143; 299 300 /* 301 * Power of 2^N Primes useful for hashing for N of 0-28, 302 * these primes are the nearest prime <= 2^N - 2^(N-2). 303 */ 304 305 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 306 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 307 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 308 50331599, 100663291, 201326557, 0} 309 310 /* 311 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 312 * are aligned on cache lines. 313 */ 314 typedef union itc_s { 315 conn_t itc_conn; 316 char itcu_filler[CACHE_ALIGN(conn_s)]; 317 } itc_t; 318 319 struct kmem_cache *tcp_conn_cache; 320 struct kmem_cache *ip_conn_cache; 321 extern struct kmem_cache *sctp_conn_cache; 322 struct kmem_cache *udp_conn_cache; 323 struct kmem_cache *rawip_conn_cache; 324 struct kmem_cache *rts_conn_cache; 325 326 extern void tcp_timermp_free(tcp_t *); 327 extern mblk_t *tcp_timermp_alloc(int); 328 329 static int ip_conn_constructor(void *, void *, int); 330 static void ip_conn_destructor(void *, void *); 331 332 static int tcp_conn_constructor(void *, void *, int); 333 static void tcp_conn_destructor(void *, void *); 334 335 static int udp_conn_constructor(void *, void *, int); 336 static void udp_conn_destructor(void *, void *); 337 338 static int rawip_conn_constructor(void *, void *, int); 339 static void rawip_conn_destructor(void *, void *); 340 341 static int rts_conn_constructor(void *, void *, int); 342 static void rts_conn_destructor(void *, void *); 343 344 /* 345 * Global (for all stack instances) init routine 346 */ 347 void 348 ipcl_g_init(void) 349 { 350 ip_conn_cache = kmem_cache_create("ip_conn_cache", 351 sizeof (conn_t), CACHE_ALIGN_SIZE, 352 ip_conn_constructor, ip_conn_destructor, 353 NULL, NULL, NULL, 0); 354 355 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 356 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 357 tcp_conn_constructor, tcp_conn_destructor, 358 tcp_conn_reclaim, NULL, NULL, 0); 359 360 udp_conn_cache = kmem_cache_create("udp_conn_cache", 361 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 362 udp_conn_constructor, udp_conn_destructor, 363 NULL, NULL, NULL, 0); 364 365 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 366 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 367 rawip_conn_constructor, rawip_conn_destructor, 368 NULL, NULL, NULL, 0); 369 370 rts_conn_cache = kmem_cache_create("rts_conn_cache", 371 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 372 rts_conn_constructor, rts_conn_destructor, 373 NULL, NULL, NULL, 0); 374 } 375 376 /* 377 * ipclassifier intialization routine, sets up hash tables. 378 */ 379 void 380 ipcl_init(ip_stack_t *ipst) 381 { 382 int i; 383 int sizes[] = P2Ps(); 384 385 /* 386 * Calculate size of conn fanout table from /etc/system settings 387 */ 388 if (ipcl_conn_hash_size != 0) { 389 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 390 } else if (tcp_conn_hash_size != 0) { 391 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 392 } else { 393 extern pgcnt_t freemem; 394 395 ipst->ips_ipcl_conn_fanout_size = 396 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 397 398 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 399 ipst->ips_ipcl_conn_fanout_size = 400 ipcl_conn_hash_maxsize; 401 } 402 } 403 404 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 405 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 406 break; 407 } 408 } 409 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 410 /* Out of range, use the 2^16 value */ 411 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 412 } 413 414 /* Take values from /etc/system */ 415 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 416 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 417 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 418 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 419 420 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 421 422 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 423 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 424 425 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 426 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 427 MUTEX_DEFAULT, NULL); 428 } 429 430 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 431 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 432 433 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 434 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 435 MUTEX_DEFAULT, NULL); 436 } 437 438 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 439 sizeof (connf_t), KM_SLEEP); 440 for (i = 0; i < IPPROTO_MAX; i++) { 441 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 442 MUTEX_DEFAULT, NULL); 443 } 444 445 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 446 sizeof (connf_t), KM_SLEEP); 447 for (i = 0; i < IPPROTO_MAX; i++) { 448 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 449 MUTEX_DEFAULT, NULL); 450 } 451 452 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 453 mutex_init(&ipst->ips_rts_clients->connf_lock, 454 NULL, MUTEX_DEFAULT, NULL); 455 456 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 457 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 458 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 459 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 460 MUTEX_DEFAULT, NULL); 461 } 462 463 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 464 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 465 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 466 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 467 MUTEX_DEFAULT, NULL); 468 } 469 470 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 471 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 472 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 473 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 474 MUTEX_DEFAULT, NULL); 475 } 476 477 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 478 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 479 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 480 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 481 NULL, MUTEX_DEFAULT, NULL); 482 } 483 } 484 485 void 486 ipcl_g_destroy(void) 487 { 488 kmem_cache_destroy(ip_conn_cache); 489 kmem_cache_destroy(tcp_conn_cache); 490 kmem_cache_destroy(udp_conn_cache); 491 kmem_cache_destroy(rawip_conn_cache); 492 kmem_cache_destroy(rts_conn_cache); 493 } 494 495 /* 496 * All user-level and kernel use of the stack must be gone 497 * by now. 498 */ 499 void 500 ipcl_destroy(ip_stack_t *ipst) 501 { 502 int i; 503 504 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 505 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 506 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 507 } 508 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 509 sizeof (connf_t)); 510 ipst->ips_ipcl_conn_fanout = NULL; 511 512 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 513 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 514 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 515 } 516 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 517 sizeof (connf_t)); 518 ipst->ips_ipcl_bind_fanout = NULL; 519 520 for (i = 0; i < IPPROTO_MAX; i++) { 521 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 522 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 523 } 524 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 525 IPPROTO_MAX * sizeof (connf_t)); 526 ipst->ips_ipcl_proto_fanout_v4 = NULL; 527 528 for (i = 0; i < IPPROTO_MAX; i++) { 529 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 530 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 531 } 532 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 533 IPPROTO_MAX * sizeof (connf_t)); 534 ipst->ips_ipcl_proto_fanout_v6 = NULL; 535 536 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 537 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 538 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 539 } 540 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 541 sizeof (connf_t)); 542 ipst->ips_ipcl_udp_fanout = NULL; 543 544 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 545 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 546 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 547 } 548 kmem_free(ipst->ips_ipcl_iptun_fanout, 549 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 550 ipst->ips_ipcl_iptun_fanout = NULL; 551 552 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 553 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 554 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 555 } 556 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 557 sizeof (connf_t)); 558 ipst->ips_ipcl_raw_fanout = NULL; 559 560 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 561 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 562 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 563 } 564 kmem_free(ipst->ips_ipcl_globalhash_fanout, 565 sizeof (connf_t) * CONN_G_HASH_SIZE); 566 ipst->ips_ipcl_globalhash_fanout = NULL; 567 568 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 569 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 570 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 571 ipst->ips_rts_clients = NULL; 572 } 573 574 /* 575 * conn creation routine. initialize the conn, sets the reference 576 * and inserts it in the global hash table. 577 */ 578 conn_t * 579 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 580 { 581 conn_t *connp; 582 struct kmem_cache *conn_cache; 583 584 switch (type) { 585 case IPCL_SCTPCONN: 586 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 587 return (NULL); 588 sctp_conn_init(connp); 589 netstack_hold(ns); 590 connp->conn_netstack = ns; 591 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 592 connp->conn_ixa->ixa_conn_id = (long)connp; 593 ipcl_globalhash_insert(connp); 594 return (connp); 595 596 case IPCL_TCPCONN: 597 conn_cache = tcp_conn_cache; 598 break; 599 600 case IPCL_UDPCONN: 601 conn_cache = udp_conn_cache; 602 break; 603 604 case IPCL_RAWIPCONN: 605 conn_cache = rawip_conn_cache; 606 break; 607 608 case IPCL_RTSCONN: 609 conn_cache = rts_conn_cache; 610 break; 611 612 case IPCL_IPCCONN: 613 conn_cache = ip_conn_cache; 614 break; 615 616 default: 617 conn_cache = NULL; 618 connp = NULL; 619 ASSERT(0); 620 } 621 622 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 623 return (NULL); 624 625 connp->conn_ref = 1; 626 netstack_hold(ns); 627 connp->conn_netstack = ns; 628 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 629 connp->conn_ixa->ixa_conn_id = (long)connp; 630 ipcl_globalhash_insert(connp); 631 return (connp); 632 } 633 634 void 635 ipcl_conn_destroy(conn_t *connp) 636 { 637 mblk_t *mp; 638 netstack_t *ns = connp->conn_netstack; 639 640 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 641 ASSERT(connp->conn_ref == 0); 642 ASSERT(connp->conn_ioctlref == 0); 643 644 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 645 646 if (connp->conn_cred != NULL) { 647 crfree(connp->conn_cred); 648 connp->conn_cred = NULL; 649 /* ixa_cred done in ipcl_conn_cleanup below */ 650 } 651 652 if (connp->conn_ht_iphc != NULL) { 653 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 654 connp->conn_ht_iphc = NULL; 655 connp->conn_ht_iphc_allocated = 0; 656 connp->conn_ht_iphc_len = 0; 657 connp->conn_ht_ulp = NULL; 658 connp->conn_ht_ulp_len = 0; 659 } 660 ip_pkt_free(&connp->conn_xmit_ipp); 661 662 ipcl_globalhash_remove(connp); 663 664 if (connp->conn_latch != NULL) { 665 IPLATCH_REFRELE(connp->conn_latch); 666 connp->conn_latch = NULL; 667 } 668 if (connp->conn_latch_in_policy != NULL) { 669 IPPOL_REFRELE(connp->conn_latch_in_policy); 670 connp->conn_latch_in_policy = NULL; 671 } 672 if (connp->conn_latch_in_action != NULL) { 673 IPACT_REFRELE(connp->conn_latch_in_action); 674 connp->conn_latch_in_action = NULL; 675 } 676 if (connp->conn_policy != NULL) { 677 IPPH_REFRELE(connp->conn_policy, ns); 678 connp->conn_policy = NULL; 679 } 680 681 if (connp->conn_ipsec_opt_mp != NULL) { 682 freemsg(connp->conn_ipsec_opt_mp); 683 connp->conn_ipsec_opt_mp = NULL; 684 } 685 686 if (connp->conn_flags & IPCL_TCPCONN) { 687 tcp_t *tcp = connp->conn_tcp; 688 689 tcp_free(tcp); 690 mp = tcp->tcp_timercache; 691 692 tcp->tcp_tcps = NULL; 693 694 /* 695 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 696 * the mblk. 697 */ 698 if (tcp->tcp_rsrv_mp != NULL) { 699 freeb(tcp->tcp_rsrv_mp); 700 tcp->tcp_rsrv_mp = NULL; 701 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 702 } 703 704 ipcl_conn_cleanup(connp); 705 connp->conn_flags = IPCL_TCPCONN; 706 if (ns != NULL) { 707 ASSERT(tcp->tcp_tcps == NULL); 708 connp->conn_netstack = NULL; 709 connp->conn_ixa->ixa_ipst = NULL; 710 netstack_rele(ns); 711 } 712 713 bzero(tcp, sizeof (tcp_t)); 714 715 tcp->tcp_timercache = mp; 716 tcp->tcp_connp = connp; 717 kmem_cache_free(tcp_conn_cache, connp); 718 return; 719 } 720 721 if (connp->conn_flags & IPCL_SCTPCONN) { 722 ASSERT(ns != NULL); 723 sctp_free(connp); 724 return; 725 } 726 727 ipcl_conn_cleanup(connp); 728 if (ns != NULL) { 729 connp->conn_netstack = NULL; 730 connp->conn_ixa->ixa_ipst = NULL; 731 netstack_rele(ns); 732 } 733 734 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 735 if (connp->conn_flags & IPCL_UDPCONN) { 736 connp->conn_flags = IPCL_UDPCONN; 737 kmem_cache_free(udp_conn_cache, connp); 738 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 739 connp->conn_flags = IPCL_RAWIPCONN; 740 connp->conn_proto = IPPROTO_ICMP; 741 connp->conn_ixa->ixa_protocol = connp->conn_proto; 742 kmem_cache_free(rawip_conn_cache, connp); 743 } else if (connp->conn_flags & IPCL_RTSCONN) { 744 connp->conn_flags = IPCL_RTSCONN; 745 kmem_cache_free(rts_conn_cache, connp); 746 } else { 747 connp->conn_flags = IPCL_IPCCONN; 748 ASSERT(connp->conn_flags & IPCL_IPCCONN); 749 ASSERT(connp->conn_priv == NULL); 750 kmem_cache_free(ip_conn_cache, connp); 751 } 752 } 753 754 /* 755 * Running in cluster mode - deregister listener information 756 */ 757 static void 758 ipcl_conn_unlisten(conn_t *connp) 759 { 760 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 761 ASSERT(connp->conn_lport != 0); 762 763 if (cl_inet_unlisten != NULL) { 764 sa_family_t addr_family; 765 uint8_t *laddrp; 766 767 if (connp->conn_ipversion == IPV6_VERSION) { 768 addr_family = AF_INET6; 769 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 770 } else { 771 addr_family = AF_INET; 772 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 773 } 774 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 775 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 776 } 777 connp->conn_flags &= ~IPCL_CL_LISTENER; 778 } 779 780 /* 781 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 782 * which table the conn belonged to). So for debugging we can see which hash 783 * table this connection was in. 784 */ 785 #define IPCL_HASH_REMOVE(connp) { \ 786 connf_t *connfp = (connp)->conn_fanout; \ 787 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 788 if (connfp != NULL) { \ 789 mutex_enter(&connfp->connf_lock); \ 790 if ((connp)->conn_next != NULL) \ 791 (connp)->conn_next->conn_prev = \ 792 (connp)->conn_prev; \ 793 if ((connp)->conn_prev != NULL) \ 794 (connp)->conn_prev->conn_next = \ 795 (connp)->conn_next; \ 796 else \ 797 connfp->connf_head = (connp)->conn_next; \ 798 (connp)->conn_fanout = NULL; \ 799 (connp)->conn_next = NULL; \ 800 (connp)->conn_prev = NULL; \ 801 (connp)->conn_flags |= IPCL_REMOVED; \ 802 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 803 ipcl_conn_unlisten((connp)); \ 804 CONN_DEC_REF((connp)); \ 805 mutex_exit(&connfp->connf_lock); \ 806 } \ 807 } 808 809 void 810 ipcl_hash_remove(conn_t *connp) 811 { 812 uint8_t protocol = connp->conn_proto; 813 814 IPCL_HASH_REMOVE(connp); 815 if (protocol == IPPROTO_RSVP) 816 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 817 } 818 819 /* 820 * The whole purpose of this function is allow removal of 821 * a conn_t from the connected hash for timewait reclaim. 822 * This is essentially a TW reclaim fastpath where timewait 823 * collector checks under fanout lock (so no one else can 824 * get access to the conn_t) that refcnt is 2 i.e. one for 825 * TCP and one for the classifier hash list. If ref count 826 * is indeed 2, we can just remove the conn under lock and 827 * avoid cleaning up the conn under squeue. This gives us 828 * improved performance. 829 */ 830 void 831 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 832 { 833 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 834 ASSERT(MUTEX_HELD(&connp->conn_lock)); 835 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 836 837 if ((connp)->conn_next != NULL) { 838 (connp)->conn_next->conn_prev = (connp)->conn_prev; 839 } 840 if ((connp)->conn_prev != NULL) { 841 (connp)->conn_prev->conn_next = (connp)->conn_next; 842 } else { 843 connfp->connf_head = (connp)->conn_next; 844 } 845 (connp)->conn_fanout = NULL; 846 (connp)->conn_next = NULL; 847 (connp)->conn_prev = NULL; 848 (connp)->conn_flags |= IPCL_REMOVED; 849 ASSERT((connp)->conn_ref == 2); 850 (connp)->conn_ref--; 851 } 852 853 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 854 ASSERT((connp)->conn_fanout == NULL); \ 855 ASSERT((connp)->conn_next == NULL); \ 856 ASSERT((connp)->conn_prev == NULL); \ 857 if ((connfp)->connf_head != NULL) { \ 858 (connfp)->connf_head->conn_prev = (connp); \ 859 (connp)->conn_next = (connfp)->connf_head; \ 860 } \ 861 (connp)->conn_fanout = (connfp); \ 862 (connfp)->connf_head = (connp); \ 863 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 864 IPCL_CONNECTED; \ 865 CONN_INC_REF(connp); \ 866 } 867 868 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 869 IPCL_HASH_REMOVE((connp)); \ 870 mutex_enter(&(connfp)->connf_lock); \ 871 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 872 mutex_exit(&(connfp)->connf_lock); \ 873 } 874 875 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 876 conn_t *pconnp = NULL, *nconnp; \ 877 IPCL_HASH_REMOVE((connp)); \ 878 mutex_enter(&(connfp)->connf_lock); \ 879 nconnp = (connfp)->connf_head; \ 880 while (nconnp != NULL && \ 881 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 882 pconnp = nconnp; \ 883 nconnp = nconnp->conn_next; \ 884 } \ 885 if (pconnp != NULL) { \ 886 pconnp->conn_next = (connp); \ 887 (connp)->conn_prev = pconnp; \ 888 } else { \ 889 (connfp)->connf_head = (connp); \ 890 } \ 891 if (nconnp != NULL) { \ 892 (connp)->conn_next = nconnp; \ 893 nconnp->conn_prev = (connp); \ 894 } \ 895 (connp)->conn_fanout = (connfp); \ 896 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 897 IPCL_BOUND; \ 898 CONN_INC_REF(connp); \ 899 mutex_exit(&(connfp)->connf_lock); \ 900 } 901 902 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 903 conn_t **list, *prev, *next; \ 904 boolean_t isv4mapped = \ 905 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 906 IPCL_HASH_REMOVE((connp)); \ 907 mutex_enter(&(connfp)->connf_lock); \ 908 list = &(connfp)->connf_head; \ 909 prev = NULL; \ 910 while ((next = *list) != NULL) { \ 911 if (isv4mapped && \ 912 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 913 connp->conn_zoneid == next->conn_zoneid) { \ 914 (connp)->conn_next = next; \ 915 if (prev != NULL) \ 916 prev = next->conn_prev; \ 917 next->conn_prev = (connp); \ 918 break; \ 919 } \ 920 list = &next->conn_next; \ 921 prev = next; \ 922 } \ 923 (connp)->conn_prev = prev; \ 924 *list = (connp); \ 925 (connp)->conn_fanout = (connfp); \ 926 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 927 IPCL_BOUND; \ 928 CONN_INC_REF((connp)); \ 929 mutex_exit(&(connfp)->connf_lock); \ 930 } 931 932 void 933 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 934 { 935 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 936 } 937 938 /* 939 * Because the classifier is used to classify inbound packets, the destination 940 * address is meant to be our local tunnel address (tunnel source), and the 941 * source the remote tunnel address (tunnel destination). 942 * 943 * Note that conn_proto can't be used for fanout since the upper protocol 944 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 945 */ 946 conn_t * 947 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 948 { 949 connf_t *connfp; 950 conn_t *connp; 951 952 /* first look for IPv4 tunnel links */ 953 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 954 mutex_enter(&connfp->connf_lock); 955 for (connp = connfp->connf_head; connp != NULL; 956 connp = connp->conn_next) { 957 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 958 break; 959 } 960 if (connp != NULL) 961 goto done; 962 963 mutex_exit(&connfp->connf_lock); 964 965 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 966 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 967 INADDR_ANY)]; 968 mutex_enter(&connfp->connf_lock); 969 for (connp = connfp->connf_head; connp != NULL; 970 connp = connp->conn_next) { 971 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 972 break; 973 } 974 done: 975 if (connp != NULL) 976 CONN_INC_REF(connp); 977 mutex_exit(&connfp->connf_lock); 978 return (connp); 979 } 980 981 conn_t * 982 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 983 { 984 connf_t *connfp; 985 conn_t *connp; 986 987 /* Look for an IPv6 tunnel link */ 988 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 989 mutex_enter(&connfp->connf_lock); 990 for (connp = connfp->connf_head; connp != NULL; 991 connp = connp->conn_next) { 992 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 993 CONN_INC_REF(connp); 994 break; 995 } 996 } 997 mutex_exit(&connfp->connf_lock); 998 return (connp); 999 } 1000 1001 /* 1002 * This function is used only for inserting SCTP raw socket now. 1003 * This may change later. 1004 * 1005 * Note that only one raw socket can be bound to a port. The param 1006 * lport is in network byte order. 1007 */ 1008 static int 1009 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1010 { 1011 connf_t *connfp; 1012 conn_t *oconnp; 1013 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1014 1015 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1016 1017 /* Check for existing raw socket already bound to the port. */ 1018 mutex_enter(&connfp->connf_lock); 1019 for (oconnp = connfp->connf_head; oconnp != NULL; 1020 oconnp = oconnp->conn_next) { 1021 if (oconnp->conn_lport == lport && 1022 oconnp->conn_zoneid == connp->conn_zoneid && 1023 oconnp->conn_family == connp->conn_family && 1024 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1025 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1026 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1027 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1028 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1029 &connp->conn_laddr_v6))) { 1030 break; 1031 } 1032 } 1033 mutex_exit(&connfp->connf_lock); 1034 if (oconnp != NULL) 1035 return (EADDRNOTAVAIL); 1036 1037 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1038 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1039 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1040 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1041 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1042 } else { 1043 IPCL_HASH_INSERT_BOUND(connfp, connp); 1044 } 1045 } else { 1046 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1047 } 1048 return (0); 1049 } 1050 1051 static int 1052 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1053 { 1054 connf_t *connfp; 1055 conn_t *tconnp; 1056 ipaddr_t laddr = connp->conn_laddr_v4; 1057 ipaddr_t faddr = connp->conn_faddr_v4; 1058 1059 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1060 mutex_enter(&connfp->connf_lock); 1061 for (tconnp = connfp->connf_head; tconnp != NULL; 1062 tconnp = tconnp->conn_next) { 1063 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1064 /* A tunnel is already bound to these addresses. */ 1065 mutex_exit(&connfp->connf_lock); 1066 return (EADDRINUSE); 1067 } 1068 } 1069 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1070 mutex_exit(&connfp->connf_lock); 1071 return (0); 1072 } 1073 1074 static int 1075 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1076 { 1077 connf_t *connfp; 1078 conn_t *tconnp; 1079 in6_addr_t *laddr = &connp->conn_laddr_v6; 1080 in6_addr_t *faddr = &connp->conn_faddr_v6; 1081 1082 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1083 mutex_enter(&connfp->connf_lock); 1084 for (tconnp = connfp->connf_head; tconnp != NULL; 1085 tconnp = tconnp->conn_next) { 1086 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1087 /* A tunnel is already bound to these addresses. */ 1088 mutex_exit(&connfp->connf_lock); 1089 return (EADDRINUSE); 1090 } 1091 } 1092 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1093 mutex_exit(&connfp->connf_lock); 1094 return (0); 1095 } 1096 1097 /* 1098 * Check for a MAC exemption conflict on a labeled system. Note that for 1099 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1100 * transport layer. This check is for binding all other protocols. 1101 * 1102 * Returns true if there's a conflict. 1103 */ 1104 static boolean_t 1105 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1106 { 1107 connf_t *connfp; 1108 conn_t *tconn; 1109 1110 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1111 mutex_enter(&connfp->connf_lock); 1112 for (tconn = connfp->connf_head; tconn != NULL; 1113 tconn = tconn->conn_next) { 1114 /* We don't allow v4 fallback for v6 raw socket */ 1115 if (connp->conn_family != tconn->conn_family) 1116 continue; 1117 /* If neither is exempt, then there's no conflict */ 1118 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1119 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1120 continue; 1121 /* We are only concerned about sockets for a different zone */ 1122 if (connp->conn_zoneid == tconn->conn_zoneid) 1123 continue; 1124 /* If both are bound to different specific addrs, ok */ 1125 if (connp->conn_laddr_v4 != INADDR_ANY && 1126 tconn->conn_laddr_v4 != INADDR_ANY && 1127 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1128 continue; 1129 /* These two conflict; fail */ 1130 break; 1131 } 1132 mutex_exit(&connfp->connf_lock); 1133 return (tconn != NULL); 1134 } 1135 1136 static boolean_t 1137 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1138 { 1139 connf_t *connfp; 1140 conn_t *tconn; 1141 1142 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1143 mutex_enter(&connfp->connf_lock); 1144 for (tconn = connfp->connf_head; tconn != NULL; 1145 tconn = tconn->conn_next) { 1146 /* We don't allow v4 fallback for v6 raw socket */ 1147 if (connp->conn_family != tconn->conn_family) 1148 continue; 1149 /* If neither is exempt, then there's no conflict */ 1150 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1151 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1152 continue; 1153 /* We are only concerned about sockets for a different zone */ 1154 if (connp->conn_zoneid == tconn->conn_zoneid) 1155 continue; 1156 /* If both are bound to different addrs, ok */ 1157 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1158 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1159 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1160 &tconn->conn_laddr_v6)) 1161 continue; 1162 /* These two conflict; fail */ 1163 break; 1164 } 1165 mutex_exit(&connfp->connf_lock); 1166 return (tconn != NULL); 1167 } 1168 1169 /* 1170 * (v4, v6) bind hash insertion routines 1171 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1172 */ 1173 1174 int 1175 ipcl_bind_insert(conn_t *connp) 1176 { 1177 if (connp->conn_ipversion == IPV6_VERSION) 1178 return (ipcl_bind_insert_v6(connp)); 1179 else 1180 return (ipcl_bind_insert_v4(connp)); 1181 } 1182 1183 int 1184 ipcl_bind_insert_v4(conn_t *connp) 1185 { 1186 connf_t *connfp; 1187 int ret = 0; 1188 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1189 uint16_t lport = connp->conn_lport; 1190 uint8_t protocol = connp->conn_proto; 1191 1192 if (IPCL_IS_IPTUN(connp)) 1193 return (ipcl_iptun_hash_insert(connp, ipst)); 1194 1195 switch (protocol) { 1196 default: 1197 if (is_system_labeled() && 1198 check_exempt_conflict_v4(connp, ipst)) 1199 return (EADDRINUSE); 1200 /* FALLTHROUGH */ 1201 case IPPROTO_UDP: 1202 if (protocol == IPPROTO_UDP) { 1203 connfp = &ipst->ips_ipcl_udp_fanout[ 1204 IPCL_UDP_HASH(lport, ipst)]; 1205 } else { 1206 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1207 } 1208 1209 if (connp->conn_faddr_v4 != INADDR_ANY) { 1210 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1211 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1212 IPCL_HASH_INSERT_BOUND(connfp, connp); 1213 } else { 1214 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1215 } 1216 if (protocol == IPPROTO_RSVP) 1217 ill_set_inputfn_all(ipst); 1218 break; 1219 1220 case IPPROTO_TCP: 1221 /* Insert it in the Bind Hash */ 1222 ASSERT(connp->conn_zoneid != ALL_ZONES); 1223 connfp = &ipst->ips_ipcl_bind_fanout[ 1224 IPCL_BIND_HASH(lport, ipst)]; 1225 if (connp->conn_laddr_v4 != INADDR_ANY) { 1226 IPCL_HASH_INSERT_BOUND(connfp, connp); 1227 } else { 1228 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1229 } 1230 if (cl_inet_listen != NULL) { 1231 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1232 connp->conn_flags |= IPCL_CL_LISTENER; 1233 (*cl_inet_listen)( 1234 connp->conn_netstack->netstack_stackid, 1235 IPPROTO_TCP, AF_INET, 1236 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1237 } 1238 break; 1239 1240 case IPPROTO_SCTP: 1241 ret = ipcl_sctp_hash_insert(connp, lport); 1242 break; 1243 } 1244 1245 return (ret); 1246 } 1247 1248 int 1249 ipcl_bind_insert_v6(conn_t *connp) 1250 { 1251 connf_t *connfp; 1252 int ret = 0; 1253 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1254 uint16_t lport = connp->conn_lport; 1255 uint8_t protocol = connp->conn_proto; 1256 1257 if (IPCL_IS_IPTUN(connp)) { 1258 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1259 } 1260 1261 switch (protocol) { 1262 default: 1263 if (is_system_labeled() && 1264 check_exempt_conflict_v6(connp, ipst)) 1265 return (EADDRINUSE); 1266 /* FALLTHROUGH */ 1267 case IPPROTO_UDP: 1268 if (protocol == IPPROTO_UDP) { 1269 connfp = &ipst->ips_ipcl_udp_fanout[ 1270 IPCL_UDP_HASH(lport, ipst)]; 1271 } else { 1272 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1273 } 1274 1275 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1276 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1277 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1278 IPCL_HASH_INSERT_BOUND(connfp, connp); 1279 } else { 1280 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1281 } 1282 break; 1283 1284 case IPPROTO_TCP: 1285 /* Insert it in the Bind Hash */ 1286 ASSERT(connp->conn_zoneid != ALL_ZONES); 1287 connfp = &ipst->ips_ipcl_bind_fanout[ 1288 IPCL_BIND_HASH(lport, ipst)]; 1289 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1290 IPCL_HASH_INSERT_BOUND(connfp, connp); 1291 } else { 1292 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1293 } 1294 if (cl_inet_listen != NULL) { 1295 sa_family_t addr_family; 1296 uint8_t *laddrp; 1297 1298 if (connp->conn_ipversion == IPV6_VERSION) { 1299 addr_family = AF_INET6; 1300 laddrp = 1301 (uint8_t *)&connp->conn_bound_addr_v6; 1302 } else { 1303 addr_family = AF_INET; 1304 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1305 } 1306 connp->conn_flags |= IPCL_CL_LISTENER; 1307 (*cl_inet_listen)( 1308 connp->conn_netstack->netstack_stackid, 1309 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1310 } 1311 break; 1312 1313 case IPPROTO_SCTP: 1314 ret = ipcl_sctp_hash_insert(connp, lport); 1315 break; 1316 } 1317 1318 return (ret); 1319 } 1320 1321 /* 1322 * ipcl_conn_hash insertion routines. 1323 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1324 */ 1325 1326 int 1327 ipcl_conn_insert(conn_t *connp) 1328 { 1329 if (connp->conn_ipversion == IPV6_VERSION) 1330 return (ipcl_conn_insert_v6(connp)); 1331 else 1332 return (ipcl_conn_insert_v4(connp)); 1333 } 1334 1335 int 1336 ipcl_conn_insert_v4(conn_t *connp) 1337 { 1338 connf_t *connfp; 1339 conn_t *tconnp; 1340 int ret = 0; 1341 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1342 uint16_t lport = connp->conn_lport; 1343 uint8_t protocol = connp->conn_proto; 1344 1345 if (IPCL_IS_IPTUN(connp)) 1346 return (ipcl_iptun_hash_insert(connp, ipst)); 1347 1348 switch (protocol) { 1349 case IPPROTO_TCP: 1350 /* 1351 * For TCP, we check whether the connection tuple already 1352 * exists before allowing the connection to proceed. We 1353 * also allow indexing on the zoneid. This is to allow 1354 * multiple shared stack zones to have the same tcp 1355 * connection tuple. In practice this only happens for 1356 * INADDR_LOOPBACK as it's the only local address which 1357 * doesn't have to be unique. 1358 */ 1359 connfp = &ipst->ips_ipcl_conn_fanout[ 1360 IPCL_CONN_HASH(connp->conn_faddr_v4, 1361 connp->conn_ports, ipst)]; 1362 mutex_enter(&connfp->connf_lock); 1363 for (tconnp = connfp->connf_head; tconnp != NULL; 1364 tconnp = tconnp->conn_next) { 1365 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1366 connp->conn_faddr_v4, connp->conn_laddr_v4, 1367 connp->conn_ports) && 1368 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1369 /* Already have a conn. bail out */ 1370 mutex_exit(&connfp->connf_lock); 1371 return (EADDRINUSE); 1372 } 1373 } 1374 if (connp->conn_fanout != NULL) { 1375 /* 1376 * Probably a XTI/TLI application trying to do a 1377 * rebind. Let it happen. 1378 */ 1379 mutex_exit(&connfp->connf_lock); 1380 IPCL_HASH_REMOVE(connp); 1381 mutex_enter(&connfp->connf_lock); 1382 } 1383 1384 ASSERT(connp->conn_recv != NULL); 1385 ASSERT(connp->conn_recvicmp != NULL); 1386 1387 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1388 mutex_exit(&connfp->connf_lock); 1389 break; 1390 1391 case IPPROTO_SCTP: 1392 /* 1393 * The raw socket may have already been bound, remove it 1394 * from the hash first. 1395 */ 1396 IPCL_HASH_REMOVE(connp); 1397 ret = ipcl_sctp_hash_insert(connp, lport); 1398 break; 1399 1400 default: 1401 /* 1402 * Check for conflicts among MAC exempt bindings. For 1403 * transports with port numbers, this is done by the upper 1404 * level per-transport binding logic. For all others, it's 1405 * done here. 1406 */ 1407 if (is_system_labeled() && 1408 check_exempt_conflict_v4(connp, ipst)) 1409 return (EADDRINUSE); 1410 /* FALLTHROUGH */ 1411 1412 case IPPROTO_UDP: 1413 if (protocol == IPPROTO_UDP) { 1414 connfp = &ipst->ips_ipcl_udp_fanout[ 1415 IPCL_UDP_HASH(lport, ipst)]; 1416 } else { 1417 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1418 } 1419 1420 if (connp->conn_faddr_v4 != INADDR_ANY) { 1421 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1422 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1423 IPCL_HASH_INSERT_BOUND(connfp, connp); 1424 } else { 1425 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1426 } 1427 break; 1428 } 1429 1430 return (ret); 1431 } 1432 1433 int 1434 ipcl_conn_insert_v6(conn_t *connp) 1435 { 1436 connf_t *connfp; 1437 conn_t *tconnp; 1438 int ret = 0; 1439 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1440 uint16_t lport = connp->conn_lport; 1441 uint8_t protocol = connp->conn_proto; 1442 uint_t ifindex = connp->conn_bound_if; 1443 1444 if (IPCL_IS_IPTUN(connp)) 1445 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1446 1447 switch (protocol) { 1448 case IPPROTO_TCP: 1449 1450 /* 1451 * For tcp, we check whether the connection tuple already 1452 * exists before allowing the connection to proceed. We 1453 * also allow indexing on the zoneid. This is to allow 1454 * multiple shared stack zones to have the same tcp 1455 * connection tuple. In practice this only happens for 1456 * ipv6_loopback as it's the only local address which 1457 * doesn't have to be unique. 1458 */ 1459 connfp = &ipst->ips_ipcl_conn_fanout[ 1460 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1461 ipst)]; 1462 mutex_enter(&connfp->connf_lock); 1463 for (tconnp = connfp->connf_head; tconnp != NULL; 1464 tconnp = tconnp->conn_next) { 1465 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1466 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1467 connp->conn_faddr_v6, connp->conn_laddr_v6, 1468 connp->conn_ports) && 1469 (tconnp->conn_bound_if == 0 || 1470 tconnp->conn_bound_if == ifindex) && 1471 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1472 /* Already have a conn. bail out */ 1473 mutex_exit(&connfp->connf_lock); 1474 return (EADDRINUSE); 1475 } 1476 } 1477 if (connp->conn_fanout != NULL) { 1478 /* 1479 * Probably a XTI/TLI application trying to do a 1480 * rebind. Let it happen. 1481 */ 1482 mutex_exit(&connfp->connf_lock); 1483 IPCL_HASH_REMOVE(connp); 1484 mutex_enter(&connfp->connf_lock); 1485 } 1486 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1487 mutex_exit(&connfp->connf_lock); 1488 break; 1489 1490 case IPPROTO_SCTP: 1491 IPCL_HASH_REMOVE(connp); 1492 ret = ipcl_sctp_hash_insert(connp, lport); 1493 break; 1494 1495 default: 1496 if (is_system_labeled() && 1497 check_exempt_conflict_v6(connp, ipst)) 1498 return (EADDRINUSE); 1499 /* FALLTHROUGH */ 1500 case IPPROTO_UDP: 1501 if (protocol == IPPROTO_UDP) { 1502 connfp = &ipst->ips_ipcl_udp_fanout[ 1503 IPCL_UDP_HASH(lport, ipst)]; 1504 } else { 1505 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1506 } 1507 1508 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1509 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1510 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1511 IPCL_HASH_INSERT_BOUND(connfp, connp); 1512 } else { 1513 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1514 } 1515 break; 1516 } 1517 1518 return (ret); 1519 } 1520 1521 /* 1522 * v4 packet classifying function. looks up the fanout table to 1523 * find the conn, the packet belongs to. returns the conn with 1524 * the reference held, null otherwise. 1525 * 1526 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1527 * Lookup" comment block are applied. Labels are also checked as described 1528 * above. If the packet is from the inside (looped back), and is from the same 1529 * zone, then label checks are omitted. 1530 */ 1531 conn_t * 1532 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1533 ip_recv_attr_t *ira, ip_stack_t *ipst) 1534 { 1535 ipha_t *ipha; 1536 connf_t *connfp, *bind_connfp; 1537 uint16_t lport; 1538 uint16_t fport; 1539 uint32_t ports; 1540 conn_t *connp; 1541 uint16_t *up; 1542 zoneid_t zoneid = ira->ira_zoneid; 1543 int ifindex = ira->ira_ruifindex; 1544 1545 ipha = (ipha_t *)mp->b_rptr; 1546 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1547 1548 switch (protocol) { 1549 case IPPROTO_TCP: 1550 ports = *(uint32_t *)up; 1551 connfp = 1552 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1553 ports, ipst)]; 1554 mutex_enter(&connfp->connf_lock); 1555 for (connp = connfp->connf_head; connp != NULL; 1556 connp = connp->conn_next) { 1557 if (IPCL_CONN_MATCH(connp, protocol, 1558 ipha->ipha_src, ipha->ipha_dst, ports) && 1559 (connp->conn_incoming_ifindex == 0 || 1560 connp->conn_incoming_ifindex == ifindex) && 1561 (connp->conn_zoneid == zoneid || 1562 connp->conn_allzones || 1563 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1564 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1565 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1566 break; 1567 } 1568 1569 if (connp != NULL) { 1570 /* 1571 * We have a fully-bound TCP connection. 1572 * 1573 * For labeled systems, there's no need to check the 1574 * label here. It's known to be good as we checked 1575 * before allowing the connection to become bound. 1576 */ 1577 CONN_INC_REF(connp); 1578 mutex_exit(&connfp->connf_lock); 1579 return (connp); 1580 } 1581 1582 mutex_exit(&connfp->connf_lock); 1583 lport = up[1]; 1584 bind_connfp = 1585 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1586 mutex_enter(&bind_connfp->connf_lock); 1587 for (connp = bind_connfp->connf_head; connp != NULL; 1588 connp = connp->conn_next) { 1589 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1590 lport) && 1591 (connp->conn_incoming_ifindex == 0 || 1592 connp->conn_incoming_ifindex == ifindex) && 1593 (connp->conn_zoneid == zoneid || 1594 connp->conn_allzones || 1595 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1596 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1597 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1598 break; 1599 } 1600 1601 /* 1602 * If the matching connection is SLP on a private address, then 1603 * the label on the packet must match the local zone's label. 1604 * Otherwise, it must be in the label range defined by tnrh. 1605 * This is ensured by tsol_receive_local. 1606 * 1607 * Note that we don't check tsol_receive_local for 1608 * the connected case. 1609 */ 1610 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1611 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1612 ira, connp)) { 1613 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1614 char *, "connp(1) could not receive mp(2)", 1615 conn_t *, connp, mblk_t *, mp); 1616 connp = NULL; 1617 } 1618 1619 if (connp != NULL) { 1620 /* Have a listener at least */ 1621 CONN_INC_REF(connp); 1622 mutex_exit(&bind_connfp->connf_lock); 1623 return (connp); 1624 } 1625 1626 mutex_exit(&bind_connfp->connf_lock); 1627 break; 1628 1629 case IPPROTO_UDP: 1630 lport = up[1]; 1631 fport = up[0]; 1632 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1633 mutex_enter(&connfp->connf_lock); 1634 for (connp = connfp->connf_head; connp != NULL; 1635 connp = connp->conn_next) { 1636 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1637 fport, ipha->ipha_src) && 1638 (connp->conn_incoming_ifindex == 0 || 1639 connp->conn_incoming_ifindex == ifindex) && 1640 (connp->conn_zoneid == zoneid || 1641 connp->conn_allzones || 1642 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1643 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1644 break; 1645 } 1646 1647 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1648 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1649 ira, connp)) { 1650 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1651 char *, "connp(1) could not receive mp(2)", 1652 conn_t *, connp, mblk_t *, mp); 1653 connp = NULL; 1654 } 1655 1656 if (connp != NULL) { 1657 CONN_INC_REF(connp); 1658 mutex_exit(&connfp->connf_lock); 1659 return (connp); 1660 } 1661 1662 /* 1663 * We shouldn't come here for multicast/broadcast packets 1664 */ 1665 mutex_exit(&connfp->connf_lock); 1666 1667 break; 1668 1669 case IPPROTO_ENCAP: 1670 case IPPROTO_IPV6: 1671 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1672 &ipha->ipha_dst, ipst)); 1673 } 1674 1675 return (NULL); 1676 } 1677 1678 conn_t * 1679 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1680 ip_recv_attr_t *ira, ip_stack_t *ipst) 1681 { 1682 ip6_t *ip6h; 1683 connf_t *connfp, *bind_connfp; 1684 uint16_t lport; 1685 uint16_t fport; 1686 tcpha_t *tcpha; 1687 uint32_t ports; 1688 conn_t *connp; 1689 uint16_t *up; 1690 zoneid_t zoneid = ira->ira_zoneid; 1691 int ifindex = ira->ira_ruifindex; 1692 1693 ip6h = (ip6_t *)mp->b_rptr; 1694 1695 switch (protocol) { 1696 case IPPROTO_TCP: 1697 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1698 up = &tcpha->tha_lport; 1699 ports = *(uint32_t *)up; 1700 1701 connfp = 1702 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1703 ports, ipst)]; 1704 mutex_enter(&connfp->connf_lock); 1705 for (connp = connfp->connf_head; connp != NULL; 1706 connp = connp->conn_next) { 1707 if (IPCL_CONN_MATCH_V6(connp, protocol, 1708 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1709 (connp->conn_incoming_ifindex == 0 || 1710 connp->conn_incoming_ifindex == ifindex) && 1711 (connp->conn_zoneid == zoneid || 1712 connp->conn_allzones || 1713 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1714 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1715 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1716 break; 1717 } 1718 1719 if (connp != NULL) { 1720 /* 1721 * We have a fully-bound TCP connection. 1722 * 1723 * For labeled systems, there's no need to check the 1724 * label here. It's known to be good as we checked 1725 * before allowing the connection to become bound. 1726 */ 1727 CONN_INC_REF(connp); 1728 mutex_exit(&connfp->connf_lock); 1729 return (connp); 1730 } 1731 1732 mutex_exit(&connfp->connf_lock); 1733 1734 lport = up[1]; 1735 bind_connfp = 1736 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1737 mutex_enter(&bind_connfp->connf_lock); 1738 for (connp = bind_connfp->connf_head; connp != NULL; 1739 connp = connp->conn_next) { 1740 if (IPCL_BIND_MATCH_V6(connp, protocol, 1741 ip6h->ip6_dst, lport) && 1742 (connp->conn_incoming_ifindex == 0 || 1743 connp->conn_incoming_ifindex == ifindex) && 1744 (connp->conn_zoneid == zoneid || 1745 connp->conn_allzones || 1746 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1747 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1748 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1749 break; 1750 } 1751 1752 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1753 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1754 ira, connp)) { 1755 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1756 char *, "connp(1) could not receive mp(2)", 1757 conn_t *, connp, mblk_t *, mp); 1758 connp = NULL; 1759 } 1760 1761 if (connp != NULL) { 1762 /* Have a listner at least */ 1763 CONN_INC_REF(connp); 1764 mutex_exit(&bind_connfp->connf_lock); 1765 return (connp); 1766 } 1767 1768 mutex_exit(&bind_connfp->connf_lock); 1769 break; 1770 1771 case IPPROTO_UDP: 1772 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1773 lport = up[1]; 1774 fport = up[0]; 1775 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1776 mutex_enter(&connfp->connf_lock); 1777 for (connp = connfp->connf_head; connp != NULL; 1778 connp = connp->conn_next) { 1779 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1780 fport, ip6h->ip6_src) && 1781 (connp->conn_incoming_ifindex == 0 || 1782 connp->conn_incoming_ifindex == ifindex) && 1783 (connp->conn_zoneid == zoneid || 1784 connp->conn_allzones || 1785 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1786 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1787 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1788 break; 1789 } 1790 1791 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1792 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1793 ira, connp)) { 1794 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1795 char *, "connp(1) could not receive mp(2)", 1796 conn_t *, connp, mblk_t *, mp); 1797 connp = NULL; 1798 } 1799 1800 if (connp != NULL) { 1801 CONN_INC_REF(connp); 1802 mutex_exit(&connfp->connf_lock); 1803 return (connp); 1804 } 1805 1806 /* 1807 * We shouldn't come here for multicast/broadcast packets 1808 */ 1809 mutex_exit(&connfp->connf_lock); 1810 break; 1811 case IPPROTO_ENCAP: 1812 case IPPROTO_IPV6: 1813 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1814 &ip6h->ip6_dst, ipst)); 1815 } 1816 1817 return (NULL); 1818 } 1819 1820 /* 1821 * wrapper around ipcl_classify_(v4,v6) routines. 1822 */ 1823 conn_t * 1824 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1825 { 1826 if (ira->ira_flags & IRAF_IS_IPV4) { 1827 return (ipcl_classify_v4(mp, ira->ira_protocol, 1828 ira->ira_ip_hdr_length, ira, ipst)); 1829 } else { 1830 return (ipcl_classify_v6(mp, ira->ira_protocol, 1831 ira->ira_ip_hdr_length, ira, ipst)); 1832 } 1833 } 1834 1835 /* 1836 * Only used to classify SCTP RAW sockets 1837 */ 1838 conn_t * 1839 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1840 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1841 { 1842 connf_t *connfp; 1843 conn_t *connp; 1844 in_port_t lport; 1845 int ipversion; 1846 const void *dst; 1847 zoneid_t zoneid = ira->ira_zoneid; 1848 1849 lport = ((uint16_t *)&ports)[1]; 1850 if (ira->ira_flags & IRAF_IS_IPV4) { 1851 dst = (const void *)&ipha->ipha_dst; 1852 ipversion = IPV4_VERSION; 1853 } else { 1854 dst = (const void *)&ip6h->ip6_dst; 1855 ipversion = IPV6_VERSION; 1856 } 1857 1858 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1859 mutex_enter(&connfp->connf_lock); 1860 for (connp = connfp->connf_head; connp != NULL; 1861 connp = connp->conn_next) { 1862 /* We don't allow v4 fallback for v6 raw socket. */ 1863 if (ipversion != connp->conn_ipversion) 1864 continue; 1865 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1866 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1867 if (ipversion == IPV4_VERSION) { 1868 if (!IPCL_CONN_MATCH(connp, protocol, 1869 ipha->ipha_src, ipha->ipha_dst, ports)) 1870 continue; 1871 } else { 1872 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1873 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1874 continue; 1875 } 1876 } else { 1877 if (ipversion == IPV4_VERSION) { 1878 if (!IPCL_BIND_MATCH(connp, protocol, 1879 ipha->ipha_dst, lport)) 1880 continue; 1881 } else { 1882 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1883 ip6h->ip6_dst, lport)) 1884 continue; 1885 } 1886 } 1887 1888 if (connp->conn_zoneid == zoneid || 1889 connp->conn_allzones || 1890 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1891 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1892 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1893 break; 1894 } 1895 1896 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1897 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1898 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1899 char *, "connp(1) could not receive mp(2)", 1900 conn_t *, connp, mblk_t *, mp); 1901 connp = NULL; 1902 } 1903 1904 if (connp != NULL) 1905 goto found; 1906 mutex_exit(&connfp->connf_lock); 1907 1908 /* Try to look for a wildcard SCTP RAW socket match. */ 1909 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1910 mutex_enter(&connfp->connf_lock); 1911 for (connp = connfp->connf_head; connp != NULL; 1912 connp = connp->conn_next) { 1913 /* We don't allow v4 fallback for v6 raw socket. */ 1914 if (ipversion != connp->conn_ipversion) 1915 continue; 1916 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1917 continue; 1918 1919 if (ipversion == IPV4_VERSION) { 1920 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1921 break; 1922 } else { 1923 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1924 break; 1925 } 1926 } 1927 } 1928 1929 if (connp != NULL) 1930 goto found; 1931 1932 mutex_exit(&connfp->connf_lock); 1933 return (NULL); 1934 1935 found: 1936 ASSERT(connp != NULL); 1937 CONN_INC_REF(connp); 1938 mutex_exit(&connfp->connf_lock); 1939 return (connp); 1940 } 1941 1942 /* ARGSUSED */ 1943 static int 1944 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1945 { 1946 itc_t *itc = (itc_t *)buf; 1947 conn_t *connp = &itc->itc_conn; 1948 tcp_t *tcp = (tcp_t *)&itc[1]; 1949 1950 bzero(connp, sizeof (conn_t)); 1951 bzero(tcp, sizeof (tcp_t)); 1952 1953 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1954 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1955 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1956 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1957 if (tcp->tcp_timercache == NULL) 1958 return (ENOMEM); 1959 connp->conn_tcp = tcp; 1960 connp->conn_flags = IPCL_TCPCONN; 1961 connp->conn_proto = IPPROTO_TCP; 1962 tcp->tcp_connp = connp; 1963 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1964 1965 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1966 if (connp->conn_ixa == NULL) { 1967 tcp_timermp_free(tcp); 1968 return (ENOMEM); 1969 } 1970 connp->conn_ixa->ixa_refcnt = 1; 1971 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1972 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1973 return (0); 1974 } 1975 1976 /* ARGSUSED */ 1977 static void 1978 tcp_conn_destructor(void *buf, void *cdrarg) 1979 { 1980 itc_t *itc = (itc_t *)buf; 1981 conn_t *connp = &itc->itc_conn; 1982 tcp_t *tcp = (tcp_t *)&itc[1]; 1983 1984 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1985 ASSERT(tcp->tcp_connp == connp); 1986 ASSERT(connp->conn_tcp == tcp); 1987 tcp_timermp_free(tcp); 1988 mutex_destroy(&connp->conn_lock); 1989 cv_destroy(&connp->conn_cv); 1990 cv_destroy(&connp->conn_sq_cv); 1991 rw_destroy(&connp->conn_ilg_lock); 1992 1993 /* Can be NULL if constructor failed */ 1994 if (connp->conn_ixa != NULL) { 1995 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1996 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1997 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1998 ixa_refrele(connp->conn_ixa); 1999 } 2000 } 2001 2002 /* ARGSUSED */ 2003 static int 2004 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2005 { 2006 itc_t *itc = (itc_t *)buf; 2007 conn_t *connp = &itc->itc_conn; 2008 2009 bzero(connp, sizeof (conn_t)); 2010 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2011 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2012 connp->conn_flags = IPCL_IPCCONN; 2013 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2014 2015 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2016 if (connp->conn_ixa == NULL) 2017 return (ENOMEM); 2018 connp->conn_ixa->ixa_refcnt = 1; 2019 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2020 return (0); 2021 } 2022 2023 /* ARGSUSED */ 2024 static void 2025 ip_conn_destructor(void *buf, void *cdrarg) 2026 { 2027 itc_t *itc = (itc_t *)buf; 2028 conn_t *connp = &itc->itc_conn; 2029 2030 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2031 ASSERT(connp->conn_priv == NULL); 2032 mutex_destroy(&connp->conn_lock); 2033 cv_destroy(&connp->conn_cv); 2034 rw_destroy(&connp->conn_ilg_lock); 2035 2036 /* Can be NULL if constructor failed */ 2037 if (connp->conn_ixa != NULL) { 2038 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2039 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2040 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2041 ixa_refrele(connp->conn_ixa); 2042 } 2043 } 2044 2045 /* ARGSUSED */ 2046 static int 2047 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2048 { 2049 itc_t *itc = (itc_t *)buf; 2050 conn_t *connp = &itc->itc_conn; 2051 udp_t *udp = (udp_t *)&itc[1]; 2052 2053 bzero(connp, sizeof (conn_t)); 2054 bzero(udp, sizeof (udp_t)); 2055 2056 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2057 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2058 connp->conn_udp = udp; 2059 connp->conn_flags = IPCL_UDPCONN; 2060 connp->conn_proto = IPPROTO_UDP; 2061 udp->udp_connp = connp; 2062 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2063 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2064 if (connp->conn_ixa == NULL) 2065 return (ENOMEM); 2066 connp->conn_ixa->ixa_refcnt = 1; 2067 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2068 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2069 return (0); 2070 } 2071 2072 /* ARGSUSED */ 2073 static void 2074 udp_conn_destructor(void *buf, void *cdrarg) 2075 { 2076 itc_t *itc = (itc_t *)buf; 2077 conn_t *connp = &itc->itc_conn; 2078 udp_t *udp = (udp_t *)&itc[1]; 2079 2080 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2081 ASSERT(udp->udp_connp == connp); 2082 ASSERT(connp->conn_udp == udp); 2083 mutex_destroy(&connp->conn_lock); 2084 cv_destroy(&connp->conn_cv); 2085 rw_destroy(&connp->conn_ilg_lock); 2086 2087 /* Can be NULL if constructor failed */ 2088 if (connp->conn_ixa != NULL) { 2089 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2090 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2091 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2092 ixa_refrele(connp->conn_ixa); 2093 } 2094 } 2095 2096 /* ARGSUSED */ 2097 static int 2098 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2099 { 2100 itc_t *itc = (itc_t *)buf; 2101 conn_t *connp = &itc->itc_conn; 2102 icmp_t *icmp = (icmp_t *)&itc[1]; 2103 2104 bzero(connp, sizeof (conn_t)); 2105 bzero(icmp, sizeof (icmp_t)); 2106 2107 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2108 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2109 connp->conn_icmp = icmp; 2110 connp->conn_flags = IPCL_RAWIPCONN; 2111 connp->conn_proto = IPPROTO_ICMP; 2112 icmp->icmp_connp = connp; 2113 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2114 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2115 if (connp->conn_ixa == NULL) 2116 return (ENOMEM); 2117 connp->conn_ixa->ixa_refcnt = 1; 2118 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2119 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2120 return (0); 2121 } 2122 2123 /* ARGSUSED */ 2124 static void 2125 rawip_conn_destructor(void *buf, void *cdrarg) 2126 { 2127 itc_t *itc = (itc_t *)buf; 2128 conn_t *connp = &itc->itc_conn; 2129 icmp_t *icmp = (icmp_t *)&itc[1]; 2130 2131 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2132 ASSERT(icmp->icmp_connp == connp); 2133 ASSERT(connp->conn_icmp == icmp); 2134 mutex_destroy(&connp->conn_lock); 2135 cv_destroy(&connp->conn_cv); 2136 rw_destroy(&connp->conn_ilg_lock); 2137 2138 /* Can be NULL if constructor failed */ 2139 if (connp->conn_ixa != NULL) { 2140 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2141 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2142 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2143 ixa_refrele(connp->conn_ixa); 2144 } 2145 } 2146 2147 /* ARGSUSED */ 2148 static int 2149 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2150 { 2151 itc_t *itc = (itc_t *)buf; 2152 conn_t *connp = &itc->itc_conn; 2153 rts_t *rts = (rts_t *)&itc[1]; 2154 2155 bzero(connp, sizeof (conn_t)); 2156 bzero(rts, sizeof (rts_t)); 2157 2158 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2159 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2160 connp->conn_rts = rts; 2161 connp->conn_flags = IPCL_RTSCONN; 2162 rts->rts_connp = connp; 2163 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2164 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2165 if (connp->conn_ixa == NULL) 2166 return (ENOMEM); 2167 connp->conn_ixa->ixa_refcnt = 1; 2168 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2169 return (0); 2170 } 2171 2172 /* ARGSUSED */ 2173 static void 2174 rts_conn_destructor(void *buf, void *cdrarg) 2175 { 2176 itc_t *itc = (itc_t *)buf; 2177 conn_t *connp = &itc->itc_conn; 2178 rts_t *rts = (rts_t *)&itc[1]; 2179 2180 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2181 ASSERT(rts->rts_connp == connp); 2182 ASSERT(connp->conn_rts == rts); 2183 mutex_destroy(&connp->conn_lock); 2184 cv_destroy(&connp->conn_cv); 2185 rw_destroy(&connp->conn_ilg_lock); 2186 2187 /* Can be NULL if constructor failed */ 2188 if (connp->conn_ixa != NULL) { 2189 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2190 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2191 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2192 ixa_refrele(connp->conn_ixa); 2193 } 2194 } 2195 2196 /* 2197 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2198 * in the conn_t. 2199 * 2200 * Below we list all the pointers in the conn_t as a documentation aid. 2201 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2202 * If you add any pointers to the conn_t please add an ASSERT here 2203 * and #ifdef it out if it can't be actually asserted to be NULL. 2204 * In any case, we bzero most of the conn_t at the end of the function. 2205 */ 2206 void 2207 ipcl_conn_cleanup(conn_t *connp) 2208 { 2209 ip_xmit_attr_t *ixa; 2210 2211 ASSERT(connp->conn_latch == NULL); 2212 ASSERT(connp->conn_latch_in_policy == NULL); 2213 ASSERT(connp->conn_latch_in_action == NULL); 2214 #ifdef notdef 2215 ASSERT(connp->conn_rq == NULL); 2216 ASSERT(connp->conn_wq == NULL); 2217 #endif 2218 ASSERT(connp->conn_cred == NULL); 2219 ASSERT(connp->conn_g_fanout == NULL); 2220 ASSERT(connp->conn_g_next == NULL); 2221 ASSERT(connp->conn_g_prev == NULL); 2222 ASSERT(connp->conn_policy == NULL); 2223 ASSERT(connp->conn_fanout == NULL); 2224 ASSERT(connp->conn_next == NULL); 2225 ASSERT(connp->conn_prev == NULL); 2226 ASSERT(connp->conn_oper_pending_ill == NULL); 2227 ASSERT(connp->conn_ilg == NULL); 2228 ASSERT(connp->conn_drain_next == NULL); 2229 ASSERT(connp->conn_drain_prev == NULL); 2230 #ifdef notdef 2231 /* conn_idl is not cleared when removed from idl list */ 2232 ASSERT(connp->conn_idl == NULL); 2233 #endif 2234 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2235 #ifdef notdef 2236 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2237 ASSERT(connp->conn_netstack == NULL); 2238 #endif 2239 2240 ASSERT(connp->conn_helper_info == NULL); 2241 ASSERT(connp->conn_ixa != NULL); 2242 ixa = connp->conn_ixa; 2243 ASSERT(ixa->ixa_refcnt == 1); 2244 /* Need to preserve ixa_protocol */ 2245 ixa_cleanup(ixa); 2246 ixa->ixa_flags = 0; 2247 2248 /* Clear out the conn_t fields that are not preserved */ 2249 bzero(&connp->conn_start_clr, 2250 sizeof (conn_t) - 2251 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2252 } 2253 2254 /* 2255 * All conns are inserted in a global multi-list for the benefit of 2256 * walkers. The walk is guaranteed to walk all open conns at the time 2257 * of the start of the walk exactly once. This property is needed to 2258 * achieve some cleanups during unplumb of interfaces. This is achieved 2259 * as follows. 2260 * 2261 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2262 * call the insert and delete functions below at creation and deletion 2263 * time respectively. The conn never moves or changes its position in this 2264 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2265 * won't increase due to walkers, once the conn deletion has started. Note 2266 * that we can't remove the conn from the global list and then wait for 2267 * the refcnt to drop to zero, since walkers would then see a truncated 2268 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2269 * conns until ip_open is ready to make them globally visible. 2270 * The global round robin multi-list locks are held only to get the 2271 * next member/insertion/deletion and contention should be negligible 2272 * if the multi-list is much greater than the number of cpus. 2273 */ 2274 void 2275 ipcl_globalhash_insert(conn_t *connp) 2276 { 2277 int index; 2278 struct connf_s *connfp; 2279 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2280 2281 /* 2282 * No need for atomic here. Approximate even distribution 2283 * in the global lists is sufficient. 2284 */ 2285 ipst->ips_conn_g_index++; 2286 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2287 2288 connp->conn_g_prev = NULL; 2289 /* 2290 * Mark as INCIPIENT, so that walkers will ignore this 2291 * for now, till ip_open is ready to make it visible globally. 2292 */ 2293 connp->conn_state_flags |= CONN_INCIPIENT; 2294 2295 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2296 /* Insert at the head of the list */ 2297 mutex_enter(&connfp->connf_lock); 2298 connp->conn_g_next = connfp->connf_head; 2299 if (connp->conn_g_next != NULL) 2300 connp->conn_g_next->conn_g_prev = connp; 2301 connfp->connf_head = connp; 2302 2303 /* The fanout bucket this conn points to */ 2304 connp->conn_g_fanout = connfp; 2305 2306 mutex_exit(&connfp->connf_lock); 2307 } 2308 2309 void 2310 ipcl_globalhash_remove(conn_t *connp) 2311 { 2312 struct connf_s *connfp; 2313 2314 /* 2315 * We were never inserted in the global multi list. 2316 * IPCL_NONE variety is never inserted in the global multilist 2317 * since it is presumed to not need any cleanup and is transient. 2318 */ 2319 if (connp->conn_g_fanout == NULL) 2320 return; 2321 2322 connfp = connp->conn_g_fanout; 2323 mutex_enter(&connfp->connf_lock); 2324 if (connp->conn_g_prev != NULL) 2325 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2326 else 2327 connfp->connf_head = connp->conn_g_next; 2328 if (connp->conn_g_next != NULL) 2329 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2330 mutex_exit(&connfp->connf_lock); 2331 2332 /* Better to stumble on a null pointer than to corrupt memory */ 2333 connp->conn_g_next = NULL; 2334 connp->conn_g_prev = NULL; 2335 connp->conn_g_fanout = NULL; 2336 } 2337 2338 /* 2339 * Walk the list of all conn_t's in the system, calling the function provided 2340 * With the specified argument for each. 2341 * Applies to both IPv4 and IPv6. 2342 * 2343 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2344 * conn_oper_pending_ill). To guard against stale pointers 2345 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2346 * unplumbed or removed. New conn_t's that are created while we are walking 2347 * may be missed by this walk, because they are not necessarily inserted 2348 * at the tail of the list. They are new conn_t's and thus don't have any 2349 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2350 * is created to the struct that is going away. 2351 */ 2352 void 2353 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2354 { 2355 int i; 2356 conn_t *connp; 2357 conn_t *prev_connp; 2358 2359 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2360 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2361 prev_connp = NULL; 2362 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2363 while (connp != NULL) { 2364 mutex_enter(&connp->conn_lock); 2365 if (connp->conn_state_flags & 2366 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2367 mutex_exit(&connp->conn_lock); 2368 connp = connp->conn_g_next; 2369 continue; 2370 } 2371 CONN_INC_REF_LOCKED(connp); 2372 mutex_exit(&connp->conn_lock); 2373 mutex_exit( 2374 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2375 (*func)(connp, arg); 2376 if (prev_connp != NULL) 2377 CONN_DEC_REF(prev_connp); 2378 mutex_enter( 2379 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2380 prev_connp = connp; 2381 connp = connp->conn_g_next; 2382 } 2383 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2384 if (prev_connp != NULL) 2385 CONN_DEC_REF(prev_connp); 2386 } 2387 } 2388 2389 /* 2390 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2391 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2392 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2393 * (peer tcp in ESTABLISHED state). 2394 */ 2395 conn_t * 2396 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2397 ip_stack_t *ipst) 2398 { 2399 uint32_t ports; 2400 uint16_t *pports = (uint16_t *)&ports; 2401 connf_t *connfp; 2402 conn_t *tconnp; 2403 boolean_t zone_chk; 2404 2405 /* 2406 * If either the source of destination address is loopback, then 2407 * both endpoints must be in the same Zone. Otherwise, both of 2408 * the addresses are system-wide unique (tcp is in ESTABLISHED 2409 * state) and the endpoints may reside in different Zones. 2410 */ 2411 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2412 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2413 2414 pports[0] = tcpha->tha_fport; 2415 pports[1] = tcpha->tha_lport; 2416 2417 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2418 ports, ipst)]; 2419 2420 mutex_enter(&connfp->connf_lock); 2421 for (tconnp = connfp->connf_head; tconnp != NULL; 2422 tconnp = tconnp->conn_next) { 2423 2424 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2425 ipha->ipha_dst, ipha->ipha_src, ports) && 2426 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2427 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2428 2429 ASSERT(tconnp != connp); 2430 CONN_INC_REF(tconnp); 2431 mutex_exit(&connfp->connf_lock); 2432 return (tconnp); 2433 } 2434 } 2435 mutex_exit(&connfp->connf_lock); 2436 return (NULL); 2437 } 2438 2439 /* 2440 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2441 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2442 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2443 * (peer tcp in ESTABLISHED state). 2444 */ 2445 conn_t * 2446 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2447 ip_stack_t *ipst) 2448 { 2449 uint32_t ports; 2450 uint16_t *pports = (uint16_t *)&ports; 2451 connf_t *connfp; 2452 conn_t *tconnp; 2453 boolean_t zone_chk; 2454 2455 /* 2456 * If either the source of destination address is loopback, then 2457 * both endpoints must be in the same Zone. Otherwise, both of 2458 * the addresses are system-wide unique (tcp is in ESTABLISHED 2459 * state) and the endpoints may reside in different Zones. We 2460 * don't do Zone check for link local address(es) because the 2461 * current Zone implementation treats each link local address as 2462 * being unique per system node, i.e. they belong to global Zone. 2463 */ 2464 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2465 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2466 2467 pports[0] = tcpha->tha_fport; 2468 pports[1] = tcpha->tha_lport; 2469 2470 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2471 ports, ipst)]; 2472 2473 mutex_enter(&connfp->connf_lock); 2474 for (tconnp = connfp->connf_head; tconnp != NULL; 2475 tconnp = tconnp->conn_next) { 2476 2477 /* We skip conn_bound_if check here as this is loopback tcp */ 2478 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2479 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2480 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2481 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2482 2483 ASSERT(tconnp != connp); 2484 CONN_INC_REF(tconnp); 2485 mutex_exit(&connfp->connf_lock); 2486 return (tconnp); 2487 } 2488 } 2489 mutex_exit(&connfp->connf_lock); 2490 return (NULL); 2491 } 2492 2493 /* 2494 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2495 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2496 * Only checks for connected entries i.e. no INADDR_ANY checks. 2497 */ 2498 conn_t * 2499 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2500 ip_stack_t *ipst) 2501 { 2502 uint32_t ports; 2503 uint16_t *pports; 2504 connf_t *connfp; 2505 conn_t *tconnp; 2506 2507 pports = (uint16_t *)&ports; 2508 pports[0] = tcpha->tha_fport; 2509 pports[1] = tcpha->tha_lport; 2510 2511 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2512 ports, ipst)]; 2513 2514 mutex_enter(&connfp->connf_lock); 2515 for (tconnp = connfp->connf_head; tconnp != NULL; 2516 tconnp = tconnp->conn_next) { 2517 2518 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2519 ipha->ipha_dst, ipha->ipha_src, ports) && 2520 tconnp->conn_tcp->tcp_state >= min_state) { 2521 2522 CONN_INC_REF(tconnp); 2523 mutex_exit(&connfp->connf_lock); 2524 return (tconnp); 2525 } 2526 } 2527 mutex_exit(&connfp->connf_lock); 2528 return (NULL); 2529 } 2530 2531 /* 2532 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2533 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2534 * Only checks for connected entries i.e. no INADDR_ANY checks. 2535 * Match on ifindex in addition to addresses. 2536 */ 2537 conn_t * 2538 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2539 uint_t ifindex, ip_stack_t *ipst) 2540 { 2541 tcp_t *tcp; 2542 uint32_t ports; 2543 uint16_t *pports; 2544 connf_t *connfp; 2545 conn_t *tconnp; 2546 2547 pports = (uint16_t *)&ports; 2548 pports[0] = tcpha->tha_fport; 2549 pports[1] = tcpha->tha_lport; 2550 2551 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2552 ports, ipst)]; 2553 2554 mutex_enter(&connfp->connf_lock); 2555 for (tconnp = connfp->connf_head; tconnp != NULL; 2556 tconnp = tconnp->conn_next) { 2557 2558 tcp = tconnp->conn_tcp; 2559 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2560 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2561 tcp->tcp_state >= min_state && 2562 (tconnp->conn_bound_if == 0 || 2563 tconnp->conn_bound_if == ifindex)) { 2564 2565 CONN_INC_REF(tconnp); 2566 mutex_exit(&connfp->connf_lock); 2567 return (tconnp); 2568 } 2569 } 2570 mutex_exit(&connfp->connf_lock); 2571 return (NULL); 2572 } 2573 2574 /* 2575 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2576 * a listener when changing state. 2577 */ 2578 conn_t * 2579 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2580 ip_stack_t *ipst) 2581 { 2582 connf_t *bind_connfp; 2583 conn_t *connp; 2584 tcp_t *tcp; 2585 2586 /* 2587 * Avoid false matches for packets sent to an IP destination of 2588 * all zeros. 2589 */ 2590 if (laddr == 0) 2591 return (NULL); 2592 2593 ASSERT(zoneid != ALL_ZONES); 2594 2595 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2596 mutex_enter(&bind_connfp->connf_lock); 2597 for (connp = bind_connfp->connf_head; connp != NULL; 2598 connp = connp->conn_next) { 2599 tcp = connp->conn_tcp; 2600 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2601 IPCL_ZONE_MATCH(connp, zoneid) && 2602 (tcp->tcp_listener == NULL)) { 2603 CONN_INC_REF(connp); 2604 mutex_exit(&bind_connfp->connf_lock); 2605 return (connp); 2606 } 2607 } 2608 mutex_exit(&bind_connfp->connf_lock); 2609 return (NULL); 2610 } 2611 2612 /* 2613 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2614 * a listener when changing state. 2615 */ 2616 conn_t * 2617 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2618 zoneid_t zoneid, ip_stack_t *ipst) 2619 { 2620 connf_t *bind_connfp; 2621 conn_t *connp = NULL; 2622 tcp_t *tcp; 2623 2624 /* 2625 * Avoid false matches for packets sent to an IP destination of 2626 * all zeros. 2627 */ 2628 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2629 return (NULL); 2630 2631 ASSERT(zoneid != ALL_ZONES); 2632 2633 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2634 mutex_enter(&bind_connfp->connf_lock); 2635 for (connp = bind_connfp->connf_head; connp != NULL; 2636 connp = connp->conn_next) { 2637 tcp = connp->conn_tcp; 2638 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2639 IPCL_ZONE_MATCH(connp, zoneid) && 2640 (connp->conn_bound_if == 0 || 2641 connp->conn_bound_if == ifindex) && 2642 tcp->tcp_listener == NULL) { 2643 CONN_INC_REF(connp); 2644 mutex_exit(&bind_connfp->connf_lock); 2645 return (connp); 2646 } 2647 } 2648 mutex_exit(&bind_connfp->connf_lock); 2649 return (NULL); 2650 } 2651 2652 /* 2653 * ipcl_get_next_conn 2654 * get the next entry in the conn global list 2655 * and put a reference on the next_conn. 2656 * decrement the reference on the current conn. 2657 * 2658 * This is an iterator based walker function that also provides for 2659 * some selection by the caller. It walks through the conn_hash bucket 2660 * searching for the next valid connp in the list, and selects connections 2661 * that are neither closed nor condemned. It also REFHOLDS the conn 2662 * thus ensuring that the conn exists when the caller uses the conn. 2663 */ 2664 conn_t * 2665 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2666 { 2667 conn_t *next_connp; 2668 2669 if (connfp == NULL) 2670 return (NULL); 2671 2672 mutex_enter(&connfp->connf_lock); 2673 2674 next_connp = (connp == NULL) ? 2675 connfp->connf_head : connp->conn_g_next; 2676 2677 while (next_connp != NULL) { 2678 mutex_enter(&next_connp->conn_lock); 2679 if (!(next_connp->conn_flags & conn_flags) || 2680 (next_connp->conn_state_flags & 2681 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2682 /* 2683 * This conn has been condemned or 2684 * is closing, or the flags don't match 2685 */ 2686 mutex_exit(&next_connp->conn_lock); 2687 next_connp = next_connp->conn_g_next; 2688 continue; 2689 } 2690 CONN_INC_REF_LOCKED(next_connp); 2691 mutex_exit(&next_connp->conn_lock); 2692 break; 2693 } 2694 2695 mutex_exit(&connfp->connf_lock); 2696 2697 if (connp != NULL) 2698 CONN_DEC_REF(connp); 2699 2700 return (next_connp); 2701 } 2702 2703 #ifdef CONN_DEBUG 2704 /* 2705 * Trace of the last NBUF refhold/refrele 2706 */ 2707 int 2708 conn_trace_ref(conn_t *connp) 2709 { 2710 int last; 2711 conn_trace_t *ctb; 2712 2713 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2714 last = connp->conn_trace_last; 2715 last++; 2716 if (last == CONN_TRACE_MAX) 2717 last = 0; 2718 2719 ctb = &connp->conn_trace_buf[last]; 2720 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2721 connp->conn_trace_last = last; 2722 return (1); 2723 } 2724 2725 int 2726 conn_untrace_ref(conn_t *connp) 2727 { 2728 int last; 2729 conn_trace_t *ctb; 2730 2731 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2732 last = connp->conn_trace_last; 2733 last++; 2734 if (last == CONN_TRACE_MAX) 2735 last = 0; 2736 2737 ctb = &connp->conn_trace_buf[last]; 2738 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2739 connp->conn_trace_last = last; 2740 return (1); 2741 } 2742 #endif 2743 2744 mib2_socketInfoEntry_t * 2745 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) 2746 { 2747 vnode_t *vn = NULL; 2748 vattr_t attr; 2749 uint64_t flags = 0; 2750 sock_upcalls_t *upcalls; 2751 sock_upper_handle_t upper_handle; 2752 2753 /* 2754 * If the connection is closing, it is not safe to make an upcall or 2755 * access the stream associated with the connection. 2756 * The callers of this function have a reference on connp itself 2757 * so, as long as it is not closing, it's safe to continue. 2758 */ 2759 mutex_enter(&connp->conn_lock); 2760 2761 if ((connp->conn_state_flags & CONN_CLOSING)) { 2762 mutex_exit(&connp->conn_lock); 2763 return (NULL); 2764 } 2765 2766 /* 2767 * Continue to hold conn_lock because we don't want to race with an 2768 * in-progress close, which will have set-to-NULL (and destroyed 2769 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING. 2770 * 2771 * There is still a race with an in-progress OPEN, however, where 2772 * conn_upper_handle and conn_upcalls are being assigned (in multiple 2773 * codepaths) WITHOUT conn_lock being held. We address that race 2774 * HERE, however, given that both are going from NULL to non-NULL, 2775 * if we lose the race, we don't get any data for the in-progress-OPEN 2776 * socket. 2777 */ 2778 2779 upcalls = connp->conn_upcalls; 2780 upper_handle = connp->conn_upper_handle; 2781 /* Check BOTH for non-NULL before attempting an upcall. */ 2782 if (upper_handle != NULL && upcalls != NULL) { 2783 /* su_get_vnode() returns one with VN_HOLD() already done. */ 2784 vn = upcalls->su_get_vnode(upper_handle); 2785 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) { 2786 vn = STREAM(connp->conn_rq)->sd_pvnode; 2787 if (vn != NULL) 2788 VN_HOLD(vn); 2789 flags |= MIB2_SOCKINFO_STREAM; 2790 } 2791 2792 mutex_exit(&connp->conn_lock); 2793 2794 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) { 2795 if (vn != NULL) 2796 VN_RELE(vn); 2797 return (NULL); 2798 } 2799 2800 VN_RELE(vn); 2801 2802 bzero(sie, sizeof (*sie)); 2803 2804 sie->sie_flags = flags; 2805 sie->sie_inode = attr.va_nodeid; 2806 sie->sie_dev = attr.va_rdev; 2807 2808 return (sie); 2809 } 2810