1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/tcp.h> 265 #include <inet/ip_ndp.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/ipsec_impl.h> 276 277 #include <sys/tsol/tnet.h> 278 279 #ifdef DEBUG 280 #define IPCL_DEBUG 281 #else 282 #undef IPCL_DEBUG 283 #endif 284 285 #ifdef IPCL_DEBUG 286 int ipcl_debug_level = 0; 287 #define IPCL_DEBUG_LVL(level, args) \ 288 if (ipcl_debug_level & level) { printf args; } 289 #else 290 #define IPCL_DEBUG_LVL(level, args) {; } 291 #endif 292 /* Old value for compatibility. Setable in /etc/system */ 293 uint_t tcp_conn_hash_size = 0; 294 295 /* New value. Zero means choose automatically. Setable in /etc/system */ 296 uint_t ipcl_conn_hash_size = 0; 297 uint_t ipcl_conn_hash_memfactor = 8192; 298 uint_t ipcl_conn_hash_maxsize = 82500; 299 300 /* bind/udp fanout table size */ 301 uint_t ipcl_bind_fanout_size = 512; 302 uint_t ipcl_udp_fanout_size = 16384; 303 304 /* Raw socket fanout size. Must be a power of 2. */ 305 uint_t ipcl_raw_fanout_size = 256; 306 307 /* 308 * Power of 2^N Primes useful for hashing for N of 0-28, 309 * these primes are the nearest prime <= 2^N - 2^(N-2). 310 */ 311 312 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 313 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 314 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 315 50331599, 100663291, 201326557, 0} 316 317 /* 318 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 319 * are aligned on cache lines. 320 */ 321 typedef union itc_s { 322 conn_t itc_conn; 323 char itcu_filler[CACHE_ALIGN(conn_s)]; 324 } itc_t; 325 326 struct kmem_cache *tcp_conn_cache; 327 struct kmem_cache *ip_conn_cache; 328 extern struct kmem_cache *sctp_conn_cache; 329 extern struct kmem_cache *tcp_sack_info_cache; 330 extern struct kmem_cache *tcp_iphc_cache; 331 struct kmem_cache *udp_conn_cache; 332 struct kmem_cache *rawip_conn_cache; 333 struct kmem_cache *rts_conn_cache; 334 335 extern void tcp_timermp_free(tcp_t *); 336 extern mblk_t *tcp_timermp_alloc(int); 337 338 static int ip_conn_constructor(void *, void *, int); 339 static void ip_conn_destructor(void *, void *); 340 341 static int tcp_conn_constructor(void *, void *, int); 342 static void tcp_conn_destructor(void *, void *); 343 344 static int udp_conn_constructor(void *, void *, int); 345 static void udp_conn_destructor(void *, void *); 346 347 static int rawip_conn_constructor(void *, void *, int); 348 static void rawip_conn_destructor(void *, void *); 349 350 static int rts_conn_constructor(void *, void *, int); 351 static void rts_conn_destructor(void *, void *); 352 353 #ifdef IPCL_DEBUG 354 #define INET_NTOA_BUFSIZE 18 355 356 static char * 357 inet_ntoa_r(uint32_t in, char *b) 358 { 359 unsigned char *p; 360 361 p = (unsigned char *)∈ 362 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 363 return (b); 364 } 365 #endif 366 367 /* 368 * Global (for all stack instances) init routine 369 */ 370 void 371 ipcl_g_init(void) 372 { 373 ip_conn_cache = kmem_cache_create("ip_conn_cache", 374 sizeof (conn_t), CACHE_ALIGN_SIZE, 375 ip_conn_constructor, ip_conn_destructor, 376 NULL, NULL, NULL, 0); 377 378 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 379 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 380 tcp_conn_constructor, tcp_conn_destructor, 381 NULL, NULL, NULL, 0); 382 383 udp_conn_cache = kmem_cache_create("udp_conn_cache", 384 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 385 udp_conn_constructor, udp_conn_destructor, 386 NULL, NULL, NULL, 0); 387 388 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 389 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 390 rawip_conn_constructor, rawip_conn_destructor, 391 NULL, NULL, NULL, 0); 392 393 rts_conn_cache = kmem_cache_create("rts_conn_cache", 394 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 395 rts_conn_constructor, rts_conn_destructor, 396 NULL, NULL, NULL, 0); 397 } 398 399 /* 400 * ipclassifier intialization routine, sets up hash tables. 401 */ 402 void 403 ipcl_init(ip_stack_t *ipst) 404 { 405 int i; 406 int sizes[] = P2Ps(); 407 408 /* 409 * Calculate size of conn fanout table from /etc/system settings 410 */ 411 if (ipcl_conn_hash_size != 0) { 412 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 413 } else if (tcp_conn_hash_size != 0) { 414 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 415 } else { 416 extern pgcnt_t freemem; 417 418 ipst->ips_ipcl_conn_fanout_size = 419 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 420 421 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 422 ipst->ips_ipcl_conn_fanout_size = 423 ipcl_conn_hash_maxsize; 424 } 425 } 426 427 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 428 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 429 break; 430 } 431 } 432 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 433 /* Out of range, use the 2^16 value */ 434 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 435 } 436 437 /* Take values from /etc/system */ 438 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 439 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 440 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 441 442 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 443 444 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 445 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 446 447 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 448 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 449 MUTEX_DEFAULT, NULL); 450 } 451 452 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 453 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 454 455 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 456 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 457 MUTEX_DEFAULT, NULL); 458 } 459 460 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 461 sizeof (connf_t), KM_SLEEP); 462 for (i = 0; i < IPPROTO_MAX; i++) { 463 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 464 MUTEX_DEFAULT, NULL); 465 } 466 467 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 468 sizeof (connf_t), KM_SLEEP); 469 for (i = 0; i < IPPROTO_MAX; i++) { 470 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 471 MUTEX_DEFAULT, NULL); 472 } 473 474 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 475 mutex_init(&ipst->ips_rts_clients->connf_lock, 476 NULL, MUTEX_DEFAULT, NULL); 477 478 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 479 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 480 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 481 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 482 MUTEX_DEFAULT, NULL); 483 } 484 485 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 486 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 487 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 488 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 489 MUTEX_DEFAULT, NULL); 490 } 491 492 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 493 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 494 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 495 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 496 NULL, MUTEX_DEFAULT, NULL); 497 } 498 } 499 500 void 501 ipcl_g_destroy(void) 502 { 503 kmem_cache_destroy(ip_conn_cache); 504 kmem_cache_destroy(tcp_conn_cache); 505 kmem_cache_destroy(udp_conn_cache); 506 kmem_cache_destroy(rawip_conn_cache); 507 kmem_cache_destroy(rts_conn_cache); 508 } 509 510 /* 511 * All user-level and kernel use of the stack must be gone 512 * by now. 513 */ 514 void 515 ipcl_destroy(ip_stack_t *ipst) 516 { 517 int i; 518 519 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 520 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 521 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 522 } 523 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 524 sizeof (connf_t)); 525 ipst->ips_ipcl_conn_fanout = NULL; 526 527 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 528 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 529 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 530 } 531 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 532 sizeof (connf_t)); 533 ipst->ips_ipcl_bind_fanout = NULL; 534 535 for (i = 0; i < IPPROTO_MAX; i++) { 536 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 537 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 538 } 539 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 540 ipst->ips_ipcl_proto_fanout = NULL; 541 542 for (i = 0; i < IPPROTO_MAX; i++) { 543 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 544 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 545 } 546 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 547 IPPROTO_MAX * sizeof (connf_t)); 548 ipst->ips_ipcl_proto_fanout_v6 = NULL; 549 550 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 551 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 552 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 553 } 554 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 555 sizeof (connf_t)); 556 ipst->ips_ipcl_udp_fanout = NULL; 557 558 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 559 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 560 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 561 } 562 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 563 sizeof (connf_t)); 564 ipst->ips_ipcl_raw_fanout = NULL; 565 566 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 567 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 568 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 569 } 570 kmem_free(ipst->ips_ipcl_globalhash_fanout, 571 sizeof (connf_t) * CONN_G_HASH_SIZE); 572 ipst->ips_ipcl_globalhash_fanout = NULL; 573 574 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 575 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 576 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 577 ipst->ips_rts_clients = NULL; 578 } 579 580 /* 581 * conn creation routine. initialize the conn, sets the reference 582 * and inserts it in the global hash table. 583 */ 584 conn_t * 585 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 586 { 587 conn_t *connp; 588 sctp_stack_t *sctps; 589 struct kmem_cache *conn_cache; 590 591 switch (type) { 592 case IPCL_SCTPCONN: 593 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 594 return (NULL); 595 sctp_conn_init(connp); 596 sctps = ns->netstack_sctp; 597 SCTP_G_Q_REFHOLD(sctps); 598 netstack_hold(ns); 599 connp->conn_netstack = ns; 600 return (connp); 601 602 case IPCL_TCPCONN: 603 conn_cache = tcp_conn_cache; 604 break; 605 606 case IPCL_UDPCONN: 607 conn_cache = udp_conn_cache; 608 break; 609 610 case IPCL_RAWIPCONN: 611 conn_cache = rawip_conn_cache; 612 break; 613 614 case IPCL_RTSCONN: 615 conn_cache = rts_conn_cache; 616 break; 617 618 case IPCL_IPCCONN: 619 conn_cache = ip_conn_cache; 620 break; 621 622 default: 623 connp = NULL; 624 ASSERT(0); 625 } 626 627 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 628 return (NULL); 629 630 connp->conn_ref = 1; 631 netstack_hold(ns); 632 connp->conn_netstack = ns; 633 ipcl_globalhash_insert(connp); 634 return (connp); 635 } 636 637 void 638 ipcl_conn_destroy(conn_t *connp) 639 { 640 mblk_t *mp; 641 netstack_t *ns = connp->conn_netstack; 642 643 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 644 ASSERT(connp->conn_ref == 0); 645 ASSERT(connp->conn_ire_cache == NULL); 646 647 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 648 649 if (connp->conn_peercred != NULL && 650 connp->conn_peercred != connp->conn_cred) 651 crfree(connp->conn_peercred); 652 connp->conn_peercred = NULL; 653 654 if (connp->conn_cred != NULL) { 655 crfree(connp->conn_cred); 656 connp->conn_cred = NULL; 657 } 658 659 ipcl_globalhash_remove(connp); 660 661 /* FIXME: add separate tcp_conn_free()? */ 662 if (connp->conn_flags & IPCL_TCPCONN) { 663 tcp_t *tcp = connp->conn_tcp; 664 tcp_stack_t *tcps; 665 666 ASSERT(tcp != NULL); 667 tcps = tcp->tcp_tcps; 668 if (tcps != NULL) { 669 if (connp->conn_latch != NULL) { 670 IPLATCH_REFRELE(connp->conn_latch, ns); 671 connp->conn_latch = NULL; 672 } 673 if (connp->conn_policy != NULL) { 674 IPPH_REFRELE(connp->conn_policy, ns); 675 connp->conn_policy = NULL; 676 } 677 tcp->tcp_tcps = NULL; 678 TCPS_REFRELE(tcps); 679 } 680 681 tcp_free(tcp); 682 mp = tcp->tcp_timercache; 683 tcp->tcp_cred = NULL; 684 685 if (tcp->tcp_sack_info != NULL) { 686 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 687 kmem_cache_free(tcp_sack_info_cache, 688 tcp->tcp_sack_info); 689 } 690 if (tcp->tcp_iphc != NULL) { 691 if (tcp->tcp_hdr_grown) { 692 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 693 } else { 694 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 695 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 696 } 697 tcp->tcp_iphc_len = 0; 698 } 699 ASSERT(tcp->tcp_iphc_len == 0); 700 701 if (tcp->tcp_ordrel_mp != NULL) { 702 freeb(tcp->tcp_ordrel_mp); 703 tcp->tcp_ordrel_mp = NULL; 704 } 705 706 /* 707 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 708 * the mblk. 709 */ 710 if (tcp->tcp_rsrv_mp != NULL) { 711 freeb(tcp->tcp_rsrv_mp); 712 tcp->tcp_rsrv_mp = NULL; 713 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 714 } 715 716 ASSERT(connp->conn_latch == NULL); 717 ASSERT(connp->conn_policy == NULL); 718 719 if (ns != NULL) { 720 ASSERT(tcp->tcp_tcps == NULL); 721 connp->conn_netstack = NULL; 722 netstack_rele(ns); 723 } 724 725 ipcl_conn_cleanup(connp); 726 connp->conn_flags = IPCL_TCPCONN; 727 bzero(tcp, sizeof (tcp_t)); 728 729 tcp->tcp_timercache = mp; 730 tcp->tcp_connp = connp; 731 kmem_cache_free(tcp_conn_cache, connp); 732 return; 733 } 734 if (connp->conn_latch != NULL) { 735 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 736 connp->conn_latch = NULL; 737 } 738 if (connp->conn_policy != NULL) { 739 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 740 connp->conn_policy = NULL; 741 } 742 if (connp->conn_ipsec_opt_mp != NULL) { 743 freemsg(connp->conn_ipsec_opt_mp); 744 connp->conn_ipsec_opt_mp = NULL; 745 } 746 747 if (connp->conn_flags & IPCL_SCTPCONN) { 748 ASSERT(ns != NULL); 749 sctp_free(connp); 750 return; 751 } 752 753 if (ns != NULL) { 754 connp->conn_netstack = NULL; 755 netstack_rele(ns); 756 } 757 ipcl_conn_cleanup(connp); 758 759 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 760 if (connp->conn_flags & IPCL_UDPCONN) { 761 connp->conn_flags = IPCL_UDPCONN; 762 kmem_cache_free(udp_conn_cache, connp); 763 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 764 connp->conn_flags = IPCL_RAWIPCONN; 765 connp->conn_ulp = IPPROTO_ICMP; 766 kmem_cache_free(rawip_conn_cache, connp); 767 } else if (connp->conn_flags & IPCL_RTSCONN) { 768 connp->conn_flags = IPCL_RTSCONN; 769 kmem_cache_free(rts_conn_cache, connp); 770 } else { 771 connp->conn_flags = IPCL_IPCCONN; 772 ASSERT(connp->conn_flags & IPCL_IPCCONN); 773 ASSERT(connp->conn_priv == NULL); 774 kmem_cache_free(ip_conn_cache, connp); 775 } 776 } 777 778 /* 779 * Running in cluster mode - deregister listener information 780 */ 781 782 static void 783 ipcl_conn_unlisten(conn_t *connp) 784 { 785 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 786 ASSERT(connp->conn_lport != 0); 787 788 if (cl_inet_unlisten != NULL) { 789 sa_family_t addr_family; 790 uint8_t *laddrp; 791 792 if (connp->conn_pkt_isv6) { 793 addr_family = AF_INET6; 794 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 795 } else { 796 addr_family = AF_INET; 797 laddrp = (uint8_t *)&connp->conn_bound_source; 798 } 799 (*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp, 800 connp->conn_lport); 801 } 802 connp->conn_flags &= ~IPCL_CL_LISTENER; 803 } 804 805 /* 806 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 807 * which table the conn belonged to). So for debugging we can see which hash 808 * table this connection was in. 809 */ 810 #define IPCL_HASH_REMOVE(connp) { \ 811 connf_t *connfp = (connp)->conn_fanout; \ 812 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 813 if (connfp != NULL) { \ 814 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 815 (void *)(connp))); \ 816 mutex_enter(&connfp->connf_lock); \ 817 if ((connp)->conn_next != NULL) \ 818 (connp)->conn_next->conn_prev = \ 819 (connp)->conn_prev; \ 820 if ((connp)->conn_prev != NULL) \ 821 (connp)->conn_prev->conn_next = \ 822 (connp)->conn_next; \ 823 else \ 824 connfp->connf_head = (connp)->conn_next; \ 825 (connp)->conn_fanout = NULL; \ 826 (connp)->conn_next = NULL; \ 827 (connp)->conn_prev = NULL; \ 828 (connp)->conn_flags |= IPCL_REMOVED; \ 829 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 830 ipcl_conn_unlisten((connp)); \ 831 CONN_DEC_REF((connp)); \ 832 mutex_exit(&connfp->connf_lock); \ 833 } \ 834 } 835 836 void 837 ipcl_hash_remove(conn_t *connp) 838 { 839 IPCL_HASH_REMOVE(connp); 840 } 841 842 /* 843 * The whole purpose of this function is allow removal of 844 * a conn_t from the connected hash for timewait reclaim. 845 * This is essentially a TW reclaim fastpath where timewait 846 * collector checks under fanout lock (so no one else can 847 * get access to the conn_t) that refcnt is 2 i.e. one for 848 * TCP and one for the classifier hash list. If ref count 849 * is indeed 2, we can just remove the conn under lock and 850 * avoid cleaning up the conn under squeue. This gives us 851 * improved performance. 852 */ 853 void 854 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 855 { 856 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 857 ASSERT(MUTEX_HELD(&connp->conn_lock)); 858 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 859 860 if ((connp)->conn_next != NULL) { 861 (connp)->conn_next->conn_prev = (connp)->conn_prev; 862 } 863 if ((connp)->conn_prev != NULL) { 864 (connp)->conn_prev->conn_next = (connp)->conn_next; 865 } else { 866 connfp->connf_head = (connp)->conn_next; 867 } 868 (connp)->conn_fanout = NULL; 869 (connp)->conn_next = NULL; 870 (connp)->conn_prev = NULL; 871 (connp)->conn_flags |= IPCL_REMOVED; 872 ASSERT((connp)->conn_ref == 2); 873 (connp)->conn_ref--; 874 } 875 876 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 877 ASSERT((connp)->conn_fanout == NULL); \ 878 ASSERT((connp)->conn_next == NULL); \ 879 ASSERT((connp)->conn_prev == NULL); \ 880 if ((connfp)->connf_head != NULL) { \ 881 (connfp)->connf_head->conn_prev = (connp); \ 882 (connp)->conn_next = (connfp)->connf_head; \ 883 } \ 884 (connp)->conn_fanout = (connfp); \ 885 (connfp)->connf_head = (connp); \ 886 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 887 IPCL_CONNECTED; \ 888 CONN_INC_REF(connp); \ 889 } 890 891 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 892 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 893 "connp %p", (void *)(connfp), (void *)(connp))); \ 894 IPCL_HASH_REMOVE((connp)); \ 895 mutex_enter(&(connfp)->connf_lock); \ 896 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 897 mutex_exit(&(connfp)->connf_lock); \ 898 } 899 900 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 901 conn_t *pconnp = NULL, *nconnp; \ 902 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 903 "connp %p", (void *)connfp, (void *)(connp))); \ 904 IPCL_HASH_REMOVE((connp)); \ 905 mutex_enter(&(connfp)->connf_lock); \ 906 nconnp = (connfp)->connf_head; \ 907 while (nconnp != NULL && \ 908 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 909 pconnp = nconnp; \ 910 nconnp = nconnp->conn_next; \ 911 } \ 912 if (pconnp != NULL) { \ 913 pconnp->conn_next = (connp); \ 914 (connp)->conn_prev = pconnp; \ 915 } else { \ 916 (connfp)->connf_head = (connp); \ 917 } \ 918 if (nconnp != NULL) { \ 919 (connp)->conn_next = nconnp; \ 920 nconnp->conn_prev = (connp); \ 921 } \ 922 (connp)->conn_fanout = (connfp); \ 923 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 924 IPCL_BOUND; \ 925 CONN_INC_REF(connp); \ 926 mutex_exit(&(connfp)->connf_lock); \ 927 } 928 929 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 930 conn_t **list, *prev, *next; \ 931 boolean_t isv4mapped = \ 932 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 933 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 934 "connp %p", (void *)(connfp), (void *)(connp))); \ 935 IPCL_HASH_REMOVE((connp)); \ 936 mutex_enter(&(connfp)->connf_lock); \ 937 list = &(connfp)->connf_head; \ 938 prev = NULL; \ 939 while ((next = *list) != NULL) { \ 940 if (isv4mapped && \ 941 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 942 connp->conn_zoneid == next->conn_zoneid) { \ 943 (connp)->conn_next = next; \ 944 if (prev != NULL) \ 945 prev = next->conn_prev; \ 946 next->conn_prev = (connp); \ 947 break; \ 948 } \ 949 list = &next->conn_next; \ 950 prev = next; \ 951 } \ 952 (connp)->conn_prev = prev; \ 953 *list = (connp); \ 954 (connp)->conn_fanout = (connfp); \ 955 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 956 IPCL_BOUND; \ 957 CONN_INC_REF((connp)); \ 958 mutex_exit(&(connfp)->connf_lock); \ 959 } 960 961 void 962 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 963 { 964 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 965 } 966 967 void 968 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 969 { 970 connf_t *connfp; 971 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 972 973 ASSERT(connp != NULL); 974 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 975 protocol == IPPROTO_ESP); 976 977 connp->conn_ulp = protocol; 978 979 /* Insert it in the protocol hash */ 980 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 981 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 982 } 983 984 void 985 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 986 { 987 connf_t *connfp; 988 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 989 990 ASSERT(connp != NULL); 991 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 992 protocol == IPPROTO_ESP); 993 994 connp->conn_ulp = protocol; 995 996 /* Insert it in the Bind Hash */ 997 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 998 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 999 } 1000 1001 /* 1002 * This function is used only for inserting SCTP raw socket now. 1003 * This may change later. 1004 * 1005 * Note that only one raw socket can be bound to a port. The param 1006 * lport is in network byte order. 1007 */ 1008 static int 1009 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1010 { 1011 connf_t *connfp; 1012 conn_t *oconnp; 1013 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1014 1015 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1016 1017 /* Check for existing raw socket already bound to the port. */ 1018 mutex_enter(&connfp->connf_lock); 1019 for (oconnp = connfp->connf_head; oconnp != NULL; 1020 oconnp = oconnp->conn_next) { 1021 if (oconnp->conn_lport == lport && 1022 oconnp->conn_zoneid == connp->conn_zoneid && 1023 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1024 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1025 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1026 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1027 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1028 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1029 &connp->conn_srcv6))) { 1030 break; 1031 } 1032 } 1033 mutex_exit(&connfp->connf_lock); 1034 if (oconnp != NULL) 1035 return (EADDRNOTAVAIL); 1036 1037 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1038 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1039 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1040 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1041 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1042 } else { 1043 IPCL_HASH_INSERT_BOUND(connfp, connp); 1044 } 1045 } else { 1046 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1047 } 1048 return (0); 1049 } 1050 1051 /* 1052 * Check for a MAC exemption conflict on a labeled system. Note that for 1053 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1054 * transport layer. This check is for binding all other protocols. 1055 * 1056 * Returns true if there's a conflict. 1057 */ 1058 static boolean_t 1059 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1060 { 1061 connf_t *connfp; 1062 conn_t *tconn; 1063 1064 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1065 mutex_enter(&connfp->connf_lock); 1066 for (tconn = connfp->connf_head; tconn != NULL; 1067 tconn = tconn->conn_next) { 1068 /* We don't allow v4 fallback for v6 raw socket */ 1069 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1070 continue; 1071 /* If neither is exempt, then there's no conflict */ 1072 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1073 continue; 1074 /* If both are bound to different specific addrs, ok */ 1075 if (connp->conn_src != INADDR_ANY && 1076 tconn->conn_src != INADDR_ANY && 1077 connp->conn_src != tconn->conn_src) 1078 continue; 1079 /* These two conflict; fail */ 1080 break; 1081 } 1082 mutex_exit(&connfp->connf_lock); 1083 return (tconn != NULL); 1084 } 1085 1086 static boolean_t 1087 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1088 { 1089 connf_t *connfp; 1090 conn_t *tconn; 1091 1092 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1093 mutex_enter(&connfp->connf_lock); 1094 for (tconn = connfp->connf_head; tconn != NULL; 1095 tconn = tconn->conn_next) { 1096 /* We don't allow v4 fallback for v6 raw socket */ 1097 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1098 continue; 1099 /* If neither is exempt, then there's no conflict */ 1100 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1101 continue; 1102 /* If both are bound to different addrs, ok */ 1103 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1104 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1105 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1106 continue; 1107 /* These two conflict; fail */ 1108 break; 1109 } 1110 mutex_exit(&connfp->connf_lock); 1111 return (tconn != NULL); 1112 } 1113 1114 /* 1115 * (v4, v6) bind hash insertion routines 1116 */ 1117 int 1118 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1119 { 1120 connf_t *connfp; 1121 #ifdef IPCL_DEBUG 1122 char buf[INET_NTOA_BUFSIZE]; 1123 #endif 1124 int ret = 0; 1125 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1126 1127 ASSERT(connp); 1128 1129 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1130 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1131 1132 connp->conn_ulp = protocol; 1133 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1134 connp->conn_lport = lport; 1135 1136 switch (protocol) { 1137 default: 1138 if (is_system_labeled() && 1139 check_exempt_conflict_v4(connp, ipst)) 1140 return (EADDRINUSE); 1141 /* FALLTHROUGH */ 1142 case IPPROTO_UDP: 1143 if (protocol == IPPROTO_UDP) { 1144 IPCL_DEBUG_LVL(64, 1145 ("ipcl_bind_insert: connp %p - udp\n", 1146 (void *)connp)); 1147 connfp = &ipst->ips_ipcl_udp_fanout[ 1148 IPCL_UDP_HASH(lport, ipst)]; 1149 } else { 1150 IPCL_DEBUG_LVL(64, 1151 ("ipcl_bind_insert: connp %p - protocol\n", 1152 (void *)connp)); 1153 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1154 } 1155 1156 if (connp->conn_rem != INADDR_ANY) { 1157 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1158 } else if (connp->conn_src != INADDR_ANY) { 1159 IPCL_HASH_INSERT_BOUND(connfp, connp); 1160 } else { 1161 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1162 } 1163 break; 1164 1165 case IPPROTO_TCP: 1166 1167 /* Insert it in the Bind Hash */ 1168 ASSERT(connp->conn_zoneid != ALL_ZONES); 1169 connfp = &ipst->ips_ipcl_bind_fanout[ 1170 IPCL_BIND_HASH(lport, ipst)]; 1171 if (connp->conn_src != INADDR_ANY) { 1172 IPCL_HASH_INSERT_BOUND(connfp, connp); 1173 } else { 1174 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1175 } 1176 if (cl_inet_listen != NULL) { 1177 ASSERT(!connp->conn_pkt_isv6); 1178 connp->conn_flags |= IPCL_CL_LISTENER; 1179 (*cl_inet_listen)(IPPROTO_TCP, AF_INET, 1180 (uint8_t *)&connp->conn_bound_source, lport); 1181 } 1182 break; 1183 1184 case IPPROTO_SCTP: 1185 ret = ipcl_sctp_hash_insert(connp, lport); 1186 break; 1187 } 1188 1189 return (ret); 1190 } 1191 1192 int 1193 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1194 uint16_t lport) 1195 { 1196 connf_t *connfp; 1197 int ret = 0; 1198 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1199 1200 ASSERT(connp); 1201 1202 connp->conn_ulp = protocol; 1203 connp->conn_srcv6 = *src; 1204 connp->conn_lport = lport; 1205 1206 switch (protocol) { 1207 default: 1208 if (is_system_labeled() && 1209 check_exempt_conflict_v6(connp, ipst)) 1210 return (EADDRINUSE); 1211 /* FALLTHROUGH */ 1212 case IPPROTO_UDP: 1213 if (protocol == IPPROTO_UDP) { 1214 IPCL_DEBUG_LVL(128, 1215 ("ipcl_bind_insert_v6: connp %p - udp\n", 1216 (void *)connp)); 1217 connfp = &ipst->ips_ipcl_udp_fanout[ 1218 IPCL_UDP_HASH(lport, ipst)]; 1219 } else { 1220 IPCL_DEBUG_LVL(128, 1221 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1222 (void *)connp)); 1223 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1224 } 1225 1226 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1227 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1228 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1229 IPCL_HASH_INSERT_BOUND(connfp, connp); 1230 } else { 1231 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1232 } 1233 break; 1234 1235 case IPPROTO_TCP: 1236 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1237 1238 /* Insert it in the Bind Hash */ 1239 ASSERT(connp->conn_zoneid != ALL_ZONES); 1240 connfp = &ipst->ips_ipcl_bind_fanout[ 1241 IPCL_BIND_HASH(lport, ipst)]; 1242 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1243 IPCL_HASH_INSERT_BOUND(connfp, connp); 1244 } else { 1245 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1246 } 1247 if (cl_inet_listen != NULL) { 1248 sa_family_t addr_family; 1249 uint8_t *laddrp; 1250 1251 if (connp->conn_pkt_isv6) { 1252 addr_family = AF_INET6; 1253 laddrp = 1254 (uint8_t *)&connp->conn_bound_source_v6; 1255 } else { 1256 addr_family = AF_INET; 1257 laddrp = (uint8_t *)&connp->conn_bound_source; 1258 } 1259 connp->conn_flags |= IPCL_CL_LISTENER; 1260 (*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp, 1261 lport); 1262 } 1263 break; 1264 1265 case IPPROTO_SCTP: 1266 ret = ipcl_sctp_hash_insert(connp, lport); 1267 break; 1268 } 1269 1270 return (ret); 1271 } 1272 1273 /* 1274 * ipcl_conn_hash insertion routines. 1275 */ 1276 int 1277 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1278 ipaddr_t rem, uint32_t ports) 1279 { 1280 connf_t *connfp; 1281 uint16_t *up; 1282 conn_t *tconnp; 1283 #ifdef IPCL_DEBUG 1284 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1285 #endif 1286 in_port_t lport; 1287 int ret = 0; 1288 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1289 1290 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1291 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1292 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1293 ports, protocol)); 1294 1295 switch (protocol) { 1296 case IPPROTO_TCP: 1297 if (!(connp->conn_flags & IPCL_EAGER)) { 1298 /* 1299 * for a eager connection, i.e connections which 1300 * have just been created, the initialization is 1301 * already done in ip at conn_creation time, so 1302 * we can skip the checks here. 1303 */ 1304 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1305 } 1306 connfp = &ipst->ips_ipcl_conn_fanout[ 1307 IPCL_CONN_HASH(connp->conn_rem, 1308 connp->conn_ports, ipst)]; 1309 mutex_enter(&connfp->connf_lock); 1310 for (tconnp = connfp->connf_head; tconnp != NULL; 1311 tconnp = tconnp->conn_next) { 1312 if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1313 connp->conn_rem, connp->conn_src, 1314 connp->conn_ports)) { 1315 1316 /* Already have a conn. bail out */ 1317 mutex_exit(&connfp->connf_lock); 1318 return (EADDRINUSE); 1319 } 1320 } 1321 if (connp->conn_fanout != NULL) { 1322 /* 1323 * Probably a XTI/TLI application trying to do a 1324 * rebind. Let it happen. 1325 */ 1326 mutex_exit(&connfp->connf_lock); 1327 IPCL_HASH_REMOVE(connp); 1328 mutex_enter(&connfp->connf_lock); 1329 } 1330 1331 ASSERT(connp->conn_recv != NULL); 1332 1333 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1334 mutex_exit(&connfp->connf_lock); 1335 break; 1336 1337 case IPPROTO_SCTP: 1338 /* 1339 * The raw socket may have already been bound, remove it 1340 * from the hash first. 1341 */ 1342 IPCL_HASH_REMOVE(connp); 1343 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1344 ret = ipcl_sctp_hash_insert(connp, lport); 1345 break; 1346 1347 default: 1348 /* 1349 * Check for conflicts among MAC exempt bindings. For 1350 * transports with port numbers, this is done by the upper 1351 * level per-transport binding logic. For all others, it's 1352 * done here. 1353 */ 1354 if (is_system_labeled() && 1355 check_exempt_conflict_v4(connp, ipst)) 1356 return (EADDRINUSE); 1357 /* FALLTHROUGH */ 1358 1359 case IPPROTO_UDP: 1360 up = (uint16_t *)&ports; 1361 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1362 if (protocol == IPPROTO_UDP) { 1363 connfp = &ipst->ips_ipcl_udp_fanout[ 1364 IPCL_UDP_HASH(up[1], ipst)]; 1365 } else { 1366 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1367 } 1368 1369 if (connp->conn_rem != INADDR_ANY) { 1370 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1371 } else if (connp->conn_src != INADDR_ANY) { 1372 IPCL_HASH_INSERT_BOUND(connfp, connp); 1373 } else { 1374 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1375 } 1376 break; 1377 } 1378 1379 return (ret); 1380 } 1381 1382 int 1383 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1384 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1385 { 1386 connf_t *connfp; 1387 uint16_t *up; 1388 conn_t *tconnp; 1389 in_port_t lport; 1390 int ret = 0; 1391 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1392 1393 switch (protocol) { 1394 case IPPROTO_TCP: 1395 /* Just need to insert a conn struct */ 1396 if (!(connp->conn_flags & IPCL_EAGER)) { 1397 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1398 } 1399 connfp = &ipst->ips_ipcl_conn_fanout[ 1400 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1401 ipst)]; 1402 mutex_enter(&connfp->connf_lock); 1403 for (tconnp = connfp->connf_head; tconnp != NULL; 1404 tconnp = tconnp->conn_next) { 1405 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1406 connp->conn_remv6, connp->conn_srcv6, 1407 connp->conn_ports) && 1408 (tconnp->conn_tcp->tcp_bound_if == 0 || 1409 tconnp->conn_tcp->tcp_bound_if == ifindex)) { 1410 /* Already have a conn. bail out */ 1411 mutex_exit(&connfp->connf_lock); 1412 return (EADDRINUSE); 1413 } 1414 } 1415 if (connp->conn_fanout != NULL) { 1416 /* 1417 * Probably a XTI/TLI application trying to do a 1418 * rebind. Let it happen. 1419 */ 1420 mutex_exit(&connfp->connf_lock); 1421 IPCL_HASH_REMOVE(connp); 1422 mutex_enter(&connfp->connf_lock); 1423 } 1424 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1425 mutex_exit(&connfp->connf_lock); 1426 break; 1427 1428 case IPPROTO_SCTP: 1429 IPCL_HASH_REMOVE(connp); 1430 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1431 ret = ipcl_sctp_hash_insert(connp, lport); 1432 break; 1433 1434 default: 1435 if (is_system_labeled() && 1436 check_exempt_conflict_v6(connp, ipst)) 1437 return (EADDRINUSE); 1438 /* FALLTHROUGH */ 1439 case IPPROTO_UDP: 1440 up = (uint16_t *)&ports; 1441 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1442 if (protocol == IPPROTO_UDP) { 1443 connfp = &ipst->ips_ipcl_udp_fanout[ 1444 IPCL_UDP_HASH(up[1], ipst)]; 1445 } else { 1446 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1447 } 1448 1449 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1450 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1451 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1452 IPCL_HASH_INSERT_BOUND(connfp, connp); 1453 } else { 1454 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1455 } 1456 break; 1457 } 1458 1459 return (ret); 1460 } 1461 1462 /* 1463 * v4 packet classifying function. looks up the fanout table to 1464 * find the conn, the packet belongs to. returns the conn with 1465 * the reference held, null otherwise. 1466 * 1467 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1468 * Lookup" comment block are applied. Labels are also checked as described 1469 * above. If the packet is from the inside (looped back), and is from the same 1470 * zone, then label checks are omitted. 1471 */ 1472 conn_t * 1473 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1474 ip_stack_t *ipst) 1475 { 1476 ipha_t *ipha; 1477 connf_t *connfp, *bind_connfp; 1478 uint16_t lport; 1479 uint16_t fport; 1480 uint32_t ports; 1481 conn_t *connp; 1482 uint16_t *up; 1483 boolean_t shared_addr; 1484 boolean_t unlabeled; 1485 1486 ipha = (ipha_t *)mp->b_rptr; 1487 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1488 1489 switch (protocol) { 1490 case IPPROTO_TCP: 1491 ports = *(uint32_t *)up; 1492 connfp = 1493 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1494 ports, ipst)]; 1495 mutex_enter(&connfp->connf_lock); 1496 for (connp = connfp->connf_head; connp != NULL; 1497 connp = connp->conn_next) { 1498 if (IPCL_CONN_MATCH(connp, protocol, 1499 ipha->ipha_src, ipha->ipha_dst, ports)) 1500 break; 1501 } 1502 1503 if (connp != NULL) { 1504 /* 1505 * We have a fully-bound TCP connection. 1506 * 1507 * For labeled systems, there's no need to check the 1508 * label here. It's known to be good as we checked 1509 * before allowing the connection to become bound. 1510 */ 1511 CONN_INC_REF(connp); 1512 mutex_exit(&connfp->connf_lock); 1513 return (connp); 1514 } 1515 1516 mutex_exit(&connfp->connf_lock); 1517 1518 lport = up[1]; 1519 unlabeled = B_FALSE; 1520 /* Cred cannot be null on IPv4 */ 1521 if (is_system_labeled()) 1522 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1523 TSLF_UNLABELED) != 0; 1524 shared_addr = (zoneid == ALL_ZONES); 1525 if (shared_addr) { 1526 /* 1527 * No need to handle exclusive-stack zones since 1528 * ALL_ZONES only applies to the shared stack. 1529 */ 1530 zoneid = tsol_mlp_findzone(protocol, lport); 1531 /* 1532 * If no shared MLP is found, tsol_mlp_findzone returns 1533 * ALL_ZONES. In that case, we assume it's SLP, and 1534 * search for the zone based on the packet label. 1535 * 1536 * If there is such a zone, we prefer to find a 1537 * connection in it. Otherwise, we look for a 1538 * MAC-exempt connection in any zone whose label 1539 * dominates the default label on the packet. 1540 */ 1541 if (zoneid == ALL_ZONES) 1542 zoneid = tsol_packet_to_zoneid(mp); 1543 else 1544 unlabeled = B_FALSE; 1545 } 1546 1547 bind_connfp = 1548 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1549 mutex_enter(&bind_connfp->connf_lock); 1550 for (connp = bind_connfp->connf_head; connp != NULL; 1551 connp = connp->conn_next) { 1552 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1553 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1554 (unlabeled && connp->conn_mac_exempt))) 1555 break; 1556 } 1557 1558 /* 1559 * If the matching connection is SLP on a private address, then 1560 * the label on the packet must match the local zone's label. 1561 * Otherwise, it must be in the label range defined by tnrh. 1562 * This is ensured by tsol_receive_label. 1563 */ 1564 if (connp != NULL && is_system_labeled() && 1565 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1566 shared_addr, connp)) { 1567 DTRACE_PROBE3( 1568 tx__ip__log__info__classify__tcp, 1569 char *, 1570 "connp(1) could not receive mp(2)", 1571 conn_t *, connp, mblk_t *, mp); 1572 connp = NULL; 1573 } 1574 1575 if (connp != NULL) { 1576 /* Have a listener at least */ 1577 CONN_INC_REF(connp); 1578 mutex_exit(&bind_connfp->connf_lock); 1579 return (connp); 1580 } 1581 1582 mutex_exit(&bind_connfp->connf_lock); 1583 1584 IPCL_DEBUG_LVL(512, 1585 ("ipcl_classify: couldn't classify mp = %p\n", 1586 (void *)mp)); 1587 break; 1588 1589 case IPPROTO_UDP: 1590 lport = up[1]; 1591 unlabeled = B_FALSE; 1592 /* Cred cannot be null on IPv4 */ 1593 if (is_system_labeled()) 1594 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1595 TSLF_UNLABELED) != 0; 1596 shared_addr = (zoneid == ALL_ZONES); 1597 if (shared_addr) { 1598 /* 1599 * No need to handle exclusive-stack zones since 1600 * ALL_ZONES only applies to the shared stack. 1601 */ 1602 zoneid = tsol_mlp_findzone(protocol, lport); 1603 /* 1604 * If no shared MLP is found, tsol_mlp_findzone returns 1605 * ALL_ZONES. In that case, we assume it's SLP, and 1606 * search for the zone based on the packet label. 1607 * 1608 * If there is such a zone, we prefer to find a 1609 * connection in it. Otherwise, we look for a 1610 * MAC-exempt connection in any zone whose label 1611 * dominates the default label on the packet. 1612 */ 1613 if (zoneid == ALL_ZONES) 1614 zoneid = tsol_packet_to_zoneid(mp); 1615 else 1616 unlabeled = B_FALSE; 1617 } 1618 fport = up[0]; 1619 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1620 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1621 mutex_enter(&connfp->connf_lock); 1622 for (connp = connfp->connf_head; connp != NULL; 1623 connp = connp->conn_next) { 1624 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1625 fport, ipha->ipha_src) && 1626 (IPCL_ZONE_MATCH(connp, zoneid) || 1627 (unlabeled && connp->conn_mac_exempt))) 1628 break; 1629 } 1630 1631 if (connp != NULL && is_system_labeled() && 1632 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1633 shared_addr, connp)) { 1634 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1635 char *, "connp(1) could not receive mp(2)", 1636 conn_t *, connp, mblk_t *, mp); 1637 connp = NULL; 1638 } 1639 1640 if (connp != NULL) { 1641 CONN_INC_REF(connp); 1642 mutex_exit(&connfp->connf_lock); 1643 return (connp); 1644 } 1645 1646 /* 1647 * We shouldn't come here for multicast/broadcast packets 1648 */ 1649 mutex_exit(&connfp->connf_lock); 1650 IPCL_DEBUG_LVL(512, 1651 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1652 lport, fport)); 1653 break; 1654 } 1655 1656 return (NULL); 1657 } 1658 1659 conn_t * 1660 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1661 ip_stack_t *ipst) 1662 { 1663 ip6_t *ip6h; 1664 connf_t *connfp, *bind_connfp; 1665 uint16_t lport; 1666 uint16_t fport; 1667 tcph_t *tcph; 1668 uint32_t ports; 1669 conn_t *connp; 1670 uint16_t *up; 1671 boolean_t shared_addr; 1672 boolean_t unlabeled; 1673 1674 ip6h = (ip6_t *)mp->b_rptr; 1675 1676 switch (protocol) { 1677 case IPPROTO_TCP: 1678 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1679 up = (uint16_t *)tcph->th_lport; 1680 ports = *(uint32_t *)up; 1681 1682 connfp = 1683 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1684 ports, ipst)]; 1685 mutex_enter(&connfp->connf_lock); 1686 for (connp = connfp->connf_head; connp != NULL; 1687 connp = connp->conn_next) { 1688 if (IPCL_CONN_MATCH_V6(connp, protocol, 1689 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1690 break; 1691 } 1692 1693 if (connp != NULL) { 1694 /* 1695 * We have a fully-bound TCP connection. 1696 * 1697 * For labeled systems, there's no need to check the 1698 * label here. It's known to be good as we checked 1699 * before allowing the connection to become bound. 1700 */ 1701 CONN_INC_REF(connp); 1702 mutex_exit(&connfp->connf_lock); 1703 return (connp); 1704 } 1705 1706 mutex_exit(&connfp->connf_lock); 1707 1708 lport = up[1]; 1709 unlabeled = B_FALSE; 1710 /* Cred can be null on IPv6 */ 1711 if (is_system_labeled()) { 1712 cred_t *cr = DB_CRED(mp); 1713 1714 unlabeled = (cr != NULL && 1715 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1716 } 1717 shared_addr = (zoneid == ALL_ZONES); 1718 if (shared_addr) { 1719 /* 1720 * No need to handle exclusive-stack zones since 1721 * ALL_ZONES only applies to the shared stack. 1722 */ 1723 zoneid = tsol_mlp_findzone(protocol, lport); 1724 /* 1725 * If no shared MLP is found, tsol_mlp_findzone returns 1726 * ALL_ZONES. In that case, we assume it's SLP, and 1727 * search for the zone based on the packet label. 1728 * 1729 * If there is such a zone, we prefer to find a 1730 * connection in it. Otherwise, we look for a 1731 * MAC-exempt connection in any zone whose label 1732 * dominates the default label on the packet. 1733 */ 1734 if (zoneid == ALL_ZONES) 1735 zoneid = tsol_packet_to_zoneid(mp); 1736 else 1737 unlabeled = B_FALSE; 1738 } 1739 1740 bind_connfp = 1741 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1742 mutex_enter(&bind_connfp->connf_lock); 1743 for (connp = bind_connfp->connf_head; connp != NULL; 1744 connp = connp->conn_next) { 1745 if (IPCL_BIND_MATCH_V6(connp, protocol, 1746 ip6h->ip6_dst, lport) && 1747 (IPCL_ZONE_MATCH(connp, zoneid) || 1748 (unlabeled && connp->conn_mac_exempt))) 1749 break; 1750 } 1751 1752 if (connp != NULL && is_system_labeled() && 1753 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1754 shared_addr, connp)) { 1755 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1756 char *, "connp(1) could not receive mp(2)", 1757 conn_t *, connp, mblk_t *, mp); 1758 connp = NULL; 1759 } 1760 1761 if (connp != NULL) { 1762 /* Have a listner at least */ 1763 CONN_INC_REF(connp); 1764 mutex_exit(&bind_connfp->connf_lock); 1765 IPCL_DEBUG_LVL(512, 1766 ("ipcl_classify_v6: found listner " 1767 "connp = %p\n", (void *)connp)); 1768 1769 return (connp); 1770 } 1771 1772 mutex_exit(&bind_connfp->connf_lock); 1773 1774 IPCL_DEBUG_LVL(512, 1775 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1776 (void *)mp)); 1777 break; 1778 1779 case IPPROTO_UDP: 1780 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1781 lport = up[1]; 1782 unlabeled = B_FALSE; 1783 /* Cred can be null on IPv6 */ 1784 if (is_system_labeled()) { 1785 cred_t *cr = DB_CRED(mp); 1786 1787 unlabeled = (cr != NULL && 1788 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1789 } 1790 shared_addr = (zoneid == ALL_ZONES); 1791 if (shared_addr) { 1792 /* 1793 * No need to handle exclusive-stack zones since 1794 * ALL_ZONES only applies to the shared stack. 1795 */ 1796 zoneid = tsol_mlp_findzone(protocol, lport); 1797 /* 1798 * If no shared MLP is found, tsol_mlp_findzone returns 1799 * ALL_ZONES. In that case, we assume it's SLP, and 1800 * search for the zone based on the packet label. 1801 * 1802 * If there is such a zone, we prefer to find a 1803 * connection in it. Otherwise, we look for a 1804 * MAC-exempt connection in any zone whose label 1805 * dominates the default label on the packet. 1806 */ 1807 if (zoneid == ALL_ZONES) 1808 zoneid = tsol_packet_to_zoneid(mp); 1809 else 1810 unlabeled = B_FALSE; 1811 } 1812 1813 fport = up[0]; 1814 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1815 fport)); 1816 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1817 mutex_enter(&connfp->connf_lock); 1818 for (connp = connfp->connf_head; connp != NULL; 1819 connp = connp->conn_next) { 1820 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1821 fport, ip6h->ip6_src) && 1822 (IPCL_ZONE_MATCH(connp, zoneid) || 1823 (unlabeled && connp->conn_mac_exempt))) 1824 break; 1825 } 1826 1827 if (connp != NULL && is_system_labeled() && 1828 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1829 shared_addr, connp)) { 1830 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1831 char *, "connp(1) could not receive mp(2)", 1832 conn_t *, connp, mblk_t *, mp); 1833 connp = NULL; 1834 } 1835 1836 if (connp != NULL) { 1837 CONN_INC_REF(connp); 1838 mutex_exit(&connfp->connf_lock); 1839 return (connp); 1840 } 1841 1842 /* 1843 * We shouldn't come here for multicast/broadcast packets 1844 */ 1845 mutex_exit(&connfp->connf_lock); 1846 IPCL_DEBUG_LVL(512, 1847 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1848 lport, fport)); 1849 break; 1850 } 1851 1852 return (NULL); 1853 } 1854 1855 /* 1856 * wrapper around ipcl_classify_(v4,v6) routines. 1857 */ 1858 conn_t * 1859 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1860 { 1861 uint16_t hdr_len; 1862 ipha_t *ipha; 1863 uint8_t *nexthdrp; 1864 1865 if (MBLKL(mp) < sizeof (ipha_t)) 1866 return (NULL); 1867 1868 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1869 case IPV4_VERSION: 1870 ipha = (ipha_t *)mp->b_rptr; 1871 hdr_len = IPH_HDR_LENGTH(ipha); 1872 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1873 zoneid, ipst)); 1874 case IPV6_VERSION: 1875 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1876 &hdr_len, &nexthdrp)) 1877 return (NULL); 1878 1879 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1880 } 1881 1882 return (NULL); 1883 } 1884 1885 conn_t * 1886 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1887 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1888 { 1889 connf_t *connfp; 1890 conn_t *connp; 1891 in_port_t lport; 1892 int af; 1893 boolean_t shared_addr; 1894 boolean_t unlabeled; 1895 const void *dst; 1896 1897 lport = ((uint16_t *)&ports)[1]; 1898 1899 unlabeled = B_FALSE; 1900 /* Cred can be null on IPv6 */ 1901 if (is_system_labeled()) { 1902 cred_t *cr = DB_CRED(mp); 1903 1904 unlabeled = (cr != NULL && 1905 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1906 } 1907 shared_addr = (zoneid == ALL_ZONES); 1908 if (shared_addr) { 1909 /* 1910 * No need to handle exclusive-stack zones since ALL_ZONES 1911 * only applies to the shared stack. 1912 */ 1913 zoneid = tsol_mlp_findzone(protocol, lport); 1914 /* 1915 * If no shared MLP is found, tsol_mlp_findzone returns 1916 * ALL_ZONES. In that case, we assume it's SLP, and search for 1917 * the zone based on the packet label. 1918 * 1919 * If there is such a zone, we prefer to find a connection in 1920 * it. Otherwise, we look for a MAC-exempt connection in any 1921 * zone whose label dominates the default label on the packet. 1922 */ 1923 if (zoneid == ALL_ZONES) 1924 zoneid = tsol_packet_to_zoneid(mp); 1925 else 1926 unlabeled = B_FALSE; 1927 } 1928 1929 af = IPH_HDR_VERSION(hdr); 1930 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1931 (const void *)&((ip6_t *)hdr)->ip6_dst; 1932 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1933 1934 mutex_enter(&connfp->connf_lock); 1935 for (connp = connfp->connf_head; connp != NULL; 1936 connp = connp->conn_next) { 1937 /* We don't allow v4 fallback for v6 raw socket. */ 1938 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1939 IPV6_VERSION)) 1940 continue; 1941 if (connp->conn_fully_bound) { 1942 if (af == IPV4_VERSION) { 1943 if (!IPCL_CONN_MATCH(connp, protocol, 1944 hdr->ipha_src, hdr->ipha_dst, ports)) 1945 continue; 1946 } else { 1947 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1948 ((ip6_t *)hdr)->ip6_src, 1949 ((ip6_t *)hdr)->ip6_dst, ports)) 1950 continue; 1951 } 1952 } else { 1953 if (af == IPV4_VERSION) { 1954 if (!IPCL_BIND_MATCH(connp, protocol, 1955 hdr->ipha_dst, lport)) 1956 continue; 1957 } else { 1958 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1959 ((ip6_t *)hdr)->ip6_dst, lport)) 1960 continue; 1961 } 1962 } 1963 1964 if (IPCL_ZONE_MATCH(connp, zoneid) || 1965 (unlabeled && connp->conn_mac_exempt)) 1966 break; 1967 } 1968 /* 1969 * If the connection is fully-bound and connection-oriented (TCP or 1970 * SCTP), then we've already validated the remote system's label. 1971 * There's no need to do it again for every packet. 1972 */ 1973 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 1974 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 1975 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 1976 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1977 char *, "connp(1) could not receive mp(2)", 1978 conn_t *, connp, mblk_t *, mp); 1979 connp = NULL; 1980 } 1981 1982 if (connp != NULL) 1983 goto found; 1984 mutex_exit(&connfp->connf_lock); 1985 1986 /* Try to look for a wildcard match. */ 1987 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1988 mutex_enter(&connfp->connf_lock); 1989 for (connp = connfp->connf_head; connp != NULL; 1990 connp = connp->conn_next) { 1991 /* We don't allow v4 fallback for v6 raw socket. */ 1992 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 1993 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 1994 continue; 1995 } 1996 if (af == IPV4_VERSION) { 1997 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 1998 break; 1999 } else { 2000 if (IPCL_RAW_MATCH_V6(connp, protocol, 2001 ((ip6_t *)hdr)->ip6_dst)) { 2002 break; 2003 } 2004 } 2005 } 2006 2007 if (connp != NULL) 2008 goto found; 2009 2010 mutex_exit(&connfp->connf_lock); 2011 return (NULL); 2012 2013 found: 2014 ASSERT(connp != NULL); 2015 CONN_INC_REF(connp); 2016 mutex_exit(&connfp->connf_lock); 2017 return (connp); 2018 } 2019 2020 /* ARGSUSED */ 2021 static int 2022 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2023 { 2024 itc_t *itc = (itc_t *)buf; 2025 conn_t *connp = &itc->itc_conn; 2026 tcp_t *tcp = (tcp_t *)&itc[1]; 2027 2028 bzero(connp, sizeof (conn_t)); 2029 bzero(tcp, sizeof (tcp_t)); 2030 2031 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2032 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2033 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2034 connp->conn_tcp = tcp; 2035 connp->conn_flags = IPCL_TCPCONN; 2036 connp->conn_ulp = IPPROTO_TCP; 2037 tcp->tcp_connp = connp; 2038 return (0); 2039 } 2040 2041 /* ARGSUSED */ 2042 static void 2043 tcp_conn_destructor(void *buf, void *cdrarg) 2044 { 2045 itc_t *itc = (itc_t *)buf; 2046 conn_t *connp = &itc->itc_conn; 2047 tcp_t *tcp = (tcp_t *)&itc[1]; 2048 2049 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2050 ASSERT(tcp->tcp_connp == connp); 2051 ASSERT(connp->conn_tcp == tcp); 2052 tcp_timermp_free(tcp); 2053 mutex_destroy(&connp->conn_lock); 2054 cv_destroy(&connp->conn_cv); 2055 } 2056 2057 /* ARGSUSED */ 2058 static int 2059 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2060 { 2061 itc_t *itc = (itc_t *)buf; 2062 conn_t *connp = &itc->itc_conn; 2063 2064 bzero(connp, sizeof (conn_t)); 2065 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2066 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2067 connp->conn_flags = IPCL_IPCCONN; 2068 2069 return (0); 2070 } 2071 2072 /* ARGSUSED */ 2073 static void 2074 ip_conn_destructor(void *buf, void *cdrarg) 2075 { 2076 itc_t *itc = (itc_t *)buf; 2077 conn_t *connp = &itc->itc_conn; 2078 2079 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2080 ASSERT(connp->conn_priv == NULL); 2081 mutex_destroy(&connp->conn_lock); 2082 cv_destroy(&connp->conn_cv); 2083 } 2084 2085 /* ARGSUSED */ 2086 static int 2087 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2088 { 2089 itc_t *itc = (itc_t *)buf; 2090 conn_t *connp = &itc->itc_conn; 2091 udp_t *udp = (udp_t *)&itc[1]; 2092 2093 bzero(connp, sizeof (conn_t)); 2094 bzero(udp, sizeof (udp_t)); 2095 2096 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2097 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2098 connp->conn_udp = udp; 2099 connp->conn_flags = IPCL_UDPCONN; 2100 connp->conn_ulp = IPPROTO_UDP; 2101 udp->udp_connp = connp; 2102 return (0); 2103 } 2104 2105 /* ARGSUSED */ 2106 static void 2107 udp_conn_destructor(void *buf, void *cdrarg) 2108 { 2109 itc_t *itc = (itc_t *)buf; 2110 conn_t *connp = &itc->itc_conn; 2111 udp_t *udp = (udp_t *)&itc[1]; 2112 2113 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2114 ASSERT(udp->udp_connp == connp); 2115 ASSERT(connp->conn_udp == udp); 2116 mutex_destroy(&connp->conn_lock); 2117 cv_destroy(&connp->conn_cv); 2118 } 2119 2120 /* ARGSUSED */ 2121 static int 2122 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2123 { 2124 itc_t *itc = (itc_t *)buf; 2125 conn_t *connp = &itc->itc_conn; 2126 icmp_t *icmp = (icmp_t *)&itc[1]; 2127 2128 bzero(connp, sizeof (conn_t)); 2129 bzero(icmp, sizeof (icmp_t)); 2130 2131 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2132 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2133 connp->conn_icmp = icmp; 2134 connp->conn_flags = IPCL_RAWIPCONN; 2135 connp->conn_ulp = IPPROTO_ICMP; 2136 icmp->icmp_connp = connp; 2137 return (0); 2138 } 2139 2140 /* ARGSUSED */ 2141 static void 2142 rawip_conn_destructor(void *buf, void *cdrarg) 2143 { 2144 itc_t *itc = (itc_t *)buf; 2145 conn_t *connp = &itc->itc_conn; 2146 icmp_t *icmp = (icmp_t *)&itc[1]; 2147 2148 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2149 ASSERT(icmp->icmp_connp == connp); 2150 ASSERT(connp->conn_icmp == icmp); 2151 mutex_destroy(&connp->conn_lock); 2152 cv_destroy(&connp->conn_cv); 2153 } 2154 2155 /* ARGSUSED */ 2156 static int 2157 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2158 { 2159 itc_t *itc = (itc_t *)buf; 2160 conn_t *connp = &itc->itc_conn; 2161 rts_t *rts = (rts_t *)&itc[1]; 2162 2163 bzero(connp, sizeof (conn_t)); 2164 bzero(rts, sizeof (rts_t)); 2165 2166 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2167 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2168 connp->conn_rts = rts; 2169 connp->conn_flags = IPCL_RTSCONN; 2170 rts->rts_connp = connp; 2171 return (0); 2172 } 2173 2174 /* ARGSUSED */ 2175 static void 2176 rts_conn_destructor(void *buf, void *cdrarg) 2177 { 2178 itc_t *itc = (itc_t *)buf; 2179 conn_t *connp = &itc->itc_conn; 2180 rts_t *rts = (rts_t *)&itc[1]; 2181 2182 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2183 ASSERT(rts->rts_connp == connp); 2184 ASSERT(connp->conn_rts == rts); 2185 mutex_destroy(&connp->conn_lock); 2186 cv_destroy(&connp->conn_cv); 2187 } 2188 2189 /* 2190 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2191 * in the conn_t. 2192 * 2193 * Below we list all the pointers in the conn_t as a documentation aid. 2194 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2195 * If you add any pointers to the conn_t please add an ASSERT here 2196 * and #ifdef it out if it can't be actually asserted to be NULL. 2197 * In any case, we bzero most of the conn_t at the end of the function. 2198 */ 2199 void 2200 ipcl_conn_cleanup(conn_t *connp) 2201 { 2202 ASSERT(connp->conn_ire_cache == NULL); 2203 ASSERT(connp->conn_latch == NULL); 2204 #ifdef notdef 2205 /* These are not cleared */ 2206 ASSERT(connp->conn_rq == NULL); 2207 ASSERT(connp->conn_wq == NULL); 2208 #endif 2209 ASSERT(connp->conn_cred == NULL); 2210 ASSERT(connp->conn_g_fanout == NULL); 2211 ASSERT(connp->conn_g_next == NULL); 2212 ASSERT(connp->conn_g_prev == NULL); 2213 ASSERT(connp->conn_policy == NULL); 2214 ASSERT(connp->conn_fanout == NULL); 2215 ASSERT(connp->conn_next == NULL); 2216 ASSERT(connp->conn_prev == NULL); 2217 #ifdef notdef 2218 /* 2219 * The ill and ipif pointers are not cleared before the conn_t 2220 * goes away since they do not hold a reference on the ill/ipif. 2221 * We should replace these pointers with ifindex/ipaddr_t to 2222 * make the code less complex. 2223 */ 2224 ASSERT(connp->conn_xmit_if_ill == NULL); 2225 ASSERT(connp->conn_nofailover_ill == NULL); 2226 ASSERT(connp->conn_outgoing_ill == NULL); 2227 ASSERT(connp->conn_incoming_ill == NULL); 2228 ASSERT(connp->conn_outgoing_pill == NULL); 2229 ASSERT(connp->conn_multicast_ipif == NULL); 2230 ASSERT(connp->conn_multicast_ill == NULL); 2231 #endif 2232 ASSERT(connp->conn_oper_pending_ill == NULL); 2233 ASSERT(connp->conn_ilg == NULL); 2234 ASSERT(connp->conn_drain_next == NULL); 2235 ASSERT(connp->conn_drain_prev == NULL); 2236 #ifdef notdef 2237 /* conn_idl is not cleared when removed from idl list */ 2238 ASSERT(connp->conn_idl == NULL); 2239 #endif 2240 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2241 ASSERT(connp->conn_peercred == NULL); 2242 ASSERT(connp->conn_netstack == NULL); 2243 2244 /* Clear out the conn_t fields that are not preserved */ 2245 bzero(&connp->conn_start_clr, 2246 sizeof (conn_t) - 2247 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2248 2249 } 2250 2251 /* 2252 * All conns are inserted in a global multi-list for the benefit of 2253 * walkers. The walk is guaranteed to walk all open conns at the time 2254 * of the start of the walk exactly once. This property is needed to 2255 * achieve some cleanups during unplumb of interfaces. This is achieved 2256 * as follows. 2257 * 2258 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2259 * call the insert and delete functions below at creation and deletion 2260 * time respectively. The conn never moves or changes its position in this 2261 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2262 * won't increase due to walkers, once the conn deletion has started. Note 2263 * that we can't remove the conn from the global list and then wait for 2264 * the refcnt to drop to zero, since walkers would then see a truncated 2265 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2266 * conns until ip_open is ready to make them globally visible. 2267 * The global round robin multi-list locks are held only to get the 2268 * next member/insertion/deletion and contention should be negligible 2269 * if the multi-list is much greater than the number of cpus. 2270 */ 2271 void 2272 ipcl_globalhash_insert(conn_t *connp) 2273 { 2274 int index; 2275 struct connf_s *connfp; 2276 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2277 2278 /* 2279 * No need for atomic here. Approximate even distribution 2280 * in the global lists is sufficient. 2281 */ 2282 ipst->ips_conn_g_index++; 2283 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2284 2285 connp->conn_g_prev = NULL; 2286 /* 2287 * Mark as INCIPIENT, so that walkers will ignore this 2288 * for now, till ip_open is ready to make it visible globally. 2289 */ 2290 connp->conn_state_flags |= CONN_INCIPIENT; 2291 2292 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2293 /* Insert at the head of the list */ 2294 mutex_enter(&connfp->connf_lock); 2295 connp->conn_g_next = connfp->connf_head; 2296 if (connp->conn_g_next != NULL) 2297 connp->conn_g_next->conn_g_prev = connp; 2298 connfp->connf_head = connp; 2299 2300 /* The fanout bucket this conn points to */ 2301 connp->conn_g_fanout = connfp; 2302 2303 mutex_exit(&connfp->connf_lock); 2304 } 2305 2306 void 2307 ipcl_globalhash_remove(conn_t *connp) 2308 { 2309 struct connf_s *connfp; 2310 2311 /* 2312 * We were never inserted in the global multi list. 2313 * IPCL_NONE variety is never inserted in the global multilist 2314 * since it is presumed to not need any cleanup and is transient. 2315 */ 2316 if (connp->conn_g_fanout == NULL) 2317 return; 2318 2319 connfp = connp->conn_g_fanout; 2320 mutex_enter(&connfp->connf_lock); 2321 if (connp->conn_g_prev != NULL) 2322 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2323 else 2324 connfp->connf_head = connp->conn_g_next; 2325 if (connp->conn_g_next != NULL) 2326 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2327 mutex_exit(&connfp->connf_lock); 2328 2329 /* Better to stumble on a null pointer than to corrupt memory */ 2330 connp->conn_g_next = NULL; 2331 connp->conn_g_prev = NULL; 2332 connp->conn_g_fanout = NULL; 2333 } 2334 2335 /* 2336 * Walk the list of all conn_t's in the system, calling the function provided 2337 * with the specified argument for each. 2338 * Applies to both IPv4 and IPv6. 2339 * 2340 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2341 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2342 * unplumbed or removed. New conn_t's that are created while we are walking 2343 * may be missed by this walk, because they are not necessarily inserted 2344 * at the tail of the list. They are new conn_t's and thus don't have any 2345 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2346 * is created to the struct that is going away. 2347 */ 2348 void 2349 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2350 { 2351 int i; 2352 conn_t *connp; 2353 conn_t *prev_connp; 2354 2355 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2356 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2357 prev_connp = NULL; 2358 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2359 while (connp != NULL) { 2360 mutex_enter(&connp->conn_lock); 2361 if (connp->conn_state_flags & 2362 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2363 mutex_exit(&connp->conn_lock); 2364 connp = connp->conn_g_next; 2365 continue; 2366 } 2367 CONN_INC_REF_LOCKED(connp); 2368 mutex_exit(&connp->conn_lock); 2369 mutex_exit( 2370 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2371 (*func)(connp, arg); 2372 if (prev_connp != NULL) 2373 CONN_DEC_REF(prev_connp); 2374 mutex_enter( 2375 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2376 prev_connp = connp; 2377 connp = connp->conn_g_next; 2378 } 2379 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2380 if (prev_connp != NULL) 2381 CONN_DEC_REF(prev_connp); 2382 } 2383 } 2384 2385 /* 2386 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2387 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2388 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2389 * (peer tcp in ESTABLISHED state). 2390 */ 2391 conn_t * 2392 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2393 ip_stack_t *ipst) 2394 { 2395 uint32_t ports; 2396 uint16_t *pports = (uint16_t *)&ports; 2397 connf_t *connfp; 2398 conn_t *tconnp; 2399 boolean_t zone_chk; 2400 2401 /* 2402 * If either the source of destination address is loopback, then 2403 * both endpoints must be in the same Zone. Otherwise, both of 2404 * the addresses are system-wide unique (tcp is in ESTABLISHED 2405 * state) and the endpoints may reside in different Zones. 2406 */ 2407 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2408 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2409 2410 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2411 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2412 2413 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2414 ports, ipst)]; 2415 2416 mutex_enter(&connfp->connf_lock); 2417 for (tconnp = connfp->connf_head; tconnp != NULL; 2418 tconnp = tconnp->conn_next) { 2419 2420 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2421 ipha->ipha_dst, ipha->ipha_src, ports) && 2422 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2423 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2424 2425 ASSERT(tconnp != connp); 2426 CONN_INC_REF(tconnp); 2427 mutex_exit(&connfp->connf_lock); 2428 return (tconnp); 2429 } 2430 } 2431 mutex_exit(&connfp->connf_lock); 2432 return (NULL); 2433 } 2434 2435 /* 2436 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2437 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2438 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2439 * (peer tcp in ESTABLISHED state). 2440 */ 2441 conn_t * 2442 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2443 ip_stack_t *ipst) 2444 { 2445 uint32_t ports; 2446 uint16_t *pports = (uint16_t *)&ports; 2447 connf_t *connfp; 2448 conn_t *tconnp; 2449 boolean_t zone_chk; 2450 2451 /* 2452 * If either the source of destination address is loopback, then 2453 * both endpoints must be in the same Zone. Otherwise, both of 2454 * the addresses are system-wide unique (tcp is in ESTABLISHED 2455 * state) and the endpoints may reside in different Zones. We 2456 * don't do Zone check for link local address(es) because the 2457 * current Zone implementation treats each link local address as 2458 * being unique per system node, i.e. they belong to global Zone. 2459 */ 2460 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2461 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2462 2463 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2464 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2465 2466 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2467 ports, ipst)]; 2468 2469 mutex_enter(&connfp->connf_lock); 2470 for (tconnp = connfp->connf_head; tconnp != NULL; 2471 tconnp = tconnp->conn_next) { 2472 2473 /* We skip tcp_bound_if check here as this is loopback tcp */ 2474 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2475 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2476 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2477 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2478 2479 ASSERT(tconnp != connp); 2480 CONN_INC_REF(tconnp); 2481 mutex_exit(&connfp->connf_lock); 2482 return (tconnp); 2483 } 2484 } 2485 mutex_exit(&connfp->connf_lock); 2486 return (NULL); 2487 } 2488 2489 /* 2490 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2491 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2492 * Only checks for connected entries i.e. no INADDR_ANY checks. 2493 */ 2494 conn_t * 2495 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2496 ip_stack_t *ipst) 2497 { 2498 uint32_t ports; 2499 uint16_t *pports; 2500 connf_t *connfp; 2501 conn_t *tconnp; 2502 2503 pports = (uint16_t *)&ports; 2504 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2505 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2506 2507 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2508 ports, ipst)]; 2509 2510 mutex_enter(&connfp->connf_lock); 2511 for (tconnp = connfp->connf_head; tconnp != NULL; 2512 tconnp = tconnp->conn_next) { 2513 2514 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2515 ipha->ipha_dst, ipha->ipha_src, ports) && 2516 tconnp->conn_tcp->tcp_state >= min_state) { 2517 2518 CONN_INC_REF(tconnp); 2519 mutex_exit(&connfp->connf_lock); 2520 return (tconnp); 2521 } 2522 } 2523 mutex_exit(&connfp->connf_lock); 2524 return (NULL); 2525 } 2526 2527 /* 2528 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2529 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2530 * Only checks for connected entries i.e. no INADDR_ANY checks. 2531 * Match on ifindex in addition to addresses. 2532 */ 2533 conn_t * 2534 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2535 uint_t ifindex, ip_stack_t *ipst) 2536 { 2537 tcp_t *tcp; 2538 uint32_t ports; 2539 uint16_t *pports; 2540 connf_t *connfp; 2541 conn_t *tconnp; 2542 2543 pports = (uint16_t *)&ports; 2544 pports[0] = tcpha->tha_fport; 2545 pports[1] = tcpha->tha_lport; 2546 2547 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2548 ports, ipst)]; 2549 2550 mutex_enter(&connfp->connf_lock); 2551 for (tconnp = connfp->connf_head; tconnp != NULL; 2552 tconnp = tconnp->conn_next) { 2553 2554 tcp = tconnp->conn_tcp; 2555 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2556 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2557 tcp->tcp_state >= min_state && 2558 (tcp->tcp_bound_if == 0 || 2559 tcp->tcp_bound_if == ifindex)) { 2560 2561 CONN_INC_REF(tconnp); 2562 mutex_exit(&connfp->connf_lock); 2563 return (tconnp); 2564 } 2565 } 2566 mutex_exit(&connfp->connf_lock); 2567 return (NULL); 2568 } 2569 2570 /* 2571 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2572 * a listener when changing state. 2573 */ 2574 conn_t * 2575 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2576 ip_stack_t *ipst) 2577 { 2578 connf_t *bind_connfp; 2579 conn_t *connp; 2580 tcp_t *tcp; 2581 2582 /* 2583 * Avoid false matches for packets sent to an IP destination of 2584 * all zeros. 2585 */ 2586 if (laddr == 0) 2587 return (NULL); 2588 2589 ASSERT(zoneid != ALL_ZONES); 2590 2591 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2592 mutex_enter(&bind_connfp->connf_lock); 2593 for (connp = bind_connfp->connf_head; connp != NULL; 2594 connp = connp->conn_next) { 2595 tcp = connp->conn_tcp; 2596 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2597 IPCL_ZONE_MATCH(connp, zoneid) && 2598 (tcp->tcp_listener == NULL)) { 2599 CONN_INC_REF(connp); 2600 mutex_exit(&bind_connfp->connf_lock); 2601 return (connp); 2602 } 2603 } 2604 mutex_exit(&bind_connfp->connf_lock); 2605 return (NULL); 2606 } 2607 2608 /* 2609 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2610 * a listener when changing state. 2611 */ 2612 conn_t * 2613 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2614 zoneid_t zoneid, ip_stack_t *ipst) 2615 { 2616 connf_t *bind_connfp; 2617 conn_t *connp = NULL; 2618 tcp_t *tcp; 2619 2620 /* 2621 * Avoid false matches for packets sent to an IP destination of 2622 * all zeros. 2623 */ 2624 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2625 return (NULL); 2626 2627 ASSERT(zoneid != ALL_ZONES); 2628 2629 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2630 mutex_enter(&bind_connfp->connf_lock); 2631 for (connp = bind_connfp->connf_head; connp != NULL; 2632 connp = connp->conn_next) { 2633 tcp = connp->conn_tcp; 2634 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2635 IPCL_ZONE_MATCH(connp, zoneid) && 2636 (tcp->tcp_bound_if == 0 || 2637 tcp->tcp_bound_if == ifindex) && 2638 tcp->tcp_listener == NULL) { 2639 CONN_INC_REF(connp); 2640 mutex_exit(&bind_connfp->connf_lock); 2641 return (connp); 2642 } 2643 } 2644 mutex_exit(&bind_connfp->connf_lock); 2645 return (NULL); 2646 } 2647 2648 /* 2649 * ipcl_get_next_conn 2650 * get the next entry in the conn global list 2651 * and put a reference on the next_conn. 2652 * decrement the reference on the current conn. 2653 * 2654 * This is an iterator based walker function that also provides for 2655 * some selection by the caller. It walks through the conn_hash bucket 2656 * searching for the next valid connp in the list, and selects connections 2657 * that are neither closed nor condemned. It also REFHOLDS the conn 2658 * thus ensuring that the conn exists when the caller uses the conn. 2659 */ 2660 conn_t * 2661 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2662 { 2663 conn_t *next_connp; 2664 2665 if (connfp == NULL) 2666 return (NULL); 2667 2668 mutex_enter(&connfp->connf_lock); 2669 2670 next_connp = (connp == NULL) ? 2671 connfp->connf_head : connp->conn_g_next; 2672 2673 while (next_connp != NULL) { 2674 mutex_enter(&next_connp->conn_lock); 2675 if (!(next_connp->conn_flags & conn_flags) || 2676 (next_connp->conn_state_flags & 2677 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2678 /* 2679 * This conn has been condemned or 2680 * is closing, or the flags don't match 2681 */ 2682 mutex_exit(&next_connp->conn_lock); 2683 next_connp = next_connp->conn_g_next; 2684 continue; 2685 } 2686 CONN_INC_REF_LOCKED(next_connp); 2687 mutex_exit(&next_connp->conn_lock); 2688 break; 2689 } 2690 2691 mutex_exit(&connfp->connf_lock); 2692 2693 if (connp != NULL) 2694 CONN_DEC_REF(connp); 2695 2696 return (next_connp); 2697 } 2698 2699 #ifdef CONN_DEBUG 2700 /* 2701 * Trace of the last NBUF refhold/refrele 2702 */ 2703 int 2704 conn_trace_ref(conn_t *connp) 2705 { 2706 int last; 2707 conn_trace_t *ctb; 2708 2709 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2710 last = connp->conn_trace_last; 2711 last++; 2712 if (last == CONN_TRACE_MAX) 2713 last = 0; 2714 2715 ctb = &connp->conn_trace_buf[last]; 2716 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2717 connp->conn_trace_last = last; 2718 return (1); 2719 } 2720 2721 int 2722 conn_untrace_ref(conn_t *connp) 2723 { 2724 int last; 2725 conn_trace_t *ctb; 2726 2727 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2728 last = connp->conn_trace_last; 2729 last++; 2730 if (last == CONN_TRACE_MAX) 2731 last = 0; 2732 2733 ctb = &connp->conn_trace_buf[last]; 2734 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2735 connp->conn_trace_last = last; 2736 return (1); 2737 } 2738 #endif 2739