1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/ip_ndp.h> 265 #include <inet/ip_impl.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/tcp.h> 276 #include <inet/ipsec_impl.h> 277 278 #include <sys/tsol/tnet.h> 279 #include <sys/sockio.h> 280 281 #ifdef DEBUG 282 #define IPCL_DEBUG 283 #else 284 #undef IPCL_DEBUG 285 #endif 286 287 #ifdef IPCL_DEBUG 288 int ipcl_debug_level = 0; 289 #define IPCL_DEBUG_LVL(level, args) \ 290 if (ipcl_debug_level & level) { printf args; } 291 #else 292 #define IPCL_DEBUG_LVL(level, args) {; } 293 #endif 294 /* Old value for compatibility. Setable in /etc/system */ 295 uint_t tcp_conn_hash_size = 0; 296 297 /* New value. Zero means choose automatically. Setable in /etc/system */ 298 uint_t ipcl_conn_hash_size = 0; 299 uint_t ipcl_conn_hash_memfactor = 8192; 300 uint_t ipcl_conn_hash_maxsize = 82500; 301 302 /* bind/udp fanout table size */ 303 uint_t ipcl_bind_fanout_size = 512; 304 uint_t ipcl_udp_fanout_size = 16384; 305 306 /* Raw socket fanout size. Must be a power of 2. */ 307 uint_t ipcl_raw_fanout_size = 256; 308 309 /* 310 * Power of 2^N Primes useful for hashing for N of 0-28, 311 * these primes are the nearest prime <= 2^N - 2^(N-2). 312 */ 313 314 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 315 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 316 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 317 50331599, 100663291, 201326557, 0} 318 319 /* 320 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 321 * are aligned on cache lines. 322 */ 323 typedef union itc_s { 324 conn_t itc_conn; 325 char itcu_filler[CACHE_ALIGN(conn_s)]; 326 } itc_t; 327 328 struct kmem_cache *tcp_conn_cache; 329 struct kmem_cache *ip_conn_cache; 330 struct kmem_cache *ip_helper_stream_cache; 331 extern struct kmem_cache *sctp_conn_cache; 332 extern struct kmem_cache *tcp_sack_info_cache; 333 extern struct kmem_cache *tcp_iphc_cache; 334 struct kmem_cache *udp_conn_cache; 335 struct kmem_cache *rawip_conn_cache; 336 struct kmem_cache *rts_conn_cache; 337 338 extern void tcp_timermp_free(tcp_t *); 339 extern mblk_t *tcp_timermp_alloc(int); 340 341 static int ip_conn_constructor(void *, void *, int); 342 static void ip_conn_destructor(void *, void *); 343 344 static int tcp_conn_constructor(void *, void *, int); 345 static void tcp_conn_destructor(void *, void *); 346 347 static int udp_conn_constructor(void *, void *, int); 348 static void udp_conn_destructor(void *, void *); 349 350 static int rawip_conn_constructor(void *, void *, int); 351 static void rawip_conn_destructor(void *, void *); 352 353 static int rts_conn_constructor(void *, void *, int); 354 static void rts_conn_destructor(void *, void *); 355 356 static int ip_helper_stream_constructor(void *, void *, int); 357 static void ip_helper_stream_destructor(void *, void *); 358 359 boolean_t ip_use_helper_cache = B_TRUE; 360 361 /* 362 * Hook functions to enable cluster networking 363 * On non-clustered systems these vectors must always be NULL. 364 */ 365 extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, 366 uint8_t *, in_port_t, void *); 367 extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, 368 uint8_t *, in_port_t, void *); 369 370 #ifdef IPCL_DEBUG 371 #define INET_NTOA_BUFSIZE 18 372 373 static char * 374 inet_ntoa_r(uint32_t in, char *b) 375 { 376 unsigned char *p; 377 378 p = (unsigned char *)∈ 379 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 380 return (b); 381 } 382 #endif 383 384 /* 385 * Global (for all stack instances) init routine 386 */ 387 void 388 ipcl_g_init(void) 389 { 390 ip_conn_cache = kmem_cache_create("ip_conn_cache", 391 sizeof (conn_t), CACHE_ALIGN_SIZE, 392 ip_conn_constructor, ip_conn_destructor, 393 NULL, NULL, NULL, 0); 394 395 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 396 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 397 tcp_conn_constructor, tcp_conn_destructor, 398 NULL, NULL, NULL, 0); 399 400 udp_conn_cache = kmem_cache_create("udp_conn_cache", 401 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 402 udp_conn_constructor, udp_conn_destructor, 403 NULL, NULL, NULL, 0); 404 405 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 406 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 407 rawip_conn_constructor, rawip_conn_destructor, 408 NULL, NULL, NULL, 0); 409 410 rts_conn_cache = kmem_cache_create("rts_conn_cache", 411 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 412 rts_conn_constructor, rts_conn_destructor, 413 NULL, NULL, NULL, 0); 414 415 if (ip_use_helper_cache) { 416 ip_helper_stream_cache = kmem_cache_create 417 ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), 418 CACHE_ALIGN_SIZE, ip_helper_stream_constructor, 419 ip_helper_stream_destructor, NULL, NULL, NULL, 0); 420 } else { 421 ip_helper_stream_cache = NULL; 422 } 423 } 424 425 /* 426 * ipclassifier intialization routine, sets up hash tables. 427 */ 428 void 429 ipcl_init(ip_stack_t *ipst) 430 { 431 int i; 432 int sizes[] = P2Ps(); 433 434 /* 435 * Calculate size of conn fanout table from /etc/system settings 436 */ 437 if (ipcl_conn_hash_size != 0) { 438 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 439 } else if (tcp_conn_hash_size != 0) { 440 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 441 } else { 442 extern pgcnt_t freemem; 443 444 ipst->ips_ipcl_conn_fanout_size = 445 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 446 447 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 448 ipst->ips_ipcl_conn_fanout_size = 449 ipcl_conn_hash_maxsize; 450 } 451 } 452 453 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 454 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 455 break; 456 } 457 } 458 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 459 /* Out of range, use the 2^16 value */ 460 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 461 } 462 463 /* Take values from /etc/system */ 464 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 465 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 466 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 467 468 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 469 470 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 471 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 472 473 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 474 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 479 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 480 481 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 482 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 483 MUTEX_DEFAULT, NULL); 484 } 485 486 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 487 sizeof (connf_t), KM_SLEEP); 488 for (i = 0; i < IPPROTO_MAX; i++) { 489 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 490 MUTEX_DEFAULT, NULL); 491 } 492 493 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 494 sizeof (connf_t), KM_SLEEP); 495 for (i = 0; i < IPPROTO_MAX; i++) { 496 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 497 MUTEX_DEFAULT, NULL); 498 } 499 500 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 501 mutex_init(&ipst->ips_rts_clients->connf_lock, 502 NULL, MUTEX_DEFAULT, NULL); 503 504 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 505 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 506 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 507 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 508 MUTEX_DEFAULT, NULL); 509 } 510 511 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 512 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 513 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 514 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 515 MUTEX_DEFAULT, NULL); 516 } 517 518 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 519 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 520 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 521 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 522 NULL, MUTEX_DEFAULT, NULL); 523 } 524 } 525 526 void 527 ipcl_g_destroy(void) 528 { 529 kmem_cache_destroy(ip_conn_cache); 530 kmem_cache_destroy(tcp_conn_cache); 531 kmem_cache_destroy(udp_conn_cache); 532 kmem_cache_destroy(rawip_conn_cache); 533 kmem_cache_destroy(rts_conn_cache); 534 } 535 536 /* 537 * All user-level and kernel use of the stack must be gone 538 * by now. 539 */ 540 void 541 ipcl_destroy(ip_stack_t *ipst) 542 { 543 int i; 544 545 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 546 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 547 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 548 } 549 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 550 sizeof (connf_t)); 551 ipst->ips_ipcl_conn_fanout = NULL; 552 553 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 554 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 555 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 556 } 557 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 558 sizeof (connf_t)); 559 ipst->ips_ipcl_bind_fanout = NULL; 560 561 for (i = 0; i < IPPROTO_MAX; i++) { 562 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 563 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 564 } 565 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 566 ipst->ips_ipcl_proto_fanout = NULL; 567 568 for (i = 0; i < IPPROTO_MAX; i++) { 569 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 570 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 571 } 572 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 573 IPPROTO_MAX * sizeof (connf_t)); 574 ipst->ips_ipcl_proto_fanout_v6 = NULL; 575 576 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 577 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 578 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 579 } 580 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 581 sizeof (connf_t)); 582 ipst->ips_ipcl_udp_fanout = NULL; 583 584 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 585 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 586 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 587 } 588 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 589 sizeof (connf_t)); 590 ipst->ips_ipcl_raw_fanout = NULL; 591 592 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 593 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 594 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 595 } 596 kmem_free(ipst->ips_ipcl_globalhash_fanout, 597 sizeof (connf_t) * CONN_G_HASH_SIZE); 598 ipst->ips_ipcl_globalhash_fanout = NULL; 599 600 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 601 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 602 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 603 ipst->ips_rts_clients = NULL; 604 } 605 606 /* 607 * conn creation routine. initialize the conn, sets the reference 608 * and inserts it in the global hash table. 609 */ 610 conn_t * 611 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 612 { 613 conn_t *connp; 614 sctp_stack_t *sctps; 615 struct kmem_cache *conn_cache; 616 617 switch (type) { 618 case IPCL_SCTPCONN: 619 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 620 return (NULL); 621 sctp_conn_init(connp); 622 sctps = ns->netstack_sctp; 623 SCTP_G_Q_REFHOLD(sctps); 624 netstack_hold(ns); 625 connp->conn_netstack = ns; 626 return (connp); 627 628 case IPCL_TCPCONN: 629 conn_cache = tcp_conn_cache; 630 break; 631 632 case IPCL_UDPCONN: 633 conn_cache = udp_conn_cache; 634 break; 635 636 case IPCL_RAWIPCONN: 637 conn_cache = rawip_conn_cache; 638 break; 639 640 case IPCL_RTSCONN: 641 conn_cache = rts_conn_cache; 642 break; 643 644 case IPCL_IPCCONN: 645 conn_cache = ip_conn_cache; 646 break; 647 648 default: 649 connp = NULL; 650 ASSERT(0); 651 } 652 653 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 654 return (NULL); 655 656 connp->conn_ref = 1; 657 netstack_hold(ns); 658 connp->conn_netstack = ns; 659 ipcl_globalhash_insert(connp); 660 return (connp); 661 } 662 663 void 664 ipcl_conn_destroy(conn_t *connp) 665 { 666 mblk_t *mp; 667 netstack_t *ns = connp->conn_netstack; 668 669 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 670 ASSERT(connp->conn_ref == 0); 671 ASSERT(connp->conn_ire_cache == NULL); 672 673 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 674 675 if (connp->conn_peercred != NULL && 676 connp->conn_peercred != connp->conn_cred) 677 crfree(connp->conn_peercred); 678 connp->conn_peercred = NULL; 679 680 if (connp->conn_cred != NULL) { 681 crfree(connp->conn_cred); 682 connp->conn_cred = NULL; 683 } 684 685 ipcl_globalhash_remove(connp); 686 687 /* FIXME: add separate tcp_conn_free()? */ 688 if (connp->conn_flags & IPCL_TCPCONN) { 689 tcp_t *tcp = connp->conn_tcp; 690 tcp_stack_t *tcps; 691 692 ASSERT(tcp != NULL); 693 tcps = tcp->tcp_tcps; 694 if (tcps != NULL) { 695 if (connp->conn_latch != NULL) { 696 IPLATCH_REFRELE(connp->conn_latch, ns); 697 connp->conn_latch = NULL; 698 } 699 if (connp->conn_policy != NULL) { 700 IPPH_REFRELE(connp->conn_policy, ns); 701 connp->conn_policy = NULL; 702 } 703 tcp->tcp_tcps = NULL; 704 TCPS_REFRELE(tcps); 705 } 706 707 tcp_free(tcp); 708 mp = tcp->tcp_timercache; 709 tcp->tcp_cred = NULL; 710 711 if (tcp->tcp_sack_info != NULL) { 712 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 713 kmem_cache_free(tcp_sack_info_cache, 714 tcp->tcp_sack_info); 715 } 716 if (tcp->tcp_iphc != NULL) { 717 if (tcp->tcp_hdr_grown) { 718 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 719 } else { 720 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 721 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 722 } 723 tcp->tcp_iphc_len = 0; 724 } 725 ASSERT(tcp->tcp_iphc_len == 0); 726 727 /* 728 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 729 * the mblk. 730 */ 731 if (tcp->tcp_rsrv_mp != NULL) { 732 freeb(tcp->tcp_rsrv_mp); 733 tcp->tcp_rsrv_mp = NULL; 734 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 735 } 736 737 ASSERT(connp->conn_latch == NULL); 738 ASSERT(connp->conn_policy == NULL); 739 740 if (ns != NULL) { 741 ASSERT(tcp->tcp_tcps == NULL); 742 connp->conn_netstack = NULL; 743 netstack_rele(ns); 744 } 745 746 ipcl_conn_cleanup(connp); 747 connp->conn_flags = IPCL_TCPCONN; 748 bzero(tcp, sizeof (tcp_t)); 749 750 tcp->tcp_timercache = mp; 751 tcp->tcp_connp = connp; 752 kmem_cache_free(tcp_conn_cache, connp); 753 return; 754 } 755 if (connp->conn_latch != NULL) { 756 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 757 connp->conn_latch = NULL; 758 } 759 if (connp->conn_policy != NULL) { 760 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 761 connp->conn_policy = NULL; 762 } 763 if (connp->conn_ipsec_opt_mp != NULL) { 764 freemsg(connp->conn_ipsec_opt_mp); 765 connp->conn_ipsec_opt_mp = NULL; 766 } 767 768 if (connp->conn_flags & IPCL_SCTPCONN) { 769 ASSERT(ns != NULL); 770 sctp_free(connp); 771 return; 772 } 773 774 if (ns != NULL) { 775 connp->conn_netstack = NULL; 776 netstack_rele(ns); 777 } 778 779 ipcl_conn_cleanup(connp); 780 781 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 782 if (connp->conn_flags & IPCL_UDPCONN) { 783 connp->conn_flags = IPCL_UDPCONN; 784 kmem_cache_free(udp_conn_cache, connp); 785 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 786 787 connp->conn_flags = IPCL_RAWIPCONN; 788 connp->conn_ulp = IPPROTO_ICMP; 789 kmem_cache_free(rawip_conn_cache, connp); 790 } else if (connp->conn_flags & IPCL_RTSCONN) { 791 connp->conn_flags = IPCL_RTSCONN; 792 kmem_cache_free(rts_conn_cache, connp); 793 } else { 794 connp->conn_flags = IPCL_IPCCONN; 795 ASSERT(connp->conn_flags & IPCL_IPCCONN); 796 ASSERT(connp->conn_priv == NULL); 797 kmem_cache_free(ip_conn_cache, connp); 798 } 799 } 800 801 /* 802 * Running in cluster mode - deregister listener information 803 */ 804 805 static void 806 ipcl_conn_unlisten(conn_t *connp) 807 { 808 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 809 ASSERT(connp->conn_lport != 0); 810 811 if (cl_inet_unlisten != NULL) { 812 sa_family_t addr_family; 813 uint8_t *laddrp; 814 815 if (connp->conn_pkt_isv6) { 816 addr_family = AF_INET6; 817 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 818 } else { 819 addr_family = AF_INET; 820 laddrp = (uint8_t *)&connp->conn_bound_source; 821 } 822 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 823 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 824 } 825 connp->conn_flags &= ~IPCL_CL_LISTENER; 826 } 827 828 /* 829 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 830 * which table the conn belonged to). So for debugging we can see which hash 831 * table this connection was in. 832 */ 833 #define IPCL_HASH_REMOVE(connp) { \ 834 connf_t *connfp = (connp)->conn_fanout; \ 835 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 836 if (connfp != NULL) { \ 837 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 838 (void *)(connp))); \ 839 mutex_enter(&connfp->connf_lock); \ 840 if ((connp)->conn_next != NULL) \ 841 (connp)->conn_next->conn_prev = \ 842 (connp)->conn_prev; \ 843 if ((connp)->conn_prev != NULL) \ 844 (connp)->conn_prev->conn_next = \ 845 (connp)->conn_next; \ 846 else \ 847 connfp->connf_head = (connp)->conn_next; \ 848 (connp)->conn_fanout = NULL; \ 849 (connp)->conn_next = NULL; \ 850 (connp)->conn_prev = NULL; \ 851 (connp)->conn_flags |= IPCL_REMOVED; \ 852 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 853 ipcl_conn_unlisten((connp)); \ 854 CONN_DEC_REF((connp)); \ 855 mutex_exit(&connfp->connf_lock); \ 856 } \ 857 } 858 859 void 860 ipcl_hash_remove(conn_t *connp) 861 { 862 IPCL_HASH_REMOVE(connp); 863 } 864 865 /* 866 * The whole purpose of this function is allow removal of 867 * a conn_t from the connected hash for timewait reclaim. 868 * This is essentially a TW reclaim fastpath where timewait 869 * collector checks under fanout lock (so no one else can 870 * get access to the conn_t) that refcnt is 2 i.e. one for 871 * TCP and one for the classifier hash list. If ref count 872 * is indeed 2, we can just remove the conn under lock and 873 * avoid cleaning up the conn under squeue. This gives us 874 * improved performance. 875 */ 876 void 877 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 878 { 879 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 880 ASSERT(MUTEX_HELD(&connp->conn_lock)); 881 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 882 883 if ((connp)->conn_next != NULL) { 884 (connp)->conn_next->conn_prev = (connp)->conn_prev; 885 } 886 if ((connp)->conn_prev != NULL) { 887 (connp)->conn_prev->conn_next = (connp)->conn_next; 888 } else { 889 connfp->connf_head = (connp)->conn_next; 890 } 891 (connp)->conn_fanout = NULL; 892 (connp)->conn_next = NULL; 893 (connp)->conn_prev = NULL; 894 (connp)->conn_flags |= IPCL_REMOVED; 895 ASSERT((connp)->conn_ref == 2); 896 (connp)->conn_ref--; 897 } 898 899 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 900 ASSERT((connp)->conn_fanout == NULL); \ 901 ASSERT((connp)->conn_next == NULL); \ 902 ASSERT((connp)->conn_prev == NULL); \ 903 if ((connfp)->connf_head != NULL) { \ 904 (connfp)->connf_head->conn_prev = (connp); \ 905 (connp)->conn_next = (connfp)->connf_head; \ 906 } \ 907 (connp)->conn_fanout = (connfp); \ 908 (connfp)->connf_head = (connp); \ 909 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 910 IPCL_CONNECTED; \ 911 CONN_INC_REF(connp); \ 912 } 913 914 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 915 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 916 "connp %p", (void *)(connfp), (void *)(connp))); \ 917 IPCL_HASH_REMOVE((connp)); \ 918 mutex_enter(&(connfp)->connf_lock); \ 919 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 920 mutex_exit(&(connfp)->connf_lock); \ 921 } 922 923 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 924 conn_t *pconnp = NULL, *nconnp; \ 925 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 926 "connp %p", (void *)connfp, (void *)(connp))); \ 927 IPCL_HASH_REMOVE((connp)); \ 928 mutex_enter(&(connfp)->connf_lock); \ 929 nconnp = (connfp)->connf_head; \ 930 while (nconnp != NULL && \ 931 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 932 pconnp = nconnp; \ 933 nconnp = nconnp->conn_next; \ 934 } \ 935 if (pconnp != NULL) { \ 936 pconnp->conn_next = (connp); \ 937 (connp)->conn_prev = pconnp; \ 938 } else { \ 939 (connfp)->connf_head = (connp); \ 940 } \ 941 if (nconnp != NULL) { \ 942 (connp)->conn_next = nconnp; \ 943 nconnp->conn_prev = (connp); \ 944 } \ 945 (connp)->conn_fanout = (connfp); \ 946 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 947 IPCL_BOUND; \ 948 CONN_INC_REF(connp); \ 949 mutex_exit(&(connfp)->connf_lock); \ 950 } 951 952 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 953 conn_t **list, *prev, *next; \ 954 boolean_t isv4mapped = \ 955 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 956 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 957 "connp %p", (void *)(connfp), (void *)(connp))); \ 958 IPCL_HASH_REMOVE((connp)); \ 959 mutex_enter(&(connfp)->connf_lock); \ 960 list = &(connfp)->connf_head; \ 961 prev = NULL; \ 962 while ((next = *list) != NULL) { \ 963 if (isv4mapped && \ 964 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 965 connp->conn_zoneid == next->conn_zoneid) { \ 966 (connp)->conn_next = next; \ 967 if (prev != NULL) \ 968 prev = next->conn_prev; \ 969 next->conn_prev = (connp); \ 970 break; \ 971 } \ 972 list = &next->conn_next; \ 973 prev = next; \ 974 } \ 975 (connp)->conn_prev = prev; \ 976 *list = (connp); \ 977 (connp)->conn_fanout = (connfp); \ 978 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 979 IPCL_BOUND; \ 980 CONN_INC_REF((connp)); \ 981 mutex_exit(&(connfp)->connf_lock); \ 982 } 983 984 void 985 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 986 { 987 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 988 } 989 990 void 991 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 992 { 993 connf_t *connfp; 994 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 995 996 ASSERT(connp != NULL); 997 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 998 protocol == IPPROTO_ESP); 999 1000 connp->conn_ulp = protocol; 1001 1002 /* Insert it in the protocol hash */ 1003 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1004 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1005 } 1006 1007 void 1008 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 1009 { 1010 connf_t *connfp; 1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1012 1013 ASSERT(connp != NULL); 1014 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 1015 protocol == IPPROTO_ESP); 1016 1017 connp->conn_ulp = protocol; 1018 1019 /* Insert it in the Bind Hash */ 1020 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1021 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1022 } 1023 1024 /* 1025 * This function is used only for inserting SCTP raw socket now. 1026 * This may change later. 1027 * 1028 * Note that only one raw socket can be bound to a port. The param 1029 * lport is in network byte order. 1030 */ 1031 static int 1032 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1033 { 1034 connf_t *connfp; 1035 conn_t *oconnp; 1036 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1037 1038 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1039 1040 /* Check for existing raw socket already bound to the port. */ 1041 mutex_enter(&connfp->connf_lock); 1042 for (oconnp = connfp->connf_head; oconnp != NULL; 1043 oconnp = oconnp->conn_next) { 1044 if (oconnp->conn_lport == lport && 1045 oconnp->conn_zoneid == connp->conn_zoneid && 1046 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1047 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1048 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1049 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1050 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1051 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1052 &connp->conn_srcv6))) { 1053 break; 1054 } 1055 } 1056 mutex_exit(&connfp->connf_lock); 1057 if (oconnp != NULL) 1058 return (EADDRNOTAVAIL); 1059 1060 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1061 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1062 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1063 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1064 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1065 } else { 1066 IPCL_HASH_INSERT_BOUND(connfp, connp); 1067 } 1068 } else { 1069 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1070 } 1071 return (0); 1072 } 1073 1074 /* 1075 * Check for a MAC exemption conflict on a labeled system. Note that for 1076 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1077 * transport layer. This check is for binding all other protocols. 1078 * 1079 * Returns true if there's a conflict. 1080 */ 1081 static boolean_t 1082 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1083 { 1084 connf_t *connfp; 1085 conn_t *tconn; 1086 1087 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1088 mutex_enter(&connfp->connf_lock); 1089 for (tconn = connfp->connf_head; tconn != NULL; 1090 tconn = tconn->conn_next) { 1091 /* We don't allow v4 fallback for v6 raw socket */ 1092 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1093 continue; 1094 /* If neither is exempt, then there's no conflict */ 1095 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1096 continue; 1097 /* If both are bound to different specific addrs, ok */ 1098 if (connp->conn_src != INADDR_ANY && 1099 tconn->conn_src != INADDR_ANY && 1100 connp->conn_src != tconn->conn_src) 1101 continue; 1102 /* These two conflict; fail */ 1103 break; 1104 } 1105 mutex_exit(&connfp->connf_lock); 1106 return (tconn != NULL); 1107 } 1108 1109 static boolean_t 1110 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1111 { 1112 connf_t *connfp; 1113 conn_t *tconn; 1114 1115 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1116 mutex_enter(&connfp->connf_lock); 1117 for (tconn = connfp->connf_head; tconn != NULL; 1118 tconn = tconn->conn_next) { 1119 /* We don't allow v4 fallback for v6 raw socket */ 1120 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1121 continue; 1122 /* If neither is exempt, then there's no conflict */ 1123 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1124 continue; 1125 /* If both are bound to different addrs, ok */ 1126 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1127 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1128 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1129 continue; 1130 /* These two conflict; fail */ 1131 break; 1132 } 1133 mutex_exit(&connfp->connf_lock); 1134 return (tconn != NULL); 1135 } 1136 1137 /* 1138 * (v4, v6) bind hash insertion routines 1139 */ 1140 int 1141 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1142 { 1143 connf_t *connfp; 1144 #ifdef IPCL_DEBUG 1145 char buf[INET_NTOA_BUFSIZE]; 1146 #endif 1147 int ret = 0; 1148 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1149 1150 ASSERT(connp); 1151 1152 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1153 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1154 1155 connp->conn_ulp = protocol; 1156 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1157 connp->conn_lport = lport; 1158 1159 switch (protocol) { 1160 default: 1161 if (is_system_labeled() && 1162 check_exempt_conflict_v4(connp, ipst)) 1163 return (EADDRINUSE); 1164 /* FALLTHROUGH */ 1165 case IPPROTO_UDP: 1166 if (protocol == IPPROTO_UDP) { 1167 IPCL_DEBUG_LVL(64, 1168 ("ipcl_bind_insert: connp %p - udp\n", 1169 (void *)connp)); 1170 connfp = &ipst->ips_ipcl_udp_fanout[ 1171 IPCL_UDP_HASH(lport, ipst)]; 1172 } else { 1173 IPCL_DEBUG_LVL(64, 1174 ("ipcl_bind_insert: connp %p - protocol\n", 1175 (void *)connp)); 1176 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1177 } 1178 1179 if (connp->conn_rem != INADDR_ANY) { 1180 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1181 } else if (connp->conn_src != INADDR_ANY) { 1182 IPCL_HASH_INSERT_BOUND(connfp, connp); 1183 } else { 1184 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1185 } 1186 break; 1187 1188 case IPPROTO_TCP: 1189 1190 /* Insert it in the Bind Hash */ 1191 ASSERT(connp->conn_zoneid != ALL_ZONES); 1192 connfp = &ipst->ips_ipcl_bind_fanout[ 1193 IPCL_BIND_HASH(lport, ipst)]; 1194 if (connp->conn_src != INADDR_ANY) { 1195 IPCL_HASH_INSERT_BOUND(connfp, connp); 1196 } else { 1197 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1198 } 1199 if (cl_inet_listen != NULL) { 1200 ASSERT(!connp->conn_pkt_isv6); 1201 connp->conn_flags |= IPCL_CL_LISTENER; 1202 (*cl_inet_listen)( 1203 connp->conn_netstack->netstack_stackid, 1204 IPPROTO_TCP, AF_INET, 1205 (uint8_t *)&connp->conn_bound_source, lport, NULL); 1206 } 1207 break; 1208 1209 case IPPROTO_SCTP: 1210 ret = ipcl_sctp_hash_insert(connp, lport); 1211 break; 1212 } 1213 1214 return (ret); 1215 } 1216 1217 int 1218 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1219 uint16_t lport) 1220 { 1221 connf_t *connfp; 1222 int ret = 0; 1223 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1224 1225 ASSERT(connp); 1226 1227 connp->conn_ulp = protocol; 1228 connp->conn_srcv6 = *src; 1229 connp->conn_lport = lport; 1230 1231 switch (protocol) { 1232 default: 1233 if (is_system_labeled() && 1234 check_exempt_conflict_v6(connp, ipst)) 1235 return (EADDRINUSE); 1236 /* FALLTHROUGH */ 1237 case IPPROTO_UDP: 1238 if (protocol == IPPROTO_UDP) { 1239 IPCL_DEBUG_LVL(128, 1240 ("ipcl_bind_insert_v6: connp %p - udp\n", 1241 (void *)connp)); 1242 connfp = &ipst->ips_ipcl_udp_fanout[ 1243 IPCL_UDP_HASH(lport, ipst)]; 1244 } else { 1245 IPCL_DEBUG_LVL(128, 1246 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1247 (void *)connp)); 1248 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1249 } 1250 1251 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1252 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1253 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1254 IPCL_HASH_INSERT_BOUND(connfp, connp); 1255 } else { 1256 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1257 } 1258 break; 1259 1260 case IPPROTO_TCP: 1261 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1262 1263 /* Insert it in the Bind Hash */ 1264 ASSERT(connp->conn_zoneid != ALL_ZONES); 1265 connfp = &ipst->ips_ipcl_bind_fanout[ 1266 IPCL_BIND_HASH(lport, ipst)]; 1267 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1268 IPCL_HASH_INSERT_BOUND(connfp, connp); 1269 } else { 1270 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1271 } 1272 if (cl_inet_listen != NULL) { 1273 sa_family_t addr_family; 1274 uint8_t *laddrp; 1275 1276 if (connp->conn_pkt_isv6) { 1277 addr_family = AF_INET6; 1278 laddrp = 1279 (uint8_t *)&connp->conn_bound_source_v6; 1280 } else { 1281 addr_family = AF_INET; 1282 laddrp = (uint8_t *)&connp->conn_bound_source; 1283 } 1284 connp->conn_flags |= IPCL_CL_LISTENER; 1285 (*cl_inet_listen)( 1286 connp->conn_netstack->netstack_stackid, 1287 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1288 } 1289 break; 1290 1291 case IPPROTO_SCTP: 1292 ret = ipcl_sctp_hash_insert(connp, lport); 1293 break; 1294 } 1295 1296 return (ret); 1297 } 1298 1299 /* 1300 * ipcl_conn_hash insertion routines. 1301 */ 1302 int 1303 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1304 ipaddr_t rem, uint32_t ports) 1305 { 1306 connf_t *connfp; 1307 uint16_t *up; 1308 conn_t *tconnp; 1309 #ifdef IPCL_DEBUG 1310 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1311 #endif 1312 in_port_t lport; 1313 int ret = 0; 1314 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1315 1316 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1317 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1318 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1319 ports, protocol)); 1320 1321 switch (protocol) { 1322 case IPPROTO_TCP: 1323 if (!(connp->conn_flags & IPCL_EAGER)) { 1324 /* 1325 * for a eager connection, i.e connections which 1326 * have just been created, the initialization is 1327 * already done in ip at conn_creation time, so 1328 * we can skip the checks here. 1329 */ 1330 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1331 } 1332 connfp = &ipst->ips_ipcl_conn_fanout[ 1333 IPCL_CONN_HASH(connp->conn_rem, 1334 connp->conn_ports, ipst)]; 1335 mutex_enter(&connfp->connf_lock); 1336 for (tconnp = connfp->connf_head; tconnp != NULL; 1337 tconnp = tconnp->conn_next) { 1338 if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1339 connp->conn_rem, connp->conn_src, 1340 connp->conn_ports)) { 1341 1342 /* Already have a conn. bail out */ 1343 mutex_exit(&connfp->connf_lock); 1344 return (EADDRINUSE); 1345 } 1346 } 1347 if (connp->conn_fanout != NULL) { 1348 /* 1349 * Probably a XTI/TLI application trying to do a 1350 * rebind. Let it happen. 1351 */ 1352 mutex_exit(&connfp->connf_lock); 1353 IPCL_HASH_REMOVE(connp); 1354 mutex_enter(&connfp->connf_lock); 1355 } 1356 1357 ASSERT(connp->conn_recv != NULL); 1358 1359 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1360 mutex_exit(&connfp->connf_lock); 1361 break; 1362 1363 case IPPROTO_SCTP: 1364 /* 1365 * The raw socket may have already been bound, remove it 1366 * from the hash first. 1367 */ 1368 IPCL_HASH_REMOVE(connp); 1369 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1370 ret = ipcl_sctp_hash_insert(connp, lport); 1371 break; 1372 1373 default: 1374 /* 1375 * Check for conflicts among MAC exempt bindings. For 1376 * transports with port numbers, this is done by the upper 1377 * level per-transport binding logic. For all others, it's 1378 * done here. 1379 */ 1380 if (is_system_labeled() && 1381 check_exempt_conflict_v4(connp, ipst)) 1382 return (EADDRINUSE); 1383 /* FALLTHROUGH */ 1384 1385 case IPPROTO_UDP: 1386 up = (uint16_t *)&ports; 1387 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1388 if (protocol == IPPROTO_UDP) { 1389 connfp = &ipst->ips_ipcl_udp_fanout[ 1390 IPCL_UDP_HASH(up[1], ipst)]; 1391 } else { 1392 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1393 } 1394 1395 if (connp->conn_rem != INADDR_ANY) { 1396 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1397 } else if (connp->conn_src != INADDR_ANY) { 1398 IPCL_HASH_INSERT_BOUND(connfp, connp); 1399 } else { 1400 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1401 } 1402 break; 1403 } 1404 1405 return (ret); 1406 } 1407 1408 int 1409 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1410 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1411 { 1412 connf_t *connfp; 1413 uint16_t *up; 1414 conn_t *tconnp; 1415 in_port_t lport; 1416 int ret = 0; 1417 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1418 1419 switch (protocol) { 1420 case IPPROTO_TCP: 1421 /* Just need to insert a conn struct */ 1422 if (!(connp->conn_flags & IPCL_EAGER)) { 1423 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1424 } 1425 connfp = &ipst->ips_ipcl_conn_fanout[ 1426 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1427 ipst)]; 1428 mutex_enter(&connfp->connf_lock); 1429 for (tconnp = connfp->connf_head; tconnp != NULL; 1430 tconnp = tconnp->conn_next) { 1431 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1432 connp->conn_remv6, connp->conn_srcv6, 1433 connp->conn_ports) && 1434 (tconnp->conn_tcp->tcp_bound_if == 0 || 1435 tconnp->conn_tcp->tcp_bound_if == ifindex)) { 1436 /* Already have a conn. bail out */ 1437 mutex_exit(&connfp->connf_lock); 1438 return (EADDRINUSE); 1439 } 1440 } 1441 if (connp->conn_fanout != NULL) { 1442 /* 1443 * Probably a XTI/TLI application trying to do a 1444 * rebind. Let it happen. 1445 */ 1446 mutex_exit(&connfp->connf_lock); 1447 IPCL_HASH_REMOVE(connp); 1448 mutex_enter(&connfp->connf_lock); 1449 } 1450 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1451 mutex_exit(&connfp->connf_lock); 1452 break; 1453 1454 case IPPROTO_SCTP: 1455 IPCL_HASH_REMOVE(connp); 1456 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1457 ret = ipcl_sctp_hash_insert(connp, lport); 1458 break; 1459 1460 default: 1461 if (is_system_labeled() && 1462 check_exempt_conflict_v6(connp, ipst)) 1463 return (EADDRINUSE); 1464 /* FALLTHROUGH */ 1465 case IPPROTO_UDP: 1466 up = (uint16_t *)&ports; 1467 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1468 if (protocol == IPPROTO_UDP) { 1469 connfp = &ipst->ips_ipcl_udp_fanout[ 1470 IPCL_UDP_HASH(up[1], ipst)]; 1471 } else { 1472 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1473 } 1474 1475 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1476 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1477 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1478 IPCL_HASH_INSERT_BOUND(connfp, connp); 1479 } else { 1480 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1481 } 1482 break; 1483 } 1484 1485 return (ret); 1486 } 1487 1488 /* 1489 * v4 packet classifying function. looks up the fanout table to 1490 * find the conn, the packet belongs to. returns the conn with 1491 * the reference held, null otherwise. 1492 * 1493 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1494 * Lookup" comment block are applied. Labels are also checked as described 1495 * above. If the packet is from the inside (looped back), and is from the same 1496 * zone, then label checks are omitted. 1497 */ 1498 conn_t * 1499 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1500 ip_stack_t *ipst) 1501 { 1502 ipha_t *ipha; 1503 connf_t *connfp, *bind_connfp; 1504 uint16_t lport; 1505 uint16_t fport; 1506 uint32_t ports; 1507 conn_t *connp; 1508 uint16_t *up; 1509 boolean_t shared_addr; 1510 boolean_t unlabeled; 1511 1512 ipha = (ipha_t *)mp->b_rptr; 1513 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1514 1515 switch (protocol) { 1516 case IPPROTO_TCP: 1517 ports = *(uint32_t *)up; 1518 connfp = 1519 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1520 ports, ipst)]; 1521 mutex_enter(&connfp->connf_lock); 1522 for (connp = connfp->connf_head; connp != NULL; 1523 connp = connp->conn_next) { 1524 if (IPCL_CONN_MATCH(connp, protocol, 1525 ipha->ipha_src, ipha->ipha_dst, ports)) 1526 break; 1527 } 1528 1529 if (connp != NULL) { 1530 /* 1531 * We have a fully-bound TCP connection. 1532 * 1533 * For labeled systems, there's no need to check the 1534 * label here. It's known to be good as we checked 1535 * before allowing the connection to become bound. 1536 */ 1537 CONN_INC_REF(connp); 1538 mutex_exit(&connfp->connf_lock); 1539 return (connp); 1540 } 1541 1542 mutex_exit(&connfp->connf_lock); 1543 1544 lport = up[1]; 1545 unlabeled = B_FALSE; 1546 /* Cred cannot be null on IPv4 */ 1547 if (is_system_labeled()) 1548 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1549 TSLF_UNLABELED) != 0; 1550 shared_addr = (zoneid == ALL_ZONES); 1551 if (shared_addr) { 1552 /* 1553 * No need to handle exclusive-stack zones since 1554 * ALL_ZONES only applies to the shared stack. 1555 */ 1556 zoneid = tsol_mlp_findzone(protocol, lport); 1557 /* 1558 * If no shared MLP is found, tsol_mlp_findzone returns 1559 * ALL_ZONES. In that case, we assume it's SLP, and 1560 * search for the zone based on the packet label. 1561 * 1562 * If there is such a zone, we prefer to find a 1563 * connection in it. Otherwise, we look for a 1564 * MAC-exempt connection in any zone whose label 1565 * dominates the default label on the packet. 1566 */ 1567 if (zoneid == ALL_ZONES) 1568 zoneid = tsol_packet_to_zoneid(mp); 1569 else 1570 unlabeled = B_FALSE; 1571 } 1572 1573 bind_connfp = 1574 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1575 mutex_enter(&bind_connfp->connf_lock); 1576 for (connp = bind_connfp->connf_head; connp != NULL; 1577 connp = connp->conn_next) { 1578 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1579 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1580 (unlabeled && connp->conn_mac_exempt))) 1581 break; 1582 } 1583 1584 /* 1585 * If the matching connection is SLP on a private address, then 1586 * the label on the packet must match the local zone's label. 1587 * Otherwise, it must be in the label range defined by tnrh. 1588 * This is ensured by tsol_receive_label. 1589 */ 1590 if (connp != NULL && is_system_labeled() && 1591 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1592 shared_addr, connp)) { 1593 DTRACE_PROBE3( 1594 tx__ip__log__info__classify__tcp, 1595 char *, 1596 "connp(1) could not receive mp(2)", 1597 conn_t *, connp, mblk_t *, mp); 1598 connp = NULL; 1599 } 1600 1601 if (connp != NULL) { 1602 /* Have a listener at least */ 1603 CONN_INC_REF(connp); 1604 mutex_exit(&bind_connfp->connf_lock); 1605 return (connp); 1606 } 1607 1608 mutex_exit(&bind_connfp->connf_lock); 1609 1610 IPCL_DEBUG_LVL(512, 1611 ("ipcl_classify: couldn't classify mp = %p\n", 1612 (void *)mp)); 1613 break; 1614 1615 case IPPROTO_UDP: 1616 lport = up[1]; 1617 unlabeled = B_FALSE; 1618 /* Cred cannot be null on IPv4 */ 1619 if (is_system_labeled()) 1620 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1621 TSLF_UNLABELED) != 0; 1622 shared_addr = (zoneid == ALL_ZONES); 1623 if (shared_addr) { 1624 /* 1625 * No need to handle exclusive-stack zones since 1626 * ALL_ZONES only applies to the shared stack. 1627 */ 1628 zoneid = tsol_mlp_findzone(protocol, lport); 1629 /* 1630 * If no shared MLP is found, tsol_mlp_findzone returns 1631 * ALL_ZONES. In that case, we assume it's SLP, and 1632 * search for the zone based on the packet label. 1633 * 1634 * If there is such a zone, we prefer to find a 1635 * connection in it. Otherwise, we look for a 1636 * MAC-exempt connection in any zone whose label 1637 * dominates the default label on the packet. 1638 */ 1639 if (zoneid == ALL_ZONES) 1640 zoneid = tsol_packet_to_zoneid(mp); 1641 else 1642 unlabeled = B_FALSE; 1643 } 1644 fport = up[0]; 1645 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1646 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1647 mutex_enter(&connfp->connf_lock); 1648 for (connp = connfp->connf_head; connp != NULL; 1649 connp = connp->conn_next) { 1650 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1651 fport, ipha->ipha_src) && 1652 (IPCL_ZONE_MATCH(connp, zoneid) || 1653 (unlabeled && connp->conn_mac_exempt))) 1654 break; 1655 } 1656 1657 if (connp != NULL && is_system_labeled() && 1658 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1659 shared_addr, connp)) { 1660 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1661 char *, "connp(1) could not receive mp(2)", 1662 conn_t *, connp, mblk_t *, mp); 1663 connp = NULL; 1664 } 1665 1666 if (connp != NULL) { 1667 CONN_INC_REF(connp); 1668 mutex_exit(&connfp->connf_lock); 1669 return (connp); 1670 } 1671 1672 /* 1673 * We shouldn't come here for multicast/broadcast packets 1674 */ 1675 mutex_exit(&connfp->connf_lock); 1676 IPCL_DEBUG_LVL(512, 1677 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1678 lport, fport)); 1679 break; 1680 } 1681 1682 return (NULL); 1683 } 1684 1685 conn_t * 1686 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1687 ip_stack_t *ipst) 1688 { 1689 ip6_t *ip6h; 1690 connf_t *connfp, *bind_connfp; 1691 uint16_t lport; 1692 uint16_t fport; 1693 tcph_t *tcph; 1694 uint32_t ports; 1695 conn_t *connp; 1696 uint16_t *up; 1697 boolean_t shared_addr; 1698 boolean_t unlabeled; 1699 1700 ip6h = (ip6_t *)mp->b_rptr; 1701 1702 switch (protocol) { 1703 case IPPROTO_TCP: 1704 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1705 up = (uint16_t *)tcph->th_lport; 1706 ports = *(uint32_t *)up; 1707 1708 connfp = 1709 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1710 ports, ipst)]; 1711 mutex_enter(&connfp->connf_lock); 1712 for (connp = connfp->connf_head; connp != NULL; 1713 connp = connp->conn_next) { 1714 if (IPCL_CONN_MATCH_V6(connp, protocol, 1715 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1716 break; 1717 } 1718 1719 if (connp != NULL) { 1720 /* 1721 * We have a fully-bound TCP connection. 1722 * 1723 * For labeled systems, there's no need to check the 1724 * label here. It's known to be good as we checked 1725 * before allowing the connection to become bound. 1726 */ 1727 CONN_INC_REF(connp); 1728 mutex_exit(&connfp->connf_lock); 1729 return (connp); 1730 } 1731 1732 mutex_exit(&connfp->connf_lock); 1733 1734 lport = up[1]; 1735 unlabeled = B_FALSE; 1736 /* Cred can be null on IPv6 */ 1737 if (is_system_labeled()) { 1738 cred_t *cr = DB_CRED(mp); 1739 1740 unlabeled = (cr != NULL && 1741 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1742 } 1743 shared_addr = (zoneid == ALL_ZONES); 1744 if (shared_addr) { 1745 /* 1746 * No need to handle exclusive-stack zones since 1747 * ALL_ZONES only applies to the shared stack. 1748 */ 1749 zoneid = tsol_mlp_findzone(protocol, lport); 1750 /* 1751 * If no shared MLP is found, tsol_mlp_findzone returns 1752 * ALL_ZONES. In that case, we assume it's SLP, and 1753 * search for the zone based on the packet label. 1754 * 1755 * If there is such a zone, we prefer to find a 1756 * connection in it. Otherwise, we look for a 1757 * MAC-exempt connection in any zone whose label 1758 * dominates the default label on the packet. 1759 */ 1760 if (zoneid == ALL_ZONES) 1761 zoneid = tsol_packet_to_zoneid(mp); 1762 else 1763 unlabeled = B_FALSE; 1764 } 1765 1766 bind_connfp = 1767 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1768 mutex_enter(&bind_connfp->connf_lock); 1769 for (connp = bind_connfp->connf_head; connp != NULL; 1770 connp = connp->conn_next) { 1771 if (IPCL_BIND_MATCH_V6(connp, protocol, 1772 ip6h->ip6_dst, lport) && 1773 (IPCL_ZONE_MATCH(connp, zoneid) || 1774 (unlabeled && connp->conn_mac_exempt))) 1775 break; 1776 } 1777 1778 if (connp != NULL && is_system_labeled() && 1779 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1780 shared_addr, connp)) { 1781 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1782 char *, "connp(1) could not receive mp(2)", 1783 conn_t *, connp, mblk_t *, mp); 1784 connp = NULL; 1785 } 1786 1787 if (connp != NULL) { 1788 /* Have a listner at least */ 1789 CONN_INC_REF(connp); 1790 mutex_exit(&bind_connfp->connf_lock); 1791 IPCL_DEBUG_LVL(512, 1792 ("ipcl_classify_v6: found listner " 1793 "connp = %p\n", (void *)connp)); 1794 1795 return (connp); 1796 } 1797 1798 mutex_exit(&bind_connfp->connf_lock); 1799 1800 IPCL_DEBUG_LVL(512, 1801 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1802 (void *)mp)); 1803 break; 1804 1805 case IPPROTO_UDP: 1806 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1807 lport = up[1]; 1808 unlabeled = B_FALSE; 1809 /* Cred can be null on IPv6 */ 1810 if (is_system_labeled()) { 1811 cred_t *cr = DB_CRED(mp); 1812 1813 unlabeled = (cr != NULL && 1814 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1815 } 1816 shared_addr = (zoneid == ALL_ZONES); 1817 if (shared_addr) { 1818 /* 1819 * No need to handle exclusive-stack zones since 1820 * ALL_ZONES only applies to the shared stack. 1821 */ 1822 zoneid = tsol_mlp_findzone(protocol, lport); 1823 /* 1824 * If no shared MLP is found, tsol_mlp_findzone returns 1825 * ALL_ZONES. In that case, we assume it's SLP, and 1826 * search for the zone based on the packet label. 1827 * 1828 * If there is such a zone, we prefer to find a 1829 * connection in it. Otherwise, we look for a 1830 * MAC-exempt connection in any zone whose label 1831 * dominates the default label on the packet. 1832 */ 1833 if (zoneid == ALL_ZONES) 1834 zoneid = tsol_packet_to_zoneid(mp); 1835 else 1836 unlabeled = B_FALSE; 1837 } 1838 1839 fport = up[0]; 1840 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1841 fport)); 1842 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1843 mutex_enter(&connfp->connf_lock); 1844 for (connp = connfp->connf_head; connp != NULL; 1845 connp = connp->conn_next) { 1846 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1847 fport, ip6h->ip6_src) && 1848 (IPCL_ZONE_MATCH(connp, zoneid) || 1849 (unlabeled && connp->conn_mac_exempt))) 1850 break; 1851 } 1852 1853 if (connp != NULL && is_system_labeled() && 1854 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1855 shared_addr, connp)) { 1856 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1857 char *, "connp(1) could not receive mp(2)", 1858 conn_t *, connp, mblk_t *, mp); 1859 connp = NULL; 1860 } 1861 1862 if (connp != NULL) { 1863 CONN_INC_REF(connp); 1864 mutex_exit(&connfp->connf_lock); 1865 return (connp); 1866 } 1867 1868 /* 1869 * We shouldn't come here for multicast/broadcast packets 1870 */ 1871 mutex_exit(&connfp->connf_lock); 1872 IPCL_DEBUG_LVL(512, 1873 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1874 lport, fport)); 1875 break; 1876 } 1877 1878 return (NULL); 1879 } 1880 1881 /* 1882 * wrapper around ipcl_classify_(v4,v6) routines. 1883 */ 1884 conn_t * 1885 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1886 { 1887 uint16_t hdr_len; 1888 ipha_t *ipha; 1889 uint8_t *nexthdrp; 1890 1891 if (MBLKL(mp) < sizeof (ipha_t)) 1892 return (NULL); 1893 1894 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1895 case IPV4_VERSION: 1896 ipha = (ipha_t *)mp->b_rptr; 1897 hdr_len = IPH_HDR_LENGTH(ipha); 1898 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1899 zoneid, ipst)); 1900 case IPV6_VERSION: 1901 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1902 &hdr_len, &nexthdrp)) 1903 return (NULL); 1904 1905 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1906 } 1907 1908 return (NULL); 1909 } 1910 1911 conn_t * 1912 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1913 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1914 { 1915 connf_t *connfp; 1916 conn_t *connp; 1917 in_port_t lport; 1918 int af; 1919 boolean_t shared_addr; 1920 boolean_t unlabeled; 1921 const void *dst; 1922 1923 lport = ((uint16_t *)&ports)[1]; 1924 1925 unlabeled = B_FALSE; 1926 /* Cred can be null on IPv6 */ 1927 if (is_system_labeled()) { 1928 cred_t *cr = DB_CRED(mp); 1929 1930 unlabeled = (cr != NULL && 1931 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1932 } 1933 shared_addr = (zoneid == ALL_ZONES); 1934 if (shared_addr) { 1935 /* 1936 * No need to handle exclusive-stack zones since ALL_ZONES 1937 * only applies to the shared stack. 1938 */ 1939 zoneid = tsol_mlp_findzone(protocol, lport); 1940 /* 1941 * If no shared MLP is found, tsol_mlp_findzone returns 1942 * ALL_ZONES. In that case, we assume it's SLP, and search for 1943 * the zone based on the packet label. 1944 * 1945 * If there is such a zone, we prefer to find a connection in 1946 * it. Otherwise, we look for a MAC-exempt connection in any 1947 * zone whose label dominates the default label on the packet. 1948 */ 1949 if (zoneid == ALL_ZONES) 1950 zoneid = tsol_packet_to_zoneid(mp); 1951 else 1952 unlabeled = B_FALSE; 1953 } 1954 1955 af = IPH_HDR_VERSION(hdr); 1956 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1957 (const void *)&((ip6_t *)hdr)->ip6_dst; 1958 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1959 1960 mutex_enter(&connfp->connf_lock); 1961 for (connp = connfp->connf_head; connp != NULL; 1962 connp = connp->conn_next) { 1963 /* We don't allow v4 fallback for v6 raw socket. */ 1964 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1965 IPV6_VERSION)) 1966 continue; 1967 if (connp->conn_fully_bound) { 1968 if (af == IPV4_VERSION) { 1969 if (!IPCL_CONN_MATCH(connp, protocol, 1970 hdr->ipha_src, hdr->ipha_dst, ports)) 1971 continue; 1972 } else { 1973 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1974 ((ip6_t *)hdr)->ip6_src, 1975 ((ip6_t *)hdr)->ip6_dst, ports)) 1976 continue; 1977 } 1978 } else { 1979 if (af == IPV4_VERSION) { 1980 if (!IPCL_BIND_MATCH(connp, protocol, 1981 hdr->ipha_dst, lport)) 1982 continue; 1983 } else { 1984 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1985 ((ip6_t *)hdr)->ip6_dst, lport)) 1986 continue; 1987 } 1988 } 1989 1990 if (IPCL_ZONE_MATCH(connp, zoneid) || 1991 (unlabeled && connp->conn_mac_exempt)) 1992 break; 1993 } 1994 /* 1995 * If the connection is fully-bound and connection-oriented (TCP or 1996 * SCTP), then we've already validated the remote system's label. 1997 * There's no need to do it again for every packet. 1998 */ 1999 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 2000 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 2001 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 2002 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 2003 char *, "connp(1) could not receive mp(2)", 2004 conn_t *, connp, mblk_t *, mp); 2005 connp = NULL; 2006 } 2007 2008 if (connp != NULL) 2009 goto found; 2010 mutex_exit(&connfp->connf_lock); 2011 2012 /* Try to look for a wildcard match. */ 2013 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 2014 mutex_enter(&connfp->connf_lock); 2015 for (connp = connfp->connf_head; connp != NULL; 2016 connp = connp->conn_next) { 2017 /* We don't allow v4 fallback for v6 raw socket. */ 2018 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 2019 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 2020 continue; 2021 } 2022 if (af == IPV4_VERSION) { 2023 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 2024 break; 2025 } else { 2026 if (IPCL_RAW_MATCH_V6(connp, protocol, 2027 ((ip6_t *)hdr)->ip6_dst)) { 2028 break; 2029 } 2030 } 2031 } 2032 2033 if (connp != NULL) 2034 goto found; 2035 2036 mutex_exit(&connfp->connf_lock); 2037 return (NULL); 2038 2039 found: 2040 ASSERT(connp != NULL); 2041 CONN_INC_REF(connp); 2042 mutex_exit(&connfp->connf_lock); 2043 return (connp); 2044 } 2045 2046 /* ARGSUSED */ 2047 static int 2048 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2049 { 2050 itc_t *itc = (itc_t *)buf; 2051 conn_t *connp = &itc->itc_conn; 2052 tcp_t *tcp = (tcp_t *)&itc[1]; 2053 2054 bzero(connp, sizeof (conn_t)); 2055 bzero(tcp, sizeof (tcp_t)); 2056 2057 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2058 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2059 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 2060 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2061 connp->conn_tcp = tcp; 2062 connp->conn_flags = IPCL_TCPCONN; 2063 connp->conn_ulp = IPPROTO_TCP; 2064 tcp->tcp_connp = connp; 2065 return (0); 2066 } 2067 2068 /* ARGSUSED */ 2069 static void 2070 tcp_conn_destructor(void *buf, void *cdrarg) 2071 { 2072 itc_t *itc = (itc_t *)buf; 2073 conn_t *connp = &itc->itc_conn; 2074 tcp_t *tcp = (tcp_t *)&itc[1]; 2075 2076 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2077 ASSERT(tcp->tcp_connp == connp); 2078 ASSERT(connp->conn_tcp == tcp); 2079 tcp_timermp_free(tcp); 2080 mutex_destroy(&connp->conn_lock); 2081 cv_destroy(&connp->conn_cv); 2082 cv_destroy(&connp->conn_sq_cv); 2083 } 2084 2085 /* ARGSUSED */ 2086 static int 2087 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2088 { 2089 itc_t *itc = (itc_t *)buf; 2090 conn_t *connp = &itc->itc_conn; 2091 2092 bzero(connp, sizeof (conn_t)); 2093 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2094 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2095 connp->conn_flags = IPCL_IPCCONN; 2096 2097 return (0); 2098 } 2099 2100 /* ARGSUSED */ 2101 static void 2102 ip_conn_destructor(void *buf, void *cdrarg) 2103 { 2104 itc_t *itc = (itc_t *)buf; 2105 conn_t *connp = &itc->itc_conn; 2106 2107 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2108 ASSERT(connp->conn_priv == NULL); 2109 mutex_destroy(&connp->conn_lock); 2110 cv_destroy(&connp->conn_cv); 2111 } 2112 2113 /* ARGSUSED */ 2114 static int 2115 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2116 { 2117 itc_t *itc = (itc_t *)buf; 2118 conn_t *connp = &itc->itc_conn; 2119 udp_t *udp = (udp_t *)&itc[1]; 2120 2121 bzero(connp, sizeof (conn_t)); 2122 bzero(udp, sizeof (udp_t)); 2123 2124 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2125 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2126 connp->conn_udp = udp; 2127 connp->conn_flags = IPCL_UDPCONN; 2128 connp->conn_ulp = IPPROTO_UDP; 2129 udp->udp_connp = connp; 2130 return (0); 2131 } 2132 2133 /* ARGSUSED */ 2134 static void 2135 udp_conn_destructor(void *buf, void *cdrarg) 2136 { 2137 itc_t *itc = (itc_t *)buf; 2138 conn_t *connp = &itc->itc_conn; 2139 udp_t *udp = (udp_t *)&itc[1]; 2140 2141 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2142 ASSERT(udp->udp_connp == connp); 2143 ASSERT(connp->conn_udp == udp); 2144 mutex_destroy(&connp->conn_lock); 2145 cv_destroy(&connp->conn_cv); 2146 } 2147 2148 /* ARGSUSED */ 2149 static int 2150 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2151 { 2152 itc_t *itc = (itc_t *)buf; 2153 conn_t *connp = &itc->itc_conn; 2154 icmp_t *icmp = (icmp_t *)&itc[1]; 2155 2156 bzero(connp, sizeof (conn_t)); 2157 bzero(icmp, sizeof (icmp_t)); 2158 2159 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2160 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2161 connp->conn_icmp = icmp; 2162 connp->conn_flags = IPCL_RAWIPCONN; 2163 connp->conn_ulp = IPPROTO_ICMP; 2164 icmp->icmp_connp = connp; 2165 return (0); 2166 } 2167 2168 /* ARGSUSED */ 2169 static void 2170 rawip_conn_destructor(void *buf, void *cdrarg) 2171 { 2172 itc_t *itc = (itc_t *)buf; 2173 conn_t *connp = &itc->itc_conn; 2174 icmp_t *icmp = (icmp_t *)&itc[1]; 2175 2176 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2177 ASSERT(icmp->icmp_connp == connp); 2178 ASSERT(connp->conn_icmp == icmp); 2179 mutex_destroy(&connp->conn_lock); 2180 cv_destroy(&connp->conn_cv); 2181 } 2182 2183 /* ARGSUSED */ 2184 static int 2185 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2186 { 2187 itc_t *itc = (itc_t *)buf; 2188 conn_t *connp = &itc->itc_conn; 2189 rts_t *rts = (rts_t *)&itc[1]; 2190 2191 bzero(connp, sizeof (conn_t)); 2192 bzero(rts, sizeof (rts_t)); 2193 2194 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2195 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2196 connp->conn_rts = rts; 2197 connp->conn_flags = IPCL_RTSCONN; 2198 rts->rts_connp = connp; 2199 return (0); 2200 } 2201 2202 /* ARGSUSED */ 2203 static void 2204 rts_conn_destructor(void *buf, void *cdrarg) 2205 { 2206 itc_t *itc = (itc_t *)buf; 2207 conn_t *connp = &itc->itc_conn; 2208 rts_t *rts = (rts_t *)&itc[1]; 2209 2210 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2211 ASSERT(rts->rts_connp == connp); 2212 ASSERT(connp->conn_rts == rts); 2213 mutex_destroy(&connp->conn_lock); 2214 cv_destroy(&connp->conn_cv); 2215 } 2216 2217 /* ARGSUSED */ 2218 int 2219 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) 2220 { 2221 int error; 2222 netstack_t *ns; 2223 int ret; 2224 tcp_stack_t *tcps; 2225 ip_helper_stream_info_t *ip_helper_str; 2226 ip_stack_t *ipst; 2227 2228 ns = netstack_find_by_cred(kcred); 2229 ASSERT(ns != NULL); 2230 tcps = ns->netstack_tcp; 2231 ipst = ns->netstack_ip; 2232 ASSERT(tcps != NULL); 2233 ip_helper_str = (ip_helper_stream_info_t *)buf; 2234 2235 error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, 2236 &ip_helper_str->ip_helper_stream_handle, ipst->ips_ldi_ident); 2237 if (error != 0) { 2238 goto done; 2239 } 2240 error = ldi_ioctl(ip_helper_str->ip_helper_stream_handle, 2241 SIOCSQPTR, (intptr_t)buf, FKIOCTL, kcred, &ret); 2242 if (error != 0) { 2243 (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, 2244 kcred); 2245 } 2246 done: 2247 netstack_rele(ipst->ips_netstack); 2248 return (error); 2249 } 2250 2251 /* ARGSUSED */ 2252 static void 2253 ip_helper_stream_destructor(void *buf, void *cdrarg) 2254 { 2255 ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; 2256 2257 ip_helper_str->ip_helper_stream_rq->q_ptr = 2258 ip_helper_str->ip_helper_stream_wq->q_ptr = 2259 ip_helper_str->ip_helper_stream_minfo; 2260 (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, kcred); 2261 } 2262 2263 2264 /* 2265 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2266 * in the conn_t. 2267 */ 2268 void 2269 ipcl_conn_cleanup(conn_t *connp) 2270 { 2271 ASSERT(connp->conn_ire_cache == NULL); 2272 ASSERT(connp->conn_latch == NULL); 2273 #ifdef notdef 2274 ASSERT(connp->conn_rq == NULL); 2275 ASSERT(connp->conn_wq == NULL); 2276 #endif 2277 ASSERT(connp->conn_cred == NULL); 2278 ASSERT(connp->conn_g_fanout == NULL); 2279 ASSERT(connp->conn_g_next == NULL); 2280 ASSERT(connp->conn_g_prev == NULL); 2281 ASSERT(connp->conn_policy == NULL); 2282 ASSERT(connp->conn_fanout == NULL); 2283 ASSERT(connp->conn_next == NULL); 2284 ASSERT(connp->conn_prev == NULL); 2285 #ifdef notdef 2286 /* 2287 * The ill and ipif pointers are not cleared before the conn_t 2288 * goes away since they do not hold a reference on the ill/ipif. 2289 * We should replace these pointers with ifindex/ipaddr_t to 2290 * make the code less complex. 2291 */ 2292 ASSERT(connp->conn_xmit_if_ill == NULL); 2293 ASSERT(connp->conn_nofailover_ill == NULL); 2294 ASSERT(connp->conn_outgoing_ill == NULL); 2295 ASSERT(connp->conn_incoming_ill == NULL); 2296 ASSERT(connp->conn_outgoing_pill == NULL); 2297 ASSERT(connp->conn_multicast_ipif == NULL); 2298 ASSERT(connp->conn_multicast_ill == NULL); 2299 #endif 2300 ASSERT(connp->conn_oper_pending_ill == NULL); 2301 ASSERT(connp->conn_ilg == NULL); 2302 ASSERT(connp->conn_drain_next == NULL); 2303 ASSERT(connp->conn_drain_prev == NULL); 2304 #ifdef notdef 2305 /* conn_idl is not cleared when removed from idl list */ 2306 ASSERT(connp->conn_idl == NULL); 2307 #endif 2308 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2309 ASSERT(connp->conn_peercred == NULL); 2310 ASSERT(connp->conn_netstack == NULL); 2311 2312 ASSERT(connp->conn_helper_info == NULL); 2313 /* Clear out the conn_t fields that are not preserved */ 2314 bzero(&connp->conn_start_clr, 2315 sizeof (conn_t) - 2316 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2317 } 2318 2319 /* 2320 * All conns are inserted in a global multi-list for the benefit of 2321 * walkers. The walk is guaranteed to walk all open conns at the time 2322 * of the start of the walk exactly once. This property is needed to 2323 * achieve some cleanups during unplumb of interfaces. This is achieved 2324 * as follows. 2325 * 2326 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2327 * call the insert and delete functions below at creation and deletion 2328 * time respectively. The conn never moves or changes its position in this 2329 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2330 * won't increase due to walkers, once the conn deletion has started. Note 2331 * that we can't remove the conn from the global list and then wait for 2332 * the refcnt to drop to zero, since walkers would then see a truncated 2333 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2334 * conns until ip_open is ready to make them globally visible. 2335 * The global round robin multi-list locks are held only to get the 2336 * next member/insertion/deletion and contention should be negligible 2337 * if the multi-list is much greater than the number of cpus. 2338 */ 2339 void 2340 ipcl_globalhash_insert(conn_t *connp) 2341 { 2342 int index; 2343 struct connf_s *connfp; 2344 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2345 2346 /* 2347 * No need for atomic here. Approximate even distribution 2348 * in the global lists is sufficient. 2349 */ 2350 ipst->ips_conn_g_index++; 2351 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2352 2353 connp->conn_g_prev = NULL; 2354 /* 2355 * Mark as INCIPIENT, so that walkers will ignore this 2356 * for now, till ip_open is ready to make it visible globally. 2357 */ 2358 connp->conn_state_flags |= CONN_INCIPIENT; 2359 2360 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2361 /* Insert at the head of the list */ 2362 mutex_enter(&connfp->connf_lock); 2363 connp->conn_g_next = connfp->connf_head; 2364 if (connp->conn_g_next != NULL) 2365 connp->conn_g_next->conn_g_prev = connp; 2366 connfp->connf_head = connp; 2367 2368 /* The fanout bucket this conn points to */ 2369 connp->conn_g_fanout = connfp; 2370 2371 mutex_exit(&connfp->connf_lock); 2372 } 2373 2374 void 2375 ipcl_globalhash_remove(conn_t *connp) 2376 { 2377 struct connf_s *connfp; 2378 2379 /* 2380 * We were never inserted in the global multi list. 2381 * IPCL_NONE variety is never inserted in the global multilist 2382 * since it is presumed to not need any cleanup and is transient. 2383 */ 2384 if (connp->conn_g_fanout == NULL) 2385 return; 2386 2387 connfp = connp->conn_g_fanout; 2388 mutex_enter(&connfp->connf_lock); 2389 if (connp->conn_g_prev != NULL) 2390 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2391 else 2392 connfp->connf_head = connp->conn_g_next; 2393 if (connp->conn_g_next != NULL) 2394 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2395 mutex_exit(&connfp->connf_lock); 2396 2397 /* Better to stumble on a null pointer than to corrupt memory */ 2398 connp->conn_g_next = NULL; 2399 connp->conn_g_prev = NULL; 2400 connp->conn_g_fanout = NULL; 2401 } 2402 2403 /* 2404 * Walk the list of all conn_t's in the system, calling the function provided 2405 * with the specified argument for each. 2406 * Applies to both IPv4 and IPv6. 2407 * 2408 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2409 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2410 * unplumbed or removed. New conn_t's that are created while we are walking 2411 * may be missed by this walk, because they are not necessarily inserted 2412 * at the tail of the list. They are new conn_t's and thus don't have any 2413 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2414 * is created to the struct that is going away. 2415 */ 2416 void 2417 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2418 { 2419 int i; 2420 conn_t *connp; 2421 conn_t *prev_connp; 2422 2423 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2424 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2425 prev_connp = NULL; 2426 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2427 while (connp != NULL) { 2428 mutex_enter(&connp->conn_lock); 2429 if (connp->conn_state_flags & 2430 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2431 mutex_exit(&connp->conn_lock); 2432 connp = connp->conn_g_next; 2433 continue; 2434 } 2435 CONN_INC_REF_LOCKED(connp); 2436 mutex_exit(&connp->conn_lock); 2437 mutex_exit( 2438 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2439 (*func)(connp, arg); 2440 if (prev_connp != NULL) 2441 CONN_DEC_REF(prev_connp); 2442 mutex_enter( 2443 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2444 prev_connp = connp; 2445 connp = connp->conn_g_next; 2446 } 2447 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2448 if (prev_connp != NULL) 2449 CONN_DEC_REF(prev_connp); 2450 } 2451 } 2452 2453 /* 2454 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2455 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2456 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2457 * (peer tcp in ESTABLISHED state). 2458 */ 2459 conn_t * 2460 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2461 ip_stack_t *ipst) 2462 { 2463 uint32_t ports; 2464 uint16_t *pports = (uint16_t *)&ports; 2465 connf_t *connfp; 2466 conn_t *tconnp; 2467 boolean_t zone_chk; 2468 2469 /* 2470 * If either the source of destination address is loopback, then 2471 * both endpoints must be in the same Zone. Otherwise, both of 2472 * the addresses are system-wide unique (tcp is in ESTABLISHED 2473 * state) and the endpoints may reside in different Zones. 2474 */ 2475 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2476 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2477 2478 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2479 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2480 2481 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2482 ports, ipst)]; 2483 2484 mutex_enter(&connfp->connf_lock); 2485 for (tconnp = connfp->connf_head; tconnp != NULL; 2486 tconnp = tconnp->conn_next) { 2487 2488 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2489 ipha->ipha_dst, ipha->ipha_src, ports) && 2490 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2491 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2492 2493 ASSERT(tconnp != connp); 2494 CONN_INC_REF(tconnp); 2495 mutex_exit(&connfp->connf_lock); 2496 return (tconnp); 2497 } 2498 } 2499 mutex_exit(&connfp->connf_lock); 2500 return (NULL); 2501 } 2502 2503 /* 2504 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2505 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2506 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2507 * (peer tcp in ESTABLISHED state). 2508 */ 2509 conn_t * 2510 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2511 ip_stack_t *ipst) 2512 { 2513 uint32_t ports; 2514 uint16_t *pports = (uint16_t *)&ports; 2515 connf_t *connfp; 2516 conn_t *tconnp; 2517 boolean_t zone_chk; 2518 2519 /* 2520 * If either the source of destination address is loopback, then 2521 * both endpoints must be in the same Zone. Otherwise, both of 2522 * the addresses are system-wide unique (tcp is in ESTABLISHED 2523 * state) and the endpoints may reside in different Zones. We 2524 * don't do Zone check for link local address(es) because the 2525 * current Zone implementation treats each link local address as 2526 * being unique per system node, i.e. they belong to global Zone. 2527 */ 2528 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2529 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2530 2531 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2532 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2533 2534 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2535 ports, ipst)]; 2536 2537 mutex_enter(&connfp->connf_lock); 2538 for (tconnp = connfp->connf_head; tconnp != NULL; 2539 tconnp = tconnp->conn_next) { 2540 2541 /* We skip tcp_bound_if check here as this is loopback tcp */ 2542 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2543 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2544 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2545 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2546 2547 ASSERT(tconnp != connp); 2548 CONN_INC_REF(tconnp); 2549 mutex_exit(&connfp->connf_lock); 2550 return (tconnp); 2551 } 2552 } 2553 mutex_exit(&connfp->connf_lock); 2554 return (NULL); 2555 } 2556 2557 /* 2558 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2559 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2560 * Only checks for connected entries i.e. no INADDR_ANY checks. 2561 */ 2562 conn_t * 2563 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2564 ip_stack_t *ipst) 2565 { 2566 uint32_t ports; 2567 uint16_t *pports; 2568 connf_t *connfp; 2569 conn_t *tconnp; 2570 2571 pports = (uint16_t *)&ports; 2572 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2573 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2574 2575 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2576 ports, ipst)]; 2577 2578 mutex_enter(&connfp->connf_lock); 2579 for (tconnp = connfp->connf_head; tconnp != NULL; 2580 tconnp = tconnp->conn_next) { 2581 2582 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2583 ipha->ipha_dst, ipha->ipha_src, ports) && 2584 tconnp->conn_tcp->tcp_state >= min_state) { 2585 2586 CONN_INC_REF(tconnp); 2587 mutex_exit(&connfp->connf_lock); 2588 return (tconnp); 2589 } 2590 } 2591 mutex_exit(&connfp->connf_lock); 2592 return (NULL); 2593 } 2594 2595 /* 2596 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2597 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2598 * Only checks for connected entries i.e. no INADDR_ANY checks. 2599 * Match on ifindex in addition to addresses. 2600 */ 2601 conn_t * 2602 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2603 uint_t ifindex, ip_stack_t *ipst) 2604 { 2605 tcp_t *tcp; 2606 uint32_t ports; 2607 uint16_t *pports; 2608 connf_t *connfp; 2609 conn_t *tconnp; 2610 2611 pports = (uint16_t *)&ports; 2612 pports[0] = tcpha->tha_fport; 2613 pports[1] = tcpha->tha_lport; 2614 2615 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2616 ports, ipst)]; 2617 2618 mutex_enter(&connfp->connf_lock); 2619 for (tconnp = connfp->connf_head; tconnp != NULL; 2620 tconnp = tconnp->conn_next) { 2621 2622 tcp = tconnp->conn_tcp; 2623 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2624 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2625 tcp->tcp_state >= min_state && 2626 (tcp->tcp_bound_if == 0 || 2627 tcp->tcp_bound_if == ifindex)) { 2628 2629 CONN_INC_REF(tconnp); 2630 mutex_exit(&connfp->connf_lock); 2631 return (tconnp); 2632 } 2633 } 2634 mutex_exit(&connfp->connf_lock); 2635 return (NULL); 2636 } 2637 2638 /* 2639 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2640 * a listener when changing state. 2641 */ 2642 conn_t * 2643 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2644 ip_stack_t *ipst) 2645 { 2646 connf_t *bind_connfp; 2647 conn_t *connp; 2648 tcp_t *tcp; 2649 2650 /* 2651 * Avoid false matches for packets sent to an IP destination of 2652 * all zeros. 2653 */ 2654 if (laddr == 0) 2655 return (NULL); 2656 2657 ASSERT(zoneid != ALL_ZONES); 2658 2659 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2660 mutex_enter(&bind_connfp->connf_lock); 2661 for (connp = bind_connfp->connf_head; connp != NULL; 2662 connp = connp->conn_next) { 2663 tcp = connp->conn_tcp; 2664 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2665 IPCL_ZONE_MATCH(connp, zoneid) && 2666 (tcp->tcp_listener == NULL)) { 2667 CONN_INC_REF(connp); 2668 mutex_exit(&bind_connfp->connf_lock); 2669 return (connp); 2670 } 2671 } 2672 mutex_exit(&bind_connfp->connf_lock); 2673 return (NULL); 2674 } 2675 2676 /* 2677 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2678 * a listener when changing state. 2679 */ 2680 conn_t * 2681 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2682 zoneid_t zoneid, ip_stack_t *ipst) 2683 { 2684 connf_t *bind_connfp; 2685 conn_t *connp = NULL; 2686 tcp_t *tcp; 2687 2688 /* 2689 * Avoid false matches for packets sent to an IP destination of 2690 * all zeros. 2691 */ 2692 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2693 return (NULL); 2694 2695 ASSERT(zoneid != ALL_ZONES); 2696 2697 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2698 mutex_enter(&bind_connfp->connf_lock); 2699 for (connp = bind_connfp->connf_head; connp != NULL; 2700 connp = connp->conn_next) { 2701 tcp = connp->conn_tcp; 2702 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2703 IPCL_ZONE_MATCH(connp, zoneid) && 2704 (tcp->tcp_bound_if == 0 || 2705 tcp->tcp_bound_if == ifindex) && 2706 tcp->tcp_listener == NULL) { 2707 CONN_INC_REF(connp); 2708 mutex_exit(&bind_connfp->connf_lock); 2709 return (connp); 2710 } 2711 } 2712 mutex_exit(&bind_connfp->connf_lock); 2713 return (NULL); 2714 } 2715 2716 /* 2717 * ipcl_get_next_conn 2718 * get the next entry in the conn global list 2719 * and put a reference on the next_conn. 2720 * decrement the reference on the current conn. 2721 * 2722 * This is an iterator based walker function that also provides for 2723 * some selection by the caller. It walks through the conn_hash bucket 2724 * searching for the next valid connp in the list, and selects connections 2725 * that are neither closed nor condemned. It also REFHOLDS the conn 2726 * thus ensuring that the conn exists when the caller uses the conn. 2727 */ 2728 conn_t * 2729 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2730 { 2731 conn_t *next_connp; 2732 2733 if (connfp == NULL) 2734 return (NULL); 2735 2736 mutex_enter(&connfp->connf_lock); 2737 2738 next_connp = (connp == NULL) ? 2739 connfp->connf_head : connp->conn_g_next; 2740 2741 while (next_connp != NULL) { 2742 mutex_enter(&next_connp->conn_lock); 2743 if (!(next_connp->conn_flags & conn_flags) || 2744 (next_connp->conn_state_flags & 2745 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2746 /* 2747 * This conn has been condemned or 2748 * is closing, or the flags don't match 2749 */ 2750 mutex_exit(&next_connp->conn_lock); 2751 next_connp = next_connp->conn_g_next; 2752 continue; 2753 } 2754 CONN_INC_REF_LOCKED(next_connp); 2755 mutex_exit(&next_connp->conn_lock); 2756 break; 2757 } 2758 2759 mutex_exit(&connfp->connf_lock); 2760 2761 if (connp != NULL) 2762 CONN_DEC_REF(connp); 2763 2764 return (next_connp); 2765 } 2766 2767 #ifdef CONN_DEBUG 2768 /* 2769 * Trace of the last NBUF refhold/refrele 2770 */ 2771 int 2772 conn_trace_ref(conn_t *connp) 2773 { 2774 int last; 2775 conn_trace_t *ctb; 2776 2777 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2778 last = connp->conn_trace_last; 2779 last++; 2780 if (last == CONN_TRACE_MAX) 2781 last = 0; 2782 2783 ctb = &connp->conn_trace_buf[last]; 2784 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2785 connp->conn_trace_last = last; 2786 return (1); 2787 } 2788 2789 int 2790 conn_untrace_ref(conn_t *connp) 2791 { 2792 int last; 2793 conn_trace_t *ctb; 2794 2795 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2796 last = connp->conn_trace_last; 2797 last++; 2798 if (last == CONN_TRACE_MAX) 2799 last = 0; 2800 2801 ctb = &connp->conn_trace_buf[last]; 2802 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2803 connp->conn_trace_last = last; 2804 return (1); 2805 } 2806 #endif 2807