1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * IP PACKET CLASSIFIER 28 * 29 * The IP packet classifier provides mapping between IP packets and persistent 30 * connection state for connection-oriented protocols. It also provides 31 * interface for managing connection states. 32 * 33 * The connection state is kept in conn_t data structure and contains, among 34 * other things: 35 * 36 * o local/remote address and ports 37 * o Transport protocol 38 * o squeue for the connection (for TCP only) 39 * o reference counter 40 * o Connection state 41 * o hash table linkage 42 * o interface/ire information 43 * o credentials 44 * o ipsec policy 45 * o send and receive functions. 46 * o mutex lock. 47 * 48 * Connections use a reference counting scheme. They are freed when the 49 * reference counter drops to zero. A reference is incremented when connection 50 * is placed in a list or table, when incoming packet for the connection arrives 51 * and when connection is processed via squeue (squeue processing may be 52 * asynchronous and the reference protects the connection from being destroyed 53 * before its processing is finished). 54 * 55 * send and receive functions are currently used for TCP only. The send function 56 * determines the IP entry point for the packet once it leaves TCP to be sent to 57 * the destination address. The receive function is used by IP when the packet 58 * should be passed for TCP processing. When a new connection is created these 59 * are set to ip_output() and tcp_input() respectively. During the lifetime of 60 * the connection the send and receive functions may change depending on the 61 * changes in the connection state. For example, Once the connection is bound to 62 * an addresse, the receive function for this connection is set to 63 * tcp_conn_request(). This allows incoming SYNs to go directly into the 64 * listener SYN processing function without going to tcp_input() first. 65 * 66 * Classifier uses several hash tables: 67 * 68 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 69 * ipcl_bind_fanout: contains all connections in BOUND state 70 * ipcl_proto_fanout: IPv4 protocol fanout 71 * ipcl_proto_fanout_v6: IPv6 protocol fanout 72 * ipcl_udp_fanout: contains all UDP connections 73 * ipcl_globalhash_fanout: contains all connections 74 * 75 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 76 * which need to view all existing connections. 77 * 78 * All tables are protected by per-bucket locks. When both per-bucket lock and 79 * connection lock need to be held, the per-bucket lock should be acquired 80 * first, followed by the connection lock. 81 * 82 * All functions doing search in one of these tables increment a reference 83 * counter on the connection found (if any). This reference should be dropped 84 * when the caller has finished processing the connection. 85 * 86 * 87 * INTERFACES: 88 * =========== 89 * 90 * Connection Lookup: 91 * ------------------ 92 * 93 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 94 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 95 * 96 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 97 * it can't find any associated connection. If the connection is found, its 98 * reference counter is incremented. 99 * 100 * mp: mblock, containing packet header. The full header should fit 101 * into a single mblock. It should also contain at least full IP 102 * and TCP or UDP header. 103 * 104 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 105 * 106 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 107 * the packet. 108 * 109 * zoneid: The zone in which the returned connection must be; the zoneid 110 * corresponding to the ire_zoneid on the IRE located for the 111 * packet's destination address. 112 * 113 * For TCP connections, the lookup order is as follows: 114 * 5-tuple {src, dst, protocol, local port, remote port} 115 * lookup in ipcl_conn_fanout table. 116 * 3-tuple {dst, remote port, protocol} lookup in 117 * ipcl_bind_fanout table. 118 * 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 120 * remote port} lookup is done on ipcl_udp_fanout. Note that, 121 * these interfaces do not handle cases where a packets belongs 122 * to multiple UDP clients, which is handled in IP itself. 123 * 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 125 * determine which actual zone gets the segment. This is used only in a 126 * labeled environment. The matching rules are: 127 * 128 * - If it's not a multilevel port, then the label on the packet selects 129 * the zone. Unlabeled packets are delivered to the global zone. 130 * 131 * - If it's a multilevel port, then only the zone registered to receive 132 * packets on that port matches. 133 * 134 * Also, in a labeled environment, packet labels need to be checked. For fully 135 * bound TCP connections, we can assume that the packet label was checked 136 * during connection establishment, and doesn't need to be checked on each 137 * packet. For others, though, we need to check for strict equality or, for 138 * multilevel ports, membership in the range or set. This part currently does 139 * a tnrh lookup on each packet, but could be optimized to use cached results 140 * if that were necessary. (SCTP doesn't come through here, but if it did, 141 * we would apply the same rules as TCP.) 142 * 143 * An implication of the above is that fully-bound TCP sockets must always use 144 * distinct 4-tuples; they can't be discriminated by label alone. 145 * 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 147 * as there's no connection set-up handshake and no shared state. 148 * 149 * Labels on looped-back packets within a single zone do not need to be 150 * checked, as all processes in the same zone have the same label. 151 * 152 * Finally, for unlabeled packets received by a labeled system, special rules 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 154 * socket in the zone whose label matches the default label of the sender, if 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 156 * receiver's label must dominate the sender's default label. 157 * 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 160 * ip_stack); 161 * 162 * Lookup routine to find a exact match for {src, dst, local port, 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and 164 * ports are read from the IP and TCP header respectively. 165 * 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 167 * zoneid, ip_stack); 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 169 * zoneid, ip_stack); 170 * 171 * Lookup routine to find a listener with the tuple {lport, laddr, 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 173 * parameter interface index is also compared. 174 * 175 * void ipcl_walk(func, arg, ip_stack) 176 * 177 * Apply 'func' to every connection available. The 'func' is called as 178 * (*func)(connp, arg). The walk is non-atomic so connections may be 179 * created and destroyed during the walk. The CONN_CONDEMNED and 180 * CONN_INCIPIENT flags ensure that connections which are newly created 181 * or being destroyed are not selected by the walker. 182 * 183 * Table Updates 184 * ------------- 185 * 186 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 187 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * protocol connection protocol 193 * src source address 194 * dst destination address 195 * ports local and remote port 196 * ifindex interface index for IPv6 connections 197 * 198 * Return value : 199 * 0 if connp was inserted 200 * EADDRINUSE if the connection with the same tuple 201 * already exists. 202 * 203 * int ipcl_bind_insert(connp, protocol, src, lport); 204 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 205 * 206 * Insert 'connp' in ipcl_bind_fanout. 207 * Arguements : 208 * connp conn_t to be inserted 209 * protocol connection protocol 210 * src source address connection wants 211 * to bind to 212 * lport local port connection wants to 213 * bind to 214 * 215 * 216 * void ipcl_hash_remove(connp); 217 * 218 * Removes the 'connp' from the connection fanout table. 219 * 220 * Connection Creation/Destruction 221 * ------------------------------- 222 * 223 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 224 * 225 * Creates a new conn based on the type flag, inserts it into 226 * globalhash table. 227 * 228 * type: This flag determines the type of conn_t which needs to be 229 * created i.e., which kmem_cache it comes from. 230 * IPCL_TCPCONN indicates a TCP connection 231 * IPCL_SCTPCONN indicates a SCTP connection 232 * IPCL_UDPCONN indicates a UDP conn_t. 233 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 234 * IPCL_RTSCONN indicates a RTS conn_t. 235 * IPCL_IPCCONN indicates all other connections. 236 * 237 * void ipcl_conn_destroy(connp) 238 * 239 * Destroys the connection state, removes it from the global 240 * connection hash table and frees its memory. 241 */ 242 243 #include <sys/types.h> 244 #include <sys/stream.h> 245 #include <sys/stropts.h> 246 #include <sys/sysmacros.h> 247 #include <sys/strsubr.h> 248 #include <sys/strsun.h> 249 #define _SUN_TPI_VERSION 2 250 #include <sys/ddi.h> 251 #include <sys/cmn_err.h> 252 #include <sys/debug.h> 253 254 #include <sys/systm.h> 255 #include <sys/param.h> 256 #include <sys/kmem.h> 257 #include <sys/isa_defs.h> 258 #include <inet/common.h> 259 #include <netinet/ip6.h> 260 #include <netinet/icmp6.h> 261 262 #include <inet/ip.h> 263 #include <inet/ip6.h> 264 #include <inet/ip_ndp.h> 265 #include <inet/ip_impl.h> 266 #include <inet/udp_impl.h> 267 #include <inet/sctp_ip.h> 268 #include <inet/sctp/sctp_impl.h> 269 #include <inet/rawip_impl.h> 270 #include <inet/rts_impl.h> 271 272 #include <sys/cpuvar.h> 273 274 #include <inet/ipclassifier.h> 275 #include <inet/tcp.h> 276 #include <inet/ipsec_impl.h> 277 278 #include <sys/tsol/tnet.h> 279 #include <sys/sockio.h> 280 281 #ifdef DEBUG 282 #define IPCL_DEBUG 283 #else 284 #undef IPCL_DEBUG 285 #endif 286 287 #ifdef IPCL_DEBUG 288 int ipcl_debug_level = 0; 289 #define IPCL_DEBUG_LVL(level, args) \ 290 if (ipcl_debug_level & level) { printf args; } 291 #else 292 #define IPCL_DEBUG_LVL(level, args) {; } 293 #endif 294 /* Old value for compatibility. Setable in /etc/system */ 295 uint_t tcp_conn_hash_size = 0; 296 297 /* New value. Zero means choose automatically. Setable in /etc/system */ 298 uint_t ipcl_conn_hash_size = 0; 299 uint_t ipcl_conn_hash_memfactor = 8192; 300 uint_t ipcl_conn_hash_maxsize = 82500; 301 302 /* bind/udp fanout table size */ 303 uint_t ipcl_bind_fanout_size = 512; 304 uint_t ipcl_udp_fanout_size = 16384; 305 306 /* Raw socket fanout size. Must be a power of 2. */ 307 uint_t ipcl_raw_fanout_size = 256; 308 309 /* 310 * Power of 2^N Primes useful for hashing for N of 0-28, 311 * these primes are the nearest prime <= 2^N - 2^(N-2). 312 */ 313 314 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 315 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 316 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 317 50331599, 100663291, 201326557, 0} 318 319 /* 320 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 321 * are aligned on cache lines. 322 */ 323 typedef union itc_s { 324 conn_t itc_conn; 325 char itcu_filler[CACHE_ALIGN(conn_s)]; 326 } itc_t; 327 328 struct kmem_cache *tcp_conn_cache; 329 struct kmem_cache *ip_conn_cache; 330 struct kmem_cache *ip_helper_stream_cache; 331 extern struct kmem_cache *sctp_conn_cache; 332 extern struct kmem_cache *tcp_sack_info_cache; 333 extern struct kmem_cache *tcp_iphc_cache; 334 struct kmem_cache *udp_conn_cache; 335 struct kmem_cache *rawip_conn_cache; 336 struct kmem_cache *rts_conn_cache; 337 338 extern void tcp_timermp_free(tcp_t *); 339 extern mblk_t *tcp_timermp_alloc(int); 340 341 static int ip_conn_constructor(void *, void *, int); 342 static void ip_conn_destructor(void *, void *); 343 344 static int tcp_conn_constructor(void *, void *, int); 345 static void tcp_conn_destructor(void *, void *); 346 347 static int udp_conn_constructor(void *, void *, int); 348 static void udp_conn_destructor(void *, void *); 349 350 static int rawip_conn_constructor(void *, void *, int); 351 static void rawip_conn_destructor(void *, void *); 352 353 static int rts_conn_constructor(void *, void *, int); 354 static void rts_conn_destructor(void *, void *); 355 356 static int ip_helper_stream_constructor(void *, void *, int); 357 static void ip_helper_stream_destructor(void *, void *); 358 359 boolean_t ip_use_helper_cache = B_TRUE; 360 361 #ifdef IPCL_DEBUG 362 #define INET_NTOA_BUFSIZE 18 363 364 static char * 365 inet_ntoa_r(uint32_t in, char *b) 366 { 367 unsigned char *p; 368 369 p = (unsigned char *)∈ 370 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 371 return (b); 372 } 373 #endif 374 375 /* 376 * Global (for all stack instances) init routine 377 */ 378 void 379 ipcl_g_init(void) 380 { 381 ip_conn_cache = kmem_cache_create("ip_conn_cache", 382 sizeof (conn_t), CACHE_ALIGN_SIZE, 383 ip_conn_constructor, ip_conn_destructor, 384 NULL, NULL, NULL, 0); 385 386 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 387 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 388 tcp_conn_constructor, tcp_conn_destructor, 389 NULL, NULL, NULL, 0); 390 391 udp_conn_cache = kmem_cache_create("udp_conn_cache", 392 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 393 udp_conn_constructor, udp_conn_destructor, 394 NULL, NULL, NULL, 0); 395 396 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 397 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 398 rawip_conn_constructor, rawip_conn_destructor, 399 NULL, NULL, NULL, 0); 400 401 rts_conn_cache = kmem_cache_create("rts_conn_cache", 402 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 403 rts_conn_constructor, rts_conn_destructor, 404 NULL, NULL, NULL, 0); 405 406 if (ip_use_helper_cache) { 407 ip_helper_stream_cache = kmem_cache_create 408 ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), 409 CACHE_ALIGN_SIZE, ip_helper_stream_constructor, 410 ip_helper_stream_destructor, NULL, NULL, NULL, 0); 411 } else { 412 ip_helper_stream_cache = NULL; 413 } 414 } 415 416 /* 417 * ipclassifier intialization routine, sets up hash tables. 418 */ 419 void 420 ipcl_init(ip_stack_t *ipst) 421 { 422 int i; 423 int sizes[] = P2Ps(); 424 425 /* 426 * Calculate size of conn fanout table from /etc/system settings 427 */ 428 if (ipcl_conn_hash_size != 0) { 429 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 430 } else if (tcp_conn_hash_size != 0) { 431 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 432 } else { 433 extern pgcnt_t freemem; 434 435 ipst->ips_ipcl_conn_fanout_size = 436 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 437 438 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 439 ipst->ips_ipcl_conn_fanout_size = 440 ipcl_conn_hash_maxsize; 441 } 442 } 443 444 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 445 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 446 break; 447 } 448 } 449 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 450 /* Out of range, use the 2^16 value */ 451 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 452 } 453 454 /* Take values from /etc/system */ 455 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 456 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 457 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 458 459 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 460 461 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 462 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 463 464 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 465 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 466 MUTEX_DEFAULT, NULL); 467 } 468 469 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 470 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 471 472 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 473 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 474 MUTEX_DEFAULT, NULL); 475 } 476 477 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 478 sizeof (connf_t), KM_SLEEP); 479 for (i = 0; i < IPPROTO_MAX; i++) { 480 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 481 MUTEX_DEFAULT, NULL); 482 } 483 484 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 485 sizeof (connf_t), KM_SLEEP); 486 for (i = 0; i < IPPROTO_MAX; i++) { 487 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 488 MUTEX_DEFAULT, NULL); 489 } 490 491 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 492 mutex_init(&ipst->ips_rts_clients->connf_lock, 493 NULL, MUTEX_DEFAULT, NULL); 494 495 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 496 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 497 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 498 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 499 MUTEX_DEFAULT, NULL); 500 } 501 502 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 503 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 504 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 505 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 506 MUTEX_DEFAULT, NULL); 507 } 508 509 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 510 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 511 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 512 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 513 NULL, MUTEX_DEFAULT, NULL); 514 } 515 } 516 517 void 518 ipcl_g_destroy(void) 519 { 520 kmem_cache_destroy(ip_conn_cache); 521 kmem_cache_destroy(tcp_conn_cache); 522 kmem_cache_destroy(udp_conn_cache); 523 kmem_cache_destroy(rawip_conn_cache); 524 kmem_cache_destroy(rts_conn_cache); 525 } 526 527 /* 528 * All user-level and kernel use of the stack must be gone 529 * by now. 530 */ 531 void 532 ipcl_destroy(ip_stack_t *ipst) 533 { 534 int i; 535 536 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 537 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 538 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 539 } 540 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 541 sizeof (connf_t)); 542 ipst->ips_ipcl_conn_fanout = NULL; 543 544 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 545 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 546 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 547 } 548 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 549 sizeof (connf_t)); 550 ipst->ips_ipcl_bind_fanout = NULL; 551 552 for (i = 0; i < IPPROTO_MAX; i++) { 553 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 554 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 555 } 556 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 557 ipst->ips_ipcl_proto_fanout = NULL; 558 559 for (i = 0; i < IPPROTO_MAX; i++) { 560 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 561 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 562 } 563 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 564 IPPROTO_MAX * sizeof (connf_t)); 565 ipst->ips_ipcl_proto_fanout_v6 = NULL; 566 567 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 568 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 569 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 570 } 571 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 572 sizeof (connf_t)); 573 ipst->ips_ipcl_udp_fanout = NULL; 574 575 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 576 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 577 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 578 } 579 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 580 sizeof (connf_t)); 581 ipst->ips_ipcl_raw_fanout = NULL; 582 583 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 584 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 585 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 586 } 587 kmem_free(ipst->ips_ipcl_globalhash_fanout, 588 sizeof (connf_t) * CONN_G_HASH_SIZE); 589 ipst->ips_ipcl_globalhash_fanout = NULL; 590 591 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 592 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 593 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 594 ipst->ips_rts_clients = NULL; 595 } 596 597 /* 598 * conn creation routine. initialize the conn, sets the reference 599 * and inserts it in the global hash table. 600 */ 601 conn_t * 602 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 603 { 604 conn_t *connp; 605 sctp_stack_t *sctps; 606 struct kmem_cache *conn_cache; 607 608 switch (type) { 609 case IPCL_SCTPCONN: 610 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 611 return (NULL); 612 sctp_conn_init(connp); 613 sctps = ns->netstack_sctp; 614 SCTP_G_Q_REFHOLD(sctps); 615 netstack_hold(ns); 616 connp->conn_netstack = ns; 617 return (connp); 618 619 case IPCL_TCPCONN: 620 conn_cache = tcp_conn_cache; 621 break; 622 623 case IPCL_UDPCONN: 624 conn_cache = udp_conn_cache; 625 break; 626 627 case IPCL_RAWIPCONN: 628 conn_cache = rawip_conn_cache; 629 break; 630 631 case IPCL_RTSCONN: 632 conn_cache = rts_conn_cache; 633 break; 634 635 case IPCL_IPCCONN: 636 conn_cache = ip_conn_cache; 637 break; 638 639 default: 640 connp = NULL; 641 ASSERT(0); 642 } 643 644 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 645 return (NULL); 646 647 connp->conn_ref = 1; 648 netstack_hold(ns); 649 connp->conn_netstack = ns; 650 ipcl_globalhash_insert(connp); 651 return (connp); 652 } 653 654 void 655 ipcl_conn_destroy(conn_t *connp) 656 { 657 mblk_t *mp; 658 netstack_t *ns = connp->conn_netstack; 659 660 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 661 ASSERT(connp->conn_ref == 0); 662 ASSERT(connp->conn_ire_cache == NULL); 663 664 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 665 666 if (connp->conn_peercred != NULL && 667 connp->conn_peercred != connp->conn_cred) 668 crfree(connp->conn_peercred); 669 connp->conn_peercred = NULL; 670 671 if (connp->conn_cred != NULL) { 672 crfree(connp->conn_cred); 673 connp->conn_cred = NULL; 674 } 675 676 ipcl_globalhash_remove(connp); 677 678 /* FIXME: add separate tcp_conn_free()? */ 679 if (connp->conn_flags & IPCL_TCPCONN) { 680 tcp_t *tcp = connp->conn_tcp; 681 tcp_stack_t *tcps; 682 683 ASSERT(tcp != NULL); 684 tcps = tcp->tcp_tcps; 685 if (tcps != NULL) { 686 if (connp->conn_latch != NULL) { 687 IPLATCH_REFRELE(connp->conn_latch, ns); 688 connp->conn_latch = NULL; 689 } 690 if (connp->conn_policy != NULL) { 691 IPPH_REFRELE(connp->conn_policy, ns); 692 connp->conn_policy = NULL; 693 } 694 tcp->tcp_tcps = NULL; 695 TCPS_REFRELE(tcps); 696 } 697 698 tcp_free(tcp); 699 mp = tcp->tcp_timercache; 700 tcp->tcp_cred = NULL; 701 702 if (tcp->tcp_sack_info != NULL) { 703 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 704 kmem_cache_free(tcp_sack_info_cache, 705 tcp->tcp_sack_info); 706 } 707 if (tcp->tcp_iphc != NULL) { 708 if (tcp->tcp_hdr_grown) { 709 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 710 } else { 711 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 712 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 713 } 714 tcp->tcp_iphc_len = 0; 715 } 716 ASSERT(tcp->tcp_iphc_len == 0); 717 718 /* 719 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 720 * the mblk. 721 */ 722 if (tcp->tcp_rsrv_mp != NULL) { 723 freeb(tcp->tcp_rsrv_mp); 724 tcp->tcp_rsrv_mp = NULL; 725 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 726 } 727 728 ASSERT(connp->conn_latch == NULL); 729 ASSERT(connp->conn_policy == NULL); 730 731 if (ns != NULL) { 732 ASSERT(tcp->tcp_tcps == NULL); 733 connp->conn_netstack = NULL; 734 netstack_rele(ns); 735 } 736 737 ipcl_conn_cleanup(connp); 738 connp->conn_flags = IPCL_TCPCONN; 739 bzero(tcp, sizeof (tcp_t)); 740 741 tcp->tcp_timercache = mp; 742 tcp->tcp_connp = connp; 743 kmem_cache_free(tcp_conn_cache, connp); 744 return; 745 } 746 if (connp->conn_latch != NULL) { 747 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 748 connp->conn_latch = NULL; 749 } 750 if (connp->conn_policy != NULL) { 751 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 752 connp->conn_policy = NULL; 753 } 754 if (connp->conn_ipsec_opt_mp != NULL) { 755 freemsg(connp->conn_ipsec_opt_mp); 756 connp->conn_ipsec_opt_mp = NULL; 757 } 758 759 if (connp->conn_flags & IPCL_SCTPCONN) { 760 ASSERT(ns != NULL); 761 sctp_free(connp); 762 return; 763 } 764 765 if (ns != NULL) { 766 connp->conn_netstack = NULL; 767 netstack_rele(ns); 768 } 769 770 ipcl_conn_cleanup(connp); 771 772 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 773 if (connp->conn_flags & IPCL_UDPCONN) { 774 connp->conn_flags = IPCL_UDPCONN; 775 kmem_cache_free(udp_conn_cache, connp); 776 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 777 778 connp->conn_flags = IPCL_RAWIPCONN; 779 connp->conn_ulp = IPPROTO_ICMP; 780 kmem_cache_free(rawip_conn_cache, connp); 781 } else if (connp->conn_flags & IPCL_RTSCONN) { 782 connp->conn_flags = IPCL_RTSCONN; 783 kmem_cache_free(rts_conn_cache, connp); 784 } else { 785 connp->conn_flags = IPCL_IPCCONN; 786 ASSERT(connp->conn_flags & IPCL_IPCCONN); 787 ASSERT(connp->conn_priv == NULL); 788 kmem_cache_free(ip_conn_cache, connp); 789 } 790 } 791 792 /* 793 * Running in cluster mode - deregister listener information 794 */ 795 796 static void 797 ipcl_conn_unlisten(conn_t *connp) 798 { 799 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 800 ASSERT(connp->conn_lport != 0); 801 802 if (cl_inet_unlisten != NULL) { 803 sa_family_t addr_family; 804 uint8_t *laddrp; 805 806 if (connp->conn_pkt_isv6) { 807 addr_family = AF_INET6; 808 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 809 } else { 810 addr_family = AF_INET; 811 laddrp = (uint8_t *)&connp->conn_bound_source; 812 } 813 (*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp, 814 connp->conn_lport); 815 } 816 connp->conn_flags &= ~IPCL_CL_LISTENER; 817 } 818 819 /* 820 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 821 * which table the conn belonged to). So for debugging we can see which hash 822 * table this connection was in. 823 */ 824 #define IPCL_HASH_REMOVE(connp) { \ 825 connf_t *connfp = (connp)->conn_fanout; \ 826 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 827 if (connfp != NULL) { \ 828 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 829 (void *)(connp))); \ 830 mutex_enter(&connfp->connf_lock); \ 831 if ((connp)->conn_next != NULL) \ 832 (connp)->conn_next->conn_prev = \ 833 (connp)->conn_prev; \ 834 if ((connp)->conn_prev != NULL) \ 835 (connp)->conn_prev->conn_next = \ 836 (connp)->conn_next; \ 837 else \ 838 connfp->connf_head = (connp)->conn_next; \ 839 (connp)->conn_fanout = NULL; \ 840 (connp)->conn_next = NULL; \ 841 (connp)->conn_prev = NULL; \ 842 (connp)->conn_flags |= IPCL_REMOVED; \ 843 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 844 ipcl_conn_unlisten((connp)); \ 845 CONN_DEC_REF((connp)); \ 846 mutex_exit(&connfp->connf_lock); \ 847 } \ 848 } 849 850 void 851 ipcl_hash_remove(conn_t *connp) 852 { 853 IPCL_HASH_REMOVE(connp); 854 } 855 856 /* 857 * The whole purpose of this function is allow removal of 858 * a conn_t from the connected hash for timewait reclaim. 859 * This is essentially a TW reclaim fastpath where timewait 860 * collector checks under fanout lock (so no one else can 861 * get access to the conn_t) that refcnt is 2 i.e. one for 862 * TCP and one for the classifier hash list. If ref count 863 * is indeed 2, we can just remove the conn under lock and 864 * avoid cleaning up the conn under squeue. This gives us 865 * improved performance. 866 */ 867 void 868 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 869 { 870 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 871 ASSERT(MUTEX_HELD(&connp->conn_lock)); 872 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 873 874 if ((connp)->conn_next != NULL) { 875 (connp)->conn_next->conn_prev = (connp)->conn_prev; 876 } 877 if ((connp)->conn_prev != NULL) { 878 (connp)->conn_prev->conn_next = (connp)->conn_next; 879 } else { 880 connfp->connf_head = (connp)->conn_next; 881 } 882 (connp)->conn_fanout = NULL; 883 (connp)->conn_next = NULL; 884 (connp)->conn_prev = NULL; 885 (connp)->conn_flags |= IPCL_REMOVED; 886 ASSERT((connp)->conn_ref == 2); 887 (connp)->conn_ref--; 888 } 889 890 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 891 ASSERT((connp)->conn_fanout == NULL); \ 892 ASSERT((connp)->conn_next == NULL); \ 893 ASSERT((connp)->conn_prev == NULL); \ 894 if ((connfp)->connf_head != NULL) { \ 895 (connfp)->connf_head->conn_prev = (connp); \ 896 (connp)->conn_next = (connfp)->connf_head; \ 897 } \ 898 (connp)->conn_fanout = (connfp); \ 899 (connfp)->connf_head = (connp); \ 900 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 901 IPCL_CONNECTED; \ 902 CONN_INC_REF(connp); \ 903 } 904 905 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 906 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 907 "connp %p", (void *)(connfp), (void *)(connp))); \ 908 IPCL_HASH_REMOVE((connp)); \ 909 mutex_enter(&(connfp)->connf_lock); \ 910 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 911 mutex_exit(&(connfp)->connf_lock); \ 912 } 913 914 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 915 conn_t *pconnp = NULL, *nconnp; \ 916 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 917 "connp %p", (void *)connfp, (void *)(connp))); \ 918 IPCL_HASH_REMOVE((connp)); \ 919 mutex_enter(&(connfp)->connf_lock); \ 920 nconnp = (connfp)->connf_head; \ 921 while (nconnp != NULL && \ 922 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 923 pconnp = nconnp; \ 924 nconnp = nconnp->conn_next; \ 925 } \ 926 if (pconnp != NULL) { \ 927 pconnp->conn_next = (connp); \ 928 (connp)->conn_prev = pconnp; \ 929 } else { \ 930 (connfp)->connf_head = (connp); \ 931 } \ 932 if (nconnp != NULL) { \ 933 (connp)->conn_next = nconnp; \ 934 nconnp->conn_prev = (connp); \ 935 } \ 936 (connp)->conn_fanout = (connfp); \ 937 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 938 IPCL_BOUND; \ 939 CONN_INC_REF(connp); \ 940 mutex_exit(&(connfp)->connf_lock); \ 941 } 942 943 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 944 conn_t **list, *prev, *next; \ 945 boolean_t isv4mapped = \ 946 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 947 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 948 "connp %p", (void *)(connfp), (void *)(connp))); \ 949 IPCL_HASH_REMOVE((connp)); \ 950 mutex_enter(&(connfp)->connf_lock); \ 951 list = &(connfp)->connf_head; \ 952 prev = NULL; \ 953 while ((next = *list) != NULL) { \ 954 if (isv4mapped && \ 955 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 956 connp->conn_zoneid == next->conn_zoneid) { \ 957 (connp)->conn_next = next; \ 958 if (prev != NULL) \ 959 prev = next->conn_prev; \ 960 next->conn_prev = (connp); \ 961 break; \ 962 } \ 963 list = &next->conn_next; \ 964 prev = next; \ 965 } \ 966 (connp)->conn_prev = prev; \ 967 *list = (connp); \ 968 (connp)->conn_fanout = (connfp); \ 969 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 970 IPCL_BOUND; \ 971 CONN_INC_REF((connp)); \ 972 mutex_exit(&(connfp)->connf_lock); \ 973 } 974 975 void 976 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 977 { 978 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 979 } 980 981 void 982 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 983 { 984 connf_t *connfp; 985 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 986 987 ASSERT(connp != NULL); 988 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 989 protocol == IPPROTO_ESP); 990 991 connp->conn_ulp = protocol; 992 993 /* Insert it in the protocol hash */ 994 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 995 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 996 } 997 998 void 999 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 1000 { 1001 connf_t *connfp; 1002 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1003 1004 ASSERT(connp != NULL); 1005 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 1006 protocol == IPPROTO_ESP); 1007 1008 connp->conn_ulp = protocol; 1009 1010 /* Insert it in the Bind Hash */ 1011 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1012 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1013 } 1014 1015 /* 1016 * This function is used only for inserting SCTP raw socket now. 1017 * This may change later. 1018 * 1019 * Note that only one raw socket can be bound to a port. The param 1020 * lport is in network byte order. 1021 */ 1022 static int 1023 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1024 { 1025 connf_t *connfp; 1026 conn_t *oconnp; 1027 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1028 1029 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1030 1031 /* Check for existing raw socket already bound to the port. */ 1032 mutex_enter(&connfp->connf_lock); 1033 for (oconnp = connfp->connf_head; oconnp != NULL; 1034 oconnp = oconnp->conn_next) { 1035 if (oconnp->conn_lport == lport && 1036 oconnp->conn_zoneid == connp->conn_zoneid && 1037 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1038 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1039 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1040 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1041 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1042 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1043 &connp->conn_srcv6))) { 1044 break; 1045 } 1046 } 1047 mutex_exit(&connfp->connf_lock); 1048 if (oconnp != NULL) 1049 return (EADDRNOTAVAIL); 1050 1051 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1052 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1053 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1054 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1055 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1056 } else { 1057 IPCL_HASH_INSERT_BOUND(connfp, connp); 1058 } 1059 } else { 1060 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1061 } 1062 return (0); 1063 } 1064 1065 /* 1066 * Check for a MAC exemption conflict on a labeled system. Note that for 1067 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1068 * transport layer. This check is for binding all other protocols. 1069 * 1070 * Returns true if there's a conflict. 1071 */ 1072 static boolean_t 1073 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1074 { 1075 connf_t *connfp; 1076 conn_t *tconn; 1077 1078 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1079 mutex_enter(&connfp->connf_lock); 1080 for (tconn = connfp->connf_head; tconn != NULL; 1081 tconn = tconn->conn_next) { 1082 /* We don't allow v4 fallback for v6 raw socket */ 1083 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1084 continue; 1085 /* If neither is exempt, then there's no conflict */ 1086 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1087 continue; 1088 /* If both are bound to different specific addrs, ok */ 1089 if (connp->conn_src != INADDR_ANY && 1090 tconn->conn_src != INADDR_ANY && 1091 connp->conn_src != tconn->conn_src) 1092 continue; 1093 /* These two conflict; fail */ 1094 break; 1095 } 1096 mutex_exit(&connfp->connf_lock); 1097 return (tconn != NULL); 1098 } 1099 1100 static boolean_t 1101 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1102 { 1103 connf_t *connfp; 1104 conn_t *tconn; 1105 1106 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1107 mutex_enter(&connfp->connf_lock); 1108 for (tconn = connfp->connf_head; tconn != NULL; 1109 tconn = tconn->conn_next) { 1110 /* We don't allow v4 fallback for v6 raw socket */ 1111 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1112 continue; 1113 /* If neither is exempt, then there's no conflict */ 1114 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1115 continue; 1116 /* If both are bound to different addrs, ok */ 1117 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1118 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1119 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1120 continue; 1121 /* These two conflict; fail */ 1122 break; 1123 } 1124 mutex_exit(&connfp->connf_lock); 1125 return (tconn != NULL); 1126 } 1127 1128 /* 1129 * (v4, v6) bind hash insertion routines 1130 */ 1131 int 1132 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1133 { 1134 connf_t *connfp; 1135 #ifdef IPCL_DEBUG 1136 char buf[INET_NTOA_BUFSIZE]; 1137 #endif 1138 int ret = 0; 1139 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1140 1141 ASSERT(connp); 1142 1143 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1144 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1145 1146 connp->conn_ulp = protocol; 1147 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1148 connp->conn_lport = lport; 1149 1150 switch (protocol) { 1151 default: 1152 if (is_system_labeled() && 1153 check_exempt_conflict_v4(connp, ipst)) 1154 return (EADDRINUSE); 1155 /* FALLTHROUGH */ 1156 case IPPROTO_UDP: 1157 if (protocol == IPPROTO_UDP) { 1158 IPCL_DEBUG_LVL(64, 1159 ("ipcl_bind_insert: connp %p - udp\n", 1160 (void *)connp)); 1161 connfp = &ipst->ips_ipcl_udp_fanout[ 1162 IPCL_UDP_HASH(lport, ipst)]; 1163 } else { 1164 IPCL_DEBUG_LVL(64, 1165 ("ipcl_bind_insert: connp %p - protocol\n", 1166 (void *)connp)); 1167 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1168 } 1169 1170 if (connp->conn_rem != INADDR_ANY) { 1171 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1172 } else if (connp->conn_src != INADDR_ANY) { 1173 IPCL_HASH_INSERT_BOUND(connfp, connp); 1174 } else { 1175 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1176 } 1177 break; 1178 1179 case IPPROTO_TCP: 1180 1181 /* Insert it in the Bind Hash */ 1182 ASSERT(connp->conn_zoneid != ALL_ZONES); 1183 connfp = &ipst->ips_ipcl_bind_fanout[ 1184 IPCL_BIND_HASH(lport, ipst)]; 1185 if (connp->conn_src != INADDR_ANY) { 1186 IPCL_HASH_INSERT_BOUND(connfp, connp); 1187 } else { 1188 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1189 } 1190 if (cl_inet_listen != NULL) { 1191 ASSERT(!connp->conn_pkt_isv6); 1192 connp->conn_flags |= IPCL_CL_LISTENER; 1193 (*cl_inet_listen)(IPPROTO_TCP, AF_INET, 1194 (uint8_t *)&connp->conn_bound_source, lport); 1195 } 1196 break; 1197 1198 case IPPROTO_SCTP: 1199 ret = ipcl_sctp_hash_insert(connp, lport); 1200 break; 1201 } 1202 1203 return (ret); 1204 } 1205 1206 int 1207 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1208 uint16_t lport) 1209 { 1210 connf_t *connfp; 1211 int ret = 0; 1212 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1213 1214 ASSERT(connp); 1215 1216 connp->conn_ulp = protocol; 1217 connp->conn_srcv6 = *src; 1218 connp->conn_lport = lport; 1219 1220 switch (protocol) { 1221 default: 1222 if (is_system_labeled() && 1223 check_exempt_conflict_v6(connp, ipst)) 1224 return (EADDRINUSE); 1225 /* FALLTHROUGH */ 1226 case IPPROTO_UDP: 1227 if (protocol == IPPROTO_UDP) { 1228 IPCL_DEBUG_LVL(128, 1229 ("ipcl_bind_insert_v6: connp %p - udp\n", 1230 (void *)connp)); 1231 connfp = &ipst->ips_ipcl_udp_fanout[ 1232 IPCL_UDP_HASH(lport, ipst)]; 1233 } else { 1234 IPCL_DEBUG_LVL(128, 1235 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1236 (void *)connp)); 1237 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1238 } 1239 1240 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1241 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1242 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1243 IPCL_HASH_INSERT_BOUND(connfp, connp); 1244 } else { 1245 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1246 } 1247 break; 1248 1249 case IPPROTO_TCP: 1250 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1251 1252 /* Insert it in the Bind Hash */ 1253 ASSERT(connp->conn_zoneid != ALL_ZONES); 1254 connfp = &ipst->ips_ipcl_bind_fanout[ 1255 IPCL_BIND_HASH(lport, ipst)]; 1256 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1257 IPCL_HASH_INSERT_BOUND(connfp, connp); 1258 } else { 1259 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1260 } 1261 if (cl_inet_listen != NULL) { 1262 sa_family_t addr_family; 1263 uint8_t *laddrp; 1264 1265 if (connp->conn_pkt_isv6) { 1266 addr_family = AF_INET6; 1267 laddrp = 1268 (uint8_t *)&connp->conn_bound_source_v6; 1269 } else { 1270 addr_family = AF_INET; 1271 laddrp = (uint8_t *)&connp->conn_bound_source; 1272 } 1273 connp->conn_flags |= IPCL_CL_LISTENER; 1274 (*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp, 1275 lport); 1276 } 1277 break; 1278 1279 case IPPROTO_SCTP: 1280 ret = ipcl_sctp_hash_insert(connp, lport); 1281 break; 1282 } 1283 1284 return (ret); 1285 } 1286 1287 /* 1288 * ipcl_conn_hash insertion routines. 1289 */ 1290 int 1291 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1292 ipaddr_t rem, uint32_t ports) 1293 { 1294 connf_t *connfp; 1295 uint16_t *up; 1296 conn_t *tconnp; 1297 #ifdef IPCL_DEBUG 1298 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1299 #endif 1300 in_port_t lport; 1301 int ret = 0; 1302 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1303 1304 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1305 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1306 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1307 ports, protocol)); 1308 1309 switch (protocol) { 1310 case IPPROTO_TCP: 1311 if (!(connp->conn_flags & IPCL_EAGER)) { 1312 /* 1313 * for a eager connection, i.e connections which 1314 * have just been created, the initialization is 1315 * already done in ip at conn_creation time, so 1316 * we can skip the checks here. 1317 */ 1318 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1319 } 1320 connfp = &ipst->ips_ipcl_conn_fanout[ 1321 IPCL_CONN_HASH(connp->conn_rem, 1322 connp->conn_ports, ipst)]; 1323 mutex_enter(&connfp->connf_lock); 1324 for (tconnp = connfp->connf_head; tconnp != NULL; 1325 tconnp = tconnp->conn_next) { 1326 if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1327 connp->conn_rem, connp->conn_src, 1328 connp->conn_ports)) { 1329 1330 /* Already have a conn. bail out */ 1331 mutex_exit(&connfp->connf_lock); 1332 return (EADDRINUSE); 1333 } 1334 } 1335 if (connp->conn_fanout != NULL) { 1336 /* 1337 * Probably a XTI/TLI application trying to do a 1338 * rebind. Let it happen. 1339 */ 1340 mutex_exit(&connfp->connf_lock); 1341 IPCL_HASH_REMOVE(connp); 1342 mutex_enter(&connfp->connf_lock); 1343 } 1344 1345 ASSERT(connp->conn_recv != NULL); 1346 1347 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1348 mutex_exit(&connfp->connf_lock); 1349 break; 1350 1351 case IPPROTO_SCTP: 1352 /* 1353 * The raw socket may have already been bound, remove it 1354 * from the hash first. 1355 */ 1356 IPCL_HASH_REMOVE(connp); 1357 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1358 ret = ipcl_sctp_hash_insert(connp, lport); 1359 break; 1360 1361 default: 1362 /* 1363 * Check for conflicts among MAC exempt bindings. For 1364 * transports with port numbers, this is done by the upper 1365 * level per-transport binding logic. For all others, it's 1366 * done here. 1367 */ 1368 if (is_system_labeled() && 1369 check_exempt_conflict_v4(connp, ipst)) 1370 return (EADDRINUSE); 1371 /* FALLTHROUGH */ 1372 1373 case IPPROTO_UDP: 1374 up = (uint16_t *)&ports; 1375 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1376 if (protocol == IPPROTO_UDP) { 1377 connfp = &ipst->ips_ipcl_udp_fanout[ 1378 IPCL_UDP_HASH(up[1], ipst)]; 1379 } else { 1380 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1381 } 1382 1383 if (connp->conn_rem != INADDR_ANY) { 1384 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1385 } else if (connp->conn_src != INADDR_ANY) { 1386 IPCL_HASH_INSERT_BOUND(connfp, connp); 1387 } else { 1388 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1389 } 1390 break; 1391 } 1392 1393 return (ret); 1394 } 1395 1396 int 1397 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1398 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1399 { 1400 connf_t *connfp; 1401 uint16_t *up; 1402 conn_t *tconnp; 1403 in_port_t lport; 1404 int ret = 0; 1405 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1406 1407 switch (protocol) { 1408 case IPPROTO_TCP: 1409 /* Just need to insert a conn struct */ 1410 if (!(connp->conn_flags & IPCL_EAGER)) { 1411 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1412 } 1413 connfp = &ipst->ips_ipcl_conn_fanout[ 1414 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1415 ipst)]; 1416 mutex_enter(&connfp->connf_lock); 1417 for (tconnp = connfp->connf_head; tconnp != NULL; 1418 tconnp = tconnp->conn_next) { 1419 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1420 connp->conn_remv6, connp->conn_srcv6, 1421 connp->conn_ports) && 1422 (tconnp->conn_tcp->tcp_bound_if == 0 || 1423 tconnp->conn_tcp->tcp_bound_if == ifindex)) { 1424 /* Already have a conn. bail out */ 1425 mutex_exit(&connfp->connf_lock); 1426 return (EADDRINUSE); 1427 } 1428 } 1429 if (connp->conn_fanout != NULL) { 1430 /* 1431 * Probably a XTI/TLI application trying to do a 1432 * rebind. Let it happen. 1433 */ 1434 mutex_exit(&connfp->connf_lock); 1435 IPCL_HASH_REMOVE(connp); 1436 mutex_enter(&connfp->connf_lock); 1437 } 1438 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1439 mutex_exit(&connfp->connf_lock); 1440 break; 1441 1442 case IPPROTO_SCTP: 1443 IPCL_HASH_REMOVE(connp); 1444 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1445 ret = ipcl_sctp_hash_insert(connp, lport); 1446 break; 1447 1448 default: 1449 if (is_system_labeled() && 1450 check_exempt_conflict_v6(connp, ipst)) 1451 return (EADDRINUSE); 1452 /* FALLTHROUGH */ 1453 case IPPROTO_UDP: 1454 up = (uint16_t *)&ports; 1455 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1456 if (protocol == IPPROTO_UDP) { 1457 connfp = &ipst->ips_ipcl_udp_fanout[ 1458 IPCL_UDP_HASH(up[1], ipst)]; 1459 } else { 1460 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1461 } 1462 1463 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1464 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1465 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1466 IPCL_HASH_INSERT_BOUND(connfp, connp); 1467 } else { 1468 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1469 } 1470 break; 1471 } 1472 1473 return (ret); 1474 } 1475 1476 /* 1477 * v4 packet classifying function. looks up the fanout table to 1478 * find the conn, the packet belongs to. returns the conn with 1479 * the reference held, null otherwise. 1480 * 1481 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1482 * Lookup" comment block are applied. Labels are also checked as described 1483 * above. If the packet is from the inside (looped back), and is from the same 1484 * zone, then label checks are omitted. 1485 */ 1486 conn_t * 1487 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1488 ip_stack_t *ipst) 1489 { 1490 ipha_t *ipha; 1491 connf_t *connfp, *bind_connfp; 1492 uint16_t lport; 1493 uint16_t fport; 1494 uint32_t ports; 1495 conn_t *connp; 1496 uint16_t *up; 1497 boolean_t shared_addr; 1498 boolean_t unlabeled; 1499 1500 ipha = (ipha_t *)mp->b_rptr; 1501 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1502 1503 switch (protocol) { 1504 case IPPROTO_TCP: 1505 ports = *(uint32_t *)up; 1506 connfp = 1507 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1508 ports, ipst)]; 1509 mutex_enter(&connfp->connf_lock); 1510 for (connp = connfp->connf_head; connp != NULL; 1511 connp = connp->conn_next) { 1512 if (IPCL_CONN_MATCH(connp, protocol, 1513 ipha->ipha_src, ipha->ipha_dst, ports)) 1514 break; 1515 } 1516 1517 if (connp != NULL) { 1518 /* 1519 * We have a fully-bound TCP connection. 1520 * 1521 * For labeled systems, there's no need to check the 1522 * label here. It's known to be good as we checked 1523 * before allowing the connection to become bound. 1524 */ 1525 CONN_INC_REF(connp); 1526 mutex_exit(&connfp->connf_lock); 1527 return (connp); 1528 } 1529 1530 mutex_exit(&connfp->connf_lock); 1531 1532 lport = up[1]; 1533 unlabeled = B_FALSE; 1534 /* Cred cannot be null on IPv4 */ 1535 if (is_system_labeled()) 1536 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1537 TSLF_UNLABELED) != 0; 1538 shared_addr = (zoneid == ALL_ZONES); 1539 if (shared_addr) { 1540 /* 1541 * No need to handle exclusive-stack zones since 1542 * ALL_ZONES only applies to the shared stack. 1543 */ 1544 zoneid = tsol_mlp_findzone(protocol, lport); 1545 /* 1546 * If no shared MLP is found, tsol_mlp_findzone returns 1547 * ALL_ZONES. In that case, we assume it's SLP, and 1548 * search for the zone based on the packet label. 1549 * 1550 * If there is such a zone, we prefer to find a 1551 * connection in it. Otherwise, we look for a 1552 * MAC-exempt connection in any zone whose label 1553 * dominates the default label on the packet. 1554 */ 1555 if (zoneid == ALL_ZONES) 1556 zoneid = tsol_packet_to_zoneid(mp); 1557 else 1558 unlabeled = B_FALSE; 1559 } 1560 1561 bind_connfp = 1562 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1563 mutex_enter(&bind_connfp->connf_lock); 1564 for (connp = bind_connfp->connf_head; connp != NULL; 1565 connp = connp->conn_next) { 1566 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1567 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1568 (unlabeled && connp->conn_mac_exempt))) 1569 break; 1570 } 1571 1572 /* 1573 * If the matching connection is SLP on a private address, then 1574 * the label on the packet must match the local zone's label. 1575 * Otherwise, it must be in the label range defined by tnrh. 1576 * This is ensured by tsol_receive_label. 1577 */ 1578 if (connp != NULL && is_system_labeled() && 1579 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1580 shared_addr, connp)) { 1581 DTRACE_PROBE3( 1582 tx__ip__log__info__classify__tcp, 1583 char *, 1584 "connp(1) could not receive mp(2)", 1585 conn_t *, connp, mblk_t *, mp); 1586 connp = NULL; 1587 } 1588 1589 if (connp != NULL) { 1590 /* Have a listener at least */ 1591 CONN_INC_REF(connp); 1592 mutex_exit(&bind_connfp->connf_lock); 1593 return (connp); 1594 } 1595 1596 mutex_exit(&bind_connfp->connf_lock); 1597 1598 IPCL_DEBUG_LVL(512, 1599 ("ipcl_classify: couldn't classify mp = %p\n", 1600 (void *)mp)); 1601 break; 1602 1603 case IPPROTO_UDP: 1604 lport = up[1]; 1605 unlabeled = B_FALSE; 1606 /* Cred cannot be null on IPv4 */ 1607 if (is_system_labeled()) 1608 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1609 TSLF_UNLABELED) != 0; 1610 shared_addr = (zoneid == ALL_ZONES); 1611 if (shared_addr) { 1612 /* 1613 * No need to handle exclusive-stack zones since 1614 * ALL_ZONES only applies to the shared stack. 1615 */ 1616 zoneid = tsol_mlp_findzone(protocol, lport); 1617 /* 1618 * If no shared MLP is found, tsol_mlp_findzone returns 1619 * ALL_ZONES. In that case, we assume it's SLP, and 1620 * search for the zone based on the packet label. 1621 * 1622 * If there is such a zone, we prefer to find a 1623 * connection in it. Otherwise, we look for a 1624 * MAC-exempt connection in any zone whose label 1625 * dominates the default label on the packet. 1626 */ 1627 if (zoneid == ALL_ZONES) 1628 zoneid = tsol_packet_to_zoneid(mp); 1629 else 1630 unlabeled = B_FALSE; 1631 } 1632 fport = up[0]; 1633 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1634 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1635 mutex_enter(&connfp->connf_lock); 1636 for (connp = connfp->connf_head; connp != NULL; 1637 connp = connp->conn_next) { 1638 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1639 fport, ipha->ipha_src) && 1640 (IPCL_ZONE_MATCH(connp, zoneid) || 1641 (unlabeled && connp->conn_mac_exempt))) 1642 break; 1643 } 1644 1645 if (connp != NULL && is_system_labeled() && 1646 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1647 shared_addr, connp)) { 1648 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1649 char *, "connp(1) could not receive mp(2)", 1650 conn_t *, connp, mblk_t *, mp); 1651 connp = NULL; 1652 } 1653 1654 if (connp != NULL) { 1655 CONN_INC_REF(connp); 1656 mutex_exit(&connfp->connf_lock); 1657 return (connp); 1658 } 1659 1660 /* 1661 * We shouldn't come here for multicast/broadcast packets 1662 */ 1663 mutex_exit(&connfp->connf_lock); 1664 IPCL_DEBUG_LVL(512, 1665 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1666 lport, fport)); 1667 break; 1668 } 1669 1670 return (NULL); 1671 } 1672 1673 conn_t * 1674 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1675 ip_stack_t *ipst) 1676 { 1677 ip6_t *ip6h; 1678 connf_t *connfp, *bind_connfp; 1679 uint16_t lport; 1680 uint16_t fport; 1681 tcph_t *tcph; 1682 uint32_t ports; 1683 conn_t *connp; 1684 uint16_t *up; 1685 boolean_t shared_addr; 1686 boolean_t unlabeled; 1687 1688 ip6h = (ip6_t *)mp->b_rptr; 1689 1690 switch (protocol) { 1691 case IPPROTO_TCP: 1692 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1693 up = (uint16_t *)tcph->th_lport; 1694 ports = *(uint32_t *)up; 1695 1696 connfp = 1697 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1698 ports, ipst)]; 1699 mutex_enter(&connfp->connf_lock); 1700 for (connp = connfp->connf_head; connp != NULL; 1701 connp = connp->conn_next) { 1702 if (IPCL_CONN_MATCH_V6(connp, protocol, 1703 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1704 break; 1705 } 1706 1707 if (connp != NULL) { 1708 /* 1709 * We have a fully-bound TCP connection. 1710 * 1711 * For labeled systems, there's no need to check the 1712 * label here. It's known to be good as we checked 1713 * before allowing the connection to become bound. 1714 */ 1715 CONN_INC_REF(connp); 1716 mutex_exit(&connfp->connf_lock); 1717 return (connp); 1718 } 1719 1720 mutex_exit(&connfp->connf_lock); 1721 1722 lport = up[1]; 1723 unlabeled = B_FALSE; 1724 /* Cred can be null on IPv6 */ 1725 if (is_system_labeled()) { 1726 cred_t *cr = DB_CRED(mp); 1727 1728 unlabeled = (cr != NULL && 1729 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1730 } 1731 shared_addr = (zoneid == ALL_ZONES); 1732 if (shared_addr) { 1733 /* 1734 * No need to handle exclusive-stack zones since 1735 * ALL_ZONES only applies to the shared stack. 1736 */ 1737 zoneid = tsol_mlp_findzone(protocol, lport); 1738 /* 1739 * If no shared MLP is found, tsol_mlp_findzone returns 1740 * ALL_ZONES. In that case, we assume it's SLP, and 1741 * search for the zone based on the packet label. 1742 * 1743 * If there is such a zone, we prefer to find a 1744 * connection in it. Otherwise, we look for a 1745 * MAC-exempt connection in any zone whose label 1746 * dominates the default label on the packet. 1747 */ 1748 if (zoneid == ALL_ZONES) 1749 zoneid = tsol_packet_to_zoneid(mp); 1750 else 1751 unlabeled = B_FALSE; 1752 } 1753 1754 bind_connfp = 1755 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1756 mutex_enter(&bind_connfp->connf_lock); 1757 for (connp = bind_connfp->connf_head; connp != NULL; 1758 connp = connp->conn_next) { 1759 if (IPCL_BIND_MATCH_V6(connp, protocol, 1760 ip6h->ip6_dst, lport) && 1761 (IPCL_ZONE_MATCH(connp, zoneid) || 1762 (unlabeled && connp->conn_mac_exempt))) 1763 break; 1764 } 1765 1766 if (connp != NULL && is_system_labeled() && 1767 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1768 shared_addr, connp)) { 1769 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1770 char *, "connp(1) could not receive mp(2)", 1771 conn_t *, connp, mblk_t *, mp); 1772 connp = NULL; 1773 } 1774 1775 if (connp != NULL) { 1776 /* Have a listner at least */ 1777 CONN_INC_REF(connp); 1778 mutex_exit(&bind_connfp->connf_lock); 1779 IPCL_DEBUG_LVL(512, 1780 ("ipcl_classify_v6: found listner " 1781 "connp = %p\n", (void *)connp)); 1782 1783 return (connp); 1784 } 1785 1786 mutex_exit(&bind_connfp->connf_lock); 1787 1788 IPCL_DEBUG_LVL(512, 1789 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1790 (void *)mp)); 1791 break; 1792 1793 case IPPROTO_UDP: 1794 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1795 lport = up[1]; 1796 unlabeled = B_FALSE; 1797 /* Cred can be null on IPv6 */ 1798 if (is_system_labeled()) { 1799 cred_t *cr = DB_CRED(mp); 1800 1801 unlabeled = (cr != NULL && 1802 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1803 } 1804 shared_addr = (zoneid == ALL_ZONES); 1805 if (shared_addr) { 1806 /* 1807 * No need to handle exclusive-stack zones since 1808 * ALL_ZONES only applies to the shared stack. 1809 */ 1810 zoneid = tsol_mlp_findzone(protocol, lport); 1811 /* 1812 * If no shared MLP is found, tsol_mlp_findzone returns 1813 * ALL_ZONES. In that case, we assume it's SLP, and 1814 * search for the zone based on the packet label. 1815 * 1816 * If there is such a zone, we prefer to find a 1817 * connection in it. Otherwise, we look for a 1818 * MAC-exempt connection in any zone whose label 1819 * dominates the default label on the packet. 1820 */ 1821 if (zoneid == ALL_ZONES) 1822 zoneid = tsol_packet_to_zoneid(mp); 1823 else 1824 unlabeled = B_FALSE; 1825 } 1826 1827 fport = up[0]; 1828 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1829 fport)); 1830 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1831 mutex_enter(&connfp->connf_lock); 1832 for (connp = connfp->connf_head; connp != NULL; 1833 connp = connp->conn_next) { 1834 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1835 fport, ip6h->ip6_src) && 1836 (IPCL_ZONE_MATCH(connp, zoneid) || 1837 (unlabeled && connp->conn_mac_exempt))) 1838 break; 1839 } 1840 1841 if (connp != NULL && is_system_labeled() && 1842 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1843 shared_addr, connp)) { 1844 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1845 char *, "connp(1) could not receive mp(2)", 1846 conn_t *, connp, mblk_t *, mp); 1847 connp = NULL; 1848 } 1849 1850 if (connp != NULL) { 1851 CONN_INC_REF(connp); 1852 mutex_exit(&connfp->connf_lock); 1853 return (connp); 1854 } 1855 1856 /* 1857 * We shouldn't come here for multicast/broadcast packets 1858 */ 1859 mutex_exit(&connfp->connf_lock); 1860 IPCL_DEBUG_LVL(512, 1861 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1862 lport, fport)); 1863 break; 1864 } 1865 1866 return (NULL); 1867 } 1868 1869 /* 1870 * wrapper around ipcl_classify_(v4,v6) routines. 1871 */ 1872 conn_t * 1873 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1874 { 1875 uint16_t hdr_len; 1876 ipha_t *ipha; 1877 uint8_t *nexthdrp; 1878 1879 if (MBLKL(mp) < sizeof (ipha_t)) 1880 return (NULL); 1881 1882 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1883 case IPV4_VERSION: 1884 ipha = (ipha_t *)mp->b_rptr; 1885 hdr_len = IPH_HDR_LENGTH(ipha); 1886 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1887 zoneid, ipst)); 1888 case IPV6_VERSION: 1889 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1890 &hdr_len, &nexthdrp)) 1891 return (NULL); 1892 1893 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1894 } 1895 1896 return (NULL); 1897 } 1898 1899 conn_t * 1900 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1901 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1902 { 1903 connf_t *connfp; 1904 conn_t *connp; 1905 in_port_t lport; 1906 int af; 1907 boolean_t shared_addr; 1908 boolean_t unlabeled; 1909 const void *dst; 1910 1911 lport = ((uint16_t *)&ports)[1]; 1912 1913 unlabeled = B_FALSE; 1914 /* Cred can be null on IPv6 */ 1915 if (is_system_labeled()) { 1916 cred_t *cr = DB_CRED(mp); 1917 1918 unlabeled = (cr != NULL && 1919 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1920 } 1921 shared_addr = (zoneid == ALL_ZONES); 1922 if (shared_addr) { 1923 /* 1924 * No need to handle exclusive-stack zones since ALL_ZONES 1925 * only applies to the shared stack. 1926 */ 1927 zoneid = tsol_mlp_findzone(protocol, lport); 1928 /* 1929 * If no shared MLP is found, tsol_mlp_findzone returns 1930 * ALL_ZONES. In that case, we assume it's SLP, and search for 1931 * the zone based on the packet label. 1932 * 1933 * If there is such a zone, we prefer to find a connection in 1934 * it. Otherwise, we look for a MAC-exempt connection in any 1935 * zone whose label dominates the default label on the packet. 1936 */ 1937 if (zoneid == ALL_ZONES) 1938 zoneid = tsol_packet_to_zoneid(mp); 1939 else 1940 unlabeled = B_FALSE; 1941 } 1942 1943 af = IPH_HDR_VERSION(hdr); 1944 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1945 (const void *)&((ip6_t *)hdr)->ip6_dst; 1946 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1947 1948 mutex_enter(&connfp->connf_lock); 1949 for (connp = connfp->connf_head; connp != NULL; 1950 connp = connp->conn_next) { 1951 /* We don't allow v4 fallback for v6 raw socket. */ 1952 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1953 IPV6_VERSION)) 1954 continue; 1955 if (connp->conn_fully_bound) { 1956 if (af == IPV4_VERSION) { 1957 if (!IPCL_CONN_MATCH(connp, protocol, 1958 hdr->ipha_src, hdr->ipha_dst, ports)) 1959 continue; 1960 } else { 1961 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1962 ((ip6_t *)hdr)->ip6_src, 1963 ((ip6_t *)hdr)->ip6_dst, ports)) 1964 continue; 1965 } 1966 } else { 1967 if (af == IPV4_VERSION) { 1968 if (!IPCL_BIND_MATCH(connp, protocol, 1969 hdr->ipha_dst, lport)) 1970 continue; 1971 } else { 1972 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1973 ((ip6_t *)hdr)->ip6_dst, lport)) 1974 continue; 1975 } 1976 } 1977 1978 if (IPCL_ZONE_MATCH(connp, zoneid) || 1979 (unlabeled && connp->conn_mac_exempt)) 1980 break; 1981 } 1982 /* 1983 * If the connection is fully-bound and connection-oriented (TCP or 1984 * SCTP), then we've already validated the remote system's label. 1985 * There's no need to do it again for every packet. 1986 */ 1987 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 1988 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 1989 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 1990 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1991 char *, "connp(1) could not receive mp(2)", 1992 conn_t *, connp, mblk_t *, mp); 1993 connp = NULL; 1994 } 1995 1996 if (connp != NULL) 1997 goto found; 1998 mutex_exit(&connfp->connf_lock); 1999 2000 /* Try to look for a wildcard match. */ 2001 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 2002 mutex_enter(&connfp->connf_lock); 2003 for (connp = connfp->connf_head; connp != NULL; 2004 connp = connp->conn_next) { 2005 /* We don't allow v4 fallback for v6 raw socket. */ 2006 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 2007 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 2008 continue; 2009 } 2010 if (af == IPV4_VERSION) { 2011 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 2012 break; 2013 } else { 2014 if (IPCL_RAW_MATCH_V6(connp, protocol, 2015 ((ip6_t *)hdr)->ip6_dst)) { 2016 break; 2017 } 2018 } 2019 } 2020 2021 if (connp != NULL) 2022 goto found; 2023 2024 mutex_exit(&connfp->connf_lock); 2025 return (NULL); 2026 2027 found: 2028 ASSERT(connp != NULL); 2029 CONN_INC_REF(connp); 2030 mutex_exit(&connfp->connf_lock); 2031 return (connp); 2032 } 2033 2034 /* ARGSUSED */ 2035 static int 2036 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2037 { 2038 itc_t *itc = (itc_t *)buf; 2039 conn_t *connp = &itc->itc_conn; 2040 tcp_t *tcp = (tcp_t *)&itc[1]; 2041 2042 bzero(connp, sizeof (conn_t)); 2043 bzero(tcp, sizeof (tcp_t)); 2044 2045 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2046 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2047 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 2048 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2049 connp->conn_tcp = tcp; 2050 connp->conn_flags = IPCL_TCPCONN; 2051 connp->conn_ulp = IPPROTO_TCP; 2052 tcp->tcp_connp = connp; 2053 return (0); 2054 } 2055 2056 /* ARGSUSED */ 2057 static void 2058 tcp_conn_destructor(void *buf, void *cdrarg) 2059 { 2060 itc_t *itc = (itc_t *)buf; 2061 conn_t *connp = &itc->itc_conn; 2062 tcp_t *tcp = (tcp_t *)&itc[1]; 2063 2064 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2065 ASSERT(tcp->tcp_connp == connp); 2066 ASSERT(connp->conn_tcp == tcp); 2067 tcp_timermp_free(tcp); 2068 mutex_destroy(&connp->conn_lock); 2069 cv_destroy(&connp->conn_cv); 2070 cv_destroy(&connp->conn_sq_cv); 2071 } 2072 2073 /* ARGSUSED */ 2074 static int 2075 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2076 { 2077 itc_t *itc = (itc_t *)buf; 2078 conn_t *connp = &itc->itc_conn; 2079 2080 bzero(connp, sizeof (conn_t)); 2081 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2082 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2083 connp->conn_flags = IPCL_IPCCONN; 2084 2085 return (0); 2086 } 2087 2088 /* ARGSUSED */ 2089 static void 2090 ip_conn_destructor(void *buf, void *cdrarg) 2091 { 2092 itc_t *itc = (itc_t *)buf; 2093 conn_t *connp = &itc->itc_conn; 2094 2095 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2096 ASSERT(connp->conn_priv == NULL); 2097 mutex_destroy(&connp->conn_lock); 2098 cv_destroy(&connp->conn_cv); 2099 } 2100 2101 /* ARGSUSED */ 2102 static int 2103 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2104 { 2105 itc_t *itc = (itc_t *)buf; 2106 conn_t *connp = &itc->itc_conn; 2107 udp_t *udp = (udp_t *)&itc[1]; 2108 2109 bzero(connp, sizeof (conn_t)); 2110 bzero(udp, sizeof (udp_t)); 2111 2112 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2113 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2114 connp->conn_udp = udp; 2115 connp->conn_flags = IPCL_UDPCONN; 2116 connp->conn_ulp = IPPROTO_UDP; 2117 udp->udp_connp = connp; 2118 return (0); 2119 } 2120 2121 /* ARGSUSED */ 2122 static void 2123 udp_conn_destructor(void *buf, void *cdrarg) 2124 { 2125 itc_t *itc = (itc_t *)buf; 2126 conn_t *connp = &itc->itc_conn; 2127 udp_t *udp = (udp_t *)&itc[1]; 2128 2129 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2130 ASSERT(udp->udp_connp == connp); 2131 ASSERT(connp->conn_udp == udp); 2132 mutex_destroy(&connp->conn_lock); 2133 cv_destroy(&connp->conn_cv); 2134 } 2135 2136 /* ARGSUSED */ 2137 static int 2138 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2139 { 2140 itc_t *itc = (itc_t *)buf; 2141 conn_t *connp = &itc->itc_conn; 2142 icmp_t *icmp = (icmp_t *)&itc[1]; 2143 2144 bzero(connp, sizeof (conn_t)); 2145 bzero(icmp, sizeof (icmp_t)); 2146 2147 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2148 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2149 connp->conn_icmp = icmp; 2150 connp->conn_flags = IPCL_RAWIPCONN; 2151 connp->conn_ulp = IPPROTO_ICMP; 2152 icmp->icmp_connp = connp; 2153 return (0); 2154 } 2155 2156 /* ARGSUSED */ 2157 static void 2158 rawip_conn_destructor(void *buf, void *cdrarg) 2159 { 2160 itc_t *itc = (itc_t *)buf; 2161 conn_t *connp = &itc->itc_conn; 2162 icmp_t *icmp = (icmp_t *)&itc[1]; 2163 2164 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2165 ASSERT(icmp->icmp_connp == connp); 2166 ASSERT(connp->conn_icmp == icmp); 2167 mutex_destroy(&connp->conn_lock); 2168 cv_destroy(&connp->conn_cv); 2169 } 2170 2171 /* ARGSUSED */ 2172 static int 2173 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2174 { 2175 itc_t *itc = (itc_t *)buf; 2176 conn_t *connp = &itc->itc_conn; 2177 rts_t *rts = (rts_t *)&itc[1]; 2178 2179 bzero(connp, sizeof (conn_t)); 2180 bzero(rts, sizeof (rts_t)); 2181 2182 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2183 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2184 connp->conn_rts = rts; 2185 connp->conn_flags = IPCL_RTSCONN; 2186 rts->rts_connp = connp; 2187 return (0); 2188 } 2189 2190 /* ARGSUSED */ 2191 static void 2192 rts_conn_destructor(void *buf, void *cdrarg) 2193 { 2194 itc_t *itc = (itc_t *)buf; 2195 conn_t *connp = &itc->itc_conn; 2196 rts_t *rts = (rts_t *)&itc[1]; 2197 2198 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2199 ASSERT(rts->rts_connp == connp); 2200 ASSERT(connp->conn_rts == rts); 2201 mutex_destroy(&connp->conn_lock); 2202 cv_destroy(&connp->conn_cv); 2203 } 2204 2205 /* ARGSUSED */ 2206 int 2207 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) 2208 { 2209 int error; 2210 netstack_t *ns; 2211 int ret; 2212 tcp_stack_t *tcps; 2213 ip_helper_stream_info_t *ip_helper_str; 2214 ip_stack_t *ipst; 2215 2216 ns = netstack_find_by_cred(kcred); 2217 ASSERT(ns != NULL); 2218 tcps = ns->netstack_tcp; 2219 ipst = ns->netstack_ip; 2220 ASSERT(tcps != NULL); 2221 ip_helper_str = (ip_helper_stream_info_t *)buf; 2222 2223 error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, 2224 &ip_helper_str->ip_helper_stream_handle, ipst->ips_ldi_ident); 2225 if (error != 0) { 2226 goto done; 2227 } 2228 error = ldi_ioctl(ip_helper_str->ip_helper_stream_handle, 2229 SIOCSQPTR, (intptr_t)buf, FKIOCTL, kcred, &ret); 2230 if (error != 0) { 2231 (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, 2232 kcred); 2233 } 2234 done: 2235 netstack_rele(ipst->ips_netstack); 2236 return (error); 2237 } 2238 2239 /* ARGSUSED */ 2240 static void 2241 ip_helper_stream_destructor(void *buf, void *cdrarg) 2242 { 2243 ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; 2244 2245 ip_helper_str->ip_helper_stream_rq->q_ptr = 2246 ip_helper_str->ip_helper_stream_wq->q_ptr = 2247 ip_helper_str->ip_helper_stream_minfo; 2248 (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, kcred); 2249 } 2250 2251 2252 /* 2253 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2254 * in the conn_t. 2255 */ 2256 void 2257 ipcl_conn_cleanup(conn_t *connp) 2258 { 2259 ASSERT(connp->conn_ire_cache == NULL); 2260 ASSERT(connp->conn_latch == NULL); 2261 #ifdef notdef 2262 ASSERT(connp->conn_rq == NULL); 2263 ASSERT(connp->conn_wq == NULL); 2264 #endif 2265 ASSERT(connp->conn_cred == NULL); 2266 ASSERT(connp->conn_g_fanout == NULL); 2267 ASSERT(connp->conn_g_next == NULL); 2268 ASSERT(connp->conn_g_prev == NULL); 2269 ASSERT(connp->conn_policy == NULL); 2270 ASSERT(connp->conn_fanout == NULL); 2271 ASSERT(connp->conn_next == NULL); 2272 ASSERT(connp->conn_prev == NULL); 2273 #ifdef notdef 2274 /* 2275 * The ill and ipif pointers are not cleared before the conn_t 2276 * goes away since they do not hold a reference on the ill/ipif. 2277 * We should replace these pointers with ifindex/ipaddr_t to 2278 * make the code less complex. 2279 */ 2280 ASSERT(connp->conn_xmit_if_ill == NULL); 2281 ASSERT(connp->conn_nofailover_ill == NULL); 2282 ASSERT(connp->conn_outgoing_ill == NULL); 2283 ASSERT(connp->conn_incoming_ill == NULL); 2284 ASSERT(connp->conn_outgoing_pill == NULL); 2285 ASSERT(connp->conn_multicast_ipif == NULL); 2286 ASSERT(connp->conn_multicast_ill == NULL); 2287 #endif 2288 ASSERT(connp->conn_oper_pending_ill == NULL); 2289 ASSERT(connp->conn_ilg == NULL); 2290 ASSERT(connp->conn_drain_next == NULL); 2291 ASSERT(connp->conn_drain_prev == NULL); 2292 #ifdef notdef 2293 /* conn_idl is not cleared when removed from idl list */ 2294 ASSERT(connp->conn_idl == NULL); 2295 #endif 2296 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2297 ASSERT(connp->conn_peercred == NULL); 2298 ASSERT(connp->conn_netstack == NULL); 2299 2300 ASSERT(connp->conn_helper_info == NULL); 2301 /* Clear out the conn_t fields that are not preserved */ 2302 bzero(&connp->conn_start_clr, 2303 sizeof (conn_t) - 2304 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2305 } 2306 2307 /* 2308 * All conns are inserted in a global multi-list for the benefit of 2309 * walkers. The walk is guaranteed to walk all open conns at the time 2310 * of the start of the walk exactly once. This property is needed to 2311 * achieve some cleanups during unplumb of interfaces. This is achieved 2312 * as follows. 2313 * 2314 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2315 * call the insert and delete functions below at creation and deletion 2316 * time respectively. The conn never moves or changes its position in this 2317 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2318 * won't increase due to walkers, once the conn deletion has started. Note 2319 * that we can't remove the conn from the global list and then wait for 2320 * the refcnt to drop to zero, since walkers would then see a truncated 2321 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2322 * conns until ip_open is ready to make them globally visible. 2323 * The global round robin multi-list locks are held only to get the 2324 * next member/insertion/deletion and contention should be negligible 2325 * if the multi-list is much greater than the number of cpus. 2326 */ 2327 void 2328 ipcl_globalhash_insert(conn_t *connp) 2329 { 2330 int index; 2331 struct connf_s *connfp; 2332 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2333 2334 /* 2335 * No need for atomic here. Approximate even distribution 2336 * in the global lists is sufficient. 2337 */ 2338 ipst->ips_conn_g_index++; 2339 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2340 2341 connp->conn_g_prev = NULL; 2342 /* 2343 * Mark as INCIPIENT, so that walkers will ignore this 2344 * for now, till ip_open is ready to make it visible globally. 2345 */ 2346 connp->conn_state_flags |= CONN_INCIPIENT; 2347 2348 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2349 /* Insert at the head of the list */ 2350 mutex_enter(&connfp->connf_lock); 2351 connp->conn_g_next = connfp->connf_head; 2352 if (connp->conn_g_next != NULL) 2353 connp->conn_g_next->conn_g_prev = connp; 2354 connfp->connf_head = connp; 2355 2356 /* The fanout bucket this conn points to */ 2357 connp->conn_g_fanout = connfp; 2358 2359 mutex_exit(&connfp->connf_lock); 2360 } 2361 2362 void 2363 ipcl_globalhash_remove(conn_t *connp) 2364 { 2365 struct connf_s *connfp; 2366 2367 /* 2368 * We were never inserted in the global multi list. 2369 * IPCL_NONE variety is never inserted in the global multilist 2370 * since it is presumed to not need any cleanup and is transient. 2371 */ 2372 if (connp->conn_g_fanout == NULL) 2373 return; 2374 2375 connfp = connp->conn_g_fanout; 2376 mutex_enter(&connfp->connf_lock); 2377 if (connp->conn_g_prev != NULL) 2378 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2379 else 2380 connfp->connf_head = connp->conn_g_next; 2381 if (connp->conn_g_next != NULL) 2382 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2383 mutex_exit(&connfp->connf_lock); 2384 2385 /* Better to stumble on a null pointer than to corrupt memory */ 2386 connp->conn_g_next = NULL; 2387 connp->conn_g_prev = NULL; 2388 connp->conn_g_fanout = NULL; 2389 } 2390 2391 /* 2392 * Walk the list of all conn_t's in the system, calling the function provided 2393 * with the specified argument for each. 2394 * Applies to both IPv4 and IPv6. 2395 * 2396 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2397 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2398 * unplumbed or removed. New conn_t's that are created while we are walking 2399 * may be missed by this walk, because they are not necessarily inserted 2400 * at the tail of the list. They are new conn_t's and thus don't have any 2401 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2402 * is created to the struct that is going away. 2403 */ 2404 void 2405 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2406 { 2407 int i; 2408 conn_t *connp; 2409 conn_t *prev_connp; 2410 2411 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2412 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2413 prev_connp = NULL; 2414 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2415 while (connp != NULL) { 2416 mutex_enter(&connp->conn_lock); 2417 if (connp->conn_state_flags & 2418 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2419 mutex_exit(&connp->conn_lock); 2420 connp = connp->conn_g_next; 2421 continue; 2422 } 2423 CONN_INC_REF_LOCKED(connp); 2424 mutex_exit(&connp->conn_lock); 2425 mutex_exit( 2426 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2427 (*func)(connp, arg); 2428 if (prev_connp != NULL) 2429 CONN_DEC_REF(prev_connp); 2430 mutex_enter( 2431 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2432 prev_connp = connp; 2433 connp = connp->conn_g_next; 2434 } 2435 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2436 if (prev_connp != NULL) 2437 CONN_DEC_REF(prev_connp); 2438 } 2439 } 2440 2441 /* 2442 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2443 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2444 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2445 * (peer tcp in ESTABLISHED state). 2446 */ 2447 conn_t * 2448 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2449 ip_stack_t *ipst) 2450 { 2451 uint32_t ports; 2452 uint16_t *pports = (uint16_t *)&ports; 2453 connf_t *connfp; 2454 conn_t *tconnp; 2455 boolean_t zone_chk; 2456 2457 /* 2458 * If either the source of destination address is loopback, then 2459 * both endpoints must be in the same Zone. Otherwise, both of 2460 * the addresses are system-wide unique (tcp is in ESTABLISHED 2461 * state) and the endpoints may reside in different Zones. 2462 */ 2463 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2464 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2465 2466 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2467 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2468 2469 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2470 ports, ipst)]; 2471 2472 mutex_enter(&connfp->connf_lock); 2473 for (tconnp = connfp->connf_head; tconnp != NULL; 2474 tconnp = tconnp->conn_next) { 2475 2476 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2477 ipha->ipha_dst, ipha->ipha_src, ports) && 2478 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2479 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2480 2481 ASSERT(tconnp != connp); 2482 CONN_INC_REF(tconnp); 2483 mutex_exit(&connfp->connf_lock); 2484 return (tconnp); 2485 } 2486 } 2487 mutex_exit(&connfp->connf_lock); 2488 return (NULL); 2489 } 2490 2491 /* 2492 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2493 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2494 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2495 * (peer tcp in ESTABLISHED state). 2496 */ 2497 conn_t * 2498 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2499 ip_stack_t *ipst) 2500 { 2501 uint32_t ports; 2502 uint16_t *pports = (uint16_t *)&ports; 2503 connf_t *connfp; 2504 conn_t *tconnp; 2505 boolean_t zone_chk; 2506 2507 /* 2508 * If either the source of destination address is loopback, then 2509 * both endpoints must be in the same Zone. Otherwise, both of 2510 * the addresses are system-wide unique (tcp is in ESTABLISHED 2511 * state) and the endpoints may reside in different Zones. We 2512 * don't do Zone check for link local address(es) because the 2513 * current Zone implementation treats each link local address as 2514 * being unique per system node, i.e. they belong to global Zone. 2515 */ 2516 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2517 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2518 2519 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2520 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2521 2522 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2523 ports, ipst)]; 2524 2525 mutex_enter(&connfp->connf_lock); 2526 for (tconnp = connfp->connf_head; tconnp != NULL; 2527 tconnp = tconnp->conn_next) { 2528 2529 /* We skip tcp_bound_if check here as this is loopback tcp */ 2530 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2531 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2532 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2533 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2534 2535 ASSERT(tconnp != connp); 2536 CONN_INC_REF(tconnp); 2537 mutex_exit(&connfp->connf_lock); 2538 return (tconnp); 2539 } 2540 } 2541 mutex_exit(&connfp->connf_lock); 2542 return (NULL); 2543 } 2544 2545 /* 2546 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2547 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2548 * Only checks for connected entries i.e. no INADDR_ANY checks. 2549 */ 2550 conn_t * 2551 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2552 ip_stack_t *ipst) 2553 { 2554 uint32_t ports; 2555 uint16_t *pports; 2556 connf_t *connfp; 2557 conn_t *tconnp; 2558 2559 pports = (uint16_t *)&ports; 2560 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2561 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2562 2563 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2564 ports, ipst)]; 2565 2566 mutex_enter(&connfp->connf_lock); 2567 for (tconnp = connfp->connf_head; tconnp != NULL; 2568 tconnp = tconnp->conn_next) { 2569 2570 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2571 ipha->ipha_dst, ipha->ipha_src, ports) && 2572 tconnp->conn_tcp->tcp_state >= min_state) { 2573 2574 CONN_INC_REF(tconnp); 2575 mutex_exit(&connfp->connf_lock); 2576 return (tconnp); 2577 } 2578 } 2579 mutex_exit(&connfp->connf_lock); 2580 return (NULL); 2581 } 2582 2583 /* 2584 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2585 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2586 * Only checks for connected entries i.e. no INADDR_ANY checks. 2587 * Match on ifindex in addition to addresses. 2588 */ 2589 conn_t * 2590 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2591 uint_t ifindex, ip_stack_t *ipst) 2592 { 2593 tcp_t *tcp; 2594 uint32_t ports; 2595 uint16_t *pports; 2596 connf_t *connfp; 2597 conn_t *tconnp; 2598 2599 pports = (uint16_t *)&ports; 2600 pports[0] = tcpha->tha_fport; 2601 pports[1] = tcpha->tha_lport; 2602 2603 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2604 ports, ipst)]; 2605 2606 mutex_enter(&connfp->connf_lock); 2607 for (tconnp = connfp->connf_head; tconnp != NULL; 2608 tconnp = tconnp->conn_next) { 2609 2610 tcp = tconnp->conn_tcp; 2611 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2612 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2613 tcp->tcp_state >= min_state && 2614 (tcp->tcp_bound_if == 0 || 2615 tcp->tcp_bound_if == ifindex)) { 2616 2617 CONN_INC_REF(tconnp); 2618 mutex_exit(&connfp->connf_lock); 2619 return (tconnp); 2620 } 2621 } 2622 mutex_exit(&connfp->connf_lock); 2623 return (NULL); 2624 } 2625 2626 /* 2627 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2628 * a listener when changing state. 2629 */ 2630 conn_t * 2631 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2632 ip_stack_t *ipst) 2633 { 2634 connf_t *bind_connfp; 2635 conn_t *connp; 2636 tcp_t *tcp; 2637 2638 /* 2639 * Avoid false matches for packets sent to an IP destination of 2640 * all zeros. 2641 */ 2642 if (laddr == 0) 2643 return (NULL); 2644 2645 ASSERT(zoneid != ALL_ZONES); 2646 2647 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2648 mutex_enter(&bind_connfp->connf_lock); 2649 for (connp = bind_connfp->connf_head; connp != NULL; 2650 connp = connp->conn_next) { 2651 tcp = connp->conn_tcp; 2652 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2653 IPCL_ZONE_MATCH(connp, zoneid) && 2654 (tcp->tcp_listener == NULL)) { 2655 CONN_INC_REF(connp); 2656 mutex_exit(&bind_connfp->connf_lock); 2657 return (connp); 2658 } 2659 } 2660 mutex_exit(&bind_connfp->connf_lock); 2661 return (NULL); 2662 } 2663 2664 /* 2665 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2666 * a listener when changing state. 2667 */ 2668 conn_t * 2669 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2670 zoneid_t zoneid, ip_stack_t *ipst) 2671 { 2672 connf_t *bind_connfp; 2673 conn_t *connp = NULL; 2674 tcp_t *tcp; 2675 2676 /* 2677 * Avoid false matches for packets sent to an IP destination of 2678 * all zeros. 2679 */ 2680 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2681 return (NULL); 2682 2683 ASSERT(zoneid != ALL_ZONES); 2684 2685 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2686 mutex_enter(&bind_connfp->connf_lock); 2687 for (connp = bind_connfp->connf_head; connp != NULL; 2688 connp = connp->conn_next) { 2689 tcp = connp->conn_tcp; 2690 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2691 IPCL_ZONE_MATCH(connp, zoneid) && 2692 (tcp->tcp_bound_if == 0 || 2693 tcp->tcp_bound_if == ifindex) && 2694 tcp->tcp_listener == NULL) { 2695 CONN_INC_REF(connp); 2696 mutex_exit(&bind_connfp->connf_lock); 2697 return (connp); 2698 } 2699 } 2700 mutex_exit(&bind_connfp->connf_lock); 2701 return (NULL); 2702 } 2703 2704 /* 2705 * ipcl_get_next_conn 2706 * get the next entry in the conn global list 2707 * and put a reference on the next_conn. 2708 * decrement the reference on the current conn. 2709 * 2710 * This is an iterator based walker function that also provides for 2711 * some selection by the caller. It walks through the conn_hash bucket 2712 * searching for the next valid connp in the list, and selects connections 2713 * that are neither closed nor condemned. It also REFHOLDS the conn 2714 * thus ensuring that the conn exists when the caller uses the conn. 2715 */ 2716 conn_t * 2717 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2718 { 2719 conn_t *next_connp; 2720 2721 if (connfp == NULL) 2722 return (NULL); 2723 2724 mutex_enter(&connfp->connf_lock); 2725 2726 next_connp = (connp == NULL) ? 2727 connfp->connf_head : connp->conn_g_next; 2728 2729 while (next_connp != NULL) { 2730 mutex_enter(&next_connp->conn_lock); 2731 if (!(next_connp->conn_flags & conn_flags) || 2732 (next_connp->conn_state_flags & 2733 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2734 /* 2735 * This conn has been condemned or 2736 * is closing, or the flags don't match 2737 */ 2738 mutex_exit(&next_connp->conn_lock); 2739 next_connp = next_connp->conn_g_next; 2740 continue; 2741 } 2742 CONN_INC_REF_LOCKED(next_connp); 2743 mutex_exit(&next_connp->conn_lock); 2744 break; 2745 } 2746 2747 mutex_exit(&connfp->connf_lock); 2748 2749 if (connp != NULL) 2750 CONN_DEC_REF(connp); 2751 2752 return (next_connp); 2753 } 2754 2755 #ifdef CONN_DEBUG 2756 /* 2757 * Trace of the last NBUF refhold/refrele 2758 */ 2759 int 2760 conn_trace_ref(conn_t *connp) 2761 { 2762 int last; 2763 conn_trace_t *ctb; 2764 2765 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2766 last = connp->conn_trace_last; 2767 last++; 2768 if (last == CONN_TRACE_MAX) 2769 last = 0; 2770 2771 ctb = &connp->conn_trace_buf[last]; 2772 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2773 connp->conn_trace_last = last; 2774 return (1); 2775 } 2776 2777 int 2778 conn_untrace_ref(conn_t *connp) 2779 { 2780 int last; 2781 conn_trace_t *ctb; 2782 2783 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2784 last = connp->conn_trace_last; 2785 last++; 2786 if (last == CONN_TRACE_MAX) 2787 last = 0; 2788 2789 ctb = &connp->conn_trace_buf[last]; 2790 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2791 connp->conn_trace_last = last; 2792 return (1); 2793 } 2794 #endif 2795