1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 const char ipclassifier_version[] = "@(#)ipclassifier.c %I% %E% SMI"; 29 30 /* 31 * IP PACKET CLASSIFIER 32 * 33 * The IP packet classifier provides mapping between IP packets and persistent 34 * connection state for connection-oriented protocols. It also provides 35 * interface for managing connection states. 36 * 37 * The connection state is kept in conn_t data structure and contains, among 38 * other things: 39 * 40 * o local/remote address and ports 41 * o Transport protocol 42 * o squeue for the connection (for TCP only) 43 * o reference counter 44 * o Connection state 45 * o hash table linkage 46 * o interface/ire information 47 * o credentials 48 * o ipsec policy 49 * o send and receive functions. 50 * o mutex lock. 51 * 52 * Connections use a reference counting scheme. They are freed when the 53 * reference counter drops to zero. A reference is incremented when connection 54 * is placed in a list or table, when incoming packet for the connection arrives 55 * and when connection is processed via squeue (squeue processing may be 56 * asynchronous and the reference protects the connection from being destroyed 57 * before its processing is finished). 58 * 59 * send and receive functions are currently used for TCP only. The send function 60 * determines the IP entry point for the packet once it leaves TCP to be sent to 61 * the destination address. The receive function is used by IP when the packet 62 * should be passed for TCP processing. When a new connection is created these 63 * are set to ip_output() and tcp_input() respectively. During the lifetime of 64 * the connection the send and receive functions may change depending on the 65 * changes in the connection state. For example, Once the connection is bound to 66 * an addresse, the receive function for this connection is set to 67 * tcp_conn_request(). This allows incoming SYNs to go directly into the 68 * listener SYN processing function without going to tcp_input() first. 69 * 70 * Classifier uses several hash tables: 71 * 72 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 73 * ipcl_bind_fanout: contains all connections in BOUND state 74 * ipcl_proto_fanout: IPv4 protocol fanout 75 * ipcl_proto_fanout_v6: IPv6 protocol fanout 76 * ipcl_udp_fanout: contains all UDP connections 77 * ipcl_globalhash_fanout: contains all connections 78 * 79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 80 * which need to view all existing connections. 81 * 82 * All tables are protected by per-bucket locks. When both per-bucket lock and 83 * connection lock need to be held, the per-bucket lock should be acquired 84 * first, followed by the connection lock. 85 * 86 * All functions doing search in one of these tables increment a reference 87 * counter on the connection found (if any). This reference should be dropped 88 * when the caller has finished processing the connection. 89 * 90 * 91 * INTERFACES: 92 * =========== 93 * 94 * Connection Lookup: 95 * ------------------ 96 * 97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 99 * 100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 101 * it can't find any associated connection. If the connection is found, its 102 * reference counter is incremented. 103 * 104 * mp: mblock, containing packet header. The full header should fit 105 * into a single mblock. It should also contain at least full IP 106 * and TCP or UDP header. 107 * 108 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 109 * 110 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 111 * the packet. 112 * 113 * zoneid: The zone in which the returned connection must be; the zoneid 114 * corresponding to the ire_zoneid on the IRE located for the 115 * packet's destination address. 116 * 117 * For TCP connections, the lookup order is as follows: 118 * 5-tuple {src, dst, protocol, local port, remote port} 119 * lookup in ipcl_conn_fanout table. 120 * 3-tuple {dst, remote port, protocol} lookup in 121 * ipcl_bind_fanout table. 122 * 123 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 124 * remote port} lookup is done on ipcl_udp_fanout. Note that, 125 * these interfaces do not handle cases where a packets belongs 126 * to multiple UDP clients, which is handled in IP itself. 127 * 128 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 129 * determine which actual zone gets the segment. This is used only in a 130 * labeled environment. The matching rules are: 131 * 132 * - If it's not a multilevel port, then the label on the packet selects 133 * the zone. Unlabeled packets are delivered to the global zone. 134 * 135 * - If it's a multilevel port, then only the zone registered to receive 136 * packets on that port matches. 137 * 138 * Also, in a labeled environment, packet labels need to be checked. For fully 139 * bound TCP connections, we can assume that the packet label was checked 140 * during connection establishment, and doesn't need to be checked on each 141 * packet. For others, though, we need to check for strict equality or, for 142 * multilevel ports, membership in the range or set. This part currently does 143 * a tnrh lookup on each packet, but could be optimized to use cached results 144 * if that were necessary. (SCTP doesn't come through here, but if it did, 145 * we would apply the same rules as TCP.) 146 * 147 * An implication of the above is that fully-bound TCP sockets must always use 148 * distinct 4-tuples; they can't be discriminated by label alone. 149 * 150 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 151 * as there's no connection set-up handshake and no shared state. 152 * 153 * Labels on looped-back packets within a single zone do not need to be 154 * checked, as all processes in the same zone have the same label. 155 * 156 * Finally, for unlabeled packets received by a labeled system, special rules 157 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 158 * socket in the zone whose label matches the default label of the sender, if 159 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 160 * receiver's label must dominate the sender's default label. 161 * 162 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 163 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 164 * ip_stack); 165 * 166 * Lookup routine to find a exact match for {src, dst, local port, 167 * remote port) for TCP connections in ipcl_conn_fanout. The address and 168 * ports are read from the IP and TCP header respectively. 169 * 170 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 171 * zoneid, ip_stack); 172 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 173 * zoneid, ip_stack); 174 * 175 * Lookup routine to find a listener with the tuple {lport, laddr, 176 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 177 * parameter interface index is also compared. 178 * 179 * void ipcl_walk(func, arg, ip_stack) 180 * 181 * Apply 'func' to every connection available. The 'func' is called as 182 * (*func)(connp, arg). The walk is non-atomic so connections may be 183 * created and destroyed during the walk. The CONN_CONDEMNED and 184 * CONN_INCIPIENT flags ensure that connections which are newly created 185 * or being destroyed are not selected by the walker. 186 * 187 * Table Updates 188 * ------------- 189 * 190 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 191 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 192 * 193 * Insert 'connp' in the ipcl_conn_fanout. 194 * Arguements : 195 * connp conn_t to be inserted 196 * protocol connection protocol 197 * src source address 198 * dst destination address 199 * ports local and remote port 200 * ifindex interface index for IPv6 connections 201 * 202 * Return value : 203 * 0 if connp was inserted 204 * EADDRINUSE if the connection with the same tuple 205 * already exists. 206 * 207 * int ipcl_bind_insert(connp, protocol, src, lport); 208 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 209 * 210 * Insert 'connp' in ipcl_bind_fanout. 211 * Arguements : 212 * connp conn_t to be inserted 213 * protocol connection protocol 214 * src source address connection wants 215 * to bind to 216 * lport local port connection wants to 217 * bind to 218 * 219 * 220 * void ipcl_hash_remove(connp); 221 * 222 * Removes the 'connp' from the connection fanout table. 223 * 224 * Connection Creation/Destruction 225 * ------------------------------- 226 * 227 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 228 * 229 * Creates a new conn based on the type flag, inserts it into 230 * globalhash table. 231 * 232 * type: This flag determines the type of conn_t which needs to be 233 * created i.e., which kmem_cache it comes from. 234 * IPCL_TCPCONN indicates a TCP connection 235 * IPCL_SCTPCONN indicates a SCTP connection 236 * IPCL_UDPCONN indicates a UDP conn_t. 237 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 238 * IPCL_RTSCONN indicates a RTS conn_t. 239 * IPCL_IPCCONN indicates all other connections. 240 * 241 * void ipcl_conn_destroy(connp) 242 * 243 * Destroys the connection state, removes it from the global 244 * connection hash table and frees its memory. 245 */ 246 247 #include <sys/types.h> 248 #include <sys/stream.h> 249 #include <sys/stropts.h> 250 #include <sys/sysmacros.h> 251 #include <sys/strsubr.h> 252 #include <sys/strsun.h> 253 #define _SUN_TPI_VERSION 2 254 #include <sys/ddi.h> 255 #include <sys/cmn_err.h> 256 #include <sys/debug.h> 257 258 #include <sys/systm.h> 259 #include <sys/param.h> 260 #include <sys/kmem.h> 261 #include <sys/isa_defs.h> 262 #include <inet/common.h> 263 #include <netinet/ip6.h> 264 #include <netinet/icmp6.h> 265 266 #include <inet/ip.h> 267 #include <inet/ip6.h> 268 #include <inet/tcp.h> 269 #include <inet/ip_ndp.h> 270 #include <inet/udp_impl.h> 271 #include <inet/sctp_ip.h> 272 #include <inet/sctp/sctp_impl.h> 273 #include <inet/rawip_impl.h> 274 #include <inet/rts_impl.h> 275 276 #include <sys/cpuvar.h> 277 278 #include <inet/ipclassifier.h> 279 #include <inet/ipsec_impl.h> 280 281 #include <sys/tsol/tnet.h> 282 283 #ifdef DEBUG 284 #define IPCL_DEBUG 285 #else 286 #undef IPCL_DEBUG 287 #endif 288 289 #ifdef IPCL_DEBUG 290 int ipcl_debug_level = 0; 291 #define IPCL_DEBUG_LVL(level, args) \ 292 if (ipcl_debug_level & level) { printf args; } 293 #else 294 #define IPCL_DEBUG_LVL(level, args) {; } 295 #endif 296 /* Old value for compatibility. Setable in /etc/system */ 297 uint_t tcp_conn_hash_size = 0; 298 299 /* New value. Zero means choose automatically. Setable in /etc/system */ 300 uint_t ipcl_conn_hash_size = 0; 301 uint_t ipcl_conn_hash_memfactor = 8192; 302 uint_t ipcl_conn_hash_maxsize = 82500; 303 304 /* bind/udp fanout table size */ 305 uint_t ipcl_bind_fanout_size = 512; 306 uint_t ipcl_udp_fanout_size = 16384; 307 308 /* Raw socket fanout size. Must be a power of 2. */ 309 uint_t ipcl_raw_fanout_size = 256; 310 311 /* 312 * Power of 2^N Primes useful for hashing for N of 0-28, 313 * these primes are the nearest prime <= 2^N - 2^(N-2). 314 */ 315 316 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 317 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 318 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 319 50331599, 100663291, 201326557, 0} 320 321 /* 322 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 323 * are aligned on cache lines. 324 */ 325 typedef union itc_s { 326 conn_t itc_conn; 327 char itcu_filler[CACHE_ALIGN(conn_s)]; 328 } itc_t; 329 330 struct kmem_cache *tcp_conn_cache; 331 struct kmem_cache *ip_conn_cache; 332 extern struct kmem_cache *sctp_conn_cache; 333 extern struct kmem_cache *tcp_sack_info_cache; 334 extern struct kmem_cache *tcp_iphc_cache; 335 struct kmem_cache *udp_conn_cache; 336 struct kmem_cache *rawip_conn_cache; 337 struct kmem_cache *rts_conn_cache; 338 339 extern void tcp_timermp_free(tcp_t *); 340 extern mblk_t *tcp_timermp_alloc(int); 341 342 static int ip_conn_constructor(void *, void *, int); 343 static void ip_conn_destructor(void *, void *); 344 345 static int tcp_conn_constructor(void *, void *, int); 346 static void tcp_conn_destructor(void *, void *); 347 348 static int udp_conn_constructor(void *, void *, int); 349 static void udp_conn_destructor(void *, void *); 350 351 static int rawip_conn_constructor(void *, void *, int); 352 static void rawip_conn_destructor(void *, void *); 353 354 static int rts_conn_constructor(void *, void *, int); 355 static void rts_conn_destructor(void *, void *); 356 357 #ifdef IPCL_DEBUG 358 #define INET_NTOA_BUFSIZE 18 359 360 static char * 361 inet_ntoa_r(uint32_t in, char *b) 362 { 363 unsigned char *p; 364 365 p = (unsigned char *)∈ 366 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 367 return (b); 368 } 369 #endif 370 371 /* 372 * Global (for all stack instances) init routine 373 */ 374 void 375 ipcl_g_init(void) 376 { 377 ip_conn_cache = kmem_cache_create("ip_conn_cache", 378 sizeof (conn_t), CACHE_ALIGN_SIZE, 379 ip_conn_constructor, ip_conn_destructor, 380 NULL, NULL, NULL, 0); 381 382 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 383 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 384 tcp_conn_constructor, tcp_conn_destructor, 385 NULL, NULL, NULL, 0); 386 387 udp_conn_cache = kmem_cache_create("udp_conn_cache", 388 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 389 udp_conn_constructor, udp_conn_destructor, 390 NULL, NULL, NULL, 0); 391 392 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 393 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 394 rawip_conn_constructor, rawip_conn_destructor, 395 NULL, NULL, NULL, 0); 396 397 rts_conn_cache = kmem_cache_create("rts_conn_cache", 398 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 399 rts_conn_constructor, rts_conn_destructor, 400 NULL, NULL, NULL, 0); 401 } 402 403 /* 404 * ipclassifier intialization routine, sets up hash tables. 405 */ 406 void 407 ipcl_init(ip_stack_t *ipst) 408 { 409 int i; 410 int sizes[] = P2Ps(); 411 412 /* 413 * Calculate size of conn fanout table from /etc/system settings 414 */ 415 if (ipcl_conn_hash_size != 0) { 416 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 417 } else if (tcp_conn_hash_size != 0) { 418 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 419 } else { 420 extern pgcnt_t freemem; 421 422 ipst->ips_ipcl_conn_fanout_size = 423 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 424 425 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 426 ipst->ips_ipcl_conn_fanout_size = 427 ipcl_conn_hash_maxsize; 428 } 429 } 430 431 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 432 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 433 break; 434 } 435 } 436 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 437 /* Out of range, use the 2^16 value */ 438 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 439 } 440 441 /* Take values from /etc/system */ 442 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 443 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 444 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 445 446 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 447 448 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 449 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 450 451 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 452 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 453 MUTEX_DEFAULT, NULL); 454 } 455 456 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 457 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 458 459 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 460 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 461 MUTEX_DEFAULT, NULL); 462 } 463 464 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 465 sizeof (connf_t), KM_SLEEP); 466 for (i = 0; i < IPPROTO_MAX; i++) { 467 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 468 MUTEX_DEFAULT, NULL); 469 } 470 471 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 472 sizeof (connf_t), KM_SLEEP); 473 for (i = 0; i < IPPROTO_MAX; i++) { 474 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 479 mutex_init(&ipst->ips_rts_clients->connf_lock, 480 NULL, MUTEX_DEFAULT, NULL); 481 482 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 483 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 484 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 485 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 486 MUTEX_DEFAULT, NULL); 487 } 488 489 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 490 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 491 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 492 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 493 MUTEX_DEFAULT, NULL); 494 } 495 496 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 497 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 498 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 499 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 500 NULL, MUTEX_DEFAULT, NULL); 501 } 502 } 503 504 void 505 ipcl_g_destroy(void) 506 { 507 kmem_cache_destroy(ip_conn_cache); 508 kmem_cache_destroy(tcp_conn_cache); 509 kmem_cache_destroy(udp_conn_cache); 510 kmem_cache_destroy(rawip_conn_cache); 511 kmem_cache_destroy(rts_conn_cache); 512 } 513 514 /* 515 * All user-level and kernel use of the stack must be gone 516 * by now. 517 */ 518 void 519 ipcl_destroy(ip_stack_t *ipst) 520 { 521 int i; 522 523 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 524 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 525 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 526 } 527 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 528 sizeof (connf_t)); 529 ipst->ips_ipcl_conn_fanout = NULL; 530 531 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 532 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 533 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 534 } 535 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 536 sizeof (connf_t)); 537 ipst->ips_ipcl_bind_fanout = NULL; 538 539 for (i = 0; i < IPPROTO_MAX; i++) { 540 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 541 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 542 } 543 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 544 ipst->ips_ipcl_proto_fanout = NULL; 545 546 for (i = 0; i < IPPROTO_MAX; i++) { 547 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 548 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 549 } 550 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 551 IPPROTO_MAX * sizeof (connf_t)); 552 ipst->ips_ipcl_proto_fanout_v6 = NULL; 553 554 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 555 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 556 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 557 } 558 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 559 sizeof (connf_t)); 560 ipst->ips_ipcl_udp_fanout = NULL; 561 562 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 563 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 564 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 565 } 566 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 567 sizeof (connf_t)); 568 ipst->ips_ipcl_raw_fanout = NULL; 569 570 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 571 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 572 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 573 } 574 kmem_free(ipst->ips_ipcl_globalhash_fanout, 575 sizeof (connf_t) * CONN_G_HASH_SIZE); 576 ipst->ips_ipcl_globalhash_fanout = NULL; 577 578 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 579 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 580 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 581 ipst->ips_rts_clients = NULL; 582 } 583 584 /* 585 * conn creation routine. initialize the conn, sets the reference 586 * and inserts it in the global hash table. 587 */ 588 conn_t * 589 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 590 { 591 conn_t *connp; 592 sctp_stack_t *sctps; 593 struct kmem_cache *conn_cache; 594 595 switch (type) { 596 case IPCL_SCTPCONN: 597 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 598 return (NULL); 599 sctp_conn_init(connp); 600 sctps = ns->netstack_sctp; 601 SCTP_G_Q_REFHOLD(sctps); 602 netstack_hold(ns); 603 connp->conn_netstack = ns; 604 return (connp); 605 606 case IPCL_TCPCONN: 607 conn_cache = tcp_conn_cache; 608 break; 609 610 case IPCL_UDPCONN: 611 conn_cache = udp_conn_cache; 612 break; 613 614 case IPCL_RAWIPCONN: 615 conn_cache = rawip_conn_cache; 616 break; 617 618 case IPCL_RTSCONN: 619 conn_cache = rts_conn_cache; 620 break; 621 622 case IPCL_IPCCONN: 623 conn_cache = ip_conn_cache; 624 break; 625 626 default: 627 connp = NULL; 628 ASSERT(0); 629 } 630 631 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 632 return (NULL); 633 634 connp->conn_ref = 1; 635 netstack_hold(ns); 636 connp->conn_netstack = ns; 637 ipcl_globalhash_insert(connp); 638 return (connp); 639 } 640 641 void 642 ipcl_conn_destroy(conn_t *connp) 643 { 644 mblk_t *mp; 645 netstack_t *ns = connp->conn_netstack; 646 647 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 648 ASSERT(connp->conn_ref == 0); 649 ASSERT(connp->conn_ire_cache == NULL); 650 651 if (connp->conn_peercred != NULL && 652 connp->conn_peercred != connp->conn_cred) 653 crfree(connp->conn_peercred); 654 connp->conn_peercred = NULL; 655 656 if (connp->conn_cred != NULL) { 657 crfree(connp->conn_cred); 658 connp->conn_cred = NULL; 659 } 660 661 ipcl_globalhash_remove(connp); 662 663 /* FIXME: add separate tcp_conn_free()? */ 664 if (connp->conn_flags & IPCL_TCPCONN) { 665 tcp_t *tcp = connp->conn_tcp; 666 tcp_stack_t *tcps; 667 668 ASSERT(tcp != NULL); 669 tcps = tcp->tcp_tcps; 670 if (tcps != NULL) { 671 if (connp->conn_latch != NULL) { 672 IPLATCH_REFRELE(connp->conn_latch, ns); 673 connp->conn_latch = NULL; 674 } 675 if (connp->conn_policy != NULL) { 676 IPPH_REFRELE(connp->conn_policy, ns); 677 connp->conn_policy = NULL; 678 } 679 tcp->tcp_tcps = NULL; 680 TCPS_REFRELE(tcps); 681 } 682 683 tcp_free(tcp); 684 mp = tcp->tcp_timercache; 685 tcp->tcp_cred = NULL; 686 687 if (tcp->tcp_sack_info != NULL) { 688 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 689 kmem_cache_free(tcp_sack_info_cache, 690 tcp->tcp_sack_info); 691 } 692 if (tcp->tcp_iphc != NULL) { 693 if (tcp->tcp_hdr_grown) { 694 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 695 } else { 696 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 697 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 698 } 699 tcp->tcp_iphc_len = 0; 700 } 701 ASSERT(tcp->tcp_iphc_len == 0); 702 703 ASSERT(connp->conn_latch == NULL); 704 ASSERT(connp->conn_policy == NULL); 705 706 if (ns != NULL) { 707 ASSERT(tcp->tcp_tcps == NULL); 708 connp->conn_netstack = NULL; 709 netstack_rele(ns); 710 } 711 712 ipcl_conn_cleanup(connp); 713 connp->conn_flags = IPCL_TCPCONN; 714 bzero(tcp, sizeof (tcp_t)); 715 716 tcp->tcp_timercache = mp; 717 tcp->tcp_connp = connp; 718 kmem_cache_free(tcp_conn_cache, connp); 719 return; 720 } 721 if (connp->conn_latch != NULL) { 722 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 723 connp->conn_latch = NULL; 724 } 725 if (connp->conn_policy != NULL) { 726 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 727 connp->conn_policy = NULL; 728 } 729 if (connp->conn_ipsec_opt_mp != NULL) { 730 freemsg(connp->conn_ipsec_opt_mp); 731 connp->conn_ipsec_opt_mp = NULL; 732 } 733 734 if (connp->conn_flags & IPCL_SCTPCONN) { 735 ASSERT(ns != NULL); 736 sctp_free(connp); 737 return; 738 } 739 740 if (ns != NULL) { 741 connp->conn_netstack = NULL; 742 netstack_rele(ns); 743 } 744 ipcl_conn_cleanup(connp); 745 746 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 747 if (connp->conn_flags & IPCL_UDPCONN) { 748 connp->conn_flags = IPCL_UDPCONN; 749 kmem_cache_free(udp_conn_cache, connp); 750 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 751 connp->conn_flags = IPCL_RAWIPCONN; 752 connp->conn_ulp = IPPROTO_ICMP; 753 kmem_cache_free(rawip_conn_cache, connp); 754 } else if (connp->conn_flags & IPCL_RTSCONN) { 755 connp->conn_flags = IPCL_RTSCONN; 756 kmem_cache_free(rts_conn_cache, connp); 757 } else { 758 connp->conn_flags = IPCL_IPCCONN; 759 ASSERT(connp->conn_flags & IPCL_IPCCONN); 760 ASSERT(connp->conn_priv == NULL); 761 kmem_cache_free(ip_conn_cache, connp); 762 } 763 } 764 765 /* 766 * Running in cluster mode - deregister listener information 767 */ 768 769 static void 770 ipcl_conn_unlisten(conn_t *connp) 771 { 772 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 773 ASSERT(connp->conn_lport != 0); 774 775 if (cl_inet_unlisten != NULL) { 776 sa_family_t addr_family; 777 uint8_t *laddrp; 778 779 if (connp->conn_pkt_isv6) { 780 addr_family = AF_INET6; 781 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 782 } else { 783 addr_family = AF_INET; 784 laddrp = (uint8_t *)&connp->conn_bound_source; 785 } 786 (*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp, 787 connp->conn_lport); 788 } 789 connp->conn_flags &= ~IPCL_CL_LISTENER; 790 } 791 792 /* 793 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 794 * which table the conn belonged to). So for debugging we can see which hash 795 * table this connection was in. 796 */ 797 #define IPCL_HASH_REMOVE(connp) { \ 798 connf_t *connfp = (connp)->conn_fanout; \ 799 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 800 if (connfp != NULL) { \ 801 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 802 (void *)(connp))); \ 803 mutex_enter(&connfp->connf_lock); \ 804 if ((connp)->conn_next != NULL) \ 805 (connp)->conn_next->conn_prev = \ 806 (connp)->conn_prev; \ 807 if ((connp)->conn_prev != NULL) \ 808 (connp)->conn_prev->conn_next = \ 809 (connp)->conn_next; \ 810 else \ 811 connfp->connf_head = (connp)->conn_next; \ 812 (connp)->conn_fanout = NULL; \ 813 (connp)->conn_next = NULL; \ 814 (connp)->conn_prev = NULL; \ 815 (connp)->conn_flags |= IPCL_REMOVED; \ 816 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 817 ipcl_conn_unlisten((connp)); \ 818 CONN_DEC_REF((connp)); \ 819 mutex_exit(&connfp->connf_lock); \ 820 } \ 821 } 822 823 void 824 ipcl_hash_remove(conn_t *connp) 825 { 826 IPCL_HASH_REMOVE(connp); 827 } 828 829 /* 830 * The whole purpose of this function is allow removal of 831 * a conn_t from the connected hash for timewait reclaim. 832 * This is essentially a TW reclaim fastpath where timewait 833 * collector checks under fanout lock (so no one else can 834 * get access to the conn_t) that refcnt is 2 i.e. one for 835 * TCP and one for the classifier hash list. If ref count 836 * is indeed 2, we can just remove the conn under lock and 837 * avoid cleaning up the conn under squeue. This gives us 838 * improved performance. 839 */ 840 void 841 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 842 { 843 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 844 ASSERT(MUTEX_HELD(&connp->conn_lock)); 845 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 846 847 if ((connp)->conn_next != NULL) { 848 (connp)->conn_next->conn_prev = (connp)->conn_prev; 849 } 850 if ((connp)->conn_prev != NULL) { 851 (connp)->conn_prev->conn_next = (connp)->conn_next; 852 } else { 853 connfp->connf_head = (connp)->conn_next; 854 } 855 (connp)->conn_fanout = NULL; 856 (connp)->conn_next = NULL; 857 (connp)->conn_prev = NULL; 858 (connp)->conn_flags |= IPCL_REMOVED; 859 ASSERT((connp)->conn_ref == 2); 860 (connp)->conn_ref--; 861 } 862 863 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 864 ASSERT((connp)->conn_fanout == NULL); \ 865 ASSERT((connp)->conn_next == NULL); \ 866 ASSERT((connp)->conn_prev == NULL); \ 867 if ((connfp)->connf_head != NULL) { \ 868 (connfp)->connf_head->conn_prev = (connp); \ 869 (connp)->conn_next = (connfp)->connf_head; \ 870 } \ 871 (connp)->conn_fanout = (connfp); \ 872 (connfp)->connf_head = (connp); \ 873 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 874 IPCL_CONNECTED; \ 875 CONN_INC_REF(connp); \ 876 } 877 878 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 879 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 880 "connp %p", (void *)(connfp), (void *)(connp))); \ 881 IPCL_HASH_REMOVE((connp)); \ 882 mutex_enter(&(connfp)->connf_lock); \ 883 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 884 mutex_exit(&(connfp)->connf_lock); \ 885 } 886 887 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 888 conn_t *pconnp = NULL, *nconnp; \ 889 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 890 "connp %p", (void *)connfp, (void *)(connp))); \ 891 IPCL_HASH_REMOVE((connp)); \ 892 mutex_enter(&(connfp)->connf_lock); \ 893 nconnp = (connfp)->connf_head; \ 894 while (nconnp != NULL && \ 895 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 896 pconnp = nconnp; \ 897 nconnp = nconnp->conn_next; \ 898 } \ 899 if (pconnp != NULL) { \ 900 pconnp->conn_next = (connp); \ 901 (connp)->conn_prev = pconnp; \ 902 } else { \ 903 (connfp)->connf_head = (connp); \ 904 } \ 905 if (nconnp != NULL) { \ 906 (connp)->conn_next = nconnp; \ 907 nconnp->conn_prev = (connp); \ 908 } \ 909 (connp)->conn_fanout = (connfp); \ 910 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 911 IPCL_BOUND; \ 912 CONN_INC_REF(connp); \ 913 mutex_exit(&(connfp)->connf_lock); \ 914 } 915 916 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 917 conn_t **list, *prev, *next; \ 918 boolean_t isv4mapped = \ 919 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 920 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 921 "connp %p", (void *)(connfp), (void *)(connp))); \ 922 IPCL_HASH_REMOVE((connp)); \ 923 mutex_enter(&(connfp)->connf_lock); \ 924 list = &(connfp)->connf_head; \ 925 prev = NULL; \ 926 while ((next = *list) != NULL) { \ 927 if (isv4mapped && \ 928 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 929 connp->conn_zoneid == next->conn_zoneid) { \ 930 (connp)->conn_next = next; \ 931 if (prev != NULL) \ 932 prev = next->conn_prev; \ 933 next->conn_prev = (connp); \ 934 break; \ 935 } \ 936 list = &next->conn_next; \ 937 prev = next; \ 938 } \ 939 (connp)->conn_prev = prev; \ 940 *list = (connp); \ 941 (connp)->conn_fanout = (connfp); \ 942 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 943 IPCL_BOUND; \ 944 CONN_INC_REF((connp)); \ 945 mutex_exit(&(connfp)->connf_lock); \ 946 } 947 948 void 949 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 950 { 951 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 952 } 953 954 void 955 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 956 { 957 connf_t *connfp; 958 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 959 960 ASSERT(connp != NULL); 961 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 962 protocol == IPPROTO_ESP); 963 964 connp->conn_ulp = protocol; 965 966 /* Insert it in the protocol hash */ 967 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 968 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 969 } 970 971 void 972 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 973 { 974 connf_t *connfp; 975 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 976 977 ASSERT(connp != NULL); 978 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 979 protocol == IPPROTO_ESP); 980 981 connp->conn_ulp = protocol; 982 983 /* Insert it in the Bind Hash */ 984 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 985 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 986 } 987 988 /* 989 * This function is used only for inserting SCTP raw socket now. 990 * This may change later. 991 * 992 * Note that only one raw socket can be bound to a port. The param 993 * lport is in network byte order. 994 */ 995 static int 996 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 997 { 998 connf_t *connfp; 999 conn_t *oconnp; 1000 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1001 1002 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1003 1004 /* Check for existing raw socket already bound to the port. */ 1005 mutex_enter(&connfp->connf_lock); 1006 for (oconnp = connfp->connf_head; oconnp != NULL; 1007 oconnp = oconnp->conn_next) { 1008 if (oconnp->conn_lport == lport && 1009 oconnp->conn_zoneid == connp->conn_zoneid && 1010 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1011 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1012 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1013 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1014 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1015 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1016 &connp->conn_srcv6))) { 1017 break; 1018 } 1019 } 1020 mutex_exit(&connfp->connf_lock); 1021 if (oconnp != NULL) 1022 return (EADDRNOTAVAIL); 1023 1024 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1025 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1026 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1027 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1028 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1029 } else { 1030 IPCL_HASH_INSERT_BOUND(connfp, connp); 1031 } 1032 } else { 1033 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1034 } 1035 return (0); 1036 } 1037 1038 /* 1039 * Check for a MAC exemption conflict on a labeled system. Note that for 1040 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1041 * transport layer. This check is for binding all other protocols. 1042 * 1043 * Returns true if there's a conflict. 1044 */ 1045 static boolean_t 1046 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1047 { 1048 connf_t *connfp; 1049 conn_t *tconn; 1050 1051 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1052 mutex_enter(&connfp->connf_lock); 1053 for (tconn = connfp->connf_head; tconn != NULL; 1054 tconn = tconn->conn_next) { 1055 /* We don't allow v4 fallback for v6 raw socket */ 1056 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1057 continue; 1058 /* If neither is exempt, then there's no conflict */ 1059 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1060 continue; 1061 /* If both are bound to different specific addrs, ok */ 1062 if (connp->conn_src != INADDR_ANY && 1063 tconn->conn_src != INADDR_ANY && 1064 connp->conn_src != tconn->conn_src) 1065 continue; 1066 /* These two conflict; fail */ 1067 break; 1068 } 1069 mutex_exit(&connfp->connf_lock); 1070 return (tconn != NULL); 1071 } 1072 1073 static boolean_t 1074 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1075 { 1076 connf_t *connfp; 1077 conn_t *tconn; 1078 1079 connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; 1080 mutex_enter(&connfp->connf_lock); 1081 for (tconn = connfp->connf_head; tconn != NULL; 1082 tconn = tconn->conn_next) { 1083 /* We don't allow v4 fallback for v6 raw socket */ 1084 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 1085 continue; 1086 /* If neither is exempt, then there's no conflict */ 1087 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 1088 continue; 1089 /* If both are bound to different addrs, ok */ 1090 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 1091 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 1092 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 1093 continue; 1094 /* These two conflict; fail */ 1095 break; 1096 } 1097 mutex_exit(&connfp->connf_lock); 1098 return (tconn != NULL); 1099 } 1100 1101 /* 1102 * (v4, v6) bind hash insertion routines 1103 */ 1104 int 1105 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 1106 { 1107 connf_t *connfp; 1108 #ifdef IPCL_DEBUG 1109 char buf[INET_NTOA_BUFSIZE]; 1110 #endif 1111 int ret = 0; 1112 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1113 1114 ASSERT(connp); 1115 1116 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 1117 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 1118 1119 connp->conn_ulp = protocol; 1120 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 1121 connp->conn_lport = lport; 1122 1123 switch (protocol) { 1124 default: 1125 if (is_system_labeled() && 1126 check_exempt_conflict_v4(connp, ipst)) 1127 return (EADDRINUSE); 1128 /* FALLTHROUGH */ 1129 case IPPROTO_UDP: 1130 if (protocol == IPPROTO_UDP) { 1131 IPCL_DEBUG_LVL(64, 1132 ("ipcl_bind_insert: connp %p - udp\n", 1133 (void *)connp)); 1134 connfp = &ipst->ips_ipcl_udp_fanout[ 1135 IPCL_UDP_HASH(lport, ipst)]; 1136 } else { 1137 IPCL_DEBUG_LVL(64, 1138 ("ipcl_bind_insert: connp %p - protocol\n", 1139 (void *)connp)); 1140 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1141 } 1142 1143 if (connp->conn_rem != INADDR_ANY) { 1144 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1145 } else if (connp->conn_src != INADDR_ANY) { 1146 IPCL_HASH_INSERT_BOUND(connfp, connp); 1147 } else { 1148 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1149 } 1150 break; 1151 1152 case IPPROTO_TCP: 1153 1154 /* Insert it in the Bind Hash */ 1155 ASSERT(connp->conn_zoneid != ALL_ZONES); 1156 connfp = &ipst->ips_ipcl_bind_fanout[ 1157 IPCL_BIND_HASH(lport, ipst)]; 1158 if (connp->conn_src != INADDR_ANY) { 1159 IPCL_HASH_INSERT_BOUND(connfp, connp); 1160 } else { 1161 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1162 } 1163 if (cl_inet_listen != NULL) { 1164 ASSERT(!connp->conn_pkt_isv6); 1165 connp->conn_flags |= IPCL_CL_LISTENER; 1166 (*cl_inet_listen)(IPPROTO_TCP, AF_INET, 1167 (uint8_t *)&connp->conn_bound_source, lport); 1168 } 1169 break; 1170 1171 case IPPROTO_SCTP: 1172 ret = ipcl_sctp_hash_insert(connp, lport); 1173 break; 1174 } 1175 1176 return (ret); 1177 } 1178 1179 int 1180 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1181 uint16_t lport) 1182 { 1183 connf_t *connfp; 1184 int ret = 0; 1185 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1186 1187 ASSERT(connp); 1188 1189 connp->conn_ulp = protocol; 1190 connp->conn_srcv6 = *src; 1191 connp->conn_lport = lport; 1192 1193 switch (protocol) { 1194 default: 1195 if (is_system_labeled() && 1196 check_exempt_conflict_v6(connp, ipst)) 1197 return (EADDRINUSE); 1198 /* FALLTHROUGH */ 1199 case IPPROTO_UDP: 1200 if (protocol == IPPROTO_UDP) { 1201 IPCL_DEBUG_LVL(128, 1202 ("ipcl_bind_insert_v6: connp %p - udp\n", 1203 (void *)connp)); 1204 connfp = &ipst->ips_ipcl_udp_fanout[ 1205 IPCL_UDP_HASH(lport, ipst)]; 1206 } else { 1207 IPCL_DEBUG_LVL(128, 1208 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1209 (void *)connp)); 1210 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1211 } 1212 1213 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1214 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1215 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1216 IPCL_HASH_INSERT_BOUND(connfp, connp); 1217 } else { 1218 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1219 } 1220 break; 1221 1222 case IPPROTO_TCP: 1223 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1224 1225 /* Insert it in the Bind Hash */ 1226 ASSERT(connp->conn_zoneid != ALL_ZONES); 1227 connfp = &ipst->ips_ipcl_bind_fanout[ 1228 IPCL_BIND_HASH(lport, ipst)]; 1229 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1230 IPCL_HASH_INSERT_BOUND(connfp, connp); 1231 } else { 1232 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1233 } 1234 if (cl_inet_listen != NULL) { 1235 sa_family_t addr_family; 1236 uint8_t *laddrp; 1237 1238 if (connp->conn_pkt_isv6) { 1239 addr_family = AF_INET6; 1240 laddrp = 1241 (uint8_t *)&connp->conn_bound_source_v6; 1242 } else { 1243 addr_family = AF_INET; 1244 laddrp = (uint8_t *)&connp->conn_bound_source; 1245 } 1246 connp->conn_flags |= IPCL_CL_LISTENER; 1247 (*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp, 1248 lport); 1249 } 1250 break; 1251 1252 case IPPROTO_SCTP: 1253 ret = ipcl_sctp_hash_insert(connp, lport); 1254 break; 1255 } 1256 1257 return (ret); 1258 } 1259 1260 /* 1261 * ipcl_conn_hash insertion routines. 1262 */ 1263 int 1264 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1265 ipaddr_t rem, uint32_t ports) 1266 { 1267 connf_t *connfp; 1268 uint16_t *up; 1269 conn_t *tconnp; 1270 #ifdef IPCL_DEBUG 1271 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1272 #endif 1273 in_port_t lport; 1274 int ret = 0; 1275 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1276 1277 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1278 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1279 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1280 ports, protocol)); 1281 1282 switch (protocol) { 1283 case IPPROTO_TCP: 1284 if (!(connp->conn_flags & IPCL_EAGER)) { 1285 /* 1286 * for a eager connection, i.e connections which 1287 * have just been created, the initialization is 1288 * already done in ip at conn_creation time, so 1289 * we can skip the checks here. 1290 */ 1291 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1292 } 1293 connfp = &ipst->ips_ipcl_conn_fanout[ 1294 IPCL_CONN_HASH(connp->conn_rem, 1295 connp->conn_ports, ipst)]; 1296 mutex_enter(&connfp->connf_lock); 1297 for (tconnp = connfp->connf_head; tconnp != NULL; 1298 tconnp = tconnp->conn_next) { 1299 if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1300 connp->conn_rem, connp->conn_src, 1301 connp->conn_ports)) { 1302 1303 /* Already have a conn. bail out */ 1304 mutex_exit(&connfp->connf_lock); 1305 return (EADDRINUSE); 1306 } 1307 } 1308 if (connp->conn_fanout != NULL) { 1309 /* 1310 * Probably a XTI/TLI application trying to do a 1311 * rebind. Let it happen. 1312 */ 1313 mutex_exit(&connfp->connf_lock); 1314 IPCL_HASH_REMOVE(connp); 1315 mutex_enter(&connfp->connf_lock); 1316 } 1317 1318 ASSERT(connp->conn_recv != NULL); 1319 1320 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1321 mutex_exit(&connfp->connf_lock); 1322 break; 1323 1324 case IPPROTO_SCTP: 1325 /* 1326 * The raw socket may have already been bound, remove it 1327 * from the hash first. 1328 */ 1329 IPCL_HASH_REMOVE(connp); 1330 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1331 ret = ipcl_sctp_hash_insert(connp, lport); 1332 break; 1333 1334 default: 1335 /* 1336 * Check for conflicts among MAC exempt bindings. For 1337 * transports with port numbers, this is done by the upper 1338 * level per-transport binding logic. For all others, it's 1339 * done here. 1340 */ 1341 if (is_system_labeled() && 1342 check_exempt_conflict_v4(connp, ipst)) 1343 return (EADDRINUSE); 1344 /* FALLTHROUGH */ 1345 1346 case IPPROTO_UDP: 1347 up = (uint16_t *)&ports; 1348 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1349 if (protocol == IPPROTO_UDP) { 1350 connfp = &ipst->ips_ipcl_udp_fanout[ 1351 IPCL_UDP_HASH(up[1], ipst)]; 1352 } else { 1353 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 1354 } 1355 1356 if (connp->conn_rem != INADDR_ANY) { 1357 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1358 } else if (connp->conn_src != INADDR_ANY) { 1359 IPCL_HASH_INSERT_BOUND(connfp, connp); 1360 } else { 1361 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1362 } 1363 break; 1364 } 1365 1366 return (ret); 1367 } 1368 1369 int 1370 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1371 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1372 { 1373 connf_t *connfp; 1374 uint16_t *up; 1375 conn_t *tconnp; 1376 in_port_t lport; 1377 int ret = 0; 1378 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1379 1380 switch (protocol) { 1381 case IPPROTO_TCP: 1382 /* Just need to insert a conn struct */ 1383 if (!(connp->conn_flags & IPCL_EAGER)) { 1384 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1385 } 1386 connfp = &ipst->ips_ipcl_conn_fanout[ 1387 IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, 1388 ipst)]; 1389 mutex_enter(&connfp->connf_lock); 1390 for (tconnp = connfp->connf_head; tconnp != NULL; 1391 tconnp = tconnp->conn_next) { 1392 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1393 connp->conn_remv6, connp->conn_srcv6, 1394 connp->conn_ports) && 1395 (tconnp->conn_tcp->tcp_bound_if == 0 || 1396 tconnp->conn_tcp->tcp_bound_if == ifindex)) { 1397 /* Already have a conn. bail out */ 1398 mutex_exit(&connfp->connf_lock); 1399 return (EADDRINUSE); 1400 } 1401 } 1402 if (connp->conn_fanout != NULL) { 1403 /* 1404 * Probably a XTI/TLI application trying to do a 1405 * rebind. Let it happen. 1406 */ 1407 mutex_exit(&connfp->connf_lock); 1408 IPCL_HASH_REMOVE(connp); 1409 mutex_enter(&connfp->connf_lock); 1410 } 1411 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1412 mutex_exit(&connfp->connf_lock); 1413 break; 1414 1415 case IPPROTO_SCTP: 1416 IPCL_HASH_REMOVE(connp); 1417 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1418 ret = ipcl_sctp_hash_insert(connp, lport); 1419 break; 1420 1421 default: 1422 if (is_system_labeled() && 1423 check_exempt_conflict_v6(connp, ipst)) 1424 return (EADDRINUSE); 1425 /* FALLTHROUGH */ 1426 case IPPROTO_UDP: 1427 up = (uint16_t *)&ports; 1428 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1429 if (protocol == IPPROTO_UDP) { 1430 connfp = &ipst->ips_ipcl_udp_fanout[ 1431 IPCL_UDP_HASH(up[1], ipst)]; 1432 } else { 1433 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1434 } 1435 1436 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1437 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1438 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1439 IPCL_HASH_INSERT_BOUND(connfp, connp); 1440 } else { 1441 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1442 } 1443 break; 1444 } 1445 1446 return (ret); 1447 } 1448 1449 /* 1450 * v4 packet classifying function. looks up the fanout table to 1451 * find the conn, the packet belongs to. returns the conn with 1452 * the reference held, null otherwise. 1453 * 1454 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1455 * Lookup" comment block are applied. Labels are also checked as described 1456 * above. If the packet is from the inside (looped back), and is from the same 1457 * zone, then label checks are omitted. 1458 */ 1459 conn_t * 1460 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1461 ip_stack_t *ipst) 1462 { 1463 ipha_t *ipha; 1464 connf_t *connfp, *bind_connfp; 1465 uint16_t lport; 1466 uint16_t fport; 1467 uint32_t ports; 1468 conn_t *connp; 1469 uint16_t *up; 1470 boolean_t shared_addr; 1471 boolean_t unlabeled; 1472 1473 ipha = (ipha_t *)mp->b_rptr; 1474 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1475 1476 switch (protocol) { 1477 case IPPROTO_TCP: 1478 ports = *(uint32_t *)up; 1479 connfp = 1480 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1481 ports, ipst)]; 1482 mutex_enter(&connfp->connf_lock); 1483 for (connp = connfp->connf_head; connp != NULL; 1484 connp = connp->conn_next) { 1485 if (IPCL_CONN_MATCH(connp, protocol, 1486 ipha->ipha_src, ipha->ipha_dst, ports)) 1487 break; 1488 } 1489 1490 if (connp != NULL) { 1491 /* 1492 * We have a fully-bound TCP connection. 1493 * 1494 * For labeled systems, there's no need to check the 1495 * label here. It's known to be good as we checked 1496 * before allowing the connection to become bound. 1497 */ 1498 CONN_INC_REF(connp); 1499 mutex_exit(&connfp->connf_lock); 1500 return (connp); 1501 } 1502 1503 mutex_exit(&connfp->connf_lock); 1504 1505 lport = up[1]; 1506 unlabeled = B_FALSE; 1507 /* Cred cannot be null on IPv4 */ 1508 if (is_system_labeled()) 1509 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1510 TSLF_UNLABELED) != 0; 1511 shared_addr = (zoneid == ALL_ZONES); 1512 if (shared_addr) { 1513 /* 1514 * No need to handle exclusive-stack zones since 1515 * ALL_ZONES only applies to the shared stack. 1516 */ 1517 zoneid = tsol_mlp_findzone(protocol, lport); 1518 /* 1519 * If no shared MLP is found, tsol_mlp_findzone returns 1520 * ALL_ZONES. In that case, we assume it's SLP, and 1521 * search for the zone based on the packet label. 1522 * 1523 * If there is such a zone, we prefer to find a 1524 * connection in it. Otherwise, we look for a 1525 * MAC-exempt connection in any zone whose label 1526 * dominates the default label on the packet. 1527 */ 1528 if (zoneid == ALL_ZONES) 1529 zoneid = tsol_packet_to_zoneid(mp); 1530 else 1531 unlabeled = B_FALSE; 1532 } 1533 1534 bind_connfp = 1535 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1536 mutex_enter(&bind_connfp->connf_lock); 1537 for (connp = bind_connfp->connf_head; connp != NULL; 1538 connp = connp->conn_next) { 1539 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1540 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1541 (unlabeled && connp->conn_mac_exempt))) 1542 break; 1543 } 1544 1545 /* 1546 * If the matching connection is SLP on a private address, then 1547 * the label on the packet must match the local zone's label. 1548 * Otherwise, it must be in the label range defined by tnrh. 1549 * This is ensured by tsol_receive_label. 1550 */ 1551 if (connp != NULL && is_system_labeled() && 1552 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1553 shared_addr, connp)) { 1554 DTRACE_PROBE3( 1555 tx__ip__log__info__classify__tcp, 1556 char *, 1557 "connp(1) could not receive mp(2)", 1558 conn_t *, connp, mblk_t *, mp); 1559 connp = NULL; 1560 } 1561 1562 if (connp != NULL) { 1563 /* Have a listener at least */ 1564 CONN_INC_REF(connp); 1565 mutex_exit(&bind_connfp->connf_lock); 1566 return (connp); 1567 } 1568 1569 mutex_exit(&bind_connfp->connf_lock); 1570 1571 IPCL_DEBUG_LVL(512, 1572 ("ipcl_classify: couldn't classify mp = %p\n", 1573 (void *)mp)); 1574 break; 1575 1576 case IPPROTO_UDP: 1577 lport = up[1]; 1578 unlabeled = B_FALSE; 1579 /* Cred cannot be null on IPv4 */ 1580 if (is_system_labeled()) 1581 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1582 TSLF_UNLABELED) != 0; 1583 shared_addr = (zoneid == ALL_ZONES); 1584 if (shared_addr) { 1585 /* 1586 * No need to handle exclusive-stack zones since 1587 * ALL_ZONES only applies to the shared stack. 1588 */ 1589 zoneid = tsol_mlp_findzone(protocol, lport); 1590 /* 1591 * If no shared MLP is found, tsol_mlp_findzone returns 1592 * ALL_ZONES. In that case, we assume it's SLP, and 1593 * search for the zone based on the packet label. 1594 * 1595 * If there is such a zone, we prefer to find a 1596 * connection in it. Otherwise, we look for a 1597 * MAC-exempt connection in any zone whose label 1598 * dominates the default label on the packet. 1599 */ 1600 if (zoneid == ALL_ZONES) 1601 zoneid = tsol_packet_to_zoneid(mp); 1602 else 1603 unlabeled = B_FALSE; 1604 } 1605 fport = up[0]; 1606 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1607 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1608 mutex_enter(&connfp->connf_lock); 1609 for (connp = connfp->connf_head; connp != NULL; 1610 connp = connp->conn_next) { 1611 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1612 fport, ipha->ipha_src) && 1613 (IPCL_ZONE_MATCH(connp, zoneid) || 1614 (unlabeled && connp->conn_mac_exempt))) 1615 break; 1616 } 1617 1618 if (connp != NULL && is_system_labeled() && 1619 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1620 shared_addr, connp)) { 1621 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1622 char *, "connp(1) could not receive mp(2)", 1623 conn_t *, connp, mblk_t *, mp); 1624 connp = NULL; 1625 } 1626 1627 if (connp != NULL) { 1628 CONN_INC_REF(connp); 1629 mutex_exit(&connfp->connf_lock); 1630 return (connp); 1631 } 1632 1633 /* 1634 * We shouldn't come here for multicast/broadcast packets 1635 */ 1636 mutex_exit(&connfp->connf_lock); 1637 IPCL_DEBUG_LVL(512, 1638 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1639 lport, fport)); 1640 break; 1641 } 1642 1643 return (NULL); 1644 } 1645 1646 conn_t * 1647 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, 1648 ip_stack_t *ipst) 1649 { 1650 ip6_t *ip6h; 1651 connf_t *connfp, *bind_connfp; 1652 uint16_t lport; 1653 uint16_t fport; 1654 tcph_t *tcph; 1655 uint32_t ports; 1656 conn_t *connp; 1657 uint16_t *up; 1658 boolean_t shared_addr; 1659 boolean_t unlabeled; 1660 1661 ip6h = (ip6_t *)mp->b_rptr; 1662 1663 switch (protocol) { 1664 case IPPROTO_TCP: 1665 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1666 up = (uint16_t *)tcph->th_lport; 1667 ports = *(uint32_t *)up; 1668 1669 connfp = 1670 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1671 ports, ipst)]; 1672 mutex_enter(&connfp->connf_lock); 1673 for (connp = connfp->connf_head; connp != NULL; 1674 connp = connp->conn_next) { 1675 if (IPCL_CONN_MATCH_V6(connp, protocol, 1676 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1677 break; 1678 } 1679 1680 if (connp != NULL) { 1681 /* 1682 * We have a fully-bound TCP connection. 1683 * 1684 * For labeled systems, there's no need to check the 1685 * label here. It's known to be good as we checked 1686 * before allowing the connection to become bound. 1687 */ 1688 CONN_INC_REF(connp); 1689 mutex_exit(&connfp->connf_lock); 1690 return (connp); 1691 } 1692 1693 mutex_exit(&connfp->connf_lock); 1694 1695 lport = up[1]; 1696 unlabeled = B_FALSE; 1697 /* Cred can be null on IPv6 */ 1698 if (is_system_labeled()) { 1699 cred_t *cr = DB_CRED(mp); 1700 1701 unlabeled = (cr != NULL && 1702 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1703 } 1704 shared_addr = (zoneid == ALL_ZONES); 1705 if (shared_addr) { 1706 /* 1707 * No need to handle exclusive-stack zones since 1708 * ALL_ZONES only applies to the shared stack. 1709 */ 1710 zoneid = tsol_mlp_findzone(protocol, lport); 1711 /* 1712 * If no shared MLP is found, tsol_mlp_findzone returns 1713 * ALL_ZONES. In that case, we assume it's SLP, and 1714 * search for the zone based on the packet label. 1715 * 1716 * If there is such a zone, we prefer to find a 1717 * connection in it. Otherwise, we look for a 1718 * MAC-exempt connection in any zone whose label 1719 * dominates the default label on the packet. 1720 */ 1721 if (zoneid == ALL_ZONES) 1722 zoneid = tsol_packet_to_zoneid(mp); 1723 else 1724 unlabeled = B_FALSE; 1725 } 1726 1727 bind_connfp = 1728 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1729 mutex_enter(&bind_connfp->connf_lock); 1730 for (connp = bind_connfp->connf_head; connp != NULL; 1731 connp = connp->conn_next) { 1732 if (IPCL_BIND_MATCH_V6(connp, protocol, 1733 ip6h->ip6_dst, lport) && 1734 (IPCL_ZONE_MATCH(connp, zoneid) || 1735 (unlabeled && connp->conn_mac_exempt))) 1736 break; 1737 } 1738 1739 if (connp != NULL && is_system_labeled() && 1740 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1741 shared_addr, connp)) { 1742 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1743 char *, "connp(1) could not receive mp(2)", 1744 conn_t *, connp, mblk_t *, mp); 1745 connp = NULL; 1746 } 1747 1748 if (connp != NULL) { 1749 /* Have a listner at least */ 1750 CONN_INC_REF(connp); 1751 mutex_exit(&bind_connfp->connf_lock); 1752 IPCL_DEBUG_LVL(512, 1753 ("ipcl_classify_v6: found listner " 1754 "connp = %p\n", (void *)connp)); 1755 1756 return (connp); 1757 } 1758 1759 mutex_exit(&bind_connfp->connf_lock); 1760 1761 IPCL_DEBUG_LVL(512, 1762 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1763 (void *)mp)); 1764 break; 1765 1766 case IPPROTO_UDP: 1767 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1768 lport = up[1]; 1769 unlabeled = B_FALSE; 1770 /* Cred can be null on IPv6 */ 1771 if (is_system_labeled()) { 1772 cred_t *cr = DB_CRED(mp); 1773 1774 unlabeled = (cr != NULL && 1775 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1776 } 1777 shared_addr = (zoneid == ALL_ZONES); 1778 if (shared_addr) { 1779 /* 1780 * No need to handle exclusive-stack zones since 1781 * ALL_ZONES only applies to the shared stack. 1782 */ 1783 zoneid = tsol_mlp_findzone(protocol, lport); 1784 /* 1785 * If no shared MLP is found, tsol_mlp_findzone returns 1786 * ALL_ZONES. In that case, we assume it's SLP, and 1787 * search for the zone based on the packet label. 1788 * 1789 * If there is such a zone, we prefer to find a 1790 * connection in it. Otherwise, we look for a 1791 * MAC-exempt connection in any zone whose label 1792 * dominates the default label on the packet. 1793 */ 1794 if (zoneid == ALL_ZONES) 1795 zoneid = tsol_packet_to_zoneid(mp); 1796 else 1797 unlabeled = B_FALSE; 1798 } 1799 1800 fport = up[0]; 1801 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1802 fport)); 1803 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1804 mutex_enter(&connfp->connf_lock); 1805 for (connp = connfp->connf_head; connp != NULL; 1806 connp = connp->conn_next) { 1807 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1808 fport, ip6h->ip6_src) && 1809 (IPCL_ZONE_MATCH(connp, zoneid) || 1810 (unlabeled && connp->conn_mac_exempt))) 1811 break; 1812 } 1813 1814 if (connp != NULL && is_system_labeled() && 1815 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1816 shared_addr, connp)) { 1817 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1818 char *, "connp(1) could not receive mp(2)", 1819 conn_t *, connp, mblk_t *, mp); 1820 connp = NULL; 1821 } 1822 1823 if (connp != NULL) { 1824 CONN_INC_REF(connp); 1825 mutex_exit(&connfp->connf_lock); 1826 return (connp); 1827 } 1828 1829 /* 1830 * We shouldn't come here for multicast/broadcast packets 1831 */ 1832 mutex_exit(&connfp->connf_lock); 1833 IPCL_DEBUG_LVL(512, 1834 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1835 lport, fport)); 1836 break; 1837 } 1838 1839 return (NULL); 1840 } 1841 1842 /* 1843 * wrapper around ipcl_classify_(v4,v6) routines. 1844 */ 1845 conn_t * 1846 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) 1847 { 1848 uint16_t hdr_len; 1849 ipha_t *ipha; 1850 uint8_t *nexthdrp; 1851 1852 if (MBLKL(mp) < sizeof (ipha_t)) 1853 return (NULL); 1854 1855 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1856 case IPV4_VERSION: 1857 ipha = (ipha_t *)mp->b_rptr; 1858 hdr_len = IPH_HDR_LENGTH(ipha); 1859 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1860 zoneid, ipst)); 1861 case IPV6_VERSION: 1862 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1863 &hdr_len, &nexthdrp)) 1864 return (NULL); 1865 1866 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); 1867 } 1868 1869 return (NULL); 1870 } 1871 1872 conn_t * 1873 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1874 uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) 1875 { 1876 connf_t *connfp; 1877 conn_t *connp; 1878 in_port_t lport; 1879 int af; 1880 boolean_t shared_addr; 1881 boolean_t unlabeled; 1882 const void *dst; 1883 1884 lport = ((uint16_t *)&ports)[1]; 1885 1886 unlabeled = B_FALSE; 1887 /* Cred can be null on IPv6 */ 1888 if (is_system_labeled()) { 1889 cred_t *cr = DB_CRED(mp); 1890 1891 unlabeled = (cr != NULL && 1892 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1893 } 1894 shared_addr = (zoneid == ALL_ZONES); 1895 if (shared_addr) { 1896 /* 1897 * No need to handle exclusive-stack zones since ALL_ZONES 1898 * only applies to the shared stack. 1899 */ 1900 zoneid = tsol_mlp_findzone(protocol, lport); 1901 /* 1902 * If no shared MLP is found, tsol_mlp_findzone returns 1903 * ALL_ZONES. In that case, we assume it's SLP, and search for 1904 * the zone based on the packet label. 1905 * 1906 * If there is such a zone, we prefer to find a connection in 1907 * it. Otherwise, we look for a MAC-exempt connection in any 1908 * zone whose label dominates the default label on the packet. 1909 */ 1910 if (zoneid == ALL_ZONES) 1911 zoneid = tsol_packet_to_zoneid(mp); 1912 else 1913 unlabeled = B_FALSE; 1914 } 1915 1916 af = IPH_HDR_VERSION(hdr); 1917 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1918 (const void *)&((ip6_t *)hdr)->ip6_dst; 1919 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1920 1921 mutex_enter(&connfp->connf_lock); 1922 for (connp = connfp->connf_head; connp != NULL; 1923 connp = connp->conn_next) { 1924 /* We don't allow v4 fallback for v6 raw socket. */ 1925 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1926 IPV6_VERSION)) 1927 continue; 1928 if (connp->conn_fully_bound) { 1929 if (af == IPV4_VERSION) { 1930 if (!IPCL_CONN_MATCH(connp, protocol, 1931 hdr->ipha_src, hdr->ipha_dst, ports)) 1932 continue; 1933 } else { 1934 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1935 ((ip6_t *)hdr)->ip6_src, 1936 ((ip6_t *)hdr)->ip6_dst, ports)) 1937 continue; 1938 } 1939 } else { 1940 if (af == IPV4_VERSION) { 1941 if (!IPCL_BIND_MATCH(connp, protocol, 1942 hdr->ipha_dst, lport)) 1943 continue; 1944 } else { 1945 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1946 ((ip6_t *)hdr)->ip6_dst, lport)) 1947 continue; 1948 } 1949 } 1950 1951 if (IPCL_ZONE_MATCH(connp, zoneid) || 1952 (unlabeled && connp->conn_mac_exempt)) 1953 break; 1954 } 1955 /* 1956 * If the connection is fully-bound and connection-oriented (TCP or 1957 * SCTP), then we've already validated the remote system's label. 1958 * There's no need to do it again for every packet. 1959 */ 1960 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 1961 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 1962 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 1963 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1964 char *, "connp(1) could not receive mp(2)", 1965 conn_t *, connp, mblk_t *, mp); 1966 connp = NULL; 1967 } 1968 1969 if (connp != NULL) 1970 goto found; 1971 mutex_exit(&connfp->connf_lock); 1972 1973 /* Try to look for a wildcard match. */ 1974 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1975 mutex_enter(&connfp->connf_lock); 1976 for (connp = connfp->connf_head; connp != NULL; 1977 connp = connp->conn_next) { 1978 /* We don't allow v4 fallback for v6 raw socket. */ 1979 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 1980 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 1981 continue; 1982 } 1983 if (af == IPV4_VERSION) { 1984 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 1985 break; 1986 } else { 1987 if (IPCL_RAW_MATCH_V6(connp, protocol, 1988 ((ip6_t *)hdr)->ip6_dst)) { 1989 break; 1990 } 1991 } 1992 } 1993 1994 if (connp != NULL) 1995 goto found; 1996 1997 mutex_exit(&connfp->connf_lock); 1998 return (NULL); 1999 2000 found: 2001 ASSERT(connp != NULL); 2002 CONN_INC_REF(connp); 2003 mutex_exit(&connfp->connf_lock); 2004 return (connp); 2005 } 2006 2007 /* ARGSUSED */ 2008 static int 2009 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2010 { 2011 itc_t *itc = (itc_t *)buf; 2012 conn_t *connp = &itc->itc_conn; 2013 tcp_t *tcp = (tcp_t *)&itc[1]; 2014 2015 bzero(connp, sizeof (conn_t)); 2016 bzero(tcp, sizeof (tcp_t)); 2017 2018 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2019 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2020 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 2021 connp->conn_tcp = tcp; 2022 connp->conn_flags = IPCL_TCPCONN; 2023 connp->conn_ulp = IPPROTO_TCP; 2024 tcp->tcp_connp = connp; 2025 return (0); 2026 } 2027 2028 /* ARGSUSED */ 2029 static void 2030 tcp_conn_destructor(void *buf, void *cdrarg) 2031 { 2032 itc_t *itc = (itc_t *)buf; 2033 conn_t *connp = &itc->itc_conn; 2034 tcp_t *tcp = (tcp_t *)&itc[1]; 2035 2036 ASSERT(connp->conn_flags & IPCL_TCPCONN); 2037 ASSERT(tcp->tcp_connp == connp); 2038 ASSERT(connp->conn_tcp == tcp); 2039 tcp_timermp_free(tcp); 2040 mutex_destroy(&connp->conn_lock); 2041 cv_destroy(&connp->conn_cv); 2042 } 2043 2044 /* ARGSUSED */ 2045 static int 2046 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2047 { 2048 itc_t *itc = (itc_t *)buf; 2049 conn_t *connp = &itc->itc_conn; 2050 2051 bzero(connp, sizeof (conn_t)); 2052 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2053 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2054 connp->conn_flags = IPCL_IPCCONN; 2055 2056 return (0); 2057 } 2058 2059 /* ARGSUSED */ 2060 static void 2061 ip_conn_destructor(void *buf, void *cdrarg) 2062 { 2063 itc_t *itc = (itc_t *)buf; 2064 conn_t *connp = &itc->itc_conn; 2065 2066 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2067 ASSERT(connp->conn_priv == NULL); 2068 mutex_destroy(&connp->conn_lock); 2069 cv_destroy(&connp->conn_cv); 2070 } 2071 2072 /* ARGSUSED */ 2073 static int 2074 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2075 { 2076 itc_t *itc = (itc_t *)buf; 2077 conn_t *connp = &itc->itc_conn; 2078 udp_t *udp = (udp_t *)&itc[1]; 2079 2080 bzero(connp, sizeof (conn_t)); 2081 bzero(udp, sizeof (udp_t)); 2082 2083 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2084 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2085 connp->conn_udp = udp; 2086 connp->conn_flags = IPCL_UDPCONN; 2087 connp->conn_ulp = IPPROTO_UDP; 2088 udp->udp_connp = connp; 2089 return (0); 2090 } 2091 2092 /* ARGSUSED */ 2093 static void 2094 udp_conn_destructor(void *buf, void *cdrarg) 2095 { 2096 itc_t *itc = (itc_t *)buf; 2097 conn_t *connp = &itc->itc_conn; 2098 udp_t *udp = (udp_t *)&itc[1]; 2099 2100 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2101 ASSERT(udp->udp_connp == connp); 2102 ASSERT(connp->conn_udp == udp); 2103 mutex_destroy(&connp->conn_lock); 2104 cv_destroy(&connp->conn_cv); 2105 } 2106 2107 /* ARGSUSED */ 2108 static int 2109 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2110 { 2111 itc_t *itc = (itc_t *)buf; 2112 conn_t *connp = &itc->itc_conn; 2113 icmp_t *icmp = (icmp_t *)&itc[1]; 2114 2115 bzero(connp, sizeof (conn_t)); 2116 bzero(icmp, sizeof (icmp_t)); 2117 2118 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2119 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2120 connp->conn_icmp = icmp; 2121 connp->conn_flags = IPCL_RAWIPCONN; 2122 connp->conn_ulp = IPPROTO_ICMP; 2123 icmp->icmp_connp = connp; 2124 return (0); 2125 } 2126 2127 /* ARGSUSED */ 2128 static void 2129 rawip_conn_destructor(void *buf, void *cdrarg) 2130 { 2131 itc_t *itc = (itc_t *)buf; 2132 conn_t *connp = &itc->itc_conn; 2133 icmp_t *icmp = (icmp_t *)&itc[1]; 2134 2135 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2136 ASSERT(icmp->icmp_connp == connp); 2137 ASSERT(connp->conn_icmp == icmp); 2138 mutex_destroy(&connp->conn_lock); 2139 cv_destroy(&connp->conn_cv); 2140 } 2141 2142 /* ARGSUSED */ 2143 static int 2144 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2145 { 2146 itc_t *itc = (itc_t *)buf; 2147 conn_t *connp = &itc->itc_conn; 2148 rts_t *rts = (rts_t *)&itc[1]; 2149 2150 bzero(connp, sizeof (conn_t)); 2151 bzero(rts, sizeof (rts_t)); 2152 2153 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2154 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2155 connp->conn_rts = rts; 2156 connp->conn_flags = IPCL_RTSCONN; 2157 rts->rts_connp = connp; 2158 return (0); 2159 } 2160 2161 /* ARGSUSED */ 2162 static void 2163 rts_conn_destructor(void *buf, void *cdrarg) 2164 { 2165 itc_t *itc = (itc_t *)buf; 2166 conn_t *connp = &itc->itc_conn; 2167 rts_t *rts = (rts_t *)&itc[1]; 2168 2169 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2170 ASSERT(rts->rts_connp == connp); 2171 ASSERT(connp->conn_rts == rts); 2172 mutex_destroy(&connp->conn_lock); 2173 cv_destroy(&connp->conn_cv); 2174 } 2175 2176 /* 2177 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2178 * in the conn_t. 2179 */ 2180 void 2181 ipcl_conn_cleanup(conn_t *connp) 2182 { 2183 ASSERT(connp->conn_ire_cache == NULL); 2184 ASSERT(connp->conn_latch == NULL); 2185 #ifdef notdef 2186 ASSERT(connp->conn_rq == NULL); 2187 ASSERT(connp->conn_wq == NULL); 2188 #endif 2189 ASSERT(connp->conn_cred == NULL); 2190 ASSERT(connp->conn_g_fanout == NULL); 2191 ASSERT(connp->conn_g_next == NULL); 2192 ASSERT(connp->conn_g_prev == NULL); 2193 ASSERT(connp->conn_policy == NULL); 2194 ASSERT(connp->conn_fanout == NULL); 2195 ASSERT(connp->conn_next == NULL); 2196 ASSERT(connp->conn_prev == NULL); 2197 #ifdef notdef 2198 /* 2199 * The ill and ipif pointers are not cleared before the conn_t 2200 * goes away since they do not hold a reference on the ill/ipif. 2201 * We should replace these pointers with ifindex/ipaddr_t to 2202 * make the code less complex. 2203 */ 2204 ASSERT(connp->conn_xmit_if_ill == NULL); 2205 ASSERT(connp->conn_nofailover_ill == NULL); 2206 ASSERT(connp->conn_outgoing_ill == NULL); 2207 ASSERT(connp->conn_incoming_ill == NULL); 2208 ASSERT(connp->conn_outgoing_pill == NULL); 2209 ASSERT(connp->conn_multicast_ipif == NULL); 2210 ASSERT(connp->conn_multicast_ill == NULL); 2211 #endif 2212 ASSERT(connp->conn_oper_pending_ill == NULL); 2213 ASSERT(connp->conn_ilg == NULL); 2214 ASSERT(connp->conn_drain_next == NULL); 2215 ASSERT(connp->conn_drain_prev == NULL); 2216 ASSERT(connp->conn_idl == NULL); 2217 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2218 ASSERT(connp->conn_peercred == NULL); 2219 ASSERT(connp->conn_netstack == NULL); 2220 2221 /* Clear out the conn_t fields that are not preserved */ 2222 bzero(&connp->conn_start_clr, 2223 sizeof (conn_t) - 2224 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2225 2226 } 2227 2228 /* 2229 * All conns are inserted in a global multi-list for the benefit of 2230 * walkers. The walk is guaranteed to walk all open conns at the time 2231 * of the start of the walk exactly once. This property is needed to 2232 * achieve some cleanups during unplumb of interfaces. This is achieved 2233 * as follows. 2234 * 2235 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2236 * call the insert and delete functions below at creation and deletion 2237 * time respectively. The conn never moves or changes its position in this 2238 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2239 * won't increase due to walkers, once the conn deletion has started. Note 2240 * that we can't remove the conn from the global list and then wait for 2241 * the refcnt to drop to zero, since walkers would then see a truncated 2242 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2243 * conns until ip_open is ready to make them globally visible. 2244 * The global round robin multi-list locks are held only to get the 2245 * next member/insertion/deletion and contention should be negligible 2246 * if the multi-list is much greater than the number of cpus. 2247 */ 2248 void 2249 ipcl_globalhash_insert(conn_t *connp) 2250 { 2251 int index; 2252 struct connf_s *connfp; 2253 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2254 2255 /* 2256 * No need for atomic here. Approximate even distribution 2257 * in the global lists is sufficient. 2258 */ 2259 ipst->ips_conn_g_index++; 2260 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2261 2262 connp->conn_g_prev = NULL; 2263 /* 2264 * Mark as INCIPIENT, so that walkers will ignore this 2265 * for now, till ip_open is ready to make it visible globally. 2266 */ 2267 connp->conn_state_flags |= CONN_INCIPIENT; 2268 2269 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2270 /* Insert at the head of the list */ 2271 mutex_enter(&connfp->connf_lock); 2272 connp->conn_g_next = connfp->connf_head; 2273 if (connp->conn_g_next != NULL) 2274 connp->conn_g_next->conn_g_prev = connp; 2275 connfp->connf_head = connp; 2276 2277 /* The fanout bucket this conn points to */ 2278 connp->conn_g_fanout = connfp; 2279 2280 mutex_exit(&connfp->connf_lock); 2281 } 2282 2283 void 2284 ipcl_globalhash_remove(conn_t *connp) 2285 { 2286 struct connf_s *connfp; 2287 2288 /* 2289 * We were never inserted in the global multi list. 2290 * IPCL_NONE variety is never inserted in the global multilist 2291 * since it is presumed to not need any cleanup and is transient. 2292 */ 2293 if (connp->conn_g_fanout == NULL) 2294 return; 2295 2296 connfp = connp->conn_g_fanout; 2297 mutex_enter(&connfp->connf_lock); 2298 if (connp->conn_g_prev != NULL) 2299 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2300 else 2301 connfp->connf_head = connp->conn_g_next; 2302 if (connp->conn_g_next != NULL) 2303 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2304 mutex_exit(&connfp->connf_lock); 2305 2306 /* Better to stumble on a null pointer than to corrupt memory */ 2307 connp->conn_g_next = NULL; 2308 connp->conn_g_prev = NULL; 2309 connp->conn_g_fanout = NULL; 2310 } 2311 2312 /* 2313 * Walk the list of all conn_t's in the system, calling the function provided 2314 * with the specified argument for each. 2315 * Applies to both IPv4 and IPv6. 2316 * 2317 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 2318 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2319 * unplumbed or removed. New conn_t's that are created while we are walking 2320 * may be missed by this walk, because they are not necessarily inserted 2321 * at the tail of the list. They are new conn_t's and thus don't have any 2322 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2323 * is created to the struct that is going away. 2324 */ 2325 void 2326 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2327 { 2328 int i; 2329 conn_t *connp; 2330 conn_t *prev_connp; 2331 2332 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2333 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2334 prev_connp = NULL; 2335 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2336 while (connp != NULL) { 2337 mutex_enter(&connp->conn_lock); 2338 if (connp->conn_state_flags & 2339 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2340 mutex_exit(&connp->conn_lock); 2341 connp = connp->conn_g_next; 2342 continue; 2343 } 2344 CONN_INC_REF_LOCKED(connp); 2345 mutex_exit(&connp->conn_lock); 2346 mutex_exit( 2347 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2348 (*func)(connp, arg); 2349 if (prev_connp != NULL) 2350 CONN_DEC_REF(prev_connp); 2351 mutex_enter( 2352 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2353 prev_connp = connp; 2354 connp = connp->conn_g_next; 2355 } 2356 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2357 if (prev_connp != NULL) 2358 CONN_DEC_REF(prev_connp); 2359 } 2360 } 2361 2362 /* 2363 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2364 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2365 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2366 * (peer tcp in ESTABLISHED state). 2367 */ 2368 conn_t * 2369 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, 2370 ip_stack_t *ipst) 2371 { 2372 uint32_t ports; 2373 uint16_t *pports = (uint16_t *)&ports; 2374 connf_t *connfp; 2375 conn_t *tconnp; 2376 boolean_t zone_chk; 2377 2378 /* 2379 * If either the source of destination address is loopback, then 2380 * both endpoints must be in the same Zone. Otherwise, both of 2381 * the addresses are system-wide unique (tcp is in ESTABLISHED 2382 * state) and the endpoints may reside in different Zones. 2383 */ 2384 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2385 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2386 2387 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2388 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2389 2390 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2391 ports, ipst)]; 2392 2393 mutex_enter(&connfp->connf_lock); 2394 for (tconnp = connfp->connf_head; tconnp != NULL; 2395 tconnp = tconnp->conn_next) { 2396 2397 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2398 ipha->ipha_dst, ipha->ipha_src, ports) && 2399 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2400 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2401 2402 ASSERT(tconnp != connp); 2403 CONN_INC_REF(tconnp); 2404 mutex_exit(&connfp->connf_lock); 2405 return (tconnp); 2406 } 2407 } 2408 mutex_exit(&connfp->connf_lock); 2409 return (NULL); 2410 } 2411 2412 /* 2413 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2414 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2415 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2416 * (peer tcp in ESTABLISHED state). 2417 */ 2418 conn_t * 2419 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, 2420 ip_stack_t *ipst) 2421 { 2422 uint32_t ports; 2423 uint16_t *pports = (uint16_t *)&ports; 2424 connf_t *connfp; 2425 conn_t *tconnp; 2426 boolean_t zone_chk; 2427 2428 /* 2429 * If either the source of destination address is loopback, then 2430 * both endpoints must be in the same Zone. Otherwise, both of 2431 * the addresses are system-wide unique (tcp is in ESTABLISHED 2432 * state) and the endpoints may reside in different Zones. We 2433 * don't do Zone check for link local address(es) because the 2434 * current Zone implementation treats each link local address as 2435 * being unique per system node, i.e. they belong to global Zone. 2436 */ 2437 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2438 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2439 2440 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2441 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2442 2443 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2444 ports, ipst)]; 2445 2446 mutex_enter(&connfp->connf_lock); 2447 for (tconnp = connfp->connf_head; tconnp != NULL; 2448 tconnp = tconnp->conn_next) { 2449 2450 /* We skip tcp_bound_if check here as this is loopback tcp */ 2451 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2452 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2453 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2454 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2455 2456 ASSERT(tconnp != connp); 2457 CONN_INC_REF(tconnp); 2458 mutex_exit(&connfp->connf_lock); 2459 return (tconnp); 2460 } 2461 } 2462 mutex_exit(&connfp->connf_lock); 2463 return (NULL); 2464 } 2465 2466 /* 2467 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2468 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2469 * Only checks for connected entries i.e. no INADDR_ANY checks. 2470 */ 2471 conn_t * 2472 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, 2473 ip_stack_t *ipst) 2474 { 2475 uint32_t ports; 2476 uint16_t *pports; 2477 connf_t *connfp; 2478 conn_t *tconnp; 2479 2480 pports = (uint16_t *)&ports; 2481 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2482 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2483 2484 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2485 ports, ipst)]; 2486 2487 mutex_enter(&connfp->connf_lock); 2488 for (tconnp = connfp->connf_head; tconnp != NULL; 2489 tconnp = tconnp->conn_next) { 2490 2491 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2492 ipha->ipha_dst, ipha->ipha_src, ports) && 2493 tconnp->conn_tcp->tcp_state >= min_state) { 2494 2495 CONN_INC_REF(tconnp); 2496 mutex_exit(&connfp->connf_lock); 2497 return (tconnp); 2498 } 2499 } 2500 mutex_exit(&connfp->connf_lock); 2501 return (NULL); 2502 } 2503 2504 /* 2505 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2506 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2507 * Only checks for connected entries i.e. no INADDR_ANY checks. 2508 * Match on ifindex in addition to addresses. 2509 */ 2510 conn_t * 2511 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2512 uint_t ifindex, ip_stack_t *ipst) 2513 { 2514 tcp_t *tcp; 2515 uint32_t ports; 2516 uint16_t *pports; 2517 connf_t *connfp; 2518 conn_t *tconnp; 2519 2520 pports = (uint16_t *)&ports; 2521 pports[0] = tcpha->tha_fport; 2522 pports[1] = tcpha->tha_lport; 2523 2524 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2525 ports, ipst)]; 2526 2527 mutex_enter(&connfp->connf_lock); 2528 for (tconnp = connfp->connf_head; tconnp != NULL; 2529 tconnp = tconnp->conn_next) { 2530 2531 tcp = tconnp->conn_tcp; 2532 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2533 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2534 tcp->tcp_state >= min_state && 2535 (tcp->tcp_bound_if == 0 || 2536 tcp->tcp_bound_if == ifindex)) { 2537 2538 CONN_INC_REF(tconnp); 2539 mutex_exit(&connfp->connf_lock); 2540 return (tconnp); 2541 } 2542 } 2543 mutex_exit(&connfp->connf_lock); 2544 return (NULL); 2545 } 2546 2547 /* 2548 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2549 * a listener when changing state. 2550 */ 2551 conn_t * 2552 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2553 ip_stack_t *ipst) 2554 { 2555 connf_t *bind_connfp; 2556 conn_t *connp; 2557 tcp_t *tcp; 2558 2559 /* 2560 * Avoid false matches for packets sent to an IP destination of 2561 * all zeros. 2562 */ 2563 if (laddr == 0) 2564 return (NULL); 2565 2566 ASSERT(zoneid != ALL_ZONES); 2567 2568 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2569 mutex_enter(&bind_connfp->connf_lock); 2570 for (connp = bind_connfp->connf_head; connp != NULL; 2571 connp = connp->conn_next) { 2572 tcp = connp->conn_tcp; 2573 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2574 IPCL_ZONE_MATCH(connp, zoneid) && 2575 (tcp->tcp_listener == NULL)) { 2576 CONN_INC_REF(connp); 2577 mutex_exit(&bind_connfp->connf_lock); 2578 return (connp); 2579 } 2580 } 2581 mutex_exit(&bind_connfp->connf_lock); 2582 return (NULL); 2583 } 2584 2585 /* 2586 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2587 * a listener when changing state. 2588 */ 2589 conn_t * 2590 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2591 zoneid_t zoneid, ip_stack_t *ipst) 2592 { 2593 connf_t *bind_connfp; 2594 conn_t *connp = NULL; 2595 tcp_t *tcp; 2596 2597 /* 2598 * Avoid false matches for packets sent to an IP destination of 2599 * all zeros. 2600 */ 2601 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2602 return (NULL); 2603 2604 ASSERT(zoneid != ALL_ZONES); 2605 2606 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2607 mutex_enter(&bind_connfp->connf_lock); 2608 for (connp = bind_connfp->connf_head; connp != NULL; 2609 connp = connp->conn_next) { 2610 tcp = connp->conn_tcp; 2611 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2612 IPCL_ZONE_MATCH(connp, zoneid) && 2613 (tcp->tcp_bound_if == 0 || 2614 tcp->tcp_bound_if == ifindex) && 2615 tcp->tcp_listener == NULL) { 2616 CONN_INC_REF(connp); 2617 mutex_exit(&bind_connfp->connf_lock); 2618 return (connp); 2619 } 2620 } 2621 mutex_exit(&bind_connfp->connf_lock); 2622 return (NULL); 2623 } 2624 2625 /* 2626 * ipcl_get_next_conn 2627 * get the next entry in the conn global list 2628 * and put a reference on the next_conn. 2629 * decrement the reference on the current conn. 2630 * 2631 * This is an iterator based walker function that also provides for 2632 * some selection by the caller. It walks through the conn_hash bucket 2633 * searching for the next valid connp in the list, and selects connections 2634 * that are neither closed nor condemned. It also REFHOLDS the conn 2635 * thus ensuring that the conn exists when the caller uses the conn. 2636 */ 2637 conn_t * 2638 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2639 { 2640 conn_t *next_connp; 2641 2642 if (connfp == NULL) 2643 return (NULL); 2644 2645 mutex_enter(&connfp->connf_lock); 2646 2647 next_connp = (connp == NULL) ? 2648 connfp->connf_head : connp->conn_g_next; 2649 2650 while (next_connp != NULL) { 2651 mutex_enter(&next_connp->conn_lock); 2652 if (!(next_connp->conn_flags & conn_flags) || 2653 (next_connp->conn_state_flags & 2654 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2655 /* 2656 * This conn has been condemned or 2657 * is closing, or the flags don't match 2658 */ 2659 mutex_exit(&next_connp->conn_lock); 2660 next_connp = next_connp->conn_g_next; 2661 continue; 2662 } 2663 CONN_INC_REF_LOCKED(next_connp); 2664 mutex_exit(&next_connp->conn_lock); 2665 break; 2666 } 2667 2668 mutex_exit(&connfp->connf_lock); 2669 2670 if (connp != NULL) 2671 CONN_DEC_REF(connp); 2672 2673 return (next_connp); 2674 } 2675 2676 #ifdef CONN_DEBUG 2677 /* 2678 * Trace of the last NBUF refhold/refrele 2679 */ 2680 int 2681 conn_trace_ref(conn_t *connp) 2682 { 2683 int last; 2684 conn_trace_t *ctb; 2685 2686 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2687 last = connp->conn_trace_last; 2688 last++; 2689 if (last == CONN_TRACE_MAX) 2690 last = 0; 2691 2692 ctb = &connp->conn_trace_buf[last]; 2693 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2694 connp->conn_trace_last = last; 2695 return (1); 2696 } 2697 2698 int 2699 conn_untrace_ref(conn_t *connp) 2700 { 2701 int last; 2702 conn_trace_t *ctb; 2703 2704 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2705 last = connp->conn_trace_last; 2706 last++; 2707 if (last == CONN_TRACE_MAX) 2708 last = 0; 2709 2710 ctb = &connp->conn_trace_buf[last]; 2711 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2712 connp->conn_trace_last = last; 2713 return (1); 2714 } 2715 #endif 2716