1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 const char ipclassifier_version[] = "@(#)ipclassifier.c %I% %E% SMI"; 29 30 /* 31 * IP PACKET CLASSIFIER 32 * 33 * The IP packet classifier provides mapping between IP packets and persistent 34 * connection state for connection-oriented protocols. It also provides 35 * interface for managing connection states. 36 * 37 * The connection state is kept in conn_t data structure and contains, among 38 * other things: 39 * 40 * o local/remote address and ports 41 * o Transport protocol 42 * o squeue for the connection (for TCP only) 43 * o reference counter 44 * o Connection state 45 * o hash table linkage 46 * o interface/ire information 47 * o credentials 48 * o ipsec policy 49 * o send and receive functions. 50 * o mutex lock. 51 * 52 * Connections use a reference counting scheme. They are freed when the 53 * reference counter drops to zero. A reference is incremented when connection 54 * is placed in a list or table, when incoming packet for the connection arrives 55 * and when connection is processed via squeue (squeue processing may be 56 * asynchronous and the reference protects the connection from being destroyed 57 * before its processing is finished). 58 * 59 * send and receive functions are currently used for TCP only. The send function 60 * determines the IP entry point for the packet once it leaves TCP to be sent to 61 * the destination address. The receive function is used by IP when the packet 62 * should be passed for TCP processing. When a new connection is created these 63 * are set to ip_output() and tcp_input() respectively. During the lifetime of 64 * the connection the send and receive functions may change depending on the 65 * changes in the connection state. For example, Once the connection is bound to 66 * an addresse, the receive function for this connection is set to 67 * tcp_conn_request(). This allows incoming SYNs to go directly into the 68 * listener SYN processing function without going to tcp_input() first. 69 * 70 * Classifier uses several hash tables: 71 * 72 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 73 * ipcl_bind_fanout: contains all connections in BOUND state 74 * ipcl_proto_fanout: IPv4 protocol fanout 75 * ipcl_proto_fanout_v6: IPv6 protocol fanout 76 * ipcl_udp_fanout: contains all UDP connections 77 * ipcl_globalhash_fanout: contains all connections 78 * 79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 80 * which need to view all existing connections. 81 * 82 * All tables are protected by per-bucket locks. When both per-bucket lock and 83 * connection lock need to be held, the per-bucket lock should be acquired 84 * first, followed by the connection lock. 85 * 86 * All functions doing search in one of these tables increment a reference 87 * counter on the connection found (if any). This reference should be dropped 88 * when the caller has finished processing the connection. 89 * 90 * 91 * INTERFACES: 92 * =========== 93 * 94 * Connection Lookup: 95 * ------------------ 96 * 97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid) 98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid) 99 * 100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 101 * it can't find any associated connection. If the connection is found, its 102 * reference counter is incremented. 103 * 104 * mp: mblock, containing packet header. The full header should fit 105 * into a single mblock. It should also contain at least full IP 106 * and TCP or UDP header. 107 * 108 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 109 * 110 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 111 * the packet. 112 * 113 * zoneid: The zone in which the returned connection must be; the zoneid 114 * corresponding to the ire_zoneid on the IRE located for the 115 * packet's destination address. 116 * 117 * For TCP connections, the lookup order is as follows: 118 * 5-tuple {src, dst, protocol, local port, remote port} 119 * lookup in ipcl_conn_fanout table. 120 * 3-tuple {dst, remote port, protocol} lookup in 121 * ipcl_bind_fanout table. 122 * 123 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 124 * remote port} lookup is done on ipcl_udp_fanout. Note that, 125 * these interfaces do not handle cases where a packets belongs 126 * to multiple UDP clients, which is handled in IP itself. 127 * 128 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 129 * determine which actual zone gets the segment. This is used only in a 130 * labeled environment. The matching rules are: 131 * 132 * - If it's not a multilevel port, then the label on the packet selects 133 * the zone. Unlabeled packets are delivered to the global zone. 134 * 135 * - If it's a multilevel port, then only the zone registered to receive 136 * packets on that port matches. 137 * 138 * Also, in a labeled environment, packet labels need to be checked. For fully 139 * bound TCP connections, we can assume that the packet label was checked 140 * during connection establishment, and doesn't need to be checked on each 141 * packet. For others, though, we need to check for strict equality or, for 142 * multilevel ports, membership in the range or set. This part currently does 143 * a tnrh lookup on each packet, but could be optimized to use cached results 144 * if that were necessary. (SCTP doesn't come through here, but if it did, 145 * we would apply the same rules as TCP.) 146 * 147 * An implication of the above is that fully-bound TCP sockets must always use 148 * distinct 4-tuples; they can't be discriminated by label alone. 149 * 150 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 151 * as there's no connection set-up handshake and no shared state. 152 * 153 * Labels on looped-back packets within a single zone do not need to be 154 * checked, as all processes in the same zone have the same label. 155 * 156 * Finally, for unlabeled packets received by a labeled system, special rules 157 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 158 * socket in the zone whose label matches the default label of the sender, if 159 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 160 * receiver's label must dominate the sender's default label. 161 * 162 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int); 163 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t); 164 * 165 * Lookup routine to find a exact match for {src, dst, local port, 166 * remote port) for TCP connections in ipcl_conn_fanout. The address and 167 * ports are read from the IP and TCP header respectively. 168 * 169 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol); 170 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex); 171 * 172 * Lookup routine to find a listener with the tuple {lport, laddr, 173 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 174 * parameter interface index is also compared. 175 * 176 * void ipcl_walk(func, arg) 177 * 178 * Apply 'func' to every connection available. The 'func' is called as 179 * (*func)(connp, arg). The walk is non-atomic so connections may be 180 * created and destroyed during the walk. The CONN_CONDEMNED and 181 * CONN_INCIPIENT flags ensure that connections which are newly created 182 * or being destroyed are not selected by the walker. 183 * 184 * Table Updates 185 * ------------- 186 * 187 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 188 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 189 * 190 * Insert 'connp' in the ipcl_conn_fanout. 191 * Arguements : 192 * connp conn_t to be inserted 193 * protocol connection protocol 194 * src source address 195 * dst destination address 196 * ports local and remote port 197 * ifindex interface index for IPv6 connections 198 * 199 * Return value : 200 * 0 if connp was inserted 201 * EADDRINUSE if the connection with the same tuple 202 * already exists. 203 * 204 * int ipcl_bind_insert(connp, protocol, src, lport); 205 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 206 * 207 * Insert 'connp' in ipcl_bind_fanout. 208 * Arguements : 209 * connp conn_t to be inserted 210 * protocol connection protocol 211 * src source address connection wants 212 * to bind to 213 * lport local port connection wants to 214 * bind to 215 * 216 * 217 * void ipcl_hash_remove(connp); 218 * 219 * Removes the 'connp' from the connection fanout table. 220 * 221 * Connection Creation/Destruction 222 * ------------------------------- 223 * 224 * conn_t *ipcl_conn_create(type, sleep) 225 * 226 * Creates a new conn based on the type flag, inserts it into 227 * globalhash table. 228 * 229 * type: This flag determines the type of conn_t which needs to be 230 * created. 231 * IPCL_TCPCONN indicates a TCP connection 232 * IPCL_IPCONN indicates all non-TCP connections. 233 * 234 * void ipcl_conn_destroy(connp) 235 * 236 * Destroys the connection state, removes it from the global 237 * connection hash table and frees its memory. 238 */ 239 240 #include <sys/types.h> 241 #include <sys/stream.h> 242 #include <sys/stropts.h> 243 #include <sys/sysmacros.h> 244 #include <sys/strsubr.h> 245 #include <sys/strsun.h> 246 #define _SUN_TPI_VERSION 2 247 #include <sys/ddi.h> 248 #include <sys/cmn_err.h> 249 #include <sys/debug.h> 250 251 #include <sys/systm.h> 252 #include <sys/param.h> 253 #include <sys/kmem.h> 254 #include <sys/isa_defs.h> 255 #include <inet/common.h> 256 #include <netinet/ip6.h> 257 #include <netinet/icmp6.h> 258 259 #include <inet/ip.h> 260 #include <inet/ip6.h> 261 #include <inet/tcp.h> 262 #include <inet/ip_ndp.h> 263 #include <inet/udp_impl.h> 264 #include <inet/sctp_ip.h> 265 266 #include <sys/cpuvar.h> 267 268 #include <inet/ipclassifier.h> 269 #include <inet/ipsec_impl.h> 270 271 #include <sys/tsol/tnet.h> 272 273 #ifdef DEBUG 274 #define IPCL_DEBUG 275 #else 276 #undef IPCL_DEBUG 277 #endif 278 279 #ifdef IPCL_DEBUG 280 int ipcl_debug_level = 0; 281 #define IPCL_DEBUG_LVL(level, args) \ 282 if (ipcl_debug_level & level) { printf args; } 283 #else 284 #define IPCL_DEBUG_LVL(level, args) {; } 285 #endif 286 connf_t *ipcl_conn_fanout; 287 connf_t *ipcl_bind_fanout; 288 connf_t ipcl_proto_fanout[IPPROTO_MAX + 1]; 289 connf_t ipcl_proto_fanout_v6[IPPROTO_MAX + 1]; 290 connf_t *ipcl_udp_fanout; 291 292 /* A separate hash list for raw socket. */ 293 connf_t *ipcl_raw_fanout; 294 295 connf_t rts_clients; 296 297 /* Old value for compatibility */ 298 uint_t tcp_conn_hash_size = 0; 299 300 /* New value. Zero means choose automatically. */ 301 uint_t ipcl_conn_hash_size = 0; 302 uint_t ipcl_conn_hash_memfactor = 8192; 303 uint_t ipcl_conn_hash_maxsize = 82500; 304 305 uint_t ipcl_conn_fanout_size = 0; 306 307 308 /* bind/udp fanout table size */ 309 uint_t ipcl_bind_fanout_size = 512; 310 uint_t ipcl_udp_fanout_size = 16384; 311 312 /* Raw socket fanout size. Must be a power of 2. */ 313 uint_t ipcl_raw_fanout_size = 256; 314 315 /* 316 * Power of 2^N Primes useful for hashing for N of 0-28, 317 * these primes are the nearest prime <= 2^N - 2^(N-2). 318 */ 319 320 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 321 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 322 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 323 50331599, 100663291, 201326557, 0} 324 325 /* 326 * wrapper structure to ensure that conn+tcpb are aligned 327 * on cache lines. 328 */ 329 typedef struct itc_s { 330 union { 331 conn_t itcu_conn; 332 char itcu_filler[CACHE_ALIGN(conn_s)]; 333 } itc_u; 334 tcp_t itc_tcp; 335 } itc_t; 336 337 #define itc_conn itc_u.itcu_conn 338 339 struct kmem_cache *ipcl_tcpconn_cache; 340 struct kmem_cache *ipcl_tcp_cache; 341 struct kmem_cache *ipcl_conn_cache; 342 extern struct kmem_cache *sctp_conn_cache; 343 extern struct kmem_cache *tcp_sack_info_cache; 344 extern struct kmem_cache *tcp_iphc_cache; 345 346 extern void tcp_timermp_free(tcp_t *); 347 extern mblk_t *tcp_timermp_alloc(int); 348 349 static int ipcl_tcpconn_constructor(void *, void *, int); 350 static void ipcl_tcpconn_destructor(void *, void *); 351 352 static int conn_g_index; 353 connf_t *ipcl_globalhash_fanout; 354 355 #ifdef IPCL_DEBUG 356 #define INET_NTOA_BUFSIZE 18 357 358 static char * 359 inet_ntoa_r(uint32_t in, char *b) 360 { 361 unsigned char *p; 362 363 p = (unsigned char *)∈ 364 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 365 return (b); 366 } 367 #endif 368 369 /* 370 * ipclassifier intialization routine, sets up hash tables and 371 * conn caches. 372 */ 373 void 374 ipcl_init(void) 375 { 376 int i; 377 int sizes[] = P2Ps(); 378 379 ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache", 380 sizeof (conn_t), CACHE_ALIGN_SIZE, 381 NULL, NULL, NULL, NULL, NULL, 0); 382 383 ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache", 384 sizeof (itc_t), CACHE_ALIGN_SIZE, 385 ipcl_tcpconn_constructor, ipcl_tcpconn_destructor, 386 NULL, NULL, NULL, 0); 387 388 /* 389 * Calculate size of conn fanout table. 390 */ 391 if (ipcl_conn_hash_size != 0) { 392 ipcl_conn_fanout_size = ipcl_conn_hash_size; 393 } else if (tcp_conn_hash_size != 0) { 394 ipcl_conn_fanout_size = tcp_conn_hash_size; 395 } else { 396 extern pgcnt_t freemem; 397 398 ipcl_conn_fanout_size = 399 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 400 401 if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) 402 ipcl_conn_fanout_size = ipcl_conn_hash_maxsize; 403 } 404 405 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 406 if (sizes[i] >= ipcl_conn_fanout_size) { 407 break; 408 } 409 } 410 if ((ipcl_conn_fanout_size = sizes[i]) == 0) { 411 /* Out of range, use the 2^16 value */ 412 ipcl_conn_fanout_size = sizes[16]; 413 } 414 ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size * 415 sizeof (*ipcl_conn_fanout), KM_SLEEP); 416 417 for (i = 0; i < ipcl_conn_fanout_size; i++) { 418 mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL, 419 MUTEX_DEFAULT, NULL); 420 } 421 422 ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size * 423 sizeof (*ipcl_bind_fanout), KM_SLEEP); 424 425 for (i = 0; i < ipcl_bind_fanout_size; i++) { 426 mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL, 427 MUTEX_DEFAULT, NULL); 428 } 429 430 for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) { 431 mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL, 432 MUTEX_DEFAULT, NULL); 433 } 434 for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) { 435 mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL, 436 MUTEX_DEFAULT, NULL); 437 } 438 439 mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL); 440 441 ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size * 442 sizeof (*ipcl_udp_fanout), KM_SLEEP); 443 444 for (i = 0; i < ipcl_udp_fanout_size; i++) { 445 mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL, 446 MUTEX_DEFAULT, NULL); 447 } 448 449 ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size * 450 sizeof (*ipcl_raw_fanout), KM_SLEEP); 451 452 for (i = 0; i < ipcl_raw_fanout_size; i++) { 453 mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL, 454 MUTEX_DEFAULT, NULL); 455 } 456 457 ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) * 458 CONN_G_HASH_SIZE, KM_SLEEP); 459 460 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 461 mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL, 462 MUTEX_DEFAULT, NULL); 463 } 464 } 465 466 void 467 ipcl_destroy(void) 468 { 469 int i; 470 kmem_cache_destroy(ipcl_conn_cache); 471 kmem_cache_destroy(ipcl_tcpconn_cache); 472 for (i = 0; i < ipcl_conn_fanout_size; i++) 473 mutex_destroy(&ipcl_conn_fanout[i].connf_lock); 474 kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size * 475 sizeof (*ipcl_conn_fanout)); 476 for (i = 0; i < ipcl_bind_fanout_size; i++) 477 mutex_destroy(&ipcl_bind_fanout[i].connf_lock); 478 kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size * 479 sizeof (*ipcl_bind_fanout)); 480 481 for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) 482 mutex_destroy(&ipcl_proto_fanout[i].connf_lock); 483 for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) 484 mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock); 485 486 for (i = 0; i < ipcl_udp_fanout_size; i++) 487 mutex_destroy(&ipcl_udp_fanout[i].connf_lock); 488 kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size * 489 sizeof (*ipcl_udp_fanout)); 490 491 for (i = 0; i < ipcl_raw_fanout_size; i++) 492 mutex_destroy(&ipcl_raw_fanout[i].connf_lock); 493 kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size * 494 sizeof (*ipcl_raw_fanout)); 495 496 kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE); 497 mutex_destroy(&rts_clients.connf_lock); 498 } 499 500 /* 501 * conn creation routine. initialize the conn, sets the reference 502 * and inserts it in the global hash table. 503 */ 504 conn_t * 505 ipcl_conn_create(uint32_t type, int sleep) 506 { 507 itc_t *itc; 508 conn_t *connp; 509 510 switch (type) { 511 case IPCL_TCPCONN: 512 if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache, 513 sleep)) == NULL) 514 return (NULL); 515 connp = &itc->itc_conn; 516 connp->conn_ref = 1; 517 IPCL_DEBUG_LVL(1, 518 ("ipcl_conn_create: connp = %p tcp (%p)", 519 (void *)connp, (void *)connp->conn_tcp)); 520 ipcl_globalhash_insert(connp); 521 break; 522 case IPCL_SCTPCONN: 523 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 524 return (NULL); 525 connp->conn_flags = IPCL_SCTPCONN; 526 break; 527 case IPCL_IPCCONN: 528 connp = kmem_cache_alloc(ipcl_conn_cache, sleep); 529 if (connp == NULL) 530 return (NULL); 531 bzero(connp, sizeof (conn_t)); 532 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 533 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 534 connp->conn_flags = IPCL_IPCCONN; 535 connp->conn_ref = 1; 536 IPCL_DEBUG_LVL(1, 537 ("ipcl_conn_create: connp = %p\n", (void *)connp)); 538 ipcl_globalhash_insert(connp); 539 break; 540 default: 541 connp = NULL; 542 ASSERT(0); 543 } 544 545 return (connp); 546 } 547 548 void 549 ipcl_conn_destroy(conn_t *connp) 550 { 551 mblk_t *mp; 552 553 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 554 ASSERT(connp->conn_ref == 0); 555 ASSERT(connp->conn_ire_cache == NULL); 556 557 if (connp->conn_peercred != NULL && 558 connp->conn_peercred != connp->conn_cred) 559 crfree(connp->conn_peercred); 560 connp->conn_peercred = NULL; 561 562 if (connp->conn_cred != NULL) { 563 crfree(connp->conn_cred); 564 connp->conn_cred = NULL; 565 } 566 567 ipcl_globalhash_remove(connp); 568 569 cv_destroy(&connp->conn_cv); 570 if (connp->conn_flags & IPCL_TCPCONN) { 571 tcp_t *tcp = connp->conn_tcp; 572 573 mutex_destroy(&connp->conn_lock); 574 ASSERT(connp->conn_tcp != NULL); 575 tcp_free(tcp); 576 mp = tcp->tcp_timercache; 577 tcp->tcp_cred = NULL; 578 579 if (tcp->tcp_sack_info != NULL) { 580 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 581 kmem_cache_free(tcp_sack_info_cache, 582 tcp->tcp_sack_info); 583 } 584 if (tcp->tcp_iphc != NULL) { 585 if (tcp->tcp_hdr_grown) { 586 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 587 } else { 588 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 589 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 590 } 591 tcp->tcp_iphc_len = 0; 592 } 593 ASSERT(tcp->tcp_iphc_len == 0); 594 595 if (connp->conn_latch != NULL) 596 IPLATCH_REFRELE(connp->conn_latch); 597 if (connp->conn_policy != NULL) 598 IPPH_REFRELE(connp->conn_policy); 599 bzero(connp, sizeof (itc_t)); 600 601 tcp->tcp_timercache = mp; 602 connp->conn_tcp = tcp; 603 connp->conn_flags = IPCL_TCPCONN; 604 connp->conn_ulp = IPPROTO_TCP; 605 tcp->tcp_connp = connp; 606 kmem_cache_free(ipcl_tcpconn_cache, connp); 607 } else if (connp->conn_flags & IPCL_SCTPCONN) { 608 sctp_free(connp); 609 } else { 610 ASSERT(connp->conn_udp == NULL); 611 mutex_destroy(&connp->conn_lock); 612 kmem_cache_free(ipcl_conn_cache, connp); 613 } 614 } 615 616 /* 617 * Running in cluster mode - deregister listener information 618 */ 619 620 static void 621 ipcl_conn_unlisten(conn_t *connp) 622 { 623 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 624 ASSERT(connp->conn_lport != 0); 625 626 if (cl_inet_unlisten != NULL) { 627 sa_family_t addr_family; 628 uint8_t *laddrp; 629 630 if (connp->conn_pkt_isv6) { 631 addr_family = AF_INET6; 632 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 633 } else { 634 addr_family = AF_INET; 635 laddrp = (uint8_t *)&connp->conn_bound_source; 636 } 637 (*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp, 638 connp->conn_lport); 639 } 640 connp->conn_flags &= ~IPCL_CL_LISTENER; 641 } 642 643 /* 644 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 645 * which table the conn belonged to). So for debugging we can see which hash 646 * table this connection was in. 647 */ 648 #define IPCL_HASH_REMOVE(connp) { \ 649 connf_t *connfp = (connp)->conn_fanout; \ 650 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 651 if (connfp != NULL) { \ 652 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 653 (void *)(connp))); \ 654 mutex_enter(&connfp->connf_lock); \ 655 if ((connp)->conn_next != NULL) \ 656 (connp)->conn_next->conn_prev = \ 657 (connp)->conn_prev; \ 658 if ((connp)->conn_prev != NULL) \ 659 (connp)->conn_prev->conn_next = \ 660 (connp)->conn_next; \ 661 else \ 662 connfp->connf_head = (connp)->conn_next; \ 663 (connp)->conn_fanout = NULL; \ 664 (connp)->conn_next = NULL; \ 665 (connp)->conn_prev = NULL; \ 666 (connp)->conn_flags |= IPCL_REMOVED; \ 667 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 668 ipcl_conn_unlisten((connp)); \ 669 CONN_DEC_REF((connp)); \ 670 mutex_exit(&connfp->connf_lock); \ 671 } \ 672 } 673 674 void 675 ipcl_hash_remove(conn_t *connp) 676 { 677 IPCL_HASH_REMOVE(connp); 678 } 679 680 /* 681 * The whole purpose of this function is allow removal of 682 * a conn_t from the connected hash for timewait reclaim. 683 * This is essentially a TW reclaim fastpath where timewait 684 * collector checks under fanout lock (so no one else can 685 * get access to the conn_t) that refcnt is 2 i.e. one for 686 * TCP and one for the classifier hash list. If ref count 687 * is indeed 2, we can just remove the conn under lock and 688 * avoid cleaning up the conn under squeue. This gives us 689 * improved performance. 690 */ 691 void 692 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 693 { 694 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 695 ASSERT(MUTEX_HELD(&connp->conn_lock)); 696 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 697 698 if ((connp)->conn_next != NULL) { 699 (connp)->conn_next->conn_prev = 700 (connp)->conn_prev; 701 } 702 if ((connp)->conn_prev != NULL) { 703 (connp)->conn_prev->conn_next = 704 (connp)->conn_next; 705 } else { 706 connfp->connf_head = (connp)->conn_next; 707 } 708 (connp)->conn_fanout = NULL; 709 (connp)->conn_next = NULL; 710 (connp)->conn_prev = NULL; 711 (connp)->conn_flags |= IPCL_REMOVED; 712 ASSERT((connp)->conn_ref == 2); 713 (connp)->conn_ref--; 714 } 715 716 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 717 ASSERT((connp)->conn_fanout == NULL); \ 718 ASSERT((connp)->conn_next == NULL); \ 719 ASSERT((connp)->conn_prev == NULL); \ 720 if ((connfp)->connf_head != NULL) { \ 721 (connfp)->connf_head->conn_prev = (connp); \ 722 (connp)->conn_next = (connfp)->connf_head; \ 723 } \ 724 (connp)->conn_fanout = (connfp); \ 725 (connfp)->connf_head = (connp); \ 726 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 727 IPCL_CONNECTED; \ 728 CONN_INC_REF(connp); \ 729 } 730 731 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 732 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 733 "connp %p", (void *)(connfp), (void *)(connp))); \ 734 IPCL_HASH_REMOVE((connp)); \ 735 mutex_enter(&(connfp)->connf_lock); \ 736 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 737 mutex_exit(&(connfp)->connf_lock); \ 738 } 739 740 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 741 conn_t *pconnp = NULL, *nconnp; \ 742 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 743 "connp %p", (void *)connfp, (void *)(connp))); \ 744 IPCL_HASH_REMOVE((connp)); \ 745 mutex_enter(&(connfp)->connf_lock); \ 746 nconnp = (connfp)->connf_head; \ 747 while (nconnp != NULL && \ 748 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 749 pconnp = nconnp; \ 750 nconnp = nconnp->conn_next; \ 751 } \ 752 if (pconnp != NULL) { \ 753 pconnp->conn_next = (connp); \ 754 (connp)->conn_prev = pconnp; \ 755 } else { \ 756 (connfp)->connf_head = (connp); \ 757 } \ 758 if (nconnp != NULL) { \ 759 (connp)->conn_next = nconnp; \ 760 nconnp->conn_prev = (connp); \ 761 } \ 762 (connp)->conn_fanout = (connfp); \ 763 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 764 IPCL_BOUND; \ 765 CONN_INC_REF(connp); \ 766 mutex_exit(&(connfp)->connf_lock); \ 767 } 768 769 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 770 conn_t **list, *prev, *next; \ 771 boolean_t isv4mapped = \ 772 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 773 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 774 "connp %p", (void *)(connfp), (void *)(connp))); \ 775 IPCL_HASH_REMOVE((connp)); \ 776 mutex_enter(&(connfp)->connf_lock); \ 777 list = &(connfp)->connf_head; \ 778 prev = NULL; \ 779 while ((next = *list) != NULL) { \ 780 if (isv4mapped && \ 781 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 782 connp->conn_zoneid == next->conn_zoneid) { \ 783 (connp)->conn_next = next; \ 784 if (prev != NULL) \ 785 prev = next->conn_prev; \ 786 next->conn_prev = (connp); \ 787 break; \ 788 } \ 789 list = &next->conn_next; \ 790 prev = next; \ 791 } \ 792 (connp)->conn_prev = prev; \ 793 *list = (connp); \ 794 (connp)->conn_fanout = (connfp); \ 795 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 796 IPCL_BOUND; \ 797 CONN_INC_REF((connp)); \ 798 mutex_exit(&(connfp)->connf_lock); \ 799 } 800 801 void 802 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 803 { 804 ASSERT(!connp->conn_mac_exempt); 805 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 806 } 807 808 void 809 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 810 { 811 connf_t *connfp; 812 813 ASSERT(connp != NULL); 814 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 815 protocol == IPPROTO_ESP); 816 817 connp->conn_ulp = protocol; 818 819 /* Insert it in the protocol hash */ 820 connfp = &ipcl_proto_fanout[protocol]; 821 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 822 } 823 824 void 825 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 826 { 827 connf_t *connfp; 828 829 ASSERT(connp != NULL); 830 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 831 protocol == IPPROTO_ESP); 832 833 connp->conn_ulp = protocol; 834 835 /* Insert it in the Bind Hash */ 836 connfp = &ipcl_proto_fanout_v6[protocol]; 837 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 838 } 839 840 /* 841 * This function is used only for inserting SCTP raw socket now. 842 * This may change later. 843 * 844 * Note that only one raw socket can be bound to a port. The param 845 * lport is in network byte order. 846 */ 847 static int 848 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 849 { 850 connf_t *connfp; 851 conn_t *oconnp; 852 853 connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))]; 854 855 /* Check for existing raw socket already bound to the port. */ 856 mutex_enter(&connfp->connf_lock); 857 for (oconnp = connfp->connf_head; oconnp != NULL; 858 oconnp = oconnp->conn_next) { 859 if (oconnp->conn_lport == lport && 860 oconnp->conn_zoneid == connp->conn_zoneid && 861 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 862 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 863 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 864 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 865 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 866 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 867 &connp->conn_srcv6))) { 868 break; 869 } 870 } 871 mutex_exit(&connfp->connf_lock); 872 if (oconnp != NULL) 873 return (EADDRNOTAVAIL); 874 875 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 876 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 877 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 878 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 879 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 880 } else { 881 IPCL_HASH_INSERT_BOUND(connfp, connp); 882 } 883 } else { 884 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 885 } 886 return (0); 887 } 888 889 /* 890 * Check for a MAC exemption conflict on a labeled system. Note that for 891 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 892 * transport layer. This check is for binding all other protocols. 893 * 894 * Returns true if there's a conflict. 895 */ 896 static boolean_t 897 check_exempt_conflict_v4(conn_t *connp) 898 { 899 connf_t *connfp; 900 conn_t *tconn; 901 902 connfp = &ipcl_proto_fanout[connp->conn_ulp]; 903 mutex_enter(&connfp->connf_lock); 904 for (tconn = connfp->connf_head; tconn != NULL; 905 tconn = tconn->conn_next) { 906 /* We don't allow v4 fallback for v6 raw socket */ 907 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 908 continue; 909 /* If neither is exempt, then there's no conflict */ 910 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 911 continue; 912 /* If both are bound to different specific addrs, ok */ 913 if (connp->conn_src != INADDR_ANY && 914 tconn->conn_src != INADDR_ANY && 915 connp->conn_src != tconn->conn_src) 916 continue; 917 /* These two conflict; fail */ 918 break; 919 } 920 mutex_exit(&connfp->connf_lock); 921 return (tconn != NULL); 922 } 923 924 static boolean_t 925 check_exempt_conflict_v6(conn_t *connp) 926 { 927 connf_t *connfp; 928 conn_t *tconn; 929 930 connfp = &ipcl_proto_fanout[connp->conn_ulp]; 931 mutex_enter(&connfp->connf_lock); 932 for (tconn = connfp->connf_head; tconn != NULL; 933 tconn = tconn->conn_next) { 934 /* We don't allow v4 fallback for v6 raw socket */ 935 if (connp->conn_af_isv6 != tconn->conn_af_isv6) 936 continue; 937 /* If neither is exempt, then there's no conflict */ 938 if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt) 939 continue; 940 /* If both are bound to different addrs, ok */ 941 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && 942 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && 943 !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) 944 continue; 945 /* These two conflict; fail */ 946 break; 947 } 948 mutex_exit(&connfp->connf_lock); 949 return (tconn != NULL); 950 } 951 952 /* 953 * (v4, v6) bind hash insertion routines 954 */ 955 int 956 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) 957 { 958 connf_t *connfp; 959 #ifdef IPCL_DEBUG 960 char buf[INET_NTOA_BUFSIZE]; 961 #endif 962 int ret = 0; 963 964 ASSERT(connp); 965 966 IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " 967 "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); 968 969 connp->conn_ulp = protocol; 970 IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); 971 connp->conn_lport = lport; 972 973 switch (protocol) { 974 default: 975 if (is_system_labeled() && check_exempt_conflict_v4(connp)) 976 return (EADDRINUSE); 977 /* FALLTHROUGH */ 978 case IPPROTO_UDP: 979 if (protocol == IPPROTO_UDP) { 980 IPCL_DEBUG_LVL(64, 981 ("ipcl_bind_insert: connp %p - udp\n", 982 (void *)connp)); 983 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)]; 984 } else { 985 IPCL_DEBUG_LVL(64, 986 ("ipcl_bind_insert: connp %p - protocol\n", 987 (void *)connp)); 988 connfp = &ipcl_proto_fanout[protocol]; 989 } 990 991 if (connp->conn_rem != INADDR_ANY) { 992 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 993 } else if (connp->conn_src != INADDR_ANY) { 994 IPCL_HASH_INSERT_BOUND(connfp, connp); 995 } else { 996 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 997 } 998 break; 999 1000 case IPPROTO_TCP: 1001 1002 /* Insert it in the Bind Hash */ 1003 ASSERT(connp->conn_zoneid != ALL_ZONES); 1004 connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 1005 if (connp->conn_src != INADDR_ANY) { 1006 IPCL_HASH_INSERT_BOUND(connfp, connp); 1007 } else { 1008 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1009 } 1010 if (cl_inet_listen != NULL) { 1011 ASSERT(!connp->conn_pkt_isv6); 1012 connp->conn_flags |= IPCL_CL_LISTENER; 1013 (*cl_inet_listen)(IPPROTO_TCP, AF_INET, 1014 (uint8_t *)&connp->conn_bound_source, lport); 1015 } 1016 break; 1017 1018 case IPPROTO_SCTP: 1019 ret = ipcl_sctp_hash_insert(connp, lport); 1020 break; 1021 } 1022 1023 return (ret); 1024 } 1025 1026 int 1027 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1028 uint16_t lport) 1029 { 1030 connf_t *connfp; 1031 int ret = 0; 1032 1033 ASSERT(connp); 1034 1035 connp->conn_ulp = protocol; 1036 connp->conn_srcv6 = *src; 1037 connp->conn_lport = lport; 1038 1039 switch (protocol) { 1040 default: 1041 if (is_system_labeled() && check_exempt_conflict_v6(connp)) 1042 return (EADDRINUSE); 1043 /* FALLTHROUGH */ 1044 case IPPROTO_UDP: 1045 if (protocol == IPPROTO_UDP) { 1046 IPCL_DEBUG_LVL(128, 1047 ("ipcl_bind_insert_v6: connp %p - udp\n", 1048 (void *)connp)); 1049 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)]; 1050 } else { 1051 IPCL_DEBUG_LVL(128, 1052 ("ipcl_bind_insert_v6: connp %p - protocol\n", 1053 (void *)connp)); 1054 connfp = &ipcl_proto_fanout_v6[protocol]; 1055 } 1056 1057 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1058 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1059 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1060 IPCL_HASH_INSERT_BOUND(connfp, connp); 1061 } else { 1062 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1063 } 1064 break; 1065 1066 case IPPROTO_TCP: 1067 /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ 1068 1069 /* Insert it in the Bind Hash */ 1070 ASSERT(connp->conn_zoneid != ALL_ZONES); 1071 connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 1072 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1073 IPCL_HASH_INSERT_BOUND(connfp, connp); 1074 } else { 1075 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1076 } 1077 if (cl_inet_listen != NULL) { 1078 sa_family_t addr_family; 1079 uint8_t *laddrp; 1080 1081 if (connp->conn_pkt_isv6) { 1082 addr_family = AF_INET6; 1083 laddrp = 1084 (uint8_t *)&connp->conn_bound_source_v6; 1085 } else { 1086 addr_family = AF_INET; 1087 laddrp = (uint8_t *)&connp->conn_bound_source; 1088 } 1089 connp->conn_flags |= IPCL_CL_LISTENER; 1090 (*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp, 1091 lport); 1092 } 1093 break; 1094 1095 case IPPROTO_SCTP: 1096 ret = ipcl_sctp_hash_insert(connp, lport); 1097 break; 1098 } 1099 1100 return (ret); 1101 } 1102 1103 /* 1104 * ipcl_conn_hash insertion routines. 1105 */ 1106 int 1107 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, 1108 ipaddr_t rem, uint32_t ports) 1109 { 1110 connf_t *connfp; 1111 uint16_t *up; 1112 conn_t *tconnp; 1113 #ifdef IPCL_DEBUG 1114 char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; 1115 #endif 1116 in_port_t lport; 1117 int ret = 0; 1118 1119 IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " 1120 "dst = %s, ports = %x, protocol = %x", (void *)connp, 1121 inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), 1122 ports, protocol)); 1123 1124 switch (protocol) { 1125 case IPPROTO_TCP: 1126 if (!(connp->conn_flags & IPCL_EAGER)) { 1127 /* 1128 * for a eager connection, i.e connections which 1129 * have just been created, the initialization is 1130 * already done in ip at conn_creation time, so 1131 * we can skip the checks here. 1132 */ 1133 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1134 } 1135 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem, 1136 connp->conn_ports)]; 1137 mutex_enter(&connfp->connf_lock); 1138 for (tconnp = connfp->connf_head; tconnp != NULL; 1139 tconnp = tconnp->conn_next) { 1140 if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp, 1141 connp->conn_rem, connp->conn_src, 1142 connp->conn_ports)) { 1143 1144 /* Already have a conn. bail out */ 1145 mutex_exit(&connfp->connf_lock); 1146 return (EADDRINUSE); 1147 } 1148 } 1149 if (connp->conn_fanout != NULL) { 1150 /* 1151 * Probably a XTI/TLI application trying to do a 1152 * rebind. Let it happen. 1153 */ 1154 mutex_exit(&connfp->connf_lock); 1155 IPCL_HASH_REMOVE(connp); 1156 mutex_enter(&connfp->connf_lock); 1157 } 1158 1159 ASSERT(connp->conn_recv != NULL); 1160 1161 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1162 mutex_exit(&connfp->connf_lock); 1163 break; 1164 1165 case IPPROTO_SCTP: 1166 /* 1167 * The raw socket may have already been bound, remove it 1168 * from the hash first. 1169 */ 1170 IPCL_HASH_REMOVE(connp); 1171 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1172 ret = ipcl_sctp_hash_insert(connp, lport); 1173 break; 1174 1175 default: 1176 /* 1177 * Check for conflicts among MAC exempt bindings. For 1178 * transports with port numbers, this is done by the upper 1179 * level per-transport binding logic. For all others, it's 1180 * done here. 1181 */ 1182 if (is_system_labeled() && check_exempt_conflict_v4(connp)) 1183 return (EADDRINUSE); 1184 /* FALLTHROUGH */ 1185 1186 case IPPROTO_UDP: 1187 up = (uint16_t *)&ports; 1188 IPCL_CONN_INIT(connp, protocol, src, rem, ports); 1189 if (protocol == IPPROTO_UDP) { 1190 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])]; 1191 } else { 1192 connfp = &ipcl_proto_fanout[protocol]; 1193 } 1194 1195 if (connp->conn_rem != INADDR_ANY) { 1196 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1197 } else if (connp->conn_src != INADDR_ANY) { 1198 IPCL_HASH_INSERT_BOUND(connfp, connp); 1199 } else { 1200 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1201 } 1202 break; 1203 } 1204 1205 return (ret); 1206 } 1207 1208 int 1209 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, 1210 const in6_addr_t *rem, uint32_t ports, uint_t ifindex) 1211 { 1212 connf_t *connfp; 1213 uint16_t *up; 1214 conn_t *tconnp; 1215 in_port_t lport; 1216 int ret = 0; 1217 1218 switch (protocol) { 1219 case IPPROTO_TCP: 1220 /* Just need to insert a conn struct */ 1221 if (!(connp->conn_flags & IPCL_EAGER)) { 1222 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1223 } 1224 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6, 1225 connp->conn_ports)]; 1226 mutex_enter(&connfp->connf_lock); 1227 for (tconnp = connfp->connf_head; tconnp != NULL; 1228 tconnp = tconnp->conn_next) { 1229 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, 1230 connp->conn_remv6, connp->conn_srcv6, 1231 connp->conn_ports) && 1232 (tconnp->conn_tcp->tcp_bound_if == 0 || 1233 tconnp->conn_tcp->tcp_bound_if == ifindex)) { 1234 /* Already have a conn. bail out */ 1235 mutex_exit(&connfp->connf_lock); 1236 return (EADDRINUSE); 1237 } 1238 } 1239 if (connp->conn_fanout != NULL) { 1240 /* 1241 * Probably a XTI/TLI application trying to do a 1242 * rebind. Let it happen. 1243 */ 1244 mutex_exit(&connfp->connf_lock); 1245 IPCL_HASH_REMOVE(connp); 1246 mutex_enter(&connfp->connf_lock); 1247 } 1248 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1249 mutex_exit(&connfp->connf_lock); 1250 break; 1251 1252 case IPPROTO_SCTP: 1253 IPCL_HASH_REMOVE(connp); 1254 lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); 1255 ret = ipcl_sctp_hash_insert(connp, lport); 1256 break; 1257 1258 default: 1259 if (is_system_labeled() && check_exempt_conflict_v6(connp)) 1260 return (EADDRINUSE); 1261 /* FALLTHROUGH */ 1262 case IPPROTO_UDP: 1263 up = (uint16_t *)&ports; 1264 IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); 1265 if (protocol == IPPROTO_UDP) { 1266 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])]; 1267 } else { 1268 connfp = &ipcl_proto_fanout_v6[protocol]; 1269 } 1270 1271 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { 1272 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1273 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { 1274 IPCL_HASH_INSERT_BOUND(connfp, connp); 1275 } else { 1276 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1277 } 1278 break; 1279 } 1280 1281 return (ret); 1282 } 1283 1284 /* 1285 * v4 packet classifying function. looks up the fanout table to 1286 * find the conn, the packet belongs to. returns the conn with 1287 * the reference held, null otherwise. 1288 * 1289 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1290 * Lookup" comment block are applied. Labels are also checked as described 1291 * above. If the packet is from the inside (looped back), and is from the same 1292 * zone, then label checks are omitted. 1293 */ 1294 conn_t * 1295 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid) 1296 { 1297 ipha_t *ipha; 1298 connf_t *connfp, *bind_connfp; 1299 uint16_t lport; 1300 uint16_t fport; 1301 uint32_t ports; 1302 conn_t *connp; 1303 uint16_t *up; 1304 boolean_t shared_addr; 1305 boolean_t unlabeled; 1306 1307 ipha = (ipha_t *)mp->b_rptr; 1308 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1309 1310 switch (protocol) { 1311 case IPPROTO_TCP: 1312 ports = *(uint32_t *)up; 1313 connfp = 1314 &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)]; 1315 mutex_enter(&connfp->connf_lock); 1316 for (connp = connfp->connf_head; connp != NULL; 1317 connp = connp->conn_next) { 1318 if (IPCL_CONN_MATCH(connp, protocol, 1319 ipha->ipha_src, ipha->ipha_dst, ports)) 1320 break; 1321 } 1322 1323 if (connp != NULL) { 1324 /* 1325 * We have a fully-bound TCP connection. 1326 * 1327 * For labeled systems, there's no need to check the 1328 * label here. It's known to be good as we checked 1329 * before allowing the connection to become bound. 1330 */ 1331 CONN_INC_REF(connp); 1332 mutex_exit(&connfp->connf_lock); 1333 return (connp); 1334 } 1335 1336 mutex_exit(&connfp->connf_lock); 1337 1338 lport = up[1]; 1339 unlabeled = B_FALSE; 1340 /* Cred cannot be null on IPv4 */ 1341 if (is_system_labeled()) 1342 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1343 TSLF_UNLABELED) != 0; 1344 shared_addr = (zoneid == ALL_ZONES); 1345 if (shared_addr) { 1346 zoneid = tsol_mlp_findzone(protocol, lport); 1347 /* 1348 * If no shared MLP is found, tsol_mlp_findzone returns 1349 * ALL_ZONES. In that case, we assume it's SLP, and 1350 * search for the zone based on the packet label. 1351 * 1352 * If there is such a zone, we prefer to find a 1353 * connection in it. Otherwise, we look for a 1354 * MAC-exempt connection in any zone whose label 1355 * dominates the default label on the packet. 1356 */ 1357 if (zoneid == ALL_ZONES) 1358 zoneid = tsol_packet_to_zoneid(mp); 1359 else 1360 unlabeled = B_FALSE; 1361 } 1362 1363 bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 1364 mutex_enter(&bind_connfp->connf_lock); 1365 for (connp = bind_connfp->connf_head; connp != NULL; 1366 connp = connp->conn_next) { 1367 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1368 lport) && (IPCL_ZONE_MATCH(connp, zoneid) || 1369 (unlabeled && connp->conn_mac_exempt))) 1370 break; 1371 } 1372 1373 /* 1374 * If the matching connection is SLP on a private address, then 1375 * the label on the packet must match the local zone's label. 1376 * Otherwise, it must be in the label range defined by tnrh. 1377 * This is ensured by tsol_receive_label. 1378 */ 1379 if (connp != NULL && is_system_labeled() && 1380 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1381 shared_addr, connp)) { 1382 DTRACE_PROBE3( 1383 tx__ip__log__info__classify__tcp, 1384 char *, 1385 "connp(1) could not receive mp(2)", 1386 conn_t *, connp, mblk_t *, mp); 1387 connp = NULL; 1388 } 1389 1390 if (connp != NULL) { 1391 /* Have a listener at least */ 1392 CONN_INC_REF(connp); 1393 mutex_exit(&bind_connfp->connf_lock); 1394 return (connp); 1395 } 1396 1397 mutex_exit(&bind_connfp->connf_lock); 1398 1399 IPCL_DEBUG_LVL(512, 1400 ("ipcl_classify: couldn't classify mp = %p\n", 1401 (void *)mp)); 1402 break; 1403 1404 case IPPROTO_UDP: 1405 lport = up[1]; 1406 unlabeled = B_FALSE; 1407 /* Cred cannot be null on IPv4 */ 1408 if (is_system_labeled()) 1409 unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags & 1410 TSLF_UNLABELED) != 0; 1411 shared_addr = (zoneid == ALL_ZONES); 1412 if (shared_addr) { 1413 zoneid = tsol_mlp_findzone(protocol, lport); 1414 /* 1415 * If no shared MLP is found, tsol_mlp_findzone returns 1416 * ALL_ZONES. In that case, we assume it's SLP, and 1417 * search for the zone based on the packet label. 1418 * 1419 * If there is such a zone, we prefer to find a 1420 * connection in it. Otherwise, we look for a 1421 * MAC-exempt connection in any zone whose label 1422 * dominates the default label on the packet. 1423 */ 1424 if (zoneid == ALL_ZONES) 1425 zoneid = tsol_packet_to_zoneid(mp); 1426 else 1427 unlabeled = B_FALSE; 1428 } 1429 fport = up[0]; 1430 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); 1431 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)]; 1432 mutex_enter(&connfp->connf_lock); 1433 for (connp = connfp->connf_head; connp != NULL; 1434 connp = connp->conn_next) { 1435 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1436 fport, ipha->ipha_src) && 1437 (IPCL_ZONE_MATCH(connp, zoneid) || 1438 (unlabeled && connp->conn_mac_exempt))) 1439 break; 1440 } 1441 1442 if (connp != NULL && is_system_labeled() && 1443 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1444 shared_addr, connp)) { 1445 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1446 char *, "connp(1) could not receive mp(2)", 1447 conn_t *, connp, mblk_t *, mp); 1448 connp = NULL; 1449 } 1450 1451 if (connp != NULL) { 1452 CONN_INC_REF(connp); 1453 mutex_exit(&connfp->connf_lock); 1454 return (connp); 1455 } 1456 1457 /* 1458 * We shouldn't come here for multicast/broadcast packets 1459 */ 1460 mutex_exit(&connfp->connf_lock); 1461 IPCL_DEBUG_LVL(512, 1462 ("ipcl_classify: cant find udp conn_t for ports : %x %x", 1463 lport, fport)); 1464 break; 1465 } 1466 1467 return (NULL); 1468 } 1469 1470 conn_t * 1471 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid) 1472 { 1473 ip6_t *ip6h; 1474 connf_t *connfp, *bind_connfp; 1475 uint16_t lport; 1476 uint16_t fport; 1477 tcph_t *tcph; 1478 uint32_t ports; 1479 conn_t *connp; 1480 uint16_t *up; 1481 boolean_t shared_addr; 1482 boolean_t unlabeled; 1483 1484 ip6h = (ip6_t *)mp->b_rptr; 1485 1486 switch (protocol) { 1487 case IPPROTO_TCP: 1488 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 1489 up = (uint16_t *)tcph->th_lport; 1490 ports = *(uint32_t *)up; 1491 1492 connfp = 1493 &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)]; 1494 mutex_enter(&connfp->connf_lock); 1495 for (connp = connfp->connf_head; connp != NULL; 1496 connp = connp->conn_next) { 1497 if (IPCL_CONN_MATCH_V6(connp, protocol, 1498 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1499 break; 1500 } 1501 1502 if (connp != NULL) { 1503 /* 1504 * We have a fully-bound TCP connection. 1505 * 1506 * For labeled systems, there's no need to check the 1507 * label here. It's known to be good as we checked 1508 * before allowing the connection to become bound. 1509 */ 1510 CONN_INC_REF(connp); 1511 mutex_exit(&connfp->connf_lock); 1512 return (connp); 1513 } 1514 1515 mutex_exit(&connfp->connf_lock); 1516 1517 lport = up[1]; 1518 unlabeled = B_FALSE; 1519 /* Cred can be null on IPv6 */ 1520 if (is_system_labeled()) { 1521 cred_t *cr = DB_CRED(mp); 1522 1523 unlabeled = (cr != NULL && 1524 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1525 } 1526 shared_addr = (zoneid == ALL_ZONES); 1527 if (shared_addr) { 1528 zoneid = tsol_mlp_findzone(protocol, lport); 1529 /* 1530 * If no shared MLP is found, tsol_mlp_findzone returns 1531 * ALL_ZONES. In that case, we assume it's SLP, and 1532 * search for the zone based on the packet label. 1533 * 1534 * If there is such a zone, we prefer to find a 1535 * connection in it. Otherwise, we look for a 1536 * MAC-exempt connection in any zone whose label 1537 * dominates the default label on the packet. 1538 */ 1539 if (zoneid == ALL_ZONES) 1540 zoneid = tsol_packet_to_zoneid(mp); 1541 else 1542 unlabeled = B_FALSE; 1543 } 1544 1545 bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 1546 mutex_enter(&bind_connfp->connf_lock); 1547 for (connp = bind_connfp->connf_head; connp != NULL; 1548 connp = connp->conn_next) { 1549 if (IPCL_BIND_MATCH_V6(connp, protocol, 1550 ip6h->ip6_dst, lport) && 1551 (IPCL_ZONE_MATCH(connp, zoneid) || 1552 (unlabeled && connp->conn_mac_exempt))) 1553 break; 1554 } 1555 1556 if (connp != NULL && is_system_labeled() && 1557 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1558 shared_addr, connp)) { 1559 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1560 char *, "connp(1) could not receive mp(2)", 1561 conn_t *, connp, mblk_t *, mp); 1562 connp = NULL; 1563 } 1564 1565 if (connp != NULL) { 1566 /* Have a listner at least */ 1567 CONN_INC_REF(connp); 1568 mutex_exit(&bind_connfp->connf_lock); 1569 IPCL_DEBUG_LVL(512, 1570 ("ipcl_classify_v6: found listner " 1571 "connp = %p\n", (void *)connp)); 1572 1573 return (connp); 1574 } 1575 1576 mutex_exit(&bind_connfp->connf_lock); 1577 1578 IPCL_DEBUG_LVL(512, 1579 ("ipcl_classify_v6: couldn't classify mp = %p\n", 1580 (void *)mp)); 1581 break; 1582 1583 case IPPROTO_UDP: 1584 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1585 lport = up[1]; 1586 unlabeled = B_FALSE; 1587 /* Cred can be null on IPv6 */ 1588 if (is_system_labeled()) { 1589 cred_t *cr = DB_CRED(mp); 1590 1591 unlabeled = (cr != NULL && 1592 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1593 } 1594 shared_addr = (zoneid == ALL_ZONES); 1595 if (shared_addr) { 1596 zoneid = tsol_mlp_findzone(protocol, lport); 1597 /* 1598 * If no shared MLP is found, tsol_mlp_findzone returns 1599 * ALL_ZONES. In that case, we assume it's SLP, and 1600 * search for the zone based on the packet label. 1601 * 1602 * If there is such a zone, we prefer to find a 1603 * connection in it. Otherwise, we look for a 1604 * MAC-exempt connection in any zone whose label 1605 * dominates the default label on the packet. 1606 */ 1607 if (zoneid == ALL_ZONES) 1608 zoneid = tsol_packet_to_zoneid(mp); 1609 else 1610 unlabeled = B_FALSE; 1611 } 1612 1613 fport = up[0]; 1614 IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, 1615 fport)); 1616 connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)]; 1617 mutex_enter(&connfp->connf_lock); 1618 for (connp = connfp->connf_head; connp != NULL; 1619 connp = connp->conn_next) { 1620 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1621 fport, ip6h->ip6_src) && 1622 (IPCL_ZONE_MATCH(connp, zoneid) || 1623 (unlabeled && connp->conn_mac_exempt))) 1624 break; 1625 } 1626 1627 if (connp != NULL && is_system_labeled() && 1628 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1629 shared_addr, connp)) { 1630 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1631 char *, "connp(1) could not receive mp(2)", 1632 conn_t *, connp, mblk_t *, mp); 1633 connp = NULL; 1634 } 1635 1636 if (connp != NULL) { 1637 CONN_INC_REF(connp); 1638 mutex_exit(&connfp->connf_lock); 1639 return (connp); 1640 } 1641 1642 /* 1643 * We shouldn't come here for multicast/broadcast packets 1644 */ 1645 mutex_exit(&connfp->connf_lock); 1646 IPCL_DEBUG_LVL(512, 1647 ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", 1648 lport, fport)); 1649 break; 1650 } 1651 1652 return (NULL); 1653 } 1654 1655 /* 1656 * wrapper around ipcl_classify_(v4,v6) routines. 1657 */ 1658 conn_t * 1659 ipcl_classify(mblk_t *mp, zoneid_t zoneid) 1660 { 1661 uint16_t hdr_len; 1662 ipha_t *ipha; 1663 uint8_t *nexthdrp; 1664 1665 if (MBLKL(mp) < sizeof (ipha_t)) 1666 return (NULL); 1667 1668 switch (IPH_HDR_VERSION(mp->b_rptr)) { 1669 case IPV4_VERSION: 1670 ipha = (ipha_t *)mp->b_rptr; 1671 hdr_len = IPH_HDR_LENGTH(ipha); 1672 return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, 1673 zoneid)); 1674 case IPV6_VERSION: 1675 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 1676 &hdr_len, &nexthdrp)) 1677 return (NULL); 1678 1679 return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid)); 1680 } 1681 1682 return (NULL); 1683 } 1684 1685 conn_t * 1686 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, 1687 uint32_t ports, ipha_t *hdr) 1688 { 1689 connf_t *connfp; 1690 conn_t *connp; 1691 in_port_t lport; 1692 int af; 1693 boolean_t shared_addr; 1694 boolean_t unlabeled; 1695 const void *dst; 1696 1697 lport = ((uint16_t *)&ports)[1]; 1698 1699 unlabeled = B_FALSE; 1700 /* Cred can be null on IPv6 */ 1701 if (is_system_labeled()) { 1702 cred_t *cr = DB_CRED(mp); 1703 1704 unlabeled = (cr != NULL && 1705 crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; 1706 } 1707 shared_addr = (zoneid == ALL_ZONES); 1708 if (shared_addr) { 1709 zoneid = tsol_mlp_findzone(protocol, lport); 1710 /* 1711 * If no shared MLP is found, tsol_mlp_findzone returns 1712 * ALL_ZONES. In that case, we assume it's SLP, and search for 1713 * the zone based on the packet label. 1714 * 1715 * If there is such a zone, we prefer to find a connection in 1716 * it. Otherwise, we look for a MAC-exempt connection in any 1717 * zone whose label dominates the default label on the packet. 1718 */ 1719 if (zoneid == ALL_ZONES) 1720 zoneid = tsol_packet_to_zoneid(mp); 1721 else 1722 unlabeled = B_FALSE; 1723 } 1724 1725 af = IPH_HDR_VERSION(hdr); 1726 dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : 1727 (const void *)&((ip6_t *)hdr)->ip6_dst; 1728 connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))]; 1729 1730 mutex_enter(&connfp->connf_lock); 1731 for (connp = connfp->connf_head; connp != NULL; 1732 connp = connp->conn_next) { 1733 /* We don't allow v4 fallback for v6 raw socket. */ 1734 if (af == (connp->conn_af_isv6 ? IPV4_VERSION : 1735 IPV6_VERSION)) 1736 continue; 1737 if (connp->conn_fully_bound) { 1738 if (af == IPV4_VERSION) { 1739 if (!IPCL_CONN_MATCH(connp, protocol, 1740 hdr->ipha_src, hdr->ipha_dst, ports)) 1741 continue; 1742 } else { 1743 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1744 ((ip6_t *)hdr)->ip6_src, 1745 ((ip6_t *)hdr)->ip6_dst, ports)) 1746 continue; 1747 } 1748 } else { 1749 if (af == IPV4_VERSION) { 1750 if (!IPCL_BIND_MATCH(connp, protocol, 1751 hdr->ipha_dst, lport)) 1752 continue; 1753 } else { 1754 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1755 ((ip6_t *)hdr)->ip6_dst, lport)) 1756 continue; 1757 } 1758 } 1759 1760 if (IPCL_ZONE_MATCH(connp, zoneid) || 1761 (unlabeled && connp->conn_mac_exempt)) 1762 break; 1763 } 1764 /* 1765 * If the connection is fully-bound and connection-oriented (TCP or 1766 * SCTP), then we've already validated the remote system's label. 1767 * There's no need to do it again for every packet. 1768 */ 1769 if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || 1770 !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && 1771 !tsol_receive_local(mp, dst, af, shared_addr, connp)) { 1772 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1773 char *, "connp(1) could not receive mp(2)", 1774 conn_t *, connp, mblk_t *, mp); 1775 connp = NULL; 1776 } 1777 1778 if (connp != NULL) 1779 goto found; 1780 mutex_exit(&connfp->connf_lock); 1781 1782 /* Try to look for a wildcard match. */ 1783 connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)]; 1784 mutex_enter(&connfp->connf_lock); 1785 for (connp = connfp->connf_head; connp != NULL; 1786 connp = connp->conn_next) { 1787 /* We don't allow v4 fallback for v6 raw socket. */ 1788 if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : 1789 IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { 1790 continue; 1791 } 1792 if (af == IPV4_VERSION) { 1793 if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) 1794 break; 1795 } else { 1796 if (IPCL_RAW_MATCH_V6(connp, protocol, 1797 ((ip6_t *)hdr)->ip6_dst)) { 1798 break; 1799 } 1800 } 1801 } 1802 1803 if (connp != NULL) 1804 goto found; 1805 1806 mutex_exit(&connfp->connf_lock); 1807 return (NULL); 1808 1809 found: 1810 ASSERT(connp != NULL); 1811 CONN_INC_REF(connp); 1812 mutex_exit(&connfp->connf_lock); 1813 return (connp); 1814 } 1815 1816 /* ARGSUSED */ 1817 static int 1818 ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags) 1819 { 1820 itc_t *itc = (itc_t *)buf; 1821 conn_t *connp = &itc->itc_conn; 1822 tcp_t *tcp = &itc->itc_tcp; 1823 bzero(itc, sizeof (itc_t)); 1824 tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); 1825 connp->conn_tcp = tcp; 1826 connp->conn_flags = IPCL_TCPCONN; 1827 connp->conn_ulp = IPPROTO_TCP; 1828 tcp->tcp_connp = connp; 1829 return (0); 1830 } 1831 1832 /* ARGSUSED */ 1833 static void 1834 ipcl_tcpconn_destructor(void *buf, void *cdrarg) 1835 { 1836 tcp_timermp_free(((conn_t *)buf)->conn_tcp); 1837 } 1838 1839 /* 1840 * All conns are inserted in a global multi-list for the benefit of 1841 * walkers. The walk is guaranteed to walk all open conns at the time 1842 * of the start of the walk exactly once. This property is needed to 1843 * achieve some cleanups during unplumb of interfaces. This is achieved 1844 * as follows. 1845 * 1846 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 1847 * call the insert and delete functions below at creation and deletion 1848 * time respectively. The conn never moves or changes its position in this 1849 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 1850 * won't increase due to walkers, once the conn deletion has started. Note 1851 * that we can't remove the conn from the global list and then wait for 1852 * the refcnt to drop to zero, since walkers would then see a truncated 1853 * list. CONN_INCIPIENT ensures that walkers don't start looking at 1854 * conns until ip_open is ready to make them globally visible. 1855 * The global round robin multi-list locks are held only to get the 1856 * next member/insertion/deletion and contention should be negligible 1857 * if the multi-list is much greater than the number of cpus. 1858 */ 1859 void 1860 ipcl_globalhash_insert(conn_t *connp) 1861 { 1862 int index; 1863 1864 /* 1865 * No need for atomic here. Approximate even distribution 1866 * in the global lists is sufficient. 1867 */ 1868 conn_g_index++; 1869 index = conn_g_index & (CONN_G_HASH_SIZE - 1); 1870 1871 connp->conn_g_prev = NULL; 1872 /* 1873 * Mark as INCIPIENT, so that walkers will ignore this 1874 * for now, till ip_open is ready to make it visible globally. 1875 */ 1876 connp->conn_state_flags |= CONN_INCIPIENT; 1877 1878 /* Insert at the head of the list */ 1879 mutex_enter(&ipcl_globalhash_fanout[index].connf_lock); 1880 connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head; 1881 if (connp->conn_g_next != NULL) 1882 connp->conn_g_next->conn_g_prev = connp; 1883 ipcl_globalhash_fanout[index].connf_head = connp; 1884 1885 /* The fanout bucket this conn points to */ 1886 connp->conn_g_fanout = &ipcl_globalhash_fanout[index]; 1887 1888 mutex_exit(&ipcl_globalhash_fanout[index].connf_lock); 1889 } 1890 1891 void 1892 ipcl_globalhash_remove(conn_t *connp) 1893 { 1894 /* 1895 * We were never inserted in the global multi list. 1896 * IPCL_NONE variety is never inserted in the global multilist 1897 * since it is presumed to not need any cleanup and is transient. 1898 */ 1899 if (connp->conn_g_fanout == NULL) 1900 return; 1901 1902 mutex_enter(&connp->conn_g_fanout->connf_lock); 1903 if (connp->conn_g_prev != NULL) 1904 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 1905 else 1906 connp->conn_g_fanout->connf_head = connp->conn_g_next; 1907 if (connp->conn_g_next != NULL) 1908 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 1909 mutex_exit(&connp->conn_g_fanout->connf_lock); 1910 1911 /* Better to stumble on a null pointer than to corrupt memory */ 1912 connp->conn_g_next = NULL; 1913 connp->conn_g_prev = NULL; 1914 } 1915 1916 /* 1917 * Walk the list of all conn_t's in the system, calling the function provided 1918 * with the specified argument for each. 1919 * Applies to both IPv4 and IPv6. 1920 * 1921 * IPCs may hold pointers to ipif/ill. To guard against stale pointers 1922 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 1923 * unplumbed or removed. New conn_t's that are created while we are walking 1924 * may be missed by this walk, because they are not necessarily inserted 1925 * at the tail of the list. They are new conn_t's and thus don't have any 1926 * stale pointers. The CONN_CLOSING flag ensures that no new reference 1927 * is created to the struct that is going away. 1928 */ 1929 void 1930 ipcl_walk(pfv_t func, void *arg) 1931 { 1932 int i; 1933 conn_t *connp; 1934 conn_t *prev_connp; 1935 1936 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 1937 mutex_enter(&ipcl_globalhash_fanout[i].connf_lock); 1938 prev_connp = NULL; 1939 connp = ipcl_globalhash_fanout[i].connf_head; 1940 while (connp != NULL) { 1941 mutex_enter(&connp->conn_lock); 1942 if (connp->conn_state_flags & 1943 (CONN_CONDEMNED | CONN_INCIPIENT)) { 1944 mutex_exit(&connp->conn_lock); 1945 connp = connp->conn_g_next; 1946 continue; 1947 } 1948 CONN_INC_REF_LOCKED(connp); 1949 mutex_exit(&connp->conn_lock); 1950 mutex_exit(&ipcl_globalhash_fanout[i].connf_lock); 1951 (*func)(connp, arg); 1952 if (prev_connp != NULL) 1953 CONN_DEC_REF(prev_connp); 1954 mutex_enter(&ipcl_globalhash_fanout[i].connf_lock); 1955 prev_connp = connp; 1956 connp = connp->conn_g_next; 1957 } 1958 mutex_exit(&ipcl_globalhash_fanout[i].connf_lock); 1959 if (prev_connp != NULL) 1960 CONN_DEC_REF(prev_connp); 1961 } 1962 } 1963 1964 /* 1965 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 1966 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 1967 * held; caller must call CONN_DEC_REF. Only checks for connected entries 1968 * (peer tcp in ESTABLISHED state). 1969 */ 1970 conn_t * 1971 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph) 1972 { 1973 uint32_t ports; 1974 uint16_t *pports = (uint16_t *)&ports; 1975 connf_t *connfp; 1976 conn_t *tconnp; 1977 boolean_t zone_chk; 1978 1979 /* 1980 * If either the source of destination address is loopback, then 1981 * both endpoints must be in the same Zone. Otherwise, both of 1982 * the addresses are system-wide unique (tcp is in ESTABLISHED 1983 * state) and the endpoints may reside in different Zones. 1984 */ 1985 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 1986 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 1987 1988 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 1989 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 1990 1991 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)]; 1992 1993 mutex_enter(&connfp->connf_lock); 1994 for (tconnp = connfp->connf_head; tconnp != NULL; 1995 tconnp = tconnp->conn_next) { 1996 1997 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 1998 ipha->ipha_dst, ipha->ipha_src, ports) && 1999 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2000 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2001 2002 ASSERT(tconnp != connp); 2003 CONN_INC_REF(tconnp); 2004 mutex_exit(&connfp->connf_lock); 2005 return (tconnp); 2006 } 2007 } 2008 mutex_exit(&connfp->connf_lock); 2009 return (NULL); 2010 } 2011 2012 /* 2013 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2014 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2015 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2016 * (peer tcp in ESTABLISHED state). 2017 */ 2018 conn_t * 2019 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph) 2020 { 2021 uint32_t ports; 2022 uint16_t *pports = (uint16_t *)&ports; 2023 connf_t *connfp; 2024 conn_t *tconnp; 2025 boolean_t zone_chk; 2026 2027 /* 2028 * If either the source of destination address is loopback, then 2029 * both endpoints must be in the same Zone. Otherwise, both of 2030 * the addresses are system-wide unique (tcp is in ESTABLISHED 2031 * state) and the endpoints may reside in different Zones. We 2032 * don't do Zone check for link local address(es) because the 2033 * current Zone implementation treats each link local address as 2034 * being unique per system node, i.e. they belong to global Zone. 2035 */ 2036 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2037 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2038 2039 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2040 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2041 2042 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)]; 2043 2044 mutex_enter(&connfp->connf_lock); 2045 for (tconnp = connfp->connf_head; tconnp != NULL; 2046 tconnp = tconnp->conn_next) { 2047 2048 /* We skip tcp_bound_if check here as this is loopback tcp */ 2049 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2050 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2051 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2052 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2053 2054 ASSERT(tconnp != connp); 2055 CONN_INC_REF(tconnp); 2056 mutex_exit(&connfp->connf_lock); 2057 return (tconnp); 2058 } 2059 } 2060 mutex_exit(&connfp->connf_lock); 2061 return (NULL); 2062 } 2063 2064 /* 2065 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2066 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2067 * Only checks for connected entries i.e. no INADDR_ANY checks. 2068 */ 2069 conn_t * 2070 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state) 2071 { 2072 uint32_t ports; 2073 uint16_t *pports; 2074 connf_t *connfp; 2075 conn_t *tconnp; 2076 2077 pports = (uint16_t *)&ports; 2078 bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); 2079 bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); 2080 2081 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)]; 2082 2083 mutex_enter(&connfp->connf_lock); 2084 for (tconnp = connfp->connf_head; tconnp != NULL; 2085 tconnp = tconnp->conn_next) { 2086 2087 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2088 ipha->ipha_dst, ipha->ipha_src, ports) && 2089 tconnp->conn_tcp->tcp_state >= min_state) { 2090 2091 CONN_INC_REF(tconnp); 2092 mutex_exit(&connfp->connf_lock); 2093 return (tconnp); 2094 } 2095 } 2096 mutex_exit(&connfp->connf_lock); 2097 return (NULL); 2098 } 2099 2100 /* 2101 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2102 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2103 * Only checks for connected entries i.e. no INADDR_ANY checks. 2104 * Match on ifindex in addition to addresses. 2105 */ 2106 conn_t * 2107 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2108 uint_t ifindex) 2109 { 2110 tcp_t *tcp; 2111 uint32_t ports; 2112 uint16_t *pports; 2113 connf_t *connfp; 2114 conn_t *tconnp; 2115 2116 pports = (uint16_t *)&ports; 2117 pports[0] = tcpha->tha_fport; 2118 pports[1] = tcpha->tha_lport; 2119 2120 connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)]; 2121 2122 mutex_enter(&connfp->connf_lock); 2123 for (tconnp = connfp->connf_head; tconnp != NULL; 2124 tconnp = tconnp->conn_next) { 2125 2126 tcp = tconnp->conn_tcp; 2127 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2128 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2129 tcp->tcp_state >= min_state && 2130 (tcp->tcp_bound_if == 0 || 2131 tcp->tcp_bound_if == ifindex)) { 2132 2133 CONN_INC_REF(tconnp); 2134 mutex_exit(&connfp->connf_lock); 2135 return (tconnp); 2136 } 2137 } 2138 mutex_exit(&connfp->connf_lock); 2139 return (NULL); 2140 } 2141 2142 /* 2143 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2144 * a listener when changing state. 2145 */ 2146 conn_t * 2147 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid) 2148 { 2149 connf_t *bind_connfp; 2150 conn_t *connp; 2151 tcp_t *tcp; 2152 2153 /* 2154 * Avoid false matches for packets sent to an IP destination of 2155 * all zeros. 2156 */ 2157 if (laddr == 0) 2158 return (NULL); 2159 2160 ASSERT(zoneid != ALL_ZONES); 2161 2162 bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 2163 mutex_enter(&bind_connfp->connf_lock); 2164 for (connp = bind_connfp->connf_head; connp != NULL; 2165 connp = connp->conn_next) { 2166 tcp = connp->conn_tcp; 2167 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2168 IPCL_ZONE_MATCH(connp, zoneid) && 2169 (tcp->tcp_listener == NULL)) { 2170 CONN_INC_REF(connp); 2171 mutex_exit(&bind_connfp->connf_lock); 2172 return (connp); 2173 } 2174 } 2175 mutex_exit(&bind_connfp->connf_lock); 2176 return (NULL); 2177 } 2178 2179 /* 2180 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2181 * a listener when changing state. 2182 */ 2183 conn_t * 2184 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2185 zoneid_t zoneid) 2186 { 2187 connf_t *bind_connfp; 2188 conn_t *connp = NULL; 2189 tcp_t *tcp; 2190 2191 /* 2192 * Avoid false matches for packets sent to an IP destination of 2193 * all zeros. 2194 */ 2195 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2196 return (NULL); 2197 2198 ASSERT(zoneid != ALL_ZONES); 2199 2200 bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)]; 2201 mutex_enter(&bind_connfp->connf_lock); 2202 for (connp = bind_connfp->connf_head; connp != NULL; 2203 connp = connp->conn_next) { 2204 tcp = connp->conn_tcp; 2205 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2206 IPCL_ZONE_MATCH(connp, zoneid) && 2207 (tcp->tcp_bound_if == 0 || 2208 tcp->tcp_bound_if == ifindex) && 2209 tcp->tcp_listener == NULL) { 2210 CONN_INC_REF(connp); 2211 mutex_exit(&bind_connfp->connf_lock); 2212 return (connp); 2213 } 2214 } 2215 mutex_exit(&bind_connfp->connf_lock); 2216 return (NULL); 2217 } 2218 2219 /* 2220 * ipcl_get_next_conn 2221 * get the next entry in the conn global list 2222 * and put a reference on the next_conn. 2223 * decrement the reference on the current conn. 2224 * 2225 * This is an iterator based walker function that also provides for 2226 * some selection by the caller. It walks through the conn_hash bucket 2227 * searching for the next valid connp in the list, and selects connections 2228 * that are neither closed nor condemned. It also REFHOLDS the conn 2229 * thus ensuring that the conn exists when the caller uses the conn. 2230 */ 2231 conn_t * 2232 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2233 { 2234 conn_t *next_connp; 2235 2236 if (connfp == NULL) 2237 return (NULL); 2238 2239 mutex_enter(&connfp->connf_lock); 2240 2241 next_connp = (connp == NULL) ? 2242 connfp->connf_head : connp->conn_g_next; 2243 2244 while (next_connp != NULL) { 2245 mutex_enter(&next_connp->conn_lock); 2246 if (!(next_connp->conn_flags & conn_flags) || 2247 (next_connp->conn_state_flags & 2248 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2249 /* 2250 * This conn has been condemned or 2251 * is closing, or the flags don't match 2252 */ 2253 mutex_exit(&next_connp->conn_lock); 2254 next_connp = next_connp->conn_g_next; 2255 continue; 2256 } 2257 CONN_INC_REF_LOCKED(next_connp); 2258 mutex_exit(&next_connp->conn_lock); 2259 break; 2260 } 2261 2262 mutex_exit(&connfp->connf_lock); 2263 2264 if (connp != NULL) 2265 CONN_DEC_REF(connp); 2266 2267 return (next_connp); 2268 } 2269 2270 #ifdef CONN_DEBUG 2271 /* 2272 * Trace of the last NBUF refhold/refrele 2273 */ 2274 int 2275 conn_trace_ref(conn_t *connp) 2276 { 2277 int last; 2278 conn_trace_t *ctb; 2279 2280 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2281 last = connp->conn_trace_last; 2282 last++; 2283 if (last == CONN_TRACE_MAX) 2284 last = 0; 2285 2286 ctb = &connp->conn_trace_buf[last]; 2287 ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH); 2288 connp->conn_trace_last = last; 2289 return (1); 2290 } 2291 2292 int 2293 conn_untrace_ref(conn_t *connp) 2294 { 2295 int last; 2296 conn_trace_t *ctb; 2297 2298 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2299 last = connp->conn_trace_last; 2300 last++; 2301 if (last == CONN_TRACE_MAX) 2302 last = 0; 2303 2304 ctb = &connp->conn_trace_buf[last]; 2305 ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH); 2306 connp->conn_trace_last = last; 2307 return (1); 2308 } 2309 #endif 2310