1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/kmem.h> 28 #include <sys/ksynch.h> 29 #include <sys/systm.h> 30 #include <sys/socket.h> 31 #include <sys/disp.h> 32 #include <sys/taskq.h> 33 #include <sys/cmn_err.h> 34 #include <sys/strsun.h> 35 #include <sys/sdt.h> 36 #include <sys/atomic.h> 37 #include <netinet/in.h> 38 #include <inet/ip.h> 39 #include <inet/ip6.h> 40 #include <inet/tcp.h> 41 #include <inet/udp_impl.h> 42 #include <inet/kstatcom.h> 43 44 #include <inet/ilb_ip.h> 45 #include "ilb_alg.h" 46 #include "ilb_nat.h" 47 #include "ilb_conn.h" 48 49 /* ILB kmem cache flag */ 50 int ilb_kmem_flags = 0; 51 52 /* 53 * The default size for the different hash tables. Global for all stacks. 54 * But each stack has its own table, just that their sizes are the same. 55 */ 56 static size_t ilb_rule_hash_size = 2048; 57 58 static size_t ilb_conn_hash_size = 262144; 59 60 static size_t ilb_sticky_hash_size = 262144; 61 62 /* This should be a prime number. */ 63 static size_t ilb_nat_src_hash_size = 97; 64 65 /* Default NAT cache entry expiry time. */ 66 static uint32_t ilb_conn_tcp_expiry = 120; 67 static uint32_t ilb_conn_udp_expiry = 60; 68 69 /* Default sticky entry expiry time. */ 70 static uint32_t ilb_sticky_expiry = 60; 71 72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */ 73 #define ILB_RULE_HASH(addr, hash_size) \ 74 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \ 75 *(addr)) & ((hash_size) - 1)) 76 77 /* 78 * Note on ILB delayed processing 79 * 80 * To avoid in line removal on some of the data structures, such as rules, 81 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq. 82 * There are three types of ILB taskq: 83 * 84 * 1. rule handling: created at stack initialialization time, ilb_stack_init() 85 * 2. conn hash handling: created at conn hash initialization time, 86 * ilb_conn_hash_init() 87 * 3. sticky hash handling: created at sticky hash initialization time, 88 * ilb_sticky_hash_init() 89 * 90 * The rule taskq is for processing rule and server removal. When a user 91 * land rule/server removal request comes in, a taskq is dispatched after 92 * removing the rule/server from all related hashes. This taskq will wait 93 * until all references to the rule/server are gone before removing it. 94 * So the user land thread requesting the removal does not need to wait 95 * for the removal completion. 96 * 97 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and 98 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers 99 * and ilb_sticky_timer_size timers running for ilb_conn_hash and 100 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one 101 * portion (same size) of the hash table. When a timer fires, it dispatches 102 * a conn hash taskq to clean up its portion of the table. This avoids in 103 * line processing of the removal. 104 * 105 * There is another delayed processing, the clean up of NAT source address 106 * table. We just use the timer to directly handle it instead of using 107 * a taskq. The reason is that the table is small so it is OK to use the 108 * timer. 109 */ 110 111 /* ILB rule taskq constants. */ 112 #define ILB_RULE_TASKQ_NUM_THR 20 113 114 /* Argument passed to ILB rule taskq routines. */ 115 typedef struct { 116 ilb_stack_t *ilbs; 117 ilb_rule_t *rule; 118 } ilb_rule_tq_t; 119 120 /* kstat handling routines. */ 121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *); 122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *); 123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *); 124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *, 125 ilb_server_t *); 126 127 /* Rule hash handling routines. */ 128 static void ilb_rule_hash_init(ilb_stack_t *); 129 static void ilb_rule_hash_fini(ilb_stack_t *); 130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *); 131 static void ilb_rule_hash_del(ilb_rule_t *); 132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *, 133 in_port_t, zoneid_t, uint32_t, boolean_t *); 134 135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *); 136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *); 137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *); 138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *, 139 int *); 140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int, 141 int, in_port_t, in_port_t, const in6_addr_t *); 142 143 /* Back end server handling routines. */ 144 static void ilb_server_free(ilb_server_t *); 145 146 /* Network stack handling routines. */ 147 static void *ilb_stack_init(netstackid_t, netstack_t *); 148 static void ilb_stack_shutdown(netstackid_t, void *); 149 static void ilb_stack_fini(netstackid_t, void *); 150 151 /* Sticky connection handling routines. */ 152 static void ilb_rule_sticky_init(ilb_rule_t *); 153 static void ilb_rule_sticky_fini(ilb_rule_t *); 154 155 /* Handy macro to check for unspecified address. */ 156 #define IS_ADDR_UNSPEC(addr) \ 157 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \ 158 IN6_IS_ADDR_UNSPECIFIED(addr)) 159 160 /* 161 * Global kstat instance counter. When a rule is created, its kstat instance 162 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is 163 * incremented. 164 */ 165 static uint_t ilb_kstat_instance = 0; 166 167 /* 168 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME. 169 * A rule's kstat has ILB_RULE_KS_CNAME class name. 170 */ 171 #define ILB_G_KS_NAME "global" 172 #define ILB_G_KS_CNAME "kstat" 173 #define ILB_RULE_KS_CNAME "rulestat" 174 175 static kstat_t * 176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs) 177 { 178 kstat_t *ksp; 179 ilb_g_kstat_t template = { 180 { "num_rules", KSTAT_DATA_UINT64, 0 }, 181 { "ip_frag_in", KSTAT_DATA_UINT64, 0 }, 182 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 } 183 }; 184 185 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME, 186 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t), 187 KSTAT_FLAG_VIRTUAL, stackid); 188 if (ksp == NULL) 189 return (NULL); 190 bcopy(&template, ilbs->ilbs_kstat, sizeof (template)); 191 ksp->ks_data = ilbs->ilbs_kstat; 192 ksp->ks_private = (void *)(uintptr_t)stackid; 193 194 kstat_install(ksp); 195 return (ksp); 196 } 197 198 static void 199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs) 200 { 201 if (ilbs->ilbs_ksp != NULL) { 202 ASSERT(stackid == (netstackid_t)(uintptr_t) 203 ilbs->ilbs_ksp->ks_private); 204 kstat_delete_netstack(ilbs->ilbs_ksp, stackid); 205 ilbs->ilbs_ksp = NULL; 206 } 207 } 208 209 static kstat_t * 210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule) 211 { 212 kstat_t *ksp; 213 ilb_rule_kstat_t template = { 214 { "num_servers", KSTAT_DATA_UINT64, 0 }, 215 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 }, 216 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 }, 217 { "bytes_dropped", KSTAT_DATA_UINT64, 0 }, 218 { "pkt_dropped", KSTAT_DATA_UINT64, 0 }, 219 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 220 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 221 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 222 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 223 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 }, 224 { "icmp_dropped", KSTAT_DATA_UINT64, 0 }, 225 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 }, 226 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 } 227 }; 228 229 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 230 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED, 231 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 232 if (ksp == NULL) 233 return (NULL); 234 235 bcopy(&template, &rule->ir_kstat, sizeof (template)); 236 ksp->ks_data = &rule->ir_kstat; 237 ksp->ks_private = (void *)(uintptr_t)stackid; 238 239 kstat_install(ksp); 240 return (ksp); 241 } 242 243 static kstat_t * 244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule, 245 ilb_server_t *server) 246 { 247 kstat_t *ksp; 248 ilb_server_kstat_t template = { 249 { "bytes_processed", KSTAT_DATA_UINT64, 0 }, 250 { "pkt_processed", KSTAT_DATA_UINT64, 0 }, 251 { "ip_address", KSTAT_DATA_STRING, 0 } 252 }; 253 char cname_buf[KSTAT_STRLEN]; 254 255 /* 7 is "-sstat" */ 256 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN); 257 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name); 258 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 259 server->iser_name, cname_buf, KSTAT_TYPE_NAMED, 260 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 261 if (ksp == NULL) 262 return (NULL); 263 264 bcopy(&template, &server->iser_kstat, sizeof (template)); 265 ksp->ks_data = &server->iser_kstat; 266 ksp->ks_private = (void *)(uintptr_t)stackid; 267 268 kstat_named_setstr(&server->iser_kstat.ip_address, 269 server->iser_ip_addr); 270 /* We never change the IP address */ 271 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1; 272 273 kstat_install(ksp); 274 return (ksp); 275 } 276 277 /* Initialize the rule hash table. */ 278 static void 279 ilb_rule_hash_init(ilb_stack_t *ilbs) 280 { 281 int i; 282 283 /* 284 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to 285 * the next power of 2. 286 */ 287 if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) { 288 for (i = 0; i < 31; i++) { 289 if (ilbs->ilbs_rule_hash_size < (1 << i)) 290 break; 291 } 292 ilbs->ilbs_rule_hash_size = 1 << i; 293 } 294 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) * 295 ilbs->ilbs_rule_hash_size, KM_SLEEP); 296 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) { 297 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL, 298 MUTEX_DEFAULT, NULL); 299 } 300 } 301 302 /* Clean up the rule hash table. */ 303 static void 304 ilb_rule_hash_fini(ilb_stack_t *ilbs) 305 { 306 if (ilbs->ilbs_g_hash == NULL) 307 return; 308 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) * 309 ilbs->ilbs_rule_hash_size); 310 } 311 312 /* Add a rule to the rule hash table. */ 313 static void 314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr) 315 { 316 int i; 317 318 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3], 319 ilbs->ilbs_rule_hash_size); 320 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i); 321 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 322 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule; 323 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL) 324 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule; 325 rule->ir_hash_prev = NULL; 326 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule; 327 328 rule->ir_hash = &ilbs->ilbs_g_hash[i]; 329 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 330 } 331 332 /* 333 * Remove a rule from the rule hash table. Note that the rule is not freed 334 * in this routine. 335 */ 336 static void 337 ilb_rule_hash_del(ilb_rule_t *rule) 338 { 339 mutex_enter(&rule->ir_hash->ilb_hash_lock); 340 if (rule->ir_hash->ilb_hash_rule == rule) { 341 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next; 342 if (rule->ir_hash_next != NULL) 343 rule->ir_hash_next->ir_hash_prev = NULL; 344 } else { 345 if (rule->ir_hash_prev != NULL) 346 rule->ir_hash_prev->ir_hash_next = 347 rule->ir_hash_next; 348 if (rule->ir_hash_next != NULL) { 349 rule->ir_hash_next->ir_hash_prev = 350 rule->ir_hash_prev; 351 } 352 } 353 mutex_exit(&rule->ir_hash->ilb_hash_lock); 354 355 rule->ir_hash_next = NULL; 356 rule->ir_hash_prev = NULL; 357 rule->ir_hash = NULL; 358 } 359 360 /* 361 * Given the info of a packet, look for a match in the rule hash table. 362 */ 363 static ilb_rule_t * 364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr, 365 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy) 366 { 367 int i; 368 ilb_rule_t *rule; 369 ipaddr_t v4_addr; 370 371 *busy = B_FALSE; 372 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr); 373 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size); 374 port = ntohs(port); 375 376 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 377 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 378 rule = rule->ir_hash_next) { 379 if (!rule->ir_port_range) { 380 if (rule->ir_min_port != port) 381 continue; 382 } else { 383 if (port < rule->ir_min_port || 384 port > rule->ir_max_port) { 385 continue; 386 } 387 } 388 if (rule->ir_ipver != l3 || rule->ir_proto != l4 || 389 rule->ir_zoneid != zoneid) { 390 continue; 391 } 392 393 if (l3 == IPPROTO_IP) { 394 if (rule->ir_target_v4 != INADDR_ANY && 395 rule->ir_target_v4 != v4_addr) { 396 continue; 397 } 398 } else { 399 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) && 400 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) { 401 continue; 402 } 403 } 404 405 /* 406 * Just update the stats if the rule is disabled. 407 */ 408 mutex_enter(&rule->ir_lock); 409 if (!(rule->ir_flags & ILB_RULE_ENABLED)) { 410 ILB_R_KSTAT(rule, pkt_not_processed); 411 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len); 412 mutex_exit(&rule->ir_lock); 413 rule = NULL; 414 break; 415 } else if (rule->ir_flags & ILB_RULE_BUSY) { 416 /* 417 * If we are busy... 418 * 419 * XXX we should have a queue to postpone the 420 * packet processing. But this requires a 421 * mechanism in IP to re-start the packet 422 * processing. So for now, just drop the packet. 423 */ 424 ILB_R_KSTAT(rule, pkt_dropped); 425 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len); 426 mutex_exit(&rule->ir_lock); 427 *busy = B_TRUE; 428 rule = NULL; 429 break; 430 } else { 431 rule->ir_refcnt++; 432 ASSERT(rule->ir_refcnt != 1); 433 mutex_exit(&rule->ir_lock); 434 break; 435 } 436 } 437 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 438 return (rule); 439 } 440 441 /* 442 * Add a rule to the global rule list. This list is for finding all rules 443 * in an IP stack. The caller is assumed to hold the ilbs_g_lock. 444 */ 445 static void 446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule) 447 { 448 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 449 rule->ir_next = ilbs->ilbs_rule_head; 450 ilbs->ilbs_rule_head = rule; 451 ILB_KSTAT_UPDATE(ilbs, num_rules, 1); 452 } 453 454 /* The call is assumed to hold the ilbs_g_lock. */ 455 static void 456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule) 457 { 458 ilb_rule_t *tmp_rule; 459 ilb_rule_t *prev_rule; 460 461 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 462 prev_rule = NULL; 463 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 464 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) { 465 if (tmp_rule == rule) 466 break; 467 } 468 if (tmp_rule == NULL) { 469 mutex_exit(&ilbs->ilbs_g_lock); 470 return; 471 } 472 if (prev_rule == NULL) 473 ilbs->ilbs_rule_head = tmp_rule->ir_next; 474 else 475 prev_rule->ir_next = tmp_rule->ir_next; 476 ILB_KSTAT_UPDATE(ilbs, num_rules, -1); 477 } 478 479 /* 480 * Helper routine to calculate how many source addresses are in a given 481 * range. 482 */ 483 static int64_t 484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2) 485 { 486 int64_t ret; 487 uint32_t addr1, addr2; 488 489 /* 490 * Here we assume that the max number of NAT source cannot be 491 * large such that the most significant 2 s6_addr32 must be 492 * equal. 493 */ 494 addr1 = ntohl(a1->s6_addr32[3]); 495 addr2 = ntohl(a2->s6_addr32[3]); 496 if (a1->s6_addr32[0] != a2->s6_addr32[0] || 497 a1->s6_addr32[1] != a2->s6_addr32[1] || 498 a1->s6_addr32[2] > a2->s6_addr32[2] || 499 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) { 500 return (-1); 501 } 502 if (a1->s6_addr32[2] == a2->s6_addr32[2]) { 503 return (addr2 - addr1 + 1); 504 } else { 505 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2])); 506 ret <<= 32; 507 ret = ret + addr1 - addr2; 508 return (ret + 1); 509 } 510 } 511 512 /* 513 * Add an ILB rule. 514 */ 515 int 516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd) 517 { 518 ilb_rule_t *rule; 519 netstackid_t stackid; 520 int ret; 521 in_port_t min_port, max_port; 522 int64_t num_src; 523 524 /* Sanity checks. */ 525 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6) 526 return (EINVAL); 527 528 /* Need to support SCTP... */ 529 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP) 530 return (EINVAL); 531 532 /* For full NAT, the NAT source must be supplied. */ 533 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 534 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) || 535 IS_ADDR_UNSPEC(&cmd->nat_src_end)) { 536 return (EINVAL); 537 } 538 } 539 540 /* Check invalid mask */ 541 if ((cmd->flags & ILB_RULE_STICKY) && 542 IS_ADDR_UNSPEC(&cmd->sticky_mask)) { 543 return (EINVAL); 544 } 545 546 /* Port is passed in network byte order. */ 547 min_port = ntohs(cmd->min_port); 548 max_port = ntohs(cmd->max_port); 549 if (min_port > max_port) 550 return (EINVAL); 551 552 /* min_port == 0 means "all ports". Make it so */ 553 if (min_port == 0) { 554 min_port = 1; 555 max_port = 65535; 556 } 557 558 /* Funny address checking. */ 559 if (cmd->ip_ver == IPPROTO_IP) { 560 in_addr_t v4_addr1, v4_addr2; 561 562 v4_addr1 = cmd->vip.s6_addr32[3]; 563 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 564 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST || 565 v4_addr1 == INADDR_ANY || 566 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 567 return (EINVAL); 568 } 569 570 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 571 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]); 572 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]); 573 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 574 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET || 575 v4_addr1 == INADDR_BROADCAST || 576 v4_addr2 == INADDR_BROADCAST || 577 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY || 578 CLASSD(v4_addr1) || CLASSD(v4_addr2) || 579 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 581 return (EINVAL); 582 } 583 584 num_src = v4_addr2 - v4_addr1 + 1; 585 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC) 586 return (EINVAL); 587 } 588 } else { 589 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) || 590 IN6_IS_ADDR_MULTICAST(&cmd->vip) || 591 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) || 592 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 593 return (EINVAL); 594 } 595 596 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 597 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) || 598 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) || 599 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) || 600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) || 601 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) || 602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) || 603 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 605 return (EINVAL); 606 } 607 608 if ((num_src = num_nat_src_v6(&cmd->nat_src_start, 609 &cmd->nat_src_end)) < 0 || 610 num_src > ILB_MAX_NAT_SRC) { 611 return (EINVAL); 612 } 613 } 614 } 615 616 mutex_enter(&ilbs->ilbs_g_lock); 617 if (ilbs->ilbs_g_hash == NULL) 618 ilb_rule_hash_init(ilbs); 619 if (ilbs->ilbs_c2s_conn_hash == NULL) { 620 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 621 ilb_conn_hash_init(ilbs); 622 ilb_nat_src_init(ilbs); 623 } 624 625 /* Make sure that the new rule does not duplicate an existing one. */ 626 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto, 627 min_port, max_port, &cmd->vip)) { 628 mutex_exit(&ilbs->ilbs_g_lock); 629 return (EEXIST); 630 } 631 632 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP); 633 if (rule == NULL) { 634 mutex_exit(&ilbs->ilbs_g_lock); 635 return (ENOMEM); 636 } 637 638 /* ir_name is all 0 to begin with */ 639 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1); 640 641 rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance); 642 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 643 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) { 644 ret = ENOMEM; 645 goto error; 646 } 647 648 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 649 rule->ir_nat_src_start = cmd->nat_src_start; 650 rule->ir_nat_src_end = cmd->nat_src_end; 651 } 652 653 rule->ir_ipver = cmd->ip_ver; 654 rule->ir_proto = cmd->proto; 655 rule->ir_topo = cmd->topo; 656 657 rule->ir_min_port = min_port; 658 rule->ir_max_port = max_port; 659 if (rule->ir_min_port != rule->ir_max_port) 660 rule->ir_port_range = B_TRUE; 661 else 662 rule->ir_port_range = B_FALSE; 663 664 rule->ir_zoneid = zoneid; 665 666 rule->ir_target_v6 = cmd->vip; 667 rule->ir_servers = NULL; 668 669 /* 670 * The default connection drain timeout is indefinite (value 0), 671 * meaning we will wait for all connections to finish. So we 672 * can assign cmd->conn_drain_timeout to it directly. 673 */ 674 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout; 675 if (cmd->nat_expiry != 0) { 676 rule->ir_nat_expiry = cmd->nat_expiry; 677 } else { 678 switch (rule->ir_proto) { 679 case IPPROTO_TCP: 680 rule->ir_nat_expiry = ilb_conn_tcp_expiry; 681 break; 682 case IPPROTO_UDP: 683 rule->ir_nat_expiry = ilb_conn_udp_expiry; 684 break; 685 default: 686 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p", 687 (void *)rule); 688 break; 689 } 690 } 691 if (cmd->sticky_expiry != 0) 692 rule->ir_sticky_expiry = cmd->sticky_expiry; 693 else 694 rule->ir_sticky_expiry = ilb_sticky_expiry; 695 696 if (cmd->flags & ILB_RULE_STICKY) { 697 rule->ir_flags |= ILB_RULE_STICKY; 698 rule->ir_sticky_mask = cmd->sticky_mask; 699 if (ilbs->ilbs_sticky_hash == NULL) 700 ilb_sticky_hash_init(ilbs); 701 } 702 if (cmd->flags & ILB_RULE_ENABLED) 703 rule->ir_flags |= ILB_RULE_ENABLED; 704 705 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL); 706 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL); 707 708 rule->ir_refcnt = 1; 709 710 switch (cmd->algo) { 711 case ILB_ALG_IMPL_ROUNDROBIN: 712 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) { 713 ret = ENOMEM; 714 goto error; 715 } 716 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN; 717 break; 718 case ILB_ALG_IMPL_HASH_IP: 719 case ILB_ALG_IMPL_HASH_IP_SPORT: 720 case ILB_ALG_IMPL_HASH_IP_VIP: 721 if ((rule->ir_alg = ilb_alg_hash_init(rule, 722 &cmd->algo)) == NULL) { 723 ret = ENOMEM; 724 goto error; 725 } 726 rule->ir_alg_type = cmd->algo; 727 break; 728 default: 729 ret = EINVAL; 730 goto error; 731 } 732 733 /* Add it to the global list and hash array at the end. */ 734 ilb_rule_g_add(ilbs, rule); 735 ilb_rule_hash_add(ilbs, rule, &cmd->vip); 736 737 mutex_exit(&ilbs->ilbs_g_lock); 738 739 return (0); 740 741 error: 742 mutex_exit(&ilbs->ilbs_g_lock); 743 if (rule->ir_ksp != NULL) { 744 /* stackid must be initialized if ir_ksp != NULL */ 745 kstat_delete_netstack(rule->ir_ksp, stackid); 746 } 747 kmem_free(rule, sizeof (ilb_rule_t)); 748 return (ret); 749 } 750 751 /* 752 * The final part in deleting a rule. Either called directly or by the 753 * taskq dispatched. 754 */ 755 static void 756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule) 757 { 758 netstackid_t stackid; 759 ilb_server_t *server; 760 761 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 762 763 /* 764 * Let the algorithm know that the rule is going away. The 765 * algorithm fini routine will free all its resources with this 766 * rule. 767 */ 768 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg); 769 770 while ((server = tmp_rule->ir_servers) != NULL) { 771 mutex_enter(&server->iser_lock); 772 ilb_destroy_nat_src(&server->iser_nat_src); 773 if (tmp_rule->ir_conn_drain_timeout != 0) { 774 /* 775 * The garbage collection thread checks this value 776 * without grabing a lock. So we need to use 777 * atomic_swap_64() to make sure that the value seen 778 * by gc thread is intact. 779 */ 780 (void) atomic_swap_64( 781 (uint64_t *)&server->iser_die_time, 782 ddi_get_lbolt64() + 783 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout)); 784 } 785 while (server->iser_refcnt > 1) 786 cv_wait(&server->iser_cv, &server->iser_lock); 787 tmp_rule->ir_servers = server->iser_next; 788 kstat_delete_netstack(server->iser_ksp, stackid); 789 kmem_free(server, sizeof (ilb_server_t)); 790 } 791 792 ASSERT(tmp_rule->ir_ksp != NULL); 793 kstat_delete_netstack(tmp_rule->ir_ksp, stackid); 794 795 kmem_free(tmp_rule, sizeof (ilb_rule_t)); 796 } 797 798 /* The routine executed by the delayed rule taskq. */ 799 static void 800 ilb_rule_del_tq(void *arg) 801 { 802 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs; 803 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule; 804 805 mutex_enter(&rule->ir_lock); 806 while (rule->ir_refcnt > 1) 807 cv_wait(&rule->ir_cv, &rule->ir_lock); 808 ilb_rule_del_common(ilbs, rule); 809 kmem_free(arg, sizeof (ilb_rule_tq_t)); 810 } 811 812 /* Routine to delete a rule. */ 813 int 814 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name) 815 { 816 ilb_rule_t *tmp_rule; 817 ilb_rule_tq_t *arg; 818 int err; 819 820 mutex_enter(&ilbs->ilbs_g_lock); 821 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, 822 &err)) == NULL) { 823 mutex_exit(&ilbs->ilbs_g_lock); 824 return (err); 825 } 826 827 /* 828 * First remove the rule from the hash array and the global list so 829 * that no one can find this rule any more. 830 */ 831 ilb_rule_hash_del(tmp_rule); 832 ilb_rule_g_del(ilbs, tmp_rule); 833 mutex_exit(&ilbs->ilbs_g_lock); 834 ILB_RULE_REFRELE(tmp_rule); 835 836 /* 837 * Now no one can find this rule, we can remove it once all 838 * references to it are dropped and all references to the list 839 * of servers are dropped. So dispatch a task to finish the deletion. 840 * We do this instead of letting the last one referencing the 841 * rule do it. The reason is that the last one may be the 842 * interrupt thread. We want to minimize the work it needs to 843 * do. Rule deletion is not a critical task so it can be delayed. 844 */ 845 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 846 arg->ilbs = ilbs; 847 arg->rule = tmp_rule; 848 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, 849 TQ_SLEEP); 850 851 return (0); 852 } 853 854 /* 855 * Given an IP address, check to see if there is a rule using this 856 * as the VIP. It can be used to check if we need to drop a fragment. 857 */ 858 boolean_t 859 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule) 860 { 861 int i; 862 ilb_rule_t *rule; 863 boolean_t ret = B_FALSE; 864 865 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3], 866 ilbs->ilbs_rule_hash_size); 867 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 868 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 869 rule = rule->ir_hash_next) { 870 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) { 871 mutex_enter(&rule->ir_lock); 872 if (rule->ir_flags & ILB_RULE_BUSY) { 873 mutex_exit(&rule->ir_lock); 874 break; 875 } 876 if (ret_rule != NULL) { 877 rule->ir_refcnt++; 878 mutex_exit(&rule->ir_lock); 879 *ret_rule = rule; 880 } else { 881 mutex_exit(&rule->ir_lock); 882 } 883 ret = B_TRUE; 884 break; 885 } 886 } 887 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 888 return (ret); 889 } 890 891 boolean_t 892 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule) 893 { 894 int i; 895 ilb_rule_t *rule; 896 boolean_t ret = B_FALSE; 897 898 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size); 899 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 900 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 901 rule = rule->ir_hash_next) { 902 if (rule->ir_target_v6.s6_addr32[3] == addr) { 903 mutex_enter(&rule->ir_lock); 904 if (rule->ir_flags & ILB_RULE_BUSY) { 905 mutex_exit(&rule->ir_lock); 906 break; 907 } 908 if (ret_rule != NULL) { 909 rule->ir_refcnt++; 910 mutex_exit(&rule->ir_lock); 911 *ret_rule = rule; 912 } else { 913 mutex_exit(&rule->ir_lock); 914 } 915 ret = B_TRUE; 916 break; 917 } 918 } 919 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 920 return (ret); 921 } 922 923 static ilb_rule_t * 924 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 925 int *err) 926 { 927 ilb_rule_t *tmp_rule; 928 929 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 930 931 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 932 tmp_rule = tmp_rule->ir_next) { 933 if (tmp_rule->ir_zoneid != zoneid) 934 continue; 935 if (strcasecmp(tmp_rule->ir_name, name) == 0) { 936 mutex_enter(&tmp_rule->ir_lock); 937 if (tmp_rule->ir_flags & ILB_RULE_BUSY) { 938 mutex_exit(&tmp_rule->ir_lock); 939 *err = EINPROGRESS; 940 return (NULL); 941 } 942 tmp_rule->ir_refcnt++; 943 mutex_exit(&tmp_rule->ir_lock); 944 *err = 0; 945 return (tmp_rule); 946 } 947 } 948 *err = ENOENT; 949 return (NULL); 950 } 951 952 /* To find a rule with a given name and zone in the global rule list. */ 953 ilb_rule_t * 954 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 955 int *err) 956 { 957 ilb_rule_t *tmp_rule; 958 959 mutex_enter(&ilbs->ilbs_g_lock); 960 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err); 961 mutex_exit(&ilbs->ilbs_g_lock); 962 return (tmp_rule); 963 } 964 965 /* Try to match the given packet info and zone ID with a rule. */ 966 static boolean_t 967 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3, 968 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr) 969 { 970 ilb_rule_t *tmp_rule; 971 972 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 973 974 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 975 tmp_rule = tmp_rule->ir_next) { 976 if (tmp_rule->ir_zoneid != zoneid) 977 continue; 978 979 /* 980 * We don't allow the same name in different rules even if all 981 * the other rule components are different. 982 */ 983 if (strcasecmp(tmp_rule->ir_name, name) == 0) 984 return (B_TRUE); 985 986 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4) 987 continue; 988 989 /* 990 * ir_min_port and ir_max_port are the same if ir_port_range 991 * is false. In this case, if the ir_min|max_port (same) is 992 * outside of the given port range, it is OK. In other cases, 993 * check if min and max port are outside a rule's range. 994 */ 995 if (tmp_rule->ir_max_port < min_port || 996 tmp_rule->ir_min_port > max_port) { 997 continue; 998 } 999 1000 /* 1001 * If l3 is IPv4, the addr passed in is assumed to be 1002 * mapped address. 1003 */ 1004 if (V6_OR_V4_INADDR_ANY(*addr) || 1005 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) || 1006 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) { 1007 return (B_TRUE); 1008 } 1009 } 1010 return (B_FALSE); 1011 } 1012 1013 int 1014 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid, 1015 const char *rule_name, ilb_rule_t *in_rule) 1016 { 1017 ilb_rule_t *rule; 1018 int err; 1019 1020 ASSERT((in_rule == NULL && rule_name != NULL) || 1021 (in_rule != NULL && rule_name == NULL)); 1022 if ((rule = in_rule) == NULL) { 1023 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1024 &err)) == NULL) { 1025 return (err); 1026 } 1027 } 1028 mutex_enter(&rule->ir_lock); 1029 rule->ir_flags |= ILB_RULE_ENABLED; 1030 mutex_exit(&rule->ir_lock); 1031 1032 /* Only refrele if the rule is passed in. */ 1033 if (in_rule == NULL) 1034 ILB_RULE_REFRELE(rule); 1035 return (0); 1036 } 1037 1038 int 1039 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid, 1040 const char *rule_name, ilb_rule_t *in_rule) 1041 { 1042 ilb_rule_t *rule; 1043 int err; 1044 1045 ASSERT((in_rule == NULL && rule_name != NULL) || 1046 (in_rule != NULL && rule_name == NULL)); 1047 if ((rule = in_rule) == NULL) { 1048 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1049 &err)) == NULL) { 1050 return (err); 1051 } 1052 } 1053 mutex_enter(&rule->ir_lock); 1054 rule->ir_flags &= ~ILB_RULE_ENABLED; 1055 mutex_exit(&rule->ir_lock); 1056 1057 /* Only refrele if the rule is passed in. */ 1058 if (in_rule == NULL) 1059 ILB_RULE_REFRELE(rule); 1060 return (0); 1061 } 1062 1063 /* 1064 * XXX We should probably have a walker function to walk all rules. For 1065 * now, just add a simple loop for enable/disable/del. 1066 */ 1067 void 1068 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1069 { 1070 ilb_rule_t *rule; 1071 1072 mutex_enter(&ilbs->ilbs_g_lock); 1073 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { 1074 if (rule->ir_zoneid != zoneid) 1075 continue; 1076 /* 1077 * No need to hold the rule as we are holding the global 1078 * lock so it won't go away. Ignore the return value here 1079 * as the rule is provided so the call cannot fail. 1080 */ 1081 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule); 1082 } 1083 mutex_exit(&ilbs->ilbs_g_lock); 1084 } 1085 1086 void 1087 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1088 { 1089 ilb_rule_t *rule; 1090 1091 mutex_enter(&ilbs->ilbs_g_lock); 1092 for (rule = ilbs->ilbs_rule_head; rule != NULL; 1093 rule = rule->ir_next) { 1094 if (rule->ir_zoneid != zoneid) 1095 continue; 1096 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule); 1097 } 1098 mutex_exit(&ilbs->ilbs_g_lock); 1099 } 1100 1101 void 1102 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1103 { 1104 ilb_rule_t *rule; 1105 ilb_rule_tq_t *arg; 1106 1107 mutex_enter(&ilbs->ilbs_g_lock); 1108 while ((rule = ilbs->ilbs_rule_head) != NULL) { 1109 if (rule->ir_zoneid != zoneid) 1110 continue; 1111 ilb_rule_hash_del(rule); 1112 ilb_rule_g_del(ilbs, rule); 1113 mutex_exit(&ilbs->ilbs_g_lock); 1114 1115 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 1116 arg->ilbs = ilbs; 1117 arg->rule = rule; 1118 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, 1119 arg, TQ_SLEEP); 1120 1121 mutex_enter(&ilbs->ilbs_g_lock); 1122 } 1123 mutex_exit(&ilbs->ilbs_g_lock); 1124 } 1125 1126 /* 1127 * This is just an optimization, so don't grab the global lock. The 1128 * worst case is that we missed a couple packets. 1129 */ 1130 boolean_t 1131 ilb_has_rules(ilb_stack_t *ilbs) 1132 { 1133 return (ilbs->ilbs_rule_head != NULL); 1134 } 1135 1136 1137 static int 1138 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1139 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable) 1140 { 1141 ilb_server_t *tmp_server; 1142 int ret; 1143 1144 ASSERT((rule == NULL && rule_name != NULL) || 1145 (rule != NULL && rule_name == NULL)); 1146 1147 if (rule == NULL) { 1148 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1149 &ret)) == NULL) { 1150 return (ret); 1151 } 1152 } 1153 1154 /* Once we get a hold on the rule, no server can be added/deleted. */ 1155 for (tmp_server = rule->ir_servers; tmp_server != NULL; 1156 tmp_server = tmp_server->iser_next) { 1157 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr)) 1158 break; 1159 } 1160 if (tmp_server == NULL) { 1161 ret = ENOENT; 1162 goto done; 1163 } 1164 1165 if (enable) { 1166 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server, 1167 rule->ir_alg->ilb_alg_data); 1168 if (ret == 0) { 1169 tmp_server->iser_enabled = B_TRUE; 1170 tmp_server->iser_die_time = 0; 1171 } 1172 } else { 1173 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server, 1174 rule->ir_alg->ilb_alg_data); 1175 if (ret == 0) { 1176 tmp_server->iser_enabled = B_FALSE; 1177 if (rule->ir_conn_drain_timeout != 0) { 1178 (void) atomic_swap_64( 1179 (uint64_t *)&tmp_server->iser_die_time, 1180 ddi_get_lbolt64() + SEC_TO_TICK( 1181 rule->ir_conn_drain_timeout)); 1182 } 1183 } 1184 } 1185 1186 done: 1187 if (rule_name != NULL) 1188 ILB_RULE_REFRELE(rule); 1189 return (ret); 1190 } 1191 int 1192 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1193 ilb_rule_t *rule, in6_addr_t *addr) 1194 { 1195 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE)); 1196 } 1197 1198 int 1199 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1200 ilb_rule_t *rule, in6_addr_t *addr) 1201 { 1202 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE)); 1203 } 1204 1205 /* 1206 * Add a back end server to a rule. If the address is IPv4, it is assumed 1207 * to be passed in as a mapped address. 1208 */ 1209 int 1210 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info) 1211 { 1212 ilb_server_t *server; 1213 netstackid_t stackid; 1214 int ret = 0; 1215 in_port_t min_port, max_port; 1216 in_port_t range; 1217 1218 /* Port is passed in network byte order. */ 1219 min_port = ntohs(info->min_port); 1220 max_port = ntohs(info->max_port); 1221 if (min_port > max_port) 1222 return (EINVAL); 1223 1224 /* min_port == 0 means "all ports". Make it so */ 1225 if (min_port == 0) { 1226 min_port = 1; 1227 max_port = 65535; 1228 } 1229 range = max_port - min_port; 1230 1231 mutex_enter(&rule->ir_lock); 1232 /* If someone is already doing server add/del, sleeps and wait. */ 1233 while (rule->ir_flags & ILB_RULE_BUSY) { 1234 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1235 mutex_exit(&rule->ir_lock); 1236 return (EINTR); 1237 } 1238 } 1239 1240 /* 1241 * Set the rule to be busy to make sure that no new packet can 1242 * use this rule. 1243 */ 1244 rule->ir_flags |= ILB_RULE_BUSY; 1245 1246 /* Now wait for all other guys to finish their work. */ 1247 while (rule->ir_refcnt > 2) { 1248 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1249 mutex_exit(&rule->ir_lock); 1250 ret = EINTR; 1251 goto end; 1252 } 1253 } 1254 mutex_exit(&rule->ir_lock); 1255 1256 /* Sanity checks... */ 1257 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) && 1258 rule->ir_ipver != IPPROTO_IP) || 1259 (!IN6_IS_ADDR_V4MAPPED(&info->addr) && 1260 rule->ir_ipver != IPPROTO_IPV6)) { 1261 ret = EINVAL; 1262 goto end; 1263 } 1264 1265 /* 1266 * Check for valid port range. 1267 * 1268 * For DSR, there can be no port shifting. Hence the server 1269 * specification must be the same as the rule's. 1270 * 1271 * For half-NAT/NAT, the range must either be 0 (port collapsing) or 1272 * it must be equal to the same value as the rule port range. 1273 * 1274 */ 1275 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) { 1276 if (rule->ir_max_port != max_port || 1277 rule->ir_min_port != min_port) { 1278 ret = EINVAL; 1279 goto end; 1280 } 1281 } else { 1282 if ((range != rule->ir_max_port - rule->ir_min_port) && 1283 range != 0) { 1284 ret = EINVAL; 1285 goto end; 1286 } 1287 } 1288 1289 /* Check for duplicate. */ 1290 for (server = rule->ir_servers; server != NULL; 1291 server = server->iser_next) { 1292 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) || 1293 strcasecmp(server->iser_name, info->name) == 0) { 1294 break; 1295 } 1296 } 1297 if (server != NULL) { 1298 ret = EEXIST; 1299 goto end; 1300 } 1301 1302 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) { 1303 ret = ENOMEM; 1304 goto end; 1305 } 1306 1307 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1); 1308 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr, 1309 sizeof (server->iser_ip_addr)); 1310 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 1311 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server); 1312 if (server->iser_ksp == NULL) { 1313 kmem_free(server, sizeof (ilb_server_t)); 1314 ret = EINVAL; 1315 goto end; 1316 } 1317 1318 server->iser_stackid = stackid; 1319 server->iser_addr_v6 = info->addr; 1320 server->iser_min_port = min_port; 1321 server->iser_max_port = max_port; 1322 if (min_port != max_port) 1323 server->iser_port_range = B_TRUE; 1324 else 1325 server->iser_port_range = B_FALSE; 1326 1327 /* 1328 * If the rule uses NAT, find/create the NAT source entry to use 1329 * for this server. 1330 */ 1331 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 1332 in_port_t port; 1333 1334 /* 1335 * If the server uses a port range, our port allocation 1336 * scheme needs to treat it as a wildcard. Refer to the 1337 * comments in ilb_nat.c about the scheme. 1338 */ 1339 if (server->iser_port_range) 1340 port = 0; 1341 else 1342 port = server->iser_min_port; 1343 1344 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src, 1345 &server->iser_addr_v6, port, &rule->ir_nat_src_start, 1346 num_nat_src_v6(&rule->ir_nat_src_start, 1347 &rule->ir_nat_src_end))) != 0) { 1348 kstat_delete_netstack(server->iser_ksp, stackid); 1349 kmem_free(server, sizeof (ilb_server_t)); 1350 goto end; 1351 } 1352 } 1353 1354 /* 1355 * The iser_lock is only used to protect iser_refcnt. All the other 1356 * fields in ilb_server_t should not change, except for iser_enabled. 1357 * The worst thing that can happen if iser_enabled is messed up is 1358 * that one or two packets may not be load balanced to a server 1359 * correctly. 1360 */ 1361 server->iser_refcnt = 1; 1362 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE : 1363 B_FALSE; 1364 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL); 1365 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL); 1366 1367 /* Let the load balancing algorithm know about the addition. */ 1368 ASSERT(rule->ir_alg != NULL); 1369 if ((ret = rule->ir_alg->ilb_alg_server_add(server, 1370 rule->ir_alg->ilb_alg_data)) != 0) { 1371 kstat_delete_netstack(server->iser_ksp, stackid); 1372 kmem_free(server, sizeof (ilb_server_t)); 1373 goto end; 1374 } 1375 1376 /* 1377 * No need to hold ir_lock since no other thread should manipulate 1378 * the following fields until ILB_RULE_BUSY is cleared. 1379 */ 1380 if (rule->ir_servers == NULL) { 1381 server->iser_next = NULL; 1382 } else { 1383 server->iser_next = rule->ir_servers; 1384 } 1385 rule->ir_servers = server; 1386 ILB_R_KSTAT(rule, num_servers); 1387 1388 end: 1389 mutex_enter(&rule->ir_lock); 1390 rule->ir_flags &= ~ILB_RULE_BUSY; 1391 cv_signal(&rule->ir_cv); 1392 mutex_exit(&rule->ir_lock); 1393 return (ret); 1394 } 1395 1396 /* The routine executed by the delayed rule processing taskq. */ 1397 static void 1398 ilb_server_del_tq(void *arg) 1399 { 1400 ilb_server_t *server = (ilb_server_t *)arg; 1401 1402 mutex_enter(&server->iser_lock); 1403 while (server->iser_refcnt > 1) 1404 cv_wait(&server->iser_cv, &server->iser_lock); 1405 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1406 kmem_free(server, sizeof (ilb_server_t)); 1407 } 1408 1409 /* 1410 * Delete a back end server from a rule. If the address is IPv4, it is assumed 1411 * to be passed in as a mapped address. 1412 */ 1413 int 1414 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1415 ilb_rule_t *rule, in6_addr_t *addr) 1416 { 1417 ilb_server_t *server; 1418 ilb_server_t *prev_server; 1419 int ret = 0; 1420 1421 ASSERT((rule == NULL && rule_name != NULL) || 1422 (rule != NULL && rule_name == NULL)); 1423 if (rule == NULL) { 1424 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1425 &ret)) == NULL) { 1426 return (ret); 1427 } 1428 } 1429 1430 mutex_enter(&rule->ir_lock); 1431 /* If someone is already doing server add/del, sleeps and wait. */ 1432 while (rule->ir_flags & ILB_RULE_BUSY) { 1433 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1434 if (rule_name != NULL) { 1435 if (--rule->ir_refcnt <= 2) 1436 cv_signal(&rule->ir_cv); 1437 } 1438 mutex_exit(&rule->ir_lock); 1439 return (EINTR); 1440 } 1441 } 1442 /* 1443 * Set the rule to be busy to make sure that no new packet can 1444 * use this rule. 1445 */ 1446 rule->ir_flags |= ILB_RULE_BUSY; 1447 1448 /* Now wait for all other guys to finish their work. */ 1449 while (rule->ir_refcnt > 2) { 1450 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1451 mutex_exit(&rule->ir_lock); 1452 ret = EINTR; 1453 goto end; 1454 } 1455 } 1456 mutex_exit(&rule->ir_lock); 1457 1458 prev_server = NULL; 1459 for (server = rule->ir_servers; server != NULL; 1460 prev_server = server, server = server->iser_next) { 1461 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr)) 1462 break; 1463 } 1464 if (server == NULL) { 1465 ret = ENOENT; 1466 goto end; 1467 } 1468 1469 /* 1470 * Let the load balancing algorithm know about the removal. 1471 * The algorithm may disallow the removal... 1472 */ 1473 if ((ret = rule->ir_alg->ilb_alg_server_del(server, 1474 rule->ir_alg->ilb_alg_data)) != 0) { 1475 goto end; 1476 } 1477 1478 if (prev_server == NULL) 1479 rule->ir_servers = server->iser_next; 1480 else 1481 prev_server->iser_next = server->iser_next; 1482 1483 ILB_R_KSTAT_UPDATE(rule, num_servers, -1); 1484 1485 /* 1486 * Mark the server as disabled so that if there is any sticky cache 1487 * using this server around, it won't be used. 1488 */ 1489 server->iser_enabled = B_FALSE; 1490 1491 mutex_enter(&server->iser_lock); 1492 1493 /* 1494 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t 1495 * may not go away if there is still a conn using it. The NAT source 1496 * timer will do the garbage collection. 1497 */ 1498 ilb_destroy_nat_src(&server->iser_nat_src); 1499 1500 /* If there is a hard limit on when a server should die, set it. */ 1501 if (rule->ir_conn_drain_timeout != 0) { 1502 (void) atomic_swap_64((uint64_t *)&server->iser_die_time, 1503 ddi_get_lbolt64() + 1504 SEC_TO_TICK(rule->ir_conn_drain_timeout)); 1505 } 1506 1507 if (server->iser_refcnt > 1) { 1508 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq, 1509 server, TQ_SLEEP); 1510 mutex_exit(&server->iser_lock); 1511 } else { 1512 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1513 kmem_free(server, sizeof (ilb_server_t)); 1514 } 1515 1516 end: 1517 mutex_enter(&rule->ir_lock); 1518 rule->ir_flags &= ~ILB_RULE_BUSY; 1519 if (rule_name != NULL) 1520 rule->ir_refcnt--; 1521 cv_signal(&rule->ir_cv); 1522 mutex_exit(&rule->ir_lock); 1523 return (ret); 1524 } 1525 1526 /* 1527 * First check if the destination of the ICMP message matches a VIP of 1528 * a rule. If it does not, just return ILB_PASSED. 1529 * 1530 * If the destination matches a VIP: 1531 * 1532 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end 1533 * server. 1534 * 1535 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload 1536 * and see which back end server we should send this message to. And we 1537 * need to do NAT on both the payload message and the outside IP packet. 1538 * 1539 * For other ICMP messages, drop them. 1540 */ 1541 /* ARGSUSED */ 1542 static int 1543 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, 1544 icmph_t *icmph, ipaddr_t *lb_dst) 1545 { 1546 ipaddr_t vip; 1547 ilb_rule_t *rule; 1548 in6_addr_t addr6; 1549 1550 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule)) 1551 return (ILB_PASSED); 1552 1553 1554 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) { 1555 ILB_R_KSTAT(rule, icmp_dropped); 1556 ILB_RULE_REFRELE(rule); 1557 return (ILB_DROPPED); 1558 } 1559 1560 switch (icmph->icmph_type) { 1561 case ICMP_ECHO_REQUEST: 1562 ILB_R_KSTAT(rule, icmp_echo_processed); 1563 ILB_RULE_REFRELE(rule); 1564 1565 icmph->icmph_type = ICMP_ECHO_REPLY; 1566 icmph->icmph_checksum = 0; 1567 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0); 1568 ipha->ipha_ttl = 1569 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl; 1570 *lb_dst = ipha->ipha_src; 1571 vip = ipha->ipha_dst; 1572 ipha->ipha_dst = ipha->ipha_src; 1573 ipha->ipha_src = vip; 1574 return (ILB_BALANCED); 1575 case ICMP_DEST_UNREACHABLE: { 1576 int ret; 1577 1578 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) { 1579 ILB_R_KSTAT(rule, icmp_dropped); 1580 ILB_RULE_REFRELE(rule); 1581 return (ILB_DROPPED); 1582 } 1583 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph, 1584 &addr6)) { 1585 ILB_R_KSTAT(rule, icmp_2big_processed); 1586 ret = ILB_BALANCED; 1587 } else { 1588 ILB_R_KSTAT(rule, icmp_2big_dropped); 1589 ret = ILB_DROPPED; 1590 } 1591 ILB_RULE_REFRELE(rule); 1592 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst); 1593 return (ret); 1594 } 1595 default: 1596 ILB_R_KSTAT(rule, icmp_dropped); 1597 ILB_RULE_REFRELE(rule); 1598 return (ILB_DROPPED); 1599 } 1600 } 1601 1602 /* ARGSUSED */ 1603 static int 1604 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, 1605 icmp6_t *icmp6, in6_addr_t *lb_dst) 1606 { 1607 ilb_rule_t *rule; 1608 1609 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule)) 1610 return (ILB_PASSED); 1611 1612 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) { 1613 ILB_R_KSTAT(rule, icmp_dropped); 1614 ILB_RULE_REFRELE(rule); 1615 return (ILB_DROPPED); 1616 } 1617 1618 switch (icmp6->icmp6_type) { 1619 case ICMP6_ECHO_REQUEST: { 1620 int hdr_len; 1621 1622 ILB_R_KSTAT(rule, icmp_echo_processed); 1623 ILB_RULE_REFRELE(rule); 1624 1625 icmp6->icmp6_type = ICMP6_ECHO_REPLY; 1626 icmp6->icmp6_cksum = ip6h->ip6_plen; 1627 hdr_len = (char *)icmp6 - (char *)ip6h; 1628 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len, 1629 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6)); 1630 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; 1631 ip6h->ip6_hops = 1632 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops; 1633 *lb_dst = ip6h->ip6_src; 1634 ip6h->ip6_src = ip6h->ip6_dst; 1635 ip6h->ip6_dst = *lb_dst; 1636 return (ILB_BALANCED); 1637 } 1638 case ICMP6_PACKET_TOO_BIG: { 1639 int ret; 1640 1641 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6, 1642 lb_dst)) { 1643 ILB_R_KSTAT(rule, icmp_2big_processed); 1644 ret = ILB_BALANCED; 1645 } else { 1646 ILB_R_KSTAT(rule, icmp_2big_dropped); 1647 ret = ILB_DROPPED; 1648 } 1649 ILB_RULE_REFRELE(rule); 1650 return (ret); 1651 } 1652 default: 1653 ILB_R_KSTAT(rule, icmp_dropped); 1654 ILB_RULE_REFRELE(rule); 1655 return (ILB_DROPPED); 1656 } 1657 } 1658 1659 /* 1660 * Common routine to check an incoming packet and decide what to do with it. 1661 * called by ilb_check_v4|v6(). 1662 */ 1663 static int 1664 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src, 1665 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len, 1666 in6_addr_t *lb_dst) 1667 { 1668 in_port_t sport, dport; 1669 tcpha_t *tcph; 1670 udpha_t *udph; 1671 ilb_rule_t *rule; 1672 ilb_server_t *server; 1673 boolean_t balanced; 1674 struct ilb_sticky_s *s = NULL; 1675 int ret; 1676 uint32_t ip_sum, tp_sum; 1677 ilb_nat_info_t info; 1678 uint16_t nat_src_idx; 1679 boolean_t busy; 1680 1681 /* 1682 * We don't really need to switch here since both protocols's 1683 * ports are at the same offset. Just prepare for future protocol 1684 * specific processing. 1685 */ 1686 switch (l4) { 1687 case IPPROTO_TCP: 1688 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr) 1689 return (ILB_DROPPED); 1690 tcph = (tcpha_t *)tph; 1691 sport = tcph->tha_lport; 1692 dport = tcph->tha_fport; 1693 break; 1694 case IPPROTO_UDP: 1695 if (tph + sizeof (udpha_t) > mp->b_wptr) 1696 return (ILB_DROPPED); 1697 udph = (udpha_t *)tph; 1698 sport = udph->uha_src_port; 1699 dport = udph->uha_dst_port; 1700 break; 1701 default: 1702 return (ILB_PASSED); 1703 } 1704 1705 /* Fast path, there is an existing conn. */ 1706 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport, 1707 pkt_len, lb_dst)) { 1708 return (ILB_BALANCED); 1709 } 1710 1711 /* 1712 * If there is no existing connection for the incoming packet, check 1713 * to see if the packet matches a rule. If not, just let IP decide 1714 * what to do with it. 1715 * 1716 * Note: a reply from back end server should not match a rule. A 1717 * reply should match one existing conn. 1718 */ 1719 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid, 1720 pkt_len, &busy); 1721 if (rule == NULL) { 1722 /* If the rule is busy, just drop the packet. */ 1723 if (busy) 1724 return (ILB_DROPPED); 1725 else 1726 return (ILB_PASSED); 1727 } 1728 1729 /* 1730 * The packet matches a rule, use the rule load balance algorithm 1731 * to find a server. 1732 */ 1733 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport, 1734 rule->ir_alg->ilb_alg_data, &server); 1735 /* 1736 * This can only happen if there is no server in a rule or all 1737 * the servers are currently disabled. 1738 */ 1739 if (!balanced) 1740 goto no_server; 1741 1742 /* 1743 * If the rule is sticky enabled, we need to check the sticky table. 1744 * If there is a sticky entry for the client, use the previous server 1745 * instead of the one found above (note that both can be the same). 1746 * If there is no entry for that client, add an entry to the sticky 1747 * table. Both the find and add are done in ilb_sticky_find_add() 1748 * to avoid checking for duplicate when adding an entry. 1749 */ 1750 if (rule->ir_flags & ILB_RULE_STICKY) { 1751 in6_addr_t addr; 1752 1753 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr); 1754 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server, 1755 &s, &nat_src_idx)) == NULL) { 1756 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1757 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1758 goto no_server; 1759 } 1760 } 1761 1762 /* 1763 * We are holding a reference on the rule, so the server 1764 * cannot go away. 1765 */ 1766 *lb_dst = server->iser_addr_v6; 1767 ILB_S_KSTAT(server, pkt_processed); 1768 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len); 1769 1770 switch (rule->ir_topo) { 1771 case ILB_TOPO_IMPL_NAT: { 1772 ilb_nat_src_entry_t *src_ent; 1773 uint16_t *src_idx; 1774 1775 /* 1776 * We create a cache even if it is not a SYN segment. 1777 * The server should return a RST. When we see the 1778 * RST, we will destroy this cache. But by having 1779 * a cache, we know how to NAT the returned RST. 1780 */ 1781 info.vip = *dst; 1782 info.dport = dport; 1783 info.src = *src; 1784 info.sport = sport; 1785 1786 /* If stickiness is enabled, use the same source address */ 1787 if (s != NULL) 1788 src_idx = &nat_src_idx; 1789 else 1790 src_idx = NULL; 1791 1792 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src, 1793 &info.nat_src, &info.nat_sport, src_idx)) == NULL) { 1794 if (s != NULL) 1795 ilb_sticky_refrele(s); 1796 ILB_R_KSTAT(rule, pkt_dropped); 1797 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1798 ILB_R_KSTAT(rule, noport_pkt_dropped); 1799 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len); 1800 ret = ILB_DROPPED; 1801 break; 1802 } 1803 info.src_ent = src_ent; 1804 info.nat_dst = server->iser_addr_v6; 1805 if (rule->ir_port_range && server->iser_port_range) { 1806 info.nat_dport = htons(ntohs(dport) - 1807 rule->ir_min_port + server->iser_min_port); 1808 } else { 1809 info.nat_dport = htons(server->iser_min_port); 1810 } 1811 1812 /* 1813 * If ilb_conn_add() fails, it will release the reference on 1814 * sticky info and de-allocate the NAT source port allocated 1815 * above. 1816 */ 1817 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1818 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1819 ILB_R_KSTAT(rule, pkt_dropped); 1820 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1821 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1822 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1823 ret = ILB_DROPPED; 1824 break; 1825 } 1826 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1827 ret = ILB_BALANCED; 1828 break; 1829 } 1830 case ILB_TOPO_IMPL_HALF_NAT: 1831 info.vip = *dst; 1832 info.nat_dst = server->iser_addr_v6; 1833 info.dport = dport; 1834 if (rule->ir_port_range && server->iser_port_range) { 1835 info.nat_dport = htons(ntohs(dport) - 1836 rule->ir_min_port + server->iser_min_port); 1837 } else { 1838 info.nat_dport = htons(server->iser_min_port); 1839 } 1840 1841 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1842 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1843 ILB_R_KSTAT(rule, pkt_dropped); 1844 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1845 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1846 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1847 ret = ILB_DROPPED; 1848 break; 1849 } 1850 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1851 1852 ret = ILB_BALANCED; 1853 break; 1854 case ILB_TOPO_IMPL_DSR: 1855 /* 1856 * By decrementing the sticky refcnt, the period of 1857 * stickiness (life time of ilb_sticky_t) will be 1858 * from now to (now + default expiry time). 1859 */ 1860 if (s != NULL) 1861 ilb_sticky_refrele(s); 1862 ret = ILB_BALANCED; 1863 break; 1864 default: 1865 cmn_err(CE_PANIC, "data corruption unknown topology: %p", 1866 (void *) rule); 1867 break; 1868 } 1869 ILB_RULE_REFRELE(rule); 1870 return (ret); 1871 1872 no_server: 1873 /* This can only happen if there is no server available. */ 1874 ILB_R_KSTAT(rule, pkt_dropped); 1875 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1876 ILB_RULE_REFRELE(rule); 1877 return (ILB_DROPPED); 1878 } 1879 1880 int 1881 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4, 1882 uint8_t *tph, ipaddr_t *lb_dst) 1883 { 1884 in6_addr_t v6_src, v6_dst, v6_lb_dst; 1885 int ret; 1886 1887 ASSERT(DB_REF(mp) == 1); 1888 1889 if (l4 == IPPROTO_ICMP) { 1890 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph, 1891 lb_dst)); 1892 } 1893 1894 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src); 1895 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst); 1896 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha, 1897 tph, ntohs(ipha->ipha_length), &v6_lb_dst); 1898 if (ret == ILB_BALANCED) 1899 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst); 1900 return (ret); 1901 } 1902 1903 int 1904 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4, 1905 uint8_t *tph, in6_addr_t *lb_dst) 1906 { 1907 uint32_t pkt_len; 1908 1909 ASSERT(DB_REF(mp) == 1); 1910 1911 if (l4 == IPPROTO_ICMPV6) { 1912 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph, 1913 lb_dst)); 1914 } 1915 1916 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 1917 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst, 1918 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst)); 1919 } 1920 1921 void 1922 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules) 1923 { 1924 ilb_rule_t *tmp_rule; 1925 1926 mutex_enter(&ilbs->ilbs_g_lock); 1927 *num_rules = 0; 1928 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1929 tmp_rule = tmp_rule->ir_next) { 1930 if (tmp_rule->ir_zoneid == zoneid) 1931 *num_rules += 1; 1932 } 1933 mutex_exit(&ilbs->ilbs_g_lock); 1934 } 1935 1936 int 1937 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1938 uint32_t *num_servers) 1939 { 1940 ilb_rule_t *rule; 1941 int err; 1942 1943 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1944 return (err); 1945 *num_servers = rule->ir_kstat.num_servers.value.ui64; 1946 ILB_RULE_REFRELE(rule); 1947 return (0); 1948 } 1949 1950 int 1951 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1952 ilb_server_info_t *servers, uint32_t *num_servers) 1953 { 1954 ilb_rule_t *rule; 1955 ilb_server_t *server; 1956 size_t cnt; 1957 int err; 1958 1959 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1960 return (err); 1961 for (server = rule->ir_servers, cnt = *num_servers; 1962 server != NULL && cnt > 0; 1963 server = server->iser_next, cnt--, servers++) { 1964 (void) memcpy(servers->name, server->iser_name, 1965 ILB_SERVER_NAMESZ); 1966 servers->addr = server->iser_addr_v6; 1967 servers->min_port = htons(server->iser_min_port); 1968 servers->max_port = htons(server->iser_max_port); 1969 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0; 1970 servers->err = 0; 1971 } 1972 ILB_RULE_REFRELE(rule); 1973 *num_servers -= cnt; 1974 1975 return (0); 1976 } 1977 1978 void 1979 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names, 1980 char *buf) 1981 { 1982 ilb_rule_t *tmp_rule; 1983 int cnt; 1984 1985 if (*num_names == 0) 1986 return; 1987 1988 mutex_enter(&ilbs->ilbs_g_lock); 1989 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1990 tmp_rule = tmp_rule->ir_next) { 1991 if (tmp_rule->ir_zoneid != zoneid) 1992 continue; 1993 1994 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ); 1995 buf += ILB_RULE_NAMESZ; 1996 if (++cnt == *num_names) 1997 break; 1998 } 1999 mutex_exit(&ilbs->ilbs_g_lock); 2000 *num_names = cnt; 2001 } 2002 2003 int 2004 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd) 2005 { 2006 ilb_rule_t *rule; 2007 int err; 2008 2009 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) { 2010 return (err); 2011 } 2012 2013 /* 2014 * Except the enabled flags, none of the following will change 2015 * in the life time of a rule. So we don't hold the mutex when 2016 * reading them. The worst is to report a wrong enabled flags. 2017 */ 2018 cmd->ip_ver = rule->ir_ipver; 2019 cmd->proto = rule->ir_proto; 2020 cmd->min_port = htons(rule->ir_min_port); 2021 cmd->max_port = htons(rule->ir_max_port); 2022 2023 cmd->vip = rule->ir_target_v6; 2024 cmd->algo = rule->ir_alg_type; 2025 cmd->topo = rule->ir_topo; 2026 2027 cmd->nat_src_start = rule->ir_nat_src_start; 2028 cmd->nat_src_end = rule->ir_nat_src_end; 2029 2030 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout; 2031 cmd->nat_expiry = rule->ir_nat_expiry; 2032 cmd->sticky_expiry = rule->ir_sticky_expiry; 2033 2034 cmd->flags = 0; 2035 if (rule->ir_flags & ILB_RULE_ENABLED) 2036 cmd->flags |= ILB_RULE_ENABLED; 2037 if (rule->ir_flags & ILB_RULE_STICKY) { 2038 cmd->flags |= ILB_RULE_STICKY; 2039 cmd->sticky_mask = rule->ir_sticky_mask; 2040 } 2041 2042 ILB_RULE_REFRELE(rule); 2043 return (0); 2044 } 2045 2046 static void * 2047 ilb_stack_init(netstackid_t stackid, netstack_t *ns) 2048 { 2049 ilb_stack_t *ilbs; 2050 char tq_name[TASKQ_NAMELEN]; 2051 2052 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP); 2053 ilbs->ilbs_netstack = ns; 2054 2055 ilbs->ilbs_rule_head = NULL; 2056 ilbs->ilbs_g_hash = NULL; 2057 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL); 2058 2059 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP); 2060 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) { 2061 kmem_free(ilbs, sizeof (ilb_stack_t)); 2062 return (NULL); 2063 } 2064 2065 /* 2066 * ilbs_conn/sticky_hash related info is initialized in 2067 * ilb_conn/sticky_hash_init(). 2068 */ 2069 ilbs->ilbs_conn_taskq = NULL; 2070 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size; 2071 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size; 2072 ilbs->ilbs_c2s_conn_hash = NULL; 2073 ilbs->ilbs_s2c_conn_hash = NULL; 2074 ilbs->ilbs_conn_timer_list = NULL; 2075 2076 ilbs->ilbs_sticky_hash = NULL; 2077 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size; 2078 ilbs->ilbs_sticky_timer_list = NULL; 2079 ilbs->ilbs_sticky_taskq = NULL; 2080 2081 /* The allocation is done later when there is a rule using NAT mode. */ 2082 ilbs->ilbs_nat_src = NULL; 2083 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size; 2084 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL); 2085 ilbs->ilbs_nat_src_tid = 0; 2086 2087 /* For listing the conn hash table */ 2088 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL); 2089 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL); 2090 ilbs->ilbs_conn_list_busy = B_FALSE; 2091 ilbs->ilbs_conn_list_cur = 0; 2092 ilbs->ilbs_conn_list_connp = NULL; 2093 2094 /* For listing the sticky hash table */ 2095 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL); 2096 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL); 2097 ilbs->ilbs_sticky_list_busy = B_FALSE; 2098 ilbs->ilbs_sticky_list_cur = 0; 2099 ilbs->ilbs_sticky_list_curp = NULL; 2100 2101 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", 2102 (void *)ns); 2103 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR, 2104 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 2105 2106 return (ilbs); 2107 } 2108 2109 /* ARGSUSED */ 2110 static void 2111 ilb_stack_shutdown(netstackid_t stackid, void *arg) 2112 { 2113 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2114 ilb_rule_t *tmp_rule; 2115 2116 ilb_sticky_hash_fini(ilbs); 2117 ilb_conn_hash_fini(ilbs); 2118 mutex_enter(&ilbs->ilbs_g_lock); 2119 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) { 2120 ilb_rule_hash_del(tmp_rule); 2121 ilb_rule_g_del(ilbs, tmp_rule); 2122 mutex_exit(&ilbs->ilbs_g_lock); 2123 ilb_rule_del_common(ilbs, tmp_rule); 2124 mutex_enter(&ilbs->ilbs_g_lock); 2125 } 2126 mutex_exit(&ilbs->ilbs_g_lock); 2127 if (ilbs->ilbs_nat_src != NULL) 2128 ilb_nat_src_fini(ilbs); 2129 } 2130 2131 static void 2132 ilb_stack_fini(netstackid_t stackid, void * arg) 2133 { 2134 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2135 2136 ilb_rule_hash_fini(ilbs); 2137 taskq_destroy(ilbs->ilbs_rule_taskq); 2138 ilb_kstat_g_fini(stackid, ilbs); 2139 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t)); 2140 kmem_free(ilbs, sizeof (ilb_stack_t)); 2141 } 2142 2143 void 2144 ilb_ddi_g_init(void) 2145 { 2146 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown, 2147 ilb_stack_fini); 2148 } 2149 2150 void 2151 ilb_ddi_g_destroy(void) 2152 { 2153 netstack_unregister(NS_ILB); 2154 ilb_conn_cache_fini(); 2155 ilb_sticky_cache_fini(); 2156 } 2157