1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/kmem.h> 28 #include <sys/ksynch.h> 29 #include <sys/systm.h> 30 #include <sys/socket.h> 31 #include <sys/disp.h> 32 #include <sys/taskq.h> 33 #include <sys/cmn_err.h> 34 #include <sys/strsun.h> 35 #include <sys/sdt.h> 36 #include <sys/atomic.h> 37 #include <netinet/in.h> 38 #include <inet/ip.h> 39 #include <inet/ip6.h> 40 #include <inet/tcp.h> 41 #include <inet/udp_impl.h> 42 #include <inet/kstatcom.h> 43 44 #include <inet/ilb_ip.h> 45 #include "ilb_alg.h" 46 #include "ilb_nat.h" 47 #include "ilb_conn.h" 48 49 /* ILB kmem cache flag */ 50 int ilb_kmem_flags = 0; 51 52 /* 53 * The default size for the different hash tables. Global for all stacks. 54 * But each stack has its own table, just that their sizes are the same. 55 */ 56 static size_t ilb_rule_hash_size = 2048; 57 58 static size_t ilb_conn_hash_size = 262144; 59 60 static size_t ilb_sticky_hash_size = 262144; 61 62 /* This should be a prime number. */ 63 static size_t ilb_nat_src_hash_size = 97; 64 65 /* Default NAT cache entry expiry time. */ 66 static uint32_t ilb_conn_tcp_expiry = 120; 67 static uint32_t ilb_conn_udp_expiry = 60; 68 69 /* Default sticky entry expiry time. */ 70 static uint32_t ilb_sticky_expiry = 60; 71 72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */ 73 #define ILB_RULE_HASH(addr, hash_size) \ 74 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \ 75 *(addr)) & ((hash_size) - 1)) 76 77 /* 78 * Note on ILB delayed processing 79 * 80 * To avoid in line removal on some of the data structures, such as rules, 81 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq. 82 * There are three types of ILB taskq: 83 * 84 * 1. rule handling: created at stack initialialization time, ilb_stack_init() 85 * 2. conn hash handling: created at conn hash initialization time, 86 * ilb_conn_hash_init() 87 * 3. sticky hash handling: created at sticky hash initialization time, 88 * ilb_sticky_hash_init() 89 * 90 * The rule taskq is for processing rule and server removal. When a user 91 * land rule/server removal request comes in, a taskq is dispatched after 92 * removing the rule/server from all related hashes. This taskq will wait 93 * until all references to the rule/server are gone before removing it. 94 * So the user land thread requesting the removal does not need to wait 95 * for the removal completion. 96 * 97 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and 98 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers 99 * and ilb_sticky_timer_size timers running for ilb_conn_hash and 100 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one 101 * portion (same size) of the hash table. When a timer fires, it dispatches 102 * a conn hash taskq to clean up its portion of the table. This avoids in 103 * line processing of the removal. 104 * 105 * There is another delayed processing, the clean up of NAT source address 106 * table. We just use the timer to directly handle it instead of using 107 * a taskq. The reason is that the table is small so it is OK to use the 108 * timer. 109 */ 110 111 /* ILB rule taskq constants. */ 112 #define ILB_RULE_TASKQ_NUM_THR 20 113 114 /* Argument passed to ILB rule taskq routines. */ 115 typedef struct { 116 ilb_stack_t *ilbs; 117 ilb_rule_t *rule; 118 } ilb_rule_tq_t; 119 120 /* kstat handling routines. */ 121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *); 122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *); 123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *); 124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *, 125 ilb_server_t *); 126 127 /* Rule hash handling routines. */ 128 static void ilb_rule_hash_init(ilb_stack_t *); 129 static void ilb_rule_hash_fini(ilb_stack_t *); 130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *); 131 static void ilb_rule_hash_del(ilb_rule_t *); 132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *, 133 in_port_t, zoneid_t, uint32_t, boolean_t *); 134 135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *); 136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *); 137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *); 138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *, 139 int *); 140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int, 141 int, in_port_t, in_port_t, const in6_addr_t *); 142 143 /* Back end server handling routines. */ 144 static void ilb_server_free(ilb_server_t *); 145 146 /* Network stack handling routines. */ 147 static void *ilb_stack_init(netstackid_t, netstack_t *); 148 static void ilb_stack_shutdown(netstackid_t, void *); 149 static void ilb_stack_fini(netstackid_t, void *); 150 151 /* Sticky connection handling routines. */ 152 static void ilb_rule_sticky_init(ilb_rule_t *); 153 static void ilb_rule_sticky_fini(ilb_rule_t *); 154 155 /* Handy macro to check for unspecified address. */ 156 #define IS_ADDR_UNSPEC(addr) \ 157 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \ 158 IN6_IS_ADDR_UNSPECIFIED(addr)) 159 160 /* 161 * Global kstat instance counter. When a rule is created, its kstat instance 162 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is 163 * incremented. 164 */ 165 static uint_t ilb_kstat_instance = 0; 166 167 /* 168 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME. 169 * A rule's kstat has ILB_RULE_KS_CNAME class name. 170 */ 171 #define ILB_G_KS_NAME "global" 172 #define ILB_G_KS_CNAME "kstat" 173 #define ILB_RULE_KS_CNAME "rulestat" 174 175 static kstat_t * 176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs) 177 { 178 kstat_t *ksp; 179 ilb_g_kstat_t template = { 180 { "num_rules", KSTAT_DATA_UINT64, 0 }, 181 { "ip_frag_in", KSTAT_DATA_UINT64, 0 }, 182 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 } 183 }; 184 185 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME, 186 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t), 187 KSTAT_FLAG_VIRTUAL, stackid); 188 if (ksp == NULL) 189 return (NULL); 190 bcopy(&template, ilbs->ilbs_kstat, sizeof (template)); 191 ksp->ks_data = ilbs->ilbs_kstat; 192 ksp->ks_private = (void *)(uintptr_t)stackid; 193 194 kstat_install(ksp); 195 return (ksp); 196 } 197 198 static void 199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs) 200 { 201 if (ilbs->ilbs_ksp != NULL) { 202 ASSERT(stackid == (netstackid_t)(uintptr_t) 203 ilbs->ilbs_ksp->ks_private); 204 kstat_delete_netstack(ilbs->ilbs_ksp, stackid); 205 ilbs->ilbs_ksp = NULL; 206 } 207 } 208 209 static kstat_t * 210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule) 211 { 212 kstat_t *ksp; 213 ilb_rule_kstat_t template = { 214 { "num_servers", KSTAT_DATA_UINT64, 0 }, 215 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 }, 216 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 }, 217 { "bytes_dropped", KSTAT_DATA_UINT64, 0 }, 218 { "pkt_dropped", KSTAT_DATA_UINT64, 0 }, 219 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 220 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 221 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 }, 222 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 }, 223 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 }, 224 { "icmp_dropped", KSTAT_DATA_UINT64, 0 }, 225 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 }, 226 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 } 227 }; 228 229 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 230 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED, 231 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 232 if (ksp == NULL) 233 return (NULL); 234 235 bcopy(&template, &rule->ir_kstat, sizeof (template)); 236 ksp->ks_data = &rule->ir_kstat; 237 ksp->ks_private = (void *)(uintptr_t)stackid; 238 239 kstat_install(ksp); 240 return (ksp); 241 } 242 243 static kstat_t * 244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule, 245 ilb_server_t *server) 246 { 247 kstat_t *ksp; 248 ilb_server_kstat_t template = { 249 { "bytes_processed", KSTAT_DATA_UINT64, 0 }, 250 { "pkt_processed", KSTAT_DATA_UINT64, 0 }, 251 { "ip_address", KSTAT_DATA_STRING, 0 } 252 }; 253 char cname_buf[KSTAT_STRLEN]; 254 255 /* 7 is "-sstat" */ 256 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN); 257 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name); 258 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, 259 server->iser_name, cname_buf, KSTAT_TYPE_NAMED, 260 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); 261 if (ksp == NULL) 262 return (NULL); 263 264 bcopy(&template, &server->iser_kstat, sizeof (template)); 265 ksp->ks_data = &server->iser_kstat; 266 ksp->ks_private = (void *)(uintptr_t)stackid; 267 268 kstat_named_setstr(&server->iser_kstat.ip_address, 269 server->iser_ip_addr); 270 /* We never change the IP address */ 271 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1; 272 273 kstat_install(ksp); 274 return (ksp); 275 } 276 277 /* Initialize the rule hash table. */ 278 static void 279 ilb_rule_hash_init(ilb_stack_t *ilbs) 280 { 281 int i; 282 283 /* 284 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to 285 * the next power of 2. 286 */ 287 if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) { 288 for (i = 0; i < 31; i++) { 289 if (ilbs->ilbs_rule_hash_size < (1 << i)) 290 break; 291 } 292 ilbs->ilbs_rule_hash_size = 1 << i; 293 } 294 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) * 295 ilbs->ilbs_rule_hash_size, KM_SLEEP); 296 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) { 297 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL, 298 MUTEX_DEFAULT, NULL); 299 } 300 } 301 302 /* Clean up the rule hash table. */ 303 static void 304 ilb_rule_hash_fini(ilb_stack_t *ilbs) 305 { 306 if (ilbs->ilbs_g_hash == NULL) 307 return; 308 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) * 309 ilbs->ilbs_rule_hash_size); 310 } 311 312 /* Add a rule to the rule hash table. */ 313 static void 314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr) 315 { 316 int i; 317 318 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3], 319 ilbs->ilbs_rule_hash_size); 320 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i); 321 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 322 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule; 323 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL) 324 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule; 325 rule->ir_hash_prev = NULL; 326 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule; 327 328 rule->ir_hash = &ilbs->ilbs_g_hash[i]; 329 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 330 } 331 332 /* 333 * Remove a rule from the rule hash table. Note that the rule is not freed 334 * in this routine. 335 */ 336 static void 337 ilb_rule_hash_del(ilb_rule_t *rule) 338 { 339 mutex_enter(&rule->ir_hash->ilb_hash_lock); 340 if (rule->ir_hash->ilb_hash_rule == rule) { 341 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next; 342 if (rule->ir_hash_next != NULL) 343 rule->ir_hash_next->ir_hash_prev = NULL; 344 } else { 345 if (rule->ir_hash_prev != NULL) 346 rule->ir_hash_prev->ir_hash_next = 347 rule->ir_hash_next; 348 if (rule->ir_hash_next != NULL) { 349 rule->ir_hash_next->ir_hash_prev = 350 rule->ir_hash_prev; 351 } 352 } 353 mutex_exit(&rule->ir_hash->ilb_hash_lock); 354 355 rule->ir_hash_next = NULL; 356 rule->ir_hash_prev = NULL; 357 rule->ir_hash = NULL; 358 } 359 360 /* 361 * Given the info of a packet, look for a match in the rule hash table. 362 */ 363 static ilb_rule_t * 364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr, 365 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy) 366 { 367 int i; 368 ilb_rule_t *rule; 369 ipaddr_t v4_addr; 370 371 *busy = B_FALSE; 372 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr); 373 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size); 374 port = ntohs(port); 375 376 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 377 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 378 rule = rule->ir_hash_next) { 379 if (!rule->ir_port_range) { 380 if (rule->ir_min_port != port) 381 continue; 382 } else { 383 if (port < rule->ir_min_port || 384 port > rule->ir_max_port) { 385 continue; 386 } 387 } 388 if (rule->ir_ipver != l3 || rule->ir_proto != l4 || 389 rule->ir_zoneid != zoneid) { 390 continue; 391 } 392 393 if (l3 == IPPROTO_IP) { 394 if (rule->ir_target_v4 != INADDR_ANY && 395 rule->ir_target_v4 != v4_addr) { 396 continue; 397 } 398 } else { 399 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) && 400 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) { 401 continue; 402 } 403 } 404 405 /* 406 * Just update the stats if the rule is disabled. 407 */ 408 mutex_enter(&rule->ir_lock); 409 if (!(rule->ir_flags & ILB_RULE_ENABLED)) { 410 ILB_R_KSTAT(rule, pkt_not_processed); 411 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len); 412 mutex_exit(&rule->ir_lock); 413 rule = NULL; 414 break; 415 } else if (rule->ir_flags & ILB_RULE_BUSY) { 416 /* 417 * If we are busy... 418 * 419 * XXX we should have a queue to postpone the 420 * packet processing. But this requires a 421 * mechanism in IP to re-start the packet 422 * processing. So for now, just drop the packet. 423 */ 424 ILB_R_KSTAT(rule, pkt_dropped); 425 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len); 426 mutex_exit(&rule->ir_lock); 427 *busy = B_TRUE; 428 rule = NULL; 429 break; 430 } else { 431 rule->ir_refcnt++; 432 ASSERT(rule->ir_refcnt != 1); 433 mutex_exit(&rule->ir_lock); 434 break; 435 } 436 } 437 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 438 return (rule); 439 } 440 441 /* 442 * Add a rule to the global rule list. This list is for finding all rules 443 * in an IP stack. The caller is assumed to hold the ilbs_g_lock. 444 */ 445 static void 446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule) 447 { 448 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 449 rule->ir_next = ilbs->ilbs_rule_head; 450 ilbs->ilbs_rule_head = rule; 451 ILB_KSTAT_UPDATE(ilbs, num_rules, 1); 452 } 453 454 /* The call is assumed to hold the ilbs_g_lock. */ 455 static void 456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule) 457 { 458 ilb_rule_t *tmp_rule; 459 ilb_rule_t *prev_rule; 460 461 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 462 prev_rule = NULL; 463 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 464 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) { 465 if (tmp_rule == rule) 466 break; 467 } 468 if (tmp_rule == NULL) { 469 mutex_exit(&ilbs->ilbs_g_lock); 470 return; 471 } 472 if (prev_rule == NULL) 473 ilbs->ilbs_rule_head = tmp_rule->ir_next; 474 else 475 prev_rule->ir_next = tmp_rule->ir_next; 476 ILB_KSTAT_UPDATE(ilbs, num_rules, -1); 477 } 478 479 /* 480 * Helper routine to calculate how many source addresses are in a given 481 * range. 482 */ 483 static int64_t 484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2) 485 { 486 int64_t ret; 487 uint32_t addr1, addr2; 488 489 /* 490 * Here we assume that the max number of NAT source cannot be 491 * large such that the most significant 2 s6_addr32 must be 492 * equal. 493 */ 494 addr1 = ntohl(a1->s6_addr32[3]); 495 addr2 = ntohl(a2->s6_addr32[3]); 496 if (a1->s6_addr32[0] != a2->s6_addr32[0] || 497 a1->s6_addr32[1] != a2->s6_addr32[1] || 498 a1->s6_addr32[2] > a2->s6_addr32[2] || 499 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) { 500 return (-1); 501 } 502 if (a1->s6_addr32[2] == a2->s6_addr32[2]) { 503 return (addr2 - addr1 + 1); 504 } else { 505 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2])); 506 ret <<= 32; 507 ret = ret + addr1 - addr2; 508 return (ret + 1); 509 } 510 } 511 512 /* 513 * Add an ILB rule. 514 */ 515 int 516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd) 517 { 518 ilb_rule_t *rule; 519 netstackid_t stackid; 520 int ret; 521 in_port_t min_port, max_port; 522 int64_t num_src; 523 524 /* Sanity checks. */ 525 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6) 526 return (EINVAL); 527 528 /* Need to support SCTP... */ 529 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP) 530 return (EINVAL); 531 532 /* For full NAT, the NAT source must be supplied. */ 533 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 534 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) || 535 IS_ADDR_UNSPEC(&cmd->nat_src_end)) { 536 return (EINVAL); 537 } 538 } 539 540 /* Check invalid mask */ 541 if ((cmd->flags & ILB_RULE_STICKY) && 542 IS_ADDR_UNSPEC(&cmd->sticky_mask)) { 543 return (EINVAL); 544 } 545 546 /* Port is passed in network byte order. */ 547 min_port = ntohs(cmd->min_port); 548 max_port = ntohs(cmd->max_port); 549 if (min_port > max_port) 550 return (EINVAL); 551 552 /* min_port == 0 means "all ports". Make it so */ 553 if (min_port == 0) { 554 min_port = 1; 555 max_port = 65535; 556 } 557 558 /* Funny address checking. */ 559 if (cmd->ip_ver == IPPROTO_IP) { 560 in_addr_t v4_addr1, v4_addr2; 561 562 v4_addr1 = cmd->vip.s6_addr32[3]; 563 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 564 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST || 565 v4_addr1 == INADDR_ANY || 566 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 567 return (EINVAL); 568 } 569 570 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 571 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]); 572 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]); 573 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || 574 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET || 575 v4_addr1 == INADDR_BROADCAST || 576 v4_addr2 == INADDR_BROADCAST || 577 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY || 578 CLASSD(v4_addr1) || CLASSD(v4_addr2) || 579 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 581 return (EINVAL); 582 } 583 584 num_src = v4_addr2 - v4_addr1 + 1; 585 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC) 586 return (EINVAL); 587 } 588 } else { 589 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) || 590 IN6_IS_ADDR_MULTICAST(&cmd->vip) || 591 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) || 592 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { 593 return (EINVAL); 594 } 595 596 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 597 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) || 598 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) || 599 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) || 600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) || 601 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) || 602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) || 603 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || 604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { 605 return (EINVAL); 606 } 607 608 if ((num_src = num_nat_src_v6(&cmd->nat_src_start, 609 &cmd->nat_src_end)) < 0 || 610 num_src > ILB_MAX_NAT_SRC) { 611 return (EINVAL); 612 } 613 } 614 } 615 616 mutex_enter(&ilbs->ilbs_g_lock); 617 if (ilbs->ilbs_g_hash == NULL) 618 ilb_rule_hash_init(ilbs); 619 if (ilbs->ilbs_c2s_conn_hash == NULL) { 620 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 621 ilb_conn_hash_init(ilbs); 622 ilb_nat_src_init(ilbs); 623 } 624 625 /* Make sure that the new rule does not duplicate an existing one. */ 626 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto, 627 min_port, max_port, &cmd->vip)) { 628 mutex_exit(&ilbs->ilbs_g_lock); 629 return (EEXIST); 630 } 631 632 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP); 633 if (rule == NULL) { 634 mutex_exit(&ilbs->ilbs_g_lock); 635 return (ENOMEM); 636 } 637 638 /* ir_name is all 0 to begin with */ 639 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1); 640 641 rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1); 642 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 643 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) { 644 ret = ENOMEM; 645 goto error; 646 } 647 648 if (cmd->topo == ILB_TOPO_IMPL_NAT) { 649 rule->ir_nat_src_start = cmd->nat_src_start; 650 rule->ir_nat_src_end = cmd->nat_src_end; 651 } 652 653 rule->ir_ipver = cmd->ip_ver; 654 rule->ir_proto = cmd->proto; 655 rule->ir_topo = cmd->topo; 656 657 rule->ir_min_port = min_port; 658 rule->ir_max_port = max_port; 659 if (rule->ir_min_port != rule->ir_max_port) 660 rule->ir_port_range = B_TRUE; 661 else 662 rule->ir_port_range = B_FALSE; 663 664 rule->ir_zoneid = zoneid; 665 666 rule->ir_target_v6 = cmd->vip; 667 rule->ir_servers = NULL; 668 669 /* 670 * The default connection drain timeout is indefinite (value 0), 671 * meaning we will wait for all connections to finish. So we 672 * can assign cmd->conn_drain_timeout to it directly. 673 */ 674 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout; 675 if (cmd->nat_expiry != 0) { 676 rule->ir_nat_expiry = cmd->nat_expiry; 677 } else { 678 switch (rule->ir_proto) { 679 case IPPROTO_TCP: 680 rule->ir_nat_expiry = ilb_conn_tcp_expiry; 681 break; 682 case IPPROTO_UDP: 683 rule->ir_nat_expiry = ilb_conn_udp_expiry; 684 break; 685 default: 686 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p", 687 (void *)rule); 688 break; 689 } 690 } 691 if (cmd->sticky_expiry != 0) 692 rule->ir_sticky_expiry = cmd->sticky_expiry; 693 else 694 rule->ir_sticky_expiry = ilb_sticky_expiry; 695 696 if (cmd->flags & ILB_RULE_STICKY) { 697 rule->ir_flags |= ILB_RULE_STICKY; 698 rule->ir_sticky_mask = cmd->sticky_mask; 699 if (ilbs->ilbs_sticky_hash == NULL) 700 ilb_sticky_hash_init(ilbs); 701 } 702 if (cmd->flags & ILB_RULE_ENABLED) 703 rule->ir_flags |= ILB_RULE_ENABLED; 704 705 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL); 706 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL); 707 708 rule->ir_refcnt = 1; 709 710 switch (cmd->algo) { 711 case ILB_ALG_IMPL_ROUNDROBIN: 712 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) { 713 ret = ENOMEM; 714 goto error; 715 } 716 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN; 717 break; 718 case ILB_ALG_IMPL_HASH_IP: 719 case ILB_ALG_IMPL_HASH_IP_SPORT: 720 case ILB_ALG_IMPL_HASH_IP_VIP: 721 if ((rule->ir_alg = ilb_alg_hash_init(rule, 722 &cmd->algo)) == NULL) { 723 ret = ENOMEM; 724 goto error; 725 } 726 rule->ir_alg_type = cmd->algo; 727 break; 728 default: 729 ret = EINVAL; 730 goto error; 731 } 732 733 /* Add it to the global list and hash array at the end. */ 734 ilb_rule_g_add(ilbs, rule); 735 ilb_rule_hash_add(ilbs, rule, &cmd->vip); 736 737 mutex_exit(&ilbs->ilbs_g_lock); 738 739 return (0); 740 741 error: 742 mutex_exit(&ilbs->ilbs_g_lock); 743 if (rule->ir_ksp != NULL) { 744 /* stackid must be initialized if ir_ksp != NULL */ 745 kstat_delete_netstack(rule->ir_ksp, stackid); 746 } 747 kmem_free(rule, sizeof (ilb_rule_t)); 748 return (ret); 749 } 750 751 /* 752 * The final part in deleting a rule. Either called directly or by the 753 * taskq dispatched. 754 */ 755 static void 756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule) 757 { 758 netstackid_t stackid; 759 ilb_server_t *server; 760 761 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 762 763 /* 764 * Let the algorithm know that the rule is going away. The 765 * algorithm fini routine will free all its resources with this 766 * rule. 767 */ 768 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg); 769 770 while ((server = tmp_rule->ir_servers) != NULL) { 771 mutex_enter(&server->iser_lock); 772 ilb_destroy_nat_src(&server->iser_nat_src); 773 if (tmp_rule->ir_conn_drain_timeout != 0) { 774 /* 775 * The garbage collection thread checks this value 776 * without grabing a lock. So we need to use 777 * atomic_swap_64() to make sure that the value seen 778 * by gc thread is intact. 779 */ 780 (void) atomic_swap_64( 781 (uint64_t *)&server->iser_die_time, lbolt64 + 782 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout)); 783 } 784 while (server->iser_refcnt > 1) 785 cv_wait(&server->iser_cv, &server->iser_lock); 786 tmp_rule->ir_servers = server->iser_next; 787 kstat_delete_netstack(server->iser_ksp, stackid); 788 kmem_free(server, sizeof (ilb_server_t)); 789 } 790 791 ASSERT(tmp_rule->ir_ksp != NULL); 792 kstat_delete_netstack(tmp_rule->ir_ksp, stackid); 793 794 kmem_free(tmp_rule, sizeof (ilb_rule_t)); 795 } 796 797 /* The routine executed by the delayed rule taskq. */ 798 static void 799 ilb_rule_del_tq(void *arg) 800 { 801 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs; 802 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule; 803 804 mutex_enter(&rule->ir_lock); 805 while (rule->ir_refcnt > 1) 806 cv_wait(&rule->ir_cv, &rule->ir_lock); 807 ilb_rule_del_common(ilbs, rule); 808 kmem_free(arg, sizeof (ilb_rule_tq_t)); 809 } 810 811 /* Routine to delete a rule. */ 812 int 813 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name) 814 { 815 ilb_rule_t *tmp_rule; 816 ilb_rule_tq_t *arg; 817 int err; 818 819 mutex_enter(&ilbs->ilbs_g_lock); 820 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, 821 &err)) == NULL) { 822 mutex_exit(&ilbs->ilbs_g_lock); 823 return (err); 824 } 825 826 /* 827 * First remove the rule from the hash array and the global list so 828 * that no one can find this rule any more. 829 */ 830 ilb_rule_hash_del(tmp_rule); 831 ilb_rule_g_del(ilbs, tmp_rule); 832 mutex_exit(&ilbs->ilbs_g_lock); 833 ILB_RULE_REFRELE(tmp_rule); 834 835 /* 836 * Now no one can find this rule, we can remove it once all 837 * references to it are dropped and all references to the list 838 * of servers are dropped. So dispatch a task to finish the deletion. 839 * We do this instead of letting the last one referencing the 840 * rule do it. The reason is that the last one may be the 841 * interrupt thread. We want to minimize the work it needs to 842 * do. Rule deletion is not a critical task so it can be delayed. 843 */ 844 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 845 arg->ilbs = ilbs; 846 arg->rule = tmp_rule; 847 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, 848 TQ_SLEEP); 849 850 return (0); 851 } 852 853 /* 854 * Given an IP address, check to see if there is a rule using this 855 * as the VIP. It can be used to check if we need to drop a fragment. 856 */ 857 boolean_t 858 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule) 859 { 860 int i; 861 ilb_rule_t *rule; 862 boolean_t ret = B_FALSE; 863 864 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3], 865 ilbs->ilbs_rule_hash_size); 866 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 867 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 868 rule = rule->ir_hash_next) { 869 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) { 870 mutex_enter(&rule->ir_lock); 871 if (rule->ir_flags & ILB_RULE_BUSY) { 872 mutex_exit(&rule->ir_lock); 873 break; 874 } 875 if (ret_rule != NULL) { 876 rule->ir_refcnt++; 877 mutex_exit(&rule->ir_lock); 878 *ret_rule = rule; 879 } else { 880 mutex_exit(&rule->ir_lock); 881 } 882 ret = B_TRUE; 883 break; 884 } 885 } 886 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 887 return (ret); 888 } 889 890 boolean_t 891 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule) 892 { 893 int i; 894 ilb_rule_t *rule; 895 boolean_t ret = B_FALSE; 896 897 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size); 898 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 899 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; 900 rule = rule->ir_hash_next) { 901 if (rule->ir_target_v6.s6_addr32[3] == addr) { 902 mutex_enter(&rule->ir_lock); 903 if (rule->ir_flags & ILB_RULE_BUSY) { 904 mutex_exit(&rule->ir_lock); 905 break; 906 } 907 if (ret_rule != NULL) { 908 rule->ir_refcnt++; 909 mutex_exit(&rule->ir_lock); 910 *ret_rule = rule; 911 } else { 912 mutex_exit(&rule->ir_lock); 913 } 914 ret = B_TRUE; 915 break; 916 } 917 } 918 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); 919 return (ret); 920 } 921 922 static ilb_rule_t * 923 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 924 int *err) 925 { 926 ilb_rule_t *tmp_rule; 927 928 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 929 930 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 931 tmp_rule = tmp_rule->ir_next) { 932 if (tmp_rule->ir_zoneid != zoneid) 933 continue; 934 if (strcasecmp(tmp_rule->ir_name, name) == 0) { 935 mutex_enter(&tmp_rule->ir_lock); 936 if (tmp_rule->ir_flags & ILB_RULE_BUSY) { 937 mutex_exit(&tmp_rule->ir_lock); 938 *err = EINPROGRESS; 939 return (NULL); 940 } 941 tmp_rule->ir_refcnt++; 942 mutex_exit(&tmp_rule->ir_lock); 943 *err = 0; 944 return (tmp_rule); 945 } 946 } 947 *err = ENOENT; 948 return (NULL); 949 } 950 951 /* To find a rule with a given name and zone in the global rule list. */ 952 ilb_rule_t * 953 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 954 int *err) 955 { 956 ilb_rule_t *tmp_rule; 957 958 mutex_enter(&ilbs->ilbs_g_lock); 959 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err); 960 mutex_exit(&ilbs->ilbs_g_lock); 961 return (tmp_rule); 962 } 963 964 /* Try to match the given packet info and zone ID with a rule. */ 965 static boolean_t 966 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3, 967 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr) 968 { 969 ilb_rule_t *tmp_rule; 970 971 ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); 972 973 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 974 tmp_rule = tmp_rule->ir_next) { 975 if (tmp_rule->ir_zoneid != zoneid) 976 continue; 977 978 /* 979 * We don't allow the same name in different rules even if all 980 * the other rule components are different. 981 */ 982 if (strcasecmp(tmp_rule->ir_name, name) == 0) 983 return (B_TRUE); 984 985 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4) 986 continue; 987 988 /* 989 * ir_min_port and ir_max_port are the same if ir_port_range 990 * is false. In this case, if the ir_min|max_port (same) is 991 * outside of the given port range, it is OK. In other cases, 992 * check if min and max port are outside a rule's range. 993 */ 994 if (tmp_rule->ir_max_port < min_port || 995 tmp_rule->ir_min_port > max_port) { 996 continue; 997 } 998 999 /* 1000 * If l3 is IPv4, the addr passed in is assumed to be 1001 * mapped address. 1002 */ 1003 if (V6_OR_V4_INADDR_ANY(*addr) || 1004 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) || 1005 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) { 1006 return (B_TRUE); 1007 } 1008 } 1009 return (B_FALSE); 1010 } 1011 1012 int 1013 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid, 1014 const char *rule_name, ilb_rule_t *in_rule) 1015 { 1016 ilb_rule_t *rule; 1017 int err; 1018 1019 ASSERT((in_rule == NULL && rule_name != NULL) || 1020 (in_rule != NULL && rule_name == NULL)); 1021 if ((rule = in_rule) == NULL) { 1022 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1023 &err)) == NULL) { 1024 return (err); 1025 } 1026 } 1027 mutex_enter(&rule->ir_lock); 1028 rule->ir_flags |= ILB_RULE_ENABLED; 1029 mutex_exit(&rule->ir_lock); 1030 1031 /* Only refrele if the rule is passed in. */ 1032 if (in_rule == NULL) 1033 ILB_RULE_REFRELE(rule); 1034 return (0); 1035 } 1036 1037 int 1038 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid, 1039 const char *rule_name, ilb_rule_t *in_rule) 1040 { 1041 ilb_rule_t *rule; 1042 int err; 1043 1044 ASSERT((in_rule == NULL && rule_name != NULL) || 1045 (in_rule != NULL && rule_name == NULL)); 1046 if ((rule = in_rule) == NULL) { 1047 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1048 &err)) == NULL) { 1049 return (err); 1050 } 1051 } 1052 mutex_enter(&rule->ir_lock); 1053 rule->ir_flags &= ~ILB_RULE_ENABLED; 1054 mutex_exit(&rule->ir_lock); 1055 1056 /* Only refrele if the rule is passed in. */ 1057 if (in_rule == NULL) 1058 ILB_RULE_REFRELE(rule); 1059 return (0); 1060 } 1061 1062 /* 1063 * XXX We should probably have a walker function to walk all rules. For 1064 * now, just add a simple loop for enable/disable/del. 1065 */ 1066 void 1067 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1068 { 1069 ilb_rule_t *rule; 1070 1071 mutex_enter(&ilbs->ilbs_g_lock); 1072 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { 1073 if (rule->ir_zoneid != zoneid) 1074 continue; 1075 /* 1076 * No need to hold the rule as we are holding the global 1077 * lock so it won't go away. Ignore the return value here 1078 * as the rule is provided so the call cannot fail. 1079 */ 1080 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule); 1081 } 1082 mutex_exit(&ilbs->ilbs_g_lock); 1083 } 1084 1085 void 1086 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1087 { 1088 ilb_rule_t *rule; 1089 1090 mutex_enter(&ilbs->ilbs_g_lock); 1091 for (rule = ilbs->ilbs_rule_head; rule != NULL; 1092 rule = rule->ir_next) { 1093 if (rule->ir_zoneid != zoneid) 1094 continue; 1095 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule); 1096 } 1097 mutex_exit(&ilbs->ilbs_g_lock); 1098 } 1099 1100 void 1101 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid) 1102 { 1103 ilb_rule_t *rule; 1104 ilb_rule_tq_t *arg; 1105 1106 mutex_enter(&ilbs->ilbs_g_lock); 1107 while ((rule = ilbs->ilbs_rule_head) != NULL) { 1108 if (rule->ir_zoneid != zoneid) 1109 continue; 1110 ilb_rule_hash_del(rule); 1111 ilb_rule_g_del(ilbs, rule); 1112 mutex_exit(&ilbs->ilbs_g_lock); 1113 1114 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); 1115 arg->ilbs = ilbs; 1116 arg->rule = rule; 1117 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, 1118 arg, TQ_SLEEP); 1119 1120 mutex_enter(&ilbs->ilbs_g_lock); 1121 } 1122 mutex_exit(&ilbs->ilbs_g_lock); 1123 } 1124 1125 /* 1126 * This is just an optimization, so don't grab the global lock. The 1127 * worst case is that we missed a couple packets. 1128 */ 1129 boolean_t 1130 ilb_has_rules(ilb_stack_t *ilbs) 1131 { 1132 return (ilbs->ilbs_rule_head != NULL); 1133 } 1134 1135 1136 static int 1137 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1138 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable) 1139 { 1140 ilb_server_t *tmp_server; 1141 int ret; 1142 1143 ASSERT((rule == NULL && rule_name != NULL) || 1144 (rule != NULL && rule_name == NULL)); 1145 1146 if (rule == NULL) { 1147 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1148 &ret)) == NULL) { 1149 return (ret); 1150 } 1151 } 1152 1153 /* Once we get a hold on the rule, no server can be added/deleted. */ 1154 for (tmp_server = rule->ir_servers; tmp_server != NULL; 1155 tmp_server = tmp_server->iser_next) { 1156 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr)) 1157 break; 1158 } 1159 if (tmp_server == NULL) { 1160 ret = ENOENT; 1161 goto done; 1162 } 1163 1164 if (enable) { 1165 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server, 1166 rule->ir_alg->ilb_alg_data); 1167 if (ret == 0) { 1168 tmp_server->iser_enabled = B_TRUE; 1169 tmp_server->iser_die_time = 0; 1170 } 1171 } else { 1172 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server, 1173 rule->ir_alg->ilb_alg_data); 1174 if (ret == 0) { 1175 tmp_server->iser_enabled = B_FALSE; 1176 if (rule->ir_conn_drain_timeout != 0) { 1177 (void) atomic_swap_64( 1178 (uint64_t *)&tmp_server->iser_die_time, 1179 lbolt64 + SEC_TO_TICK( 1180 rule->ir_conn_drain_timeout)); 1181 } 1182 } 1183 } 1184 1185 done: 1186 if (rule_name != NULL) 1187 ILB_RULE_REFRELE(rule); 1188 return (ret); 1189 } 1190 int 1191 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1192 ilb_rule_t *rule, in6_addr_t *addr) 1193 { 1194 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE)); 1195 } 1196 1197 int 1198 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1199 ilb_rule_t *rule, in6_addr_t *addr) 1200 { 1201 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE)); 1202 } 1203 1204 /* 1205 * Add a back end server to a rule. If the address is IPv4, it is assumed 1206 * to be passed in as a mapped address. 1207 */ 1208 int 1209 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info) 1210 { 1211 ilb_server_t *server; 1212 netstackid_t stackid; 1213 int ret = 0; 1214 in_port_t min_port, max_port; 1215 in_port_t range; 1216 1217 /* Port is passed in network byte order. */ 1218 min_port = ntohs(info->min_port); 1219 max_port = ntohs(info->max_port); 1220 if (min_port > max_port) 1221 return (EINVAL); 1222 1223 /* min_port == 0 means "all ports". Make it so */ 1224 if (min_port == 0) { 1225 min_port = 1; 1226 max_port = 65535; 1227 } 1228 range = max_port - min_port; 1229 1230 mutex_enter(&rule->ir_lock); 1231 /* If someone is already doing server add/del, sleeps and wait. */ 1232 while (rule->ir_flags & ILB_RULE_BUSY) { 1233 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1234 mutex_exit(&rule->ir_lock); 1235 return (EINTR); 1236 } 1237 } 1238 1239 /* 1240 * Set the rule to be busy to make sure that no new packet can 1241 * use this rule. 1242 */ 1243 rule->ir_flags |= ILB_RULE_BUSY; 1244 1245 /* Now wait for all other guys to finish their work. */ 1246 while (rule->ir_refcnt > 2) { 1247 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1248 mutex_exit(&rule->ir_lock); 1249 ret = EINTR; 1250 goto end; 1251 } 1252 } 1253 mutex_exit(&rule->ir_lock); 1254 1255 /* Sanity checks... */ 1256 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) && 1257 rule->ir_ipver != IPPROTO_IP) || 1258 (!IN6_IS_ADDR_V4MAPPED(&info->addr) && 1259 rule->ir_ipver != IPPROTO_IPV6)) { 1260 ret = EINVAL; 1261 goto end; 1262 } 1263 1264 /* 1265 * Check for valid port range. 1266 * 1267 * For DSR, there can be no port shifting. Hence the server 1268 * specification must be the same as the rule's. 1269 * 1270 * For half-NAT/NAT, the range must either be 0 (port collapsing) or 1271 * it must be equal to the same value as the rule port range. 1272 * 1273 */ 1274 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) { 1275 if (rule->ir_max_port != max_port || 1276 rule->ir_min_port != min_port) { 1277 ret = EINVAL; 1278 goto end; 1279 } 1280 } else { 1281 if ((range != rule->ir_max_port - rule->ir_min_port) && 1282 range != 0) { 1283 ret = EINVAL; 1284 goto end; 1285 } 1286 } 1287 1288 /* Check for duplicate. */ 1289 for (server = rule->ir_servers; server != NULL; 1290 server = server->iser_next) { 1291 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) || 1292 strcasecmp(server->iser_name, info->name) == 0) { 1293 break; 1294 } 1295 } 1296 if (server != NULL) { 1297 ret = EEXIST; 1298 goto end; 1299 } 1300 1301 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) { 1302 ret = ENOMEM; 1303 goto end; 1304 } 1305 1306 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1); 1307 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr, 1308 sizeof (server->iser_ip_addr)); 1309 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; 1310 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server); 1311 if (server->iser_ksp == NULL) { 1312 kmem_free(server, sizeof (ilb_server_t)); 1313 ret = EINVAL; 1314 goto end; 1315 } 1316 1317 server->iser_stackid = stackid; 1318 server->iser_addr_v6 = info->addr; 1319 server->iser_min_port = min_port; 1320 server->iser_max_port = max_port; 1321 if (min_port != max_port) 1322 server->iser_port_range = B_TRUE; 1323 else 1324 server->iser_port_range = B_FALSE; 1325 1326 /* 1327 * If the rule uses NAT, find/create the NAT source entry to use 1328 * for this server. 1329 */ 1330 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 1331 in_port_t port; 1332 1333 /* 1334 * If the server uses a port range, our port allocation 1335 * scheme needs to treat it as a wildcard. Refer to the 1336 * comments in ilb_nat.c about the scheme. 1337 */ 1338 if (server->iser_port_range) 1339 port = 0; 1340 else 1341 port = server->iser_min_port; 1342 1343 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src, 1344 &server->iser_addr_v6, port, &rule->ir_nat_src_start, 1345 num_nat_src_v6(&rule->ir_nat_src_start, 1346 &rule->ir_nat_src_end))) != 0) { 1347 kstat_delete_netstack(server->iser_ksp, stackid); 1348 kmem_free(server, sizeof (ilb_server_t)); 1349 goto end; 1350 } 1351 } 1352 1353 /* 1354 * The iser_lock is only used to protect iser_refcnt. All the other 1355 * fields in ilb_server_t should not change, except for iser_enabled. 1356 * The worst thing that can happen if iser_enabled is messed up is 1357 * that one or two packets may not be load balanced to a server 1358 * correctly. 1359 */ 1360 server->iser_refcnt = 1; 1361 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE : 1362 B_FALSE; 1363 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL); 1364 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL); 1365 1366 /* Let the load balancing algorithm know about the addition. */ 1367 ASSERT(rule->ir_alg != NULL); 1368 if ((ret = rule->ir_alg->ilb_alg_server_add(server, 1369 rule->ir_alg->ilb_alg_data)) != 0) { 1370 kstat_delete_netstack(server->iser_ksp, stackid); 1371 kmem_free(server, sizeof (ilb_server_t)); 1372 goto end; 1373 } 1374 1375 /* 1376 * No need to hold ir_lock since no other thread should manipulate 1377 * the following fields until ILB_RULE_BUSY is cleared. 1378 */ 1379 if (rule->ir_servers == NULL) { 1380 server->iser_next = NULL; 1381 } else { 1382 server->iser_next = rule->ir_servers; 1383 } 1384 rule->ir_servers = server; 1385 ILB_R_KSTAT(rule, num_servers); 1386 1387 end: 1388 mutex_enter(&rule->ir_lock); 1389 rule->ir_flags &= ~ILB_RULE_BUSY; 1390 cv_signal(&rule->ir_cv); 1391 mutex_exit(&rule->ir_lock); 1392 return (ret); 1393 } 1394 1395 /* The routine executed by the delayed rule processing taskq. */ 1396 static void 1397 ilb_server_del_tq(void *arg) 1398 { 1399 ilb_server_t *server = (ilb_server_t *)arg; 1400 1401 mutex_enter(&server->iser_lock); 1402 while (server->iser_refcnt > 1) 1403 cv_wait(&server->iser_cv, &server->iser_lock); 1404 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1405 kmem_free(server, sizeof (ilb_server_t)); 1406 } 1407 1408 /* 1409 * Delete a back end server from a rule. If the address is IPv4, it is assumed 1410 * to be passed in as a mapped address. 1411 */ 1412 int 1413 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, 1414 ilb_rule_t *rule, in6_addr_t *addr) 1415 { 1416 ilb_server_t *server; 1417 ilb_server_t *prev_server; 1418 int ret = 0; 1419 1420 ASSERT((rule == NULL && rule_name != NULL) || 1421 (rule != NULL && rule_name == NULL)); 1422 if (rule == NULL) { 1423 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, 1424 &ret)) == NULL) { 1425 return (ret); 1426 } 1427 } 1428 1429 mutex_enter(&rule->ir_lock); 1430 /* If someone is already doing server add/del, sleeps and wait. */ 1431 while (rule->ir_flags & ILB_RULE_BUSY) { 1432 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1433 if (rule_name != NULL) { 1434 if (--rule->ir_refcnt <= 2) 1435 cv_signal(&rule->ir_cv); 1436 } 1437 mutex_exit(&rule->ir_lock); 1438 return (EINTR); 1439 } 1440 } 1441 /* 1442 * Set the rule to be busy to make sure that no new packet can 1443 * use this rule. 1444 */ 1445 rule->ir_flags |= ILB_RULE_BUSY; 1446 1447 /* Now wait for all other guys to finish their work. */ 1448 while (rule->ir_refcnt > 2) { 1449 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { 1450 mutex_exit(&rule->ir_lock); 1451 ret = EINTR; 1452 goto end; 1453 } 1454 } 1455 mutex_exit(&rule->ir_lock); 1456 1457 prev_server = NULL; 1458 for (server = rule->ir_servers; server != NULL; 1459 prev_server = server, server = server->iser_next) { 1460 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr)) 1461 break; 1462 } 1463 if (server == NULL) { 1464 ret = ENOENT; 1465 goto end; 1466 } 1467 1468 /* 1469 * Let the load balancing algorithm know about the removal. 1470 * The algorithm may disallow the removal... 1471 */ 1472 if ((ret = rule->ir_alg->ilb_alg_server_del(server, 1473 rule->ir_alg->ilb_alg_data)) != 0) { 1474 goto end; 1475 } 1476 1477 if (prev_server == NULL) 1478 rule->ir_servers = server->iser_next; 1479 else 1480 prev_server->iser_next = server->iser_next; 1481 1482 ILB_R_KSTAT_UPDATE(rule, num_servers, -1); 1483 1484 /* 1485 * Mark the server as disabled so that if there is any sticky cache 1486 * using this server around, it won't be used. 1487 */ 1488 server->iser_enabled = B_FALSE; 1489 1490 mutex_enter(&server->iser_lock); 1491 1492 /* 1493 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t 1494 * may not go away if there is still a conn using it. The NAT source 1495 * timer will do the garbage collection. 1496 */ 1497 ilb_destroy_nat_src(&server->iser_nat_src); 1498 1499 /* If there is a hard limit on when a server should die, set it. */ 1500 if (rule->ir_conn_drain_timeout != 0) { 1501 (void) atomic_swap_64((uint64_t *)&server->iser_die_time, 1502 lbolt64 + SEC_TO_TICK(rule->ir_conn_drain_timeout)); 1503 } 1504 1505 if (server->iser_refcnt > 1) { 1506 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq, 1507 server, TQ_SLEEP); 1508 mutex_exit(&server->iser_lock); 1509 } else { 1510 kstat_delete_netstack(server->iser_ksp, server->iser_stackid); 1511 kmem_free(server, sizeof (ilb_server_t)); 1512 } 1513 1514 end: 1515 mutex_enter(&rule->ir_lock); 1516 rule->ir_flags &= ~ILB_RULE_BUSY; 1517 if (rule_name != NULL) 1518 rule->ir_refcnt--; 1519 cv_signal(&rule->ir_cv); 1520 mutex_exit(&rule->ir_lock); 1521 return (ret); 1522 } 1523 1524 /* 1525 * First check if the destination of the ICMP message matches a VIP of 1526 * a rule. If it does not, just return ILB_PASSED. 1527 * 1528 * If the destination matches a VIP: 1529 * 1530 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end 1531 * server. 1532 * 1533 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload 1534 * and see which back end server we should send this message to. And we 1535 * need to do NAT on both the payload message and the outside IP packet. 1536 * 1537 * For other ICMP messages, drop them. 1538 */ 1539 /* ARGSUSED */ 1540 static int 1541 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, 1542 icmph_t *icmph, ipaddr_t *lb_dst) 1543 { 1544 ipaddr_t vip; 1545 ilb_rule_t *rule; 1546 in6_addr_t addr6; 1547 1548 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule)) 1549 return (ILB_PASSED); 1550 1551 1552 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) { 1553 ILB_R_KSTAT(rule, icmp_dropped); 1554 ILB_RULE_REFRELE(rule); 1555 return (ILB_DROPPED); 1556 } 1557 1558 switch (icmph->icmph_type) { 1559 case ICMP_ECHO_REQUEST: 1560 ILB_R_KSTAT(rule, icmp_echo_processed); 1561 ILB_RULE_REFRELE(rule); 1562 1563 icmph->icmph_type = ICMP_ECHO_REPLY; 1564 icmph->icmph_checksum = 0; 1565 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0); 1566 ipha->ipha_ttl = 1567 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl; 1568 *lb_dst = ipha->ipha_src; 1569 vip = ipha->ipha_dst; 1570 ipha->ipha_dst = ipha->ipha_src; 1571 ipha->ipha_src = vip; 1572 return (ILB_BALANCED); 1573 case ICMP_DEST_UNREACHABLE: { 1574 int ret; 1575 1576 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) { 1577 ILB_R_KSTAT(rule, icmp_dropped); 1578 ILB_RULE_REFRELE(rule); 1579 return (ILB_DROPPED); 1580 } 1581 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph, 1582 &addr6)) { 1583 ILB_R_KSTAT(rule, icmp_2big_processed); 1584 ret = ILB_BALANCED; 1585 } else { 1586 ILB_R_KSTAT(rule, icmp_2big_dropped); 1587 ret = ILB_DROPPED; 1588 } 1589 ILB_RULE_REFRELE(rule); 1590 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst); 1591 return (ret); 1592 } 1593 default: 1594 ILB_R_KSTAT(rule, icmp_dropped); 1595 ILB_RULE_REFRELE(rule); 1596 return (ILB_DROPPED); 1597 } 1598 } 1599 1600 /* ARGSUSED */ 1601 static int 1602 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, 1603 icmp6_t *icmp6, in6_addr_t *lb_dst) 1604 { 1605 ilb_rule_t *rule; 1606 1607 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule)) 1608 return (ILB_PASSED); 1609 1610 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) { 1611 ILB_R_KSTAT(rule, icmp_dropped); 1612 ILB_RULE_REFRELE(rule); 1613 return (ILB_DROPPED); 1614 } 1615 1616 switch (icmp6->icmp6_type) { 1617 case ICMP6_ECHO_REQUEST: { 1618 int hdr_len; 1619 1620 ILB_R_KSTAT(rule, icmp_echo_processed); 1621 ILB_RULE_REFRELE(rule); 1622 1623 icmp6->icmp6_type = ICMP6_ECHO_REPLY; 1624 icmp6->icmp6_cksum = ip6h->ip6_plen; 1625 hdr_len = (char *)icmp6 - (char *)ip6h; 1626 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len, 1627 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6)); 1628 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; 1629 ip6h->ip6_hops = 1630 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops; 1631 *lb_dst = ip6h->ip6_src; 1632 ip6h->ip6_src = ip6h->ip6_dst; 1633 ip6h->ip6_dst = *lb_dst; 1634 return (ILB_BALANCED); 1635 } 1636 case ICMP6_PACKET_TOO_BIG: { 1637 int ret; 1638 1639 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6, 1640 lb_dst)) { 1641 ILB_R_KSTAT(rule, icmp_2big_processed); 1642 ret = ILB_BALANCED; 1643 } else { 1644 ILB_R_KSTAT(rule, icmp_2big_dropped); 1645 ret = ILB_DROPPED; 1646 } 1647 ILB_RULE_REFRELE(rule); 1648 return (ret); 1649 } 1650 default: 1651 ILB_R_KSTAT(rule, icmp_dropped); 1652 ILB_RULE_REFRELE(rule); 1653 return (ILB_DROPPED); 1654 } 1655 } 1656 1657 /* 1658 * Common routine to check an incoming packet and decide what to do with it. 1659 * called by ilb_check_v4|v6(). 1660 */ 1661 static int 1662 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src, 1663 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len, 1664 in6_addr_t *lb_dst) 1665 { 1666 in_port_t sport, dport; 1667 tcpha_t *tcph; 1668 udpha_t *udph; 1669 ilb_rule_t *rule; 1670 ilb_server_t *server; 1671 boolean_t balanced; 1672 struct ilb_sticky_s *s = NULL; 1673 int ret; 1674 uint32_t ip_sum, tp_sum; 1675 ilb_nat_info_t info; 1676 uint16_t nat_src_idx; 1677 boolean_t busy; 1678 1679 /* 1680 * We don't really need to switch here since both protocols's 1681 * ports are at the same offset. Just prepare for future protocol 1682 * specific processing. 1683 */ 1684 switch (l4) { 1685 case IPPROTO_TCP: 1686 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr) 1687 return (ILB_DROPPED); 1688 tcph = (tcpha_t *)tph; 1689 sport = tcph->tha_lport; 1690 dport = tcph->tha_fport; 1691 break; 1692 case IPPROTO_UDP: 1693 if (tph + sizeof (udpha_t) > mp->b_wptr) 1694 return (ILB_DROPPED); 1695 udph = (udpha_t *)tph; 1696 sport = udph->uha_src_port; 1697 dport = udph->uha_dst_port; 1698 break; 1699 default: 1700 return (ILB_PASSED); 1701 } 1702 1703 /* Fast path, there is an existing conn. */ 1704 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport, 1705 pkt_len, lb_dst)) { 1706 return (ILB_BALANCED); 1707 } 1708 1709 /* 1710 * If there is no existing connection for the incoming packet, check 1711 * to see if the packet matches a rule. If not, just let IP decide 1712 * what to do with it. 1713 * 1714 * Note: a reply from back end server should not match a rule. A 1715 * reply should match one existing conn. 1716 */ 1717 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid, 1718 pkt_len, &busy); 1719 if (rule == NULL) { 1720 /* If the rule is busy, just drop the packet. */ 1721 if (busy) 1722 return (ILB_DROPPED); 1723 else 1724 return (ILB_PASSED); 1725 } 1726 1727 /* 1728 * The packet matches a rule, use the rule load balance algorithm 1729 * to find a server. 1730 */ 1731 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport, 1732 rule->ir_alg->ilb_alg_data, &server); 1733 /* 1734 * This can only happen if there is no server in a rule or all 1735 * the servers are currently disabled. 1736 */ 1737 if (!balanced) 1738 goto no_server; 1739 1740 /* 1741 * If the rule is sticky enabled, we need to check the sticky table. 1742 * If there is a sticky entry for the client, use the previous server 1743 * instead of the one found above (note that both can be the same). 1744 * If there is no entry for that client, add an entry to the sticky 1745 * table. Both the find and add are done in ilb_sticky_find_add() 1746 * to avoid checking for duplicate when adding an entry. 1747 */ 1748 if (rule->ir_flags & ILB_RULE_STICKY) { 1749 in6_addr_t addr; 1750 1751 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr); 1752 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server, 1753 &s, &nat_src_idx)) == NULL) { 1754 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1755 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1756 goto no_server; 1757 } 1758 } 1759 1760 /* 1761 * We are holding a reference on the rule, so the server 1762 * cannot go away. 1763 */ 1764 *lb_dst = server->iser_addr_v6; 1765 ILB_S_KSTAT(server, pkt_processed); 1766 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len); 1767 1768 switch (rule->ir_topo) { 1769 case ILB_TOPO_IMPL_NAT: { 1770 ilb_nat_src_entry_t *src_ent; 1771 uint16_t *src_idx; 1772 1773 /* 1774 * We create a cache even if it is not a SYN segment. 1775 * The server should return a RST. When we see the 1776 * RST, we will destroy this cache. But by having 1777 * a cache, we know how to NAT the returned RST. 1778 */ 1779 info.vip = *dst; 1780 info.dport = dport; 1781 info.src = *src; 1782 info.sport = sport; 1783 1784 /* If stickiness is enabled, use the same source address */ 1785 if (s != NULL) 1786 src_idx = &nat_src_idx; 1787 else 1788 src_idx = NULL; 1789 1790 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src, 1791 &info.nat_src, &info.nat_sport, src_idx)) == NULL) { 1792 if (s != NULL) 1793 ilb_sticky_refrele(s); 1794 ILB_R_KSTAT(rule, pkt_dropped); 1795 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1796 ILB_R_KSTAT(rule, noport_pkt_dropped); 1797 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len); 1798 ret = ILB_DROPPED; 1799 break; 1800 } 1801 info.src_ent = src_ent; 1802 info.nat_dst = server->iser_addr_v6; 1803 if (rule->ir_port_range && server->iser_port_range) { 1804 info.nat_dport = htons(ntohs(dport) - 1805 rule->ir_min_port + server->iser_min_port); 1806 } else { 1807 info.nat_dport = htons(server->iser_min_port); 1808 } 1809 1810 /* 1811 * If ilb_conn_add() fails, it will release the reference on 1812 * sticky info and de-allocate the NAT source port allocated 1813 * above. 1814 */ 1815 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1816 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1817 ILB_R_KSTAT(rule, pkt_dropped); 1818 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1819 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1820 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1821 ret = ILB_DROPPED; 1822 break; 1823 } 1824 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1825 ret = ILB_BALANCED; 1826 break; 1827 } 1828 case ILB_TOPO_IMPL_HALF_NAT: 1829 info.vip = *dst; 1830 info.nat_dst = server->iser_addr_v6; 1831 info.dport = dport; 1832 if (rule->ir_port_range && server->iser_port_range) { 1833 info.nat_dport = htons(ntohs(dport) - 1834 rule->ir_min_port + server->iser_min_port); 1835 } else { 1836 info.nat_dport = htons(server->iser_min_port); 1837 } 1838 1839 if (ilb_conn_add(ilbs, rule, server, src, sport, dst, 1840 dport, &info, &ip_sum, &tp_sum, s) != 0) { 1841 ILB_R_KSTAT(rule, pkt_dropped); 1842 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1843 ILB_R_KSTAT(rule, nomem_pkt_dropped); 1844 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); 1845 ret = ILB_DROPPED; 1846 break; 1847 } 1848 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); 1849 1850 ret = ILB_BALANCED; 1851 break; 1852 case ILB_TOPO_IMPL_DSR: 1853 /* 1854 * By decrementing the sticky refcnt, the period of 1855 * stickiness (life time of ilb_sticky_t) will be 1856 * from now to (now + default expiry time). 1857 */ 1858 if (s != NULL) 1859 ilb_sticky_refrele(s); 1860 ret = ILB_BALANCED; 1861 break; 1862 default: 1863 cmn_err(CE_PANIC, "data corruption unknown topology: %p", 1864 (void *) rule); 1865 break; 1866 } 1867 ILB_RULE_REFRELE(rule); 1868 return (ret); 1869 1870 no_server: 1871 /* This can only happen if there is no server available. */ 1872 ILB_R_KSTAT(rule, pkt_dropped); 1873 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); 1874 ILB_RULE_REFRELE(rule); 1875 return (ILB_DROPPED); 1876 } 1877 1878 int 1879 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4, 1880 uint8_t *tph, ipaddr_t *lb_dst) 1881 { 1882 in6_addr_t v6_src, v6_dst, v6_lb_dst; 1883 int ret; 1884 1885 ASSERT(DB_REF(mp) == 1); 1886 1887 if (l4 == IPPROTO_ICMP) { 1888 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph, 1889 lb_dst)); 1890 } 1891 1892 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src); 1893 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst); 1894 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha, 1895 tph, ntohs(ipha->ipha_length), &v6_lb_dst); 1896 if (ret == ILB_BALANCED) 1897 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst); 1898 return (ret); 1899 } 1900 1901 int 1902 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4, 1903 uint8_t *tph, in6_addr_t *lb_dst) 1904 { 1905 uint32_t pkt_len; 1906 1907 ASSERT(DB_REF(mp) == 1); 1908 1909 if (l4 == IPPROTO_ICMPV6) { 1910 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph, 1911 lb_dst)); 1912 } 1913 1914 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 1915 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst, 1916 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst)); 1917 } 1918 1919 void 1920 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules) 1921 { 1922 ilb_rule_t *tmp_rule; 1923 1924 mutex_enter(&ilbs->ilbs_g_lock); 1925 *num_rules = 0; 1926 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1927 tmp_rule = tmp_rule->ir_next) { 1928 if (tmp_rule->ir_zoneid == zoneid) 1929 *num_rules += 1; 1930 } 1931 mutex_exit(&ilbs->ilbs_g_lock); 1932 } 1933 1934 int 1935 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1936 uint32_t *num_servers) 1937 { 1938 ilb_rule_t *rule; 1939 int err; 1940 1941 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1942 return (err); 1943 *num_servers = rule->ir_kstat.num_servers.value.ui64; 1944 ILB_RULE_REFRELE(rule); 1945 return (0); 1946 } 1947 1948 int 1949 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, 1950 ilb_server_info_t *servers, uint32_t *num_servers) 1951 { 1952 ilb_rule_t *rule; 1953 ilb_server_t *server; 1954 size_t cnt; 1955 int err; 1956 1957 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) 1958 return (err); 1959 for (server = rule->ir_servers, cnt = *num_servers; 1960 server != NULL && cnt > 0; 1961 server = server->iser_next, cnt--, servers++) { 1962 (void) memcpy(servers->name, server->iser_name, 1963 ILB_SERVER_NAMESZ); 1964 servers->addr = server->iser_addr_v6; 1965 servers->min_port = htons(server->iser_min_port); 1966 servers->max_port = htons(server->iser_max_port); 1967 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0; 1968 servers->err = 0; 1969 } 1970 ILB_RULE_REFRELE(rule); 1971 *num_servers -= cnt; 1972 1973 return (0); 1974 } 1975 1976 void 1977 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names, 1978 char *buf) 1979 { 1980 ilb_rule_t *tmp_rule; 1981 int cnt; 1982 1983 if (*num_names == 0) 1984 return; 1985 1986 mutex_enter(&ilbs->ilbs_g_lock); 1987 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; 1988 tmp_rule = tmp_rule->ir_next) { 1989 if (tmp_rule->ir_zoneid != zoneid) 1990 continue; 1991 1992 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ); 1993 buf += ILB_RULE_NAMESZ; 1994 if (++cnt == *num_names) 1995 break; 1996 } 1997 mutex_exit(&ilbs->ilbs_g_lock); 1998 *num_names = cnt; 1999 } 2000 2001 int 2002 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd) 2003 { 2004 ilb_rule_t *rule; 2005 int err; 2006 2007 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) { 2008 return (err); 2009 } 2010 2011 /* 2012 * Except the enabled flags, none of the following will change 2013 * in the life time of a rule. So we don't hold the mutex when 2014 * reading them. The worst is to report a wrong enabled flags. 2015 */ 2016 cmd->ip_ver = rule->ir_ipver; 2017 cmd->proto = rule->ir_proto; 2018 cmd->min_port = htons(rule->ir_min_port); 2019 cmd->max_port = htons(rule->ir_max_port); 2020 2021 cmd->vip = rule->ir_target_v6; 2022 cmd->algo = rule->ir_alg_type; 2023 cmd->topo = rule->ir_topo; 2024 2025 cmd->nat_src_start = rule->ir_nat_src_start; 2026 cmd->nat_src_end = rule->ir_nat_src_end; 2027 2028 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout; 2029 cmd->nat_expiry = rule->ir_nat_expiry; 2030 cmd->sticky_expiry = rule->ir_sticky_expiry; 2031 2032 cmd->flags = 0; 2033 if (rule->ir_flags & ILB_RULE_ENABLED) 2034 cmd->flags |= ILB_RULE_ENABLED; 2035 if (rule->ir_flags & ILB_RULE_STICKY) { 2036 cmd->flags |= ILB_RULE_STICKY; 2037 cmd->sticky_mask = rule->ir_sticky_mask; 2038 } 2039 2040 ILB_RULE_REFRELE(rule); 2041 return (0); 2042 } 2043 2044 static void * 2045 ilb_stack_init(netstackid_t stackid, netstack_t *ns) 2046 { 2047 ilb_stack_t *ilbs; 2048 char tq_name[TASKQ_NAMELEN]; 2049 2050 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP); 2051 ilbs->ilbs_netstack = ns; 2052 2053 ilbs->ilbs_rule_head = NULL; 2054 ilbs->ilbs_g_hash = NULL; 2055 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL); 2056 2057 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP); 2058 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) { 2059 kmem_free(ilbs, sizeof (ilb_stack_t)); 2060 return (NULL); 2061 } 2062 2063 /* 2064 * ilbs_conn/sticky_hash related info is initialized in 2065 * ilb_conn/sticky_hash_init(). 2066 */ 2067 ilbs->ilbs_conn_taskq = NULL; 2068 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size; 2069 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size; 2070 ilbs->ilbs_c2s_conn_hash = NULL; 2071 ilbs->ilbs_s2c_conn_hash = NULL; 2072 ilbs->ilbs_conn_timer_list = NULL; 2073 2074 ilbs->ilbs_sticky_hash = NULL; 2075 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size; 2076 ilbs->ilbs_sticky_timer_list = NULL; 2077 ilbs->ilbs_sticky_taskq = NULL; 2078 2079 /* The allocation is done later when there is a rule using NAT mode. */ 2080 ilbs->ilbs_nat_src = NULL; 2081 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size; 2082 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL); 2083 ilbs->ilbs_nat_src_tid = 0; 2084 2085 /* For listing the conn hash table */ 2086 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL); 2087 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL); 2088 ilbs->ilbs_conn_list_busy = B_FALSE; 2089 ilbs->ilbs_conn_list_cur = 0; 2090 ilbs->ilbs_conn_list_connp = NULL; 2091 2092 /* For listing the sticky hash table */ 2093 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL); 2094 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL); 2095 ilbs->ilbs_sticky_list_busy = B_FALSE; 2096 ilbs->ilbs_sticky_list_cur = 0; 2097 ilbs->ilbs_sticky_list_curp = NULL; 2098 2099 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", ns); 2100 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR, 2101 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 2102 2103 return (ilbs); 2104 } 2105 2106 /* ARGSUSED */ 2107 static void 2108 ilb_stack_shutdown(netstackid_t stackid, void *arg) 2109 { 2110 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2111 ilb_rule_t *tmp_rule; 2112 2113 ilb_sticky_hash_fini(ilbs); 2114 ilb_conn_hash_fini(ilbs); 2115 mutex_enter(&ilbs->ilbs_g_lock); 2116 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) { 2117 ilb_rule_hash_del(tmp_rule); 2118 ilb_rule_g_del(ilbs, tmp_rule); 2119 mutex_exit(&ilbs->ilbs_g_lock); 2120 ilb_rule_del_common(ilbs, tmp_rule); 2121 mutex_enter(&ilbs->ilbs_g_lock); 2122 } 2123 mutex_exit(&ilbs->ilbs_g_lock); 2124 if (ilbs->ilbs_nat_src != NULL) 2125 ilb_nat_src_fini(ilbs); 2126 } 2127 2128 static void 2129 ilb_stack_fini(netstackid_t stackid, void * arg) 2130 { 2131 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 2132 2133 ilb_rule_hash_fini(ilbs); 2134 taskq_destroy(ilbs->ilbs_rule_taskq); 2135 ilb_kstat_g_fini(stackid, ilbs); 2136 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t)); 2137 kmem_free(ilbs, sizeof (ilb_stack_t)); 2138 } 2139 2140 void 2141 ilb_ddi_g_init(void) 2142 { 2143 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown, 2144 ilb_stack_fini); 2145 } 2146 2147 void 2148 ilb_ddi_g_destroy(void) 2149 { 2150 netstack_unregister(NS_ILB); 2151 ilb_conn_cache_fini(); 2152 ilb_sticky_cache_fini(); 2153 } 2154