/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ilb_alg.h" #include "ilb_nat.h" #include "ilb_conn.h" /* ILB kmem cache flag */ int ilb_kmem_flags = 0; /* * The default size for the different hash tables. Global for all stacks. * But each stack has its own table, just that their sizes are the same. */ static size_t ilb_rule_hash_size = 2048; static size_t ilb_conn_hash_size = 262144; static size_t ilb_sticky_hash_size = 262144; /* This should be a prime number. */ static size_t ilb_nat_src_hash_size = 97; /* Default NAT cache entry expiry time. */ static uint32_t ilb_conn_tcp_expiry = 120; static uint32_t ilb_conn_udp_expiry = 60; /* Default sticky entry expiry time. */ static uint32_t ilb_sticky_expiry = 60; /* addr is assumed to be a uint8_t * to an ipaddr_t. */ #define ILB_RULE_HASH(addr, hash_size) \ ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \ *(addr)) & ((hash_size) - 1)) /* * Note on ILB delayed processing * * To avoid in line removal on some of the data structures, such as rules, * servers and ilb_conn_hash entries, ILB delays such processing to a taskq. * There are three types of ILB taskq: * * 1. rule handling: created at stack initialialization time, ilb_stack_init() * 2. conn hash handling: created at conn hash initialization time, * ilb_conn_hash_init() * 3. sticky hash handling: created at sticky hash initialization time, * ilb_sticky_hash_init() * * The rule taskq is for processing rule and server removal. When a user * land rule/server removal request comes in, a taskq is dispatched after * removing the rule/server from all related hashes. This taskq will wait * until all references to the rule/server are gone before removing it. * So the user land thread requesting the removal does not need to wait * for the removal completion. * * The conn hash/sticky hash taskq is for processing ilb_conn_hash and * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers * and ilb_sticky_timer_size timers running for ilb_conn_hash and * ilb_sticky_hash cleanup respectively. Each timer is responsible for one * portion (same size) of the hash table. When a timer fires, it dispatches * a conn hash taskq to clean up its portion of the table. This avoids in * line processing of the removal. * * There is another delayed processing, the clean up of NAT source address * table. We just use the timer to directly handle it instead of using * a taskq. The reason is that the table is small so it is OK to use the * timer. */ /* ILB rule taskq constants. */ #define ILB_RULE_TASKQ_NUM_THR 20 /* Argument passed to ILB rule taskq routines. */ typedef struct { ilb_stack_t *ilbs; ilb_rule_t *rule; } ilb_rule_tq_t; /* kstat handling routines. */ static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *); static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *); static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *); static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *, ilb_server_t *); /* Rule hash handling routines. */ static void ilb_rule_hash_init(ilb_stack_t *); static void ilb_rule_hash_fini(ilb_stack_t *); static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *); static void ilb_rule_hash_del(ilb_rule_t *); static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *, in_port_t, zoneid_t, uint32_t, boolean_t *); static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *); static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *); static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *); static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *, int *); static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int, int, in_port_t, in_port_t, const in6_addr_t *); /* Back end server handling routines. */ static void ilb_server_free(ilb_server_t *); /* Network stack handling routines. */ static void *ilb_stack_init(netstackid_t, netstack_t *); static void ilb_stack_shutdown(netstackid_t, void *); static void ilb_stack_fini(netstackid_t, void *); /* Sticky connection handling routines. */ static void ilb_rule_sticky_init(ilb_rule_t *); static void ilb_rule_sticky_fini(ilb_rule_t *); /* Handy macro to check for unspecified address. */ #define IS_ADDR_UNSPEC(addr) \ (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \ IN6_IS_ADDR_UNSPECIFIED(addr)) /* * Global kstat instance counter. When a rule is created, its kstat instance * number is assigned by ilb_kstat_instance and ilb_kstat_instance is * incremented. */ static uint_t ilb_kstat_instance = 0; /* * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME. * A rule's kstat has ILB_RULE_KS_CNAME class name. */ #define ILB_G_KS_NAME "global" #define ILB_G_KS_CNAME "kstat" #define ILB_RULE_KS_CNAME "rulestat" static kstat_t * ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs) { kstat_t *ksp; ilb_g_kstat_t template = { { "num_rules", KSTAT_DATA_UINT64, 0 }, { "ip_frag_in", KSTAT_DATA_UINT64, 0 }, { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 } }; ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME, ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); if (ksp == NULL) return (NULL); bcopy(&template, ilbs->ilbs_kstat, sizeof (template)); ksp->ks_data = ilbs->ilbs_kstat; ksp->ks_private = (void *)(uintptr_t)stackid; kstat_install(ksp); return (ksp); } static void ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs) { if (ilbs->ilbs_ksp != NULL) { ASSERT(stackid == (netstackid_t)(uintptr_t) ilbs->ilbs_ksp->ks_private); kstat_delete_netstack(ilbs->ilbs_ksp, stackid); ilbs->ilbs_ksp = NULL; } } static kstat_t * ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule) { kstat_t *ksp; ilb_rule_kstat_t template = { { "num_servers", KSTAT_DATA_UINT64, 0 }, { "bytes_not_processed", KSTAT_DATA_UINT64, 0 }, { "pkt_not_processed", KSTAT_DATA_UINT64, 0 }, { "bytes_dropped", KSTAT_DATA_UINT64, 0 }, { "pkt_dropped", KSTAT_DATA_UINT64, 0 }, { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 }, { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 }, { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 }, { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 }, { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 }, { "icmp_dropped", KSTAT_DATA_UINT64, 0 }, { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 }, { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 } }; ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); if (ksp == NULL) return (NULL); bcopy(&template, &rule->ir_kstat, sizeof (template)); ksp->ks_data = &rule->ir_kstat; ksp->ks_private = (void *)(uintptr_t)stackid; kstat_install(ksp); return (ksp); } static kstat_t * ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule, ilb_server_t *server) { kstat_t *ksp; ilb_server_kstat_t template = { { "bytes_processed", KSTAT_DATA_UINT64, 0 }, { "pkt_processed", KSTAT_DATA_UINT64, 0 }, { "ip_address", KSTAT_DATA_STRING, 0 } }; char cname_buf[KSTAT_STRLEN]; /* 7 is "-sstat" */ ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN); (void) sprintf(cname_buf, "%s-sstat", rule->ir_name); ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, server->iser_name, cname_buf, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); if (ksp == NULL) return (NULL); bcopy(&template, &server->iser_kstat, sizeof (template)); ksp->ks_data = &server->iser_kstat; ksp->ks_private = (void *)(uintptr_t)stackid; kstat_named_setstr(&server->iser_kstat.ip_address, server->iser_ip_addr); /* We never change the IP address */ ksp->ks_data_size += strlen(server->iser_ip_addr) + 1; kstat_install(ksp); return (ksp); } /* Initialize the rule hash table. */ static void ilb_rule_hash_init(ilb_stack_t *ilbs) { int i; /* * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to * the next power of 2. */ if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) { for (i = 0; i < 31; i++) { if (ilbs->ilbs_rule_hash_size < (1 << i)) break; } ilbs->ilbs_rule_hash_size = 1 << i; } ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) * ilbs->ilbs_rule_hash_size, KM_SLEEP); for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) { mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL, MUTEX_DEFAULT, NULL); } } /* Clean up the rule hash table. */ static void ilb_rule_hash_fini(ilb_stack_t *ilbs) { if (ilbs->ilbs_g_hash == NULL) return; kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) * ilbs->ilbs_rule_hash_size); } /* Add a rule to the rule hash table. */ static void ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr) { int i; i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3], ilbs->ilbs_rule_hash_size); DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i); mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule; if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL) ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule; rule->ir_hash_prev = NULL; ilbs->ilbs_g_hash[i].ilb_hash_rule = rule; rule->ir_hash = &ilbs->ilbs_g_hash[i]; mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); } /* * Remove a rule from the rule hash table. Note that the rule is not freed * in this routine. */ static void ilb_rule_hash_del(ilb_rule_t *rule) { mutex_enter(&rule->ir_hash->ilb_hash_lock); if (rule->ir_hash->ilb_hash_rule == rule) { rule->ir_hash->ilb_hash_rule = rule->ir_hash_next; if (rule->ir_hash_next != NULL) rule->ir_hash_next->ir_hash_prev = NULL; } else { if (rule->ir_hash_prev != NULL) rule->ir_hash_prev->ir_hash_next = rule->ir_hash_next; if (rule->ir_hash_next != NULL) { rule->ir_hash_next->ir_hash_prev = rule->ir_hash_prev; } } mutex_exit(&rule->ir_hash->ilb_hash_lock); rule->ir_hash_next = NULL; rule->ir_hash_prev = NULL; rule->ir_hash = NULL; } /* * Given the info of a packet, look for a match in the rule hash table. */ static ilb_rule_t * ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr, in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy) { int i; ilb_rule_t *rule; ipaddr_t v4_addr; *busy = B_FALSE; IN6_V4MAPPED_TO_IPADDR(addr, v4_addr); i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size); port = ntohs(port); mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; rule = rule->ir_hash_next) { if (!rule->ir_port_range) { if (rule->ir_min_port != port) continue; } else { if (port < rule->ir_min_port || port > rule->ir_max_port) { continue; } } if (rule->ir_ipver != l3 || rule->ir_proto != l4 || rule->ir_zoneid != zoneid) { continue; } if (l3 == IPPROTO_IP) { if (rule->ir_target_v4 != INADDR_ANY && rule->ir_target_v4 != v4_addr) { continue; } } else { if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) && !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) { continue; } } /* * Just update the stats if the rule is disabled. */ mutex_enter(&rule->ir_lock); if (!(rule->ir_flags & ILB_RULE_ENABLED)) { ILB_R_KSTAT(rule, pkt_not_processed); ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len); mutex_exit(&rule->ir_lock); rule = NULL; break; } else if (rule->ir_flags & ILB_RULE_BUSY) { /* * If we are busy... * * XXX we should have a queue to postpone the * packet processing. But this requires a * mechanism in IP to re-start the packet * processing. So for now, just drop the packet. */ ILB_R_KSTAT(rule, pkt_dropped); ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len); mutex_exit(&rule->ir_lock); *busy = B_TRUE; rule = NULL; break; } else { rule->ir_refcnt++; ASSERT(rule->ir_refcnt != 1); mutex_exit(&rule->ir_lock); break; } } mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); return (rule); } /* * Add a rule to the global rule list. This list is for finding all rules * in an IP stack. The caller is assumed to hold the ilbs_g_lock. */ static void ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule) { ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); rule->ir_next = ilbs->ilbs_rule_head; ilbs->ilbs_rule_head = rule; ILB_KSTAT_UPDATE(ilbs, num_rules, 1); } /* The call is assumed to hold the ilbs_g_lock. */ static void ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule) { ilb_rule_t *tmp_rule; ilb_rule_t *prev_rule; ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); prev_rule = NULL; for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) { if (tmp_rule == rule) break; } if (tmp_rule == NULL) { mutex_exit(&ilbs->ilbs_g_lock); return; } if (prev_rule == NULL) ilbs->ilbs_rule_head = tmp_rule->ir_next; else prev_rule->ir_next = tmp_rule->ir_next; ILB_KSTAT_UPDATE(ilbs, num_rules, -1); } /* * Helper routine to calculate how many source addresses are in a given * range. */ static int64_t num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2) { int64_t ret; uint32_t addr1, addr2; /* * Here we assume that the max number of NAT source cannot be * large such that the most significant 2 s6_addr32 must be * equal. */ addr1 = ntohl(a1->s6_addr32[3]); addr2 = ntohl(a2->s6_addr32[3]); if (a1->s6_addr32[0] != a2->s6_addr32[0] || a1->s6_addr32[1] != a2->s6_addr32[1] || a1->s6_addr32[2] > a2->s6_addr32[2] || (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) { return (-1); } if (a1->s6_addr32[2] == a2->s6_addr32[2]) { return (addr2 - addr1 + 1); } else { ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2])); ret <<= 32; ret = ret + addr1 - addr2; return (ret + 1); } } /* * Add an ILB rule. */ int ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd) { ilb_rule_t *rule; netstackid_t stackid; int ret; in_port_t min_port, max_port; int64_t num_src; /* Sanity checks. */ if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6) return (EINVAL); /* Need to support SCTP... */ if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP) return (EINVAL); /* For full NAT, the NAT source must be supplied. */ if (cmd->topo == ILB_TOPO_IMPL_NAT) { if (IS_ADDR_UNSPEC(&cmd->nat_src_start) || IS_ADDR_UNSPEC(&cmd->nat_src_end)) { return (EINVAL); } } /* Check invalid mask */ if ((cmd->flags & ILB_RULE_STICKY) && IS_ADDR_UNSPEC(&cmd->sticky_mask)) { return (EINVAL); } /* Port is passed in network byte order. */ min_port = ntohs(cmd->min_port); max_port = ntohs(cmd->max_port); if (min_port > max_port) return (EINVAL); /* min_port == 0 means "all ports". Make it so */ if (min_port == 0) { min_port = 1; max_port = 65535; } /* Funny address checking. */ if (cmd->ip_ver == IPPROTO_IP) { in_addr_t v4_addr1, v4_addr2; v4_addr1 = cmd->vip.s6_addr32[3]; if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST || v4_addr1 == INADDR_ANY || !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { return (EINVAL); } if (cmd->topo == ILB_TOPO_IMPL_NAT) { v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]); v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]); if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET || v4_addr1 == INADDR_BROADCAST || v4_addr2 == INADDR_BROADCAST || v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY || CLASSD(v4_addr1) || CLASSD(v4_addr2) || !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { return (EINVAL); } num_src = v4_addr2 - v4_addr1 + 1; if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC) return (EINVAL); } } else { if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) || IN6_IS_ADDR_MULTICAST(&cmd->vip) || IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) || IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { return (EINVAL); } if (cmd->topo == ILB_TOPO_IMPL_NAT) { if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) || IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) || IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) || IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) || IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) || IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) || IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { return (EINVAL); } if ((num_src = num_nat_src_v6(&cmd->nat_src_start, &cmd->nat_src_end)) < 0 || num_src > ILB_MAX_NAT_SRC) { return (EINVAL); } } } mutex_enter(&ilbs->ilbs_g_lock); if (ilbs->ilbs_g_hash == NULL) ilb_rule_hash_init(ilbs); if (ilbs->ilbs_c2s_conn_hash == NULL) { ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); ilb_conn_hash_init(ilbs); ilb_nat_src_init(ilbs); } /* Make sure that the new rule does not duplicate an existing one. */ if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto, min_port, max_port, &cmd->vip)) { mutex_exit(&ilbs->ilbs_g_lock); return (EEXIST); } rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP); if (rule == NULL) { mutex_exit(&ilbs->ilbs_g_lock); return (ENOMEM); } /* ir_name is all 0 to begin with */ (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1); rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1); stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) { ret = ENOMEM; goto error; } if (cmd->topo == ILB_TOPO_IMPL_NAT) { rule->ir_nat_src_start = cmd->nat_src_start; rule->ir_nat_src_end = cmd->nat_src_end; } rule->ir_ipver = cmd->ip_ver; rule->ir_proto = cmd->proto; rule->ir_topo = cmd->topo; rule->ir_min_port = min_port; rule->ir_max_port = max_port; if (rule->ir_min_port != rule->ir_max_port) rule->ir_port_range = B_TRUE; else rule->ir_port_range = B_FALSE; rule->ir_zoneid = zoneid; rule->ir_target_v6 = cmd->vip; rule->ir_servers = NULL; /* * The default connection drain timeout is indefinite (value 0), * meaning we will wait for all connections to finish. So we * can assign cmd->conn_drain_timeout to it directly. */ rule->ir_conn_drain_timeout = cmd->conn_drain_timeout; if (cmd->nat_expiry != 0) { rule->ir_nat_expiry = cmd->nat_expiry; } else { switch (rule->ir_proto) { case IPPROTO_TCP: rule->ir_nat_expiry = ilb_conn_tcp_expiry; break; case IPPROTO_UDP: rule->ir_nat_expiry = ilb_conn_udp_expiry; break; default: cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p", (void *)rule); break; } } if (cmd->sticky_expiry != 0) rule->ir_sticky_expiry = cmd->sticky_expiry; else rule->ir_sticky_expiry = ilb_sticky_expiry; if (cmd->flags & ILB_RULE_STICKY) { rule->ir_flags |= ILB_RULE_STICKY; rule->ir_sticky_mask = cmd->sticky_mask; if (ilbs->ilbs_sticky_hash == NULL) ilb_sticky_hash_init(ilbs); } if (cmd->flags & ILB_RULE_ENABLED) rule->ir_flags |= ILB_RULE_ENABLED; mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL); rule->ir_refcnt = 1; switch (cmd->algo) { case ILB_ALG_IMPL_ROUNDROBIN: if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) { ret = ENOMEM; goto error; } rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN; break; case ILB_ALG_IMPL_HASH_IP: case ILB_ALG_IMPL_HASH_IP_SPORT: case ILB_ALG_IMPL_HASH_IP_VIP: if ((rule->ir_alg = ilb_alg_hash_init(rule, &cmd->algo)) == NULL) { ret = ENOMEM; goto error; } rule->ir_alg_type = cmd->algo; break; default: ret = EINVAL; goto error; } /* Add it to the global list and hash array at the end. */ ilb_rule_g_add(ilbs, rule); ilb_rule_hash_add(ilbs, rule, &cmd->vip); mutex_exit(&ilbs->ilbs_g_lock); return (0); error: mutex_exit(&ilbs->ilbs_g_lock); if (rule->ir_ksp != NULL) { /* stackid must be initialized if ir_ksp != NULL */ kstat_delete_netstack(rule->ir_ksp, stackid); } kmem_free(rule, sizeof (ilb_rule_t)); return (ret); } /* * The final part in deleting a rule. Either called directly or by the * taskq dispatched. */ static void ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule) { netstackid_t stackid; ilb_server_t *server; stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; /* * Let the algorithm know that the rule is going away. The * algorithm fini routine will free all its resources with this * rule. */ tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg); while ((server = tmp_rule->ir_servers) != NULL) { mutex_enter(&server->iser_lock); ilb_destroy_nat_src(&server->iser_nat_src); if (tmp_rule->ir_conn_drain_timeout != 0) { /* * The garbage collection thread checks this value * without grabing a lock. So we need to use * atomic_swap_64() to make sure that the value seen * by gc thread is intact. */ (void) atomic_swap_64( (uint64_t *)&server->iser_die_time, lbolt64 + SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout)); } while (server->iser_refcnt > 1) cv_wait(&server->iser_cv, &server->iser_lock); tmp_rule->ir_servers = server->iser_next; kstat_delete_netstack(server->iser_ksp, stackid); kmem_free(server, sizeof (ilb_server_t)); } ASSERT(tmp_rule->ir_ksp != NULL); kstat_delete_netstack(tmp_rule->ir_ksp, stackid); kmem_free(tmp_rule, sizeof (ilb_rule_t)); } /* The routine executed by the delayed rule taskq. */ static void ilb_rule_del_tq(void *arg) { ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs; ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule; mutex_enter(&rule->ir_lock); while (rule->ir_refcnt > 1) cv_wait(&rule->ir_cv, &rule->ir_lock); ilb_rule_del_common(ilbs, rule); kmem_free(arg, sizeof (ilb_rule_tq_t)); } /* Routine to delete a rule. */ int ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name) { ilb_rule_t *tmp_rule; ilb_rule_tq_t *arg; int err; mutex_enter(&ilbs->ilbs_g_lock); if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, &err)) == NULL) { mutex_exit(&ilbs->ilbs_g_lock); return (err); } /* * First remove the rule from the hash array and the global list so * that no one can find this rule any more. */ ilb_rule_hash_del(tmp_rule); ilb_rule_g_del(ilbs, tmp_rule); mutex_exit(&ilbs->ilbs_g_lock); ILB_RULE_REFRELE(tmp_rule); /* * Now no one can find this rule, we can remove it once all * references to it are dropped and all references to the list * of servers are dropped. So dispatch a task to finish the deletion. * We do this instead of letting the last one referencing the * rule do it. The reason is that the last one may be the * interrupt thread. We want to minimize the work it needs to * do. Rule deletion is not a critical task so it can be delayed. */ arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); arg->ilbs = ilbs; arg->rule = tmp_rule; (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, TQ_SLEEP); return (0); } /* * Given an IP address, check to see if there is a rule using this * as the VIP. It can be used to check if we need to drop a fragment. */ boolean_t ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule) { int i; ilb_rule_t *rule; boolean_t ret = B_FALSE; i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3], ilbs->ilbs_rule_hash_size); mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; rule = rule->ir_hash_next) { if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) { mutex_enter(&rule->ir_lock); if (rule->ir_flags & ILB_RULE_BUSY) { mutex_exit(&rule->ir_lock); break; } if (ret_rule != NULL) { rule->ir_refcnt++; mutex_exit(&rule->ir_lock); *ret_rule = rule; } else { mutex_exit(&rule->ir_lock); } ret = B_TRUE; break; } } mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); return (ret); } boolean_t ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule) { int i; ilb_rule_t *rule; boolean_t ret = B_FALSE; i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size); mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; rule = rule->ir_hash_next) { if (rule->ir_target_v6.s6_addr32[3] == addr) { mutex_enter(&rule->ir_lock); if (rule->ir_flags & ILB_RULE_BUSY) { mutex_exit(&rule->ir_lock); break; } if (ret_rule != NULL) { rule->ir_refcnt++; mutex_exit(&rule->ir_lock); *ret_rule = rule; } else { mutex_exit(&rule->ir_lock); } ret = B_TRUE; break; } } mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); return (ret); } static ilb_rule_t * ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int *err) { ilb_rule_t *tmp_rule; ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; tmp_rule = tmp_rule->ir_next) { if (tmp_rule->ir_zoneid != zoneid) continue; if (strcasecmp(tmp_rule->ir_name, name) == 0) { mutex_enter(&tmp_rule->ir_lock); if (tmp_rule->ir_flags & ILB_RULE_BUSY) { mutex_exit(&tmp_rule->ir_lock); *err = EINPROGRESS; return (NULL); } tmp_rule->ir_refcnt++; mutex_exit(&tmp_rule->ir_lock); *err = 0; return (tmp_rule); } } *err = ENOENT; return (NULL); } /* To find a rule with a given name and zone in the global rule list. */ ilb_rule_t * ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int *err) { ilb_rule_t *tmp_rule; mutex_enter(&ilbs->ilbs_g_lock); tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err); mutex_exit(&ilbs->ilbs_g_lock); return (tmp_rule); } /* Try to match the given packet info and zone ID with a rule. */ static boolean_t ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3, int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr) { ilb_rule_t *tmp_rule; ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; tmp_rule = tmp_rule->ir_next) { if (tmp_rule->ir_zoneid != zoneid) continue; /* * We don't allow the same name in different rules even if all * the other rule components are different. */ if (strcasecmp(tmp_rule->ir_name, name) == 0) return (B_TRUE); if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4) continue; /* * ir_min_port and ir_max_port are the same if ir_port_range * is false. In this case, if the ir_min|max_port (same) is * outside of the given port range, it is OK. In other cases, * check if min and max port are outside a rule's range. */ if (tmp_rule->ir_max_port < min_port || tmp_rule->ir_min_port > max_port) { continue; } /* * If l3 is IPv4, the addr passed in is assumed to be * mapped address. */ if (V6_OR_V4_INADDR_ANY(*addr) || V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) || IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) { return (B_TRUE); } } return (B_FALSE); } int ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, ilb_rule_t *in_rule) { ilb_rule_t *rule; int err; ASSERT((in_rule == NULL && rule_name != NULL) || (in_rule != NULL && rule_name == NULL)); if ((rule = in_rule) == NULL) { if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, &err)) == NULL) { return (err); } } mutex_enter(&rule->ir_lock); rule->ir_flags |= ILB_RULE_ENABLED; mutex_exit(&rule->ir_lock); /* Only refrele if the rule is passed in. */ if (in_rule == NULL) ILB_RULE_REFRELE(rule); return (0); } int ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, ilb_rule_t *in_rule) { ilb_rule_t *rule; int err; ASSERT((in_rule == NULL && rule_name != NULL) || (in_rule != NULL && rule_name == NULL)); if ((rule = in_rule) == NULL) { if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, &err)) == NULL) { return (err); } } mutex_enter(&rule->ir_lock); rule->ir_flags &= ~ILB_RULE_ENABLED; mutex_exit(&rule->ir_lock); /* Only refrele if the rule is passed in. */ if (in_rule == NULL) ILB_RULE_REFRELE(rule); return (0); } /* * XXX We should probably have a walker function to walk all rules. For * now, just add a simple loop for enable/disable/del. */ void ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid) { ilb_rule_t *rule; mutex_enter(&ilbs->ilbs_g_lock); for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { if (rule->ir_zoneid != zoneid) continue; /* * No need to hold the rule as we are holding the global * lock so it won't go away. Ignore the return value here * as the rule is provided so the call cannot fail. */ (void) ilb_rule_enable(ilbs, zoneid, NULL, rule); } mutex_exit(&ilbs->ilbs_g_lock); } void ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid) { ilb_rule_t *rule; mutex_enter(&ilbs->ilbs_g_lock); for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { if (rule->ir_zoneid != zoneid) continue; (void) ilb_rule_disable(ilbs, zoneid, NULL, rule); } mutex_exit(&ilbs->ilbs_g_lock); } void ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid) { ilb_rule_t *rule; ilb_rule_tq_t *arg; mutex_enter(&ilbs->ilbs_g_lock); while ((rule = ilbs->ilbs_rule_head) != NULL) { if (rule->ir_zoneid != zoneid) continue; ilb_rule_hash_del(rule); ilb_rule_g_del(ilbs, rule); mutex_exit(&ilbs->ilbs_g_lock); arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); arg->ilbs = ilbs; arg->rule = rule; (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, TQ_SLEEP); mutex_enter(&ilbs->ilbs_g_lock); } mutex_exit(&ilbs->ilbs_g_lock); } /* * This is just an optimization, so don't grab the global lock. The * worst case is that we missed a couple packets. */ boolean_t ilb_has_rules(ilb_stack_t *ilbs) { return (ilbs->ilbs_rule_head != NULL); } static int ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable) { ilb_server_t *tmp_server; int ret; ASSERT((rule == NULL && rule_name != NULL) || (rule != NULL && rule_name == NULL)); if (rule == NULL) { if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, &ret)) == NULL) { return (ret); } } /* Once we get a hold on the rule, no server can be added/deleted. */ for (tmp_server = rule->ir_servers; tmp_server != NULL; tmp_server = tmp_server->iser_next) { if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr)) break; } if (tmp_server == NULL) { ret = ENOENT; goto done; } if (enable) { ret = rule->ir_alg->ilb_alg_server_enable(tmp_server, rule->ir_alg->ilb_alg_data); if (ret == 0) { tmp_server->iser_enabled = B_TRUE; tmp_server->iser_die_time = 0; } } else { ret = rule->ir_alg->ilb_alg_server_disable(tmp_server, rule->ir_alg->ilb_alg_data); if (ret == 0) { tmp_server->iser_enabled = B_FALSE; if (rule->ir_conn_drain_timeout != 0) { (void) atomic_swap_64( (uint64_t *)&tmp_server->iser_die_time, lbolt64 + SEC_TO_TICK( rule->ir_conn_drain_timeout)); } } } done: if (rule_name != NULL) ILB_RULE_REFRELE(rule); return (ret); } int ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, ilb_rule_t *rule, in6_addr_t *addr) { return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE)); } int ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, ilb_rule_t *rule, in6_addr_t *addr) { return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE)); } /* * Add a back end server to a rule. If the address is IPv4, it is assumed * to be passed in as a mapped address. */ int ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info) { ilb_server_t *server; netstackid_t stackid; int ret = 0; in_port_t min_port, max_port; in_port_t range; /* Port is passed in network byte order. */ min_port = ntohs(info->min_port); max_port = ntohs(info->max_port); if (min_port > max_port) return (EINVAL); /* min_port == 0 means "all ports". Make it so */ if (min_port == 0) { min_port = 1; max_port = 65535; } range = max_port - min_port; mutex_enter(&rule->ir_lock); /* If someone is already doing server add/del, sleeps and wait. */ while (rule->ir_flags & ILB_RULE_BUSY) { if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { mutex_exit(&rule->ir_lock); return (EINTR); } } /* * Set the rule to be busy to make sure that no new packet can * use this rule. */ rule->ir_flags |= ILB_RULE_BUSY; /* Now wait for all other guys to finish their work. */ while (rule->ir_refcnt > 2) { if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { mutex_exit(&rule->ir_lock); ret = EINTR; goto end; } } mutex_exit(&rule->ir_lock); /* Sanity checks... */ if ((IN6_IS_ADDR_V4MAPPED(&info->addr) && rule->ir_ipver != IPPROTO_IP) || (!IN6_IS_ADDR_V4MAPPED(&info->addr) && rule->ir_ipver != IPPROTO_IPV6)) { ret = EINVAL; goto end; } /* * Check for valid port range. * * For DSR, there can be no port shifting. Hence the server * specification must be the same as the rule's. * * For half-NAT/NAT, the range must either be 0 (port collapsing) or * it must be equal to the same value as the rule port range. * */ if (rule->ir_topo == ILB_TOPO_IMPL_DSR) { if (rule->ir_max_port != max_port || rule->ir_min_port != min_port) { ret = EINVAL; goto end; } } else { if ((range != rule->ir_max_port - rule->ir_min_port) && range != 0) { ret = EINVAL; goto end; } } /* Check for duplicate. */ for (server = rule->ir_servers; server != NULL; server = server->iser_next) { if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) || strcasecmp(server->iser_name, info->name) == 0) { break; } } if (server != NULL) { ret = EEXIST; goto end; } if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) { ret = ENOMEM; goto end; } (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1); (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr, sizeof (server->iser_ip_addr)); stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; server->iser_ksp = ilb_server_kstat_init(stackid, rule, server); if (server->iser_ksp == NULL) { kmem_free(server, sizeof (ilb_server_t)); ret = EINVAL; goto end; } server->iser_stackid = stackid; server->iser_addr_v6 = info->addr; server->iser_min_port = min_port; server->iser_max_port = max_port; if (min_port != max_port) server->iser_port_range = B_TRUE; else server->iser_port_range = B_FALSE; /* * If the rule uses NAT, find/create the NAT source entry to use * for this server. */ if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { in_port_t port; /* * If the server uses a port range, our port allocation * scheme needs to treat it as a wildcard. Refer to the * comments in ilb_nat.c about the scheme. */ if (server->iser_port_range) port = 0; else port = server->iser_min_port; if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src, &server->iser_addr_v6, port, &rule->ir_nat_src_start, num_nat_src_v6(&rule->ir_nat_src_start, &rule->ir_nat_src_end))) != 0) { kstat_delete_netstack(server->iser_ksp, stackid); kmem_free(server, sizeof (ilb_server_t)); goto end; } } /* * The iser_lock is only used to protect iser_refcnt. All the other * fields in ilb_server_t should not change, except for iser_enabled. * The worst thing that can happen if iser_enabled is messed up is * that one or two packets may not be load balanced to a server * correctly. */ server->iser_refcnt = 1; server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE : B_FALSE; mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL); /* Let the load balancing algorithm know about the addition. */ ASSERT(rule->ir_alg != NULL); if ((ret = rule->ir_alg->ilb_alg_server_add(server, rule->ir_alg->ilb_alg_data)) != 0) { kstat_delete_netstack(server->iser_ksp, stackid); kmem_free(server, sizeof (ilb_server_t)); goto end; } /* * No need to hold ir_lock since no other thread should manipulate * the following fields until ILB_RULE_BUSY is cleared. */ if (rule->ir_servers == NULL) { server->iser_next = NULL; } else { server->iser_next = rule->ir_servers; } rule->ir_servers = server; ILB_R_KSTAT(rule, num_servers); end: mutex_enter(&rule->ir_lock); rule->ir_flags &= ~ILB_RULE_BUSY; cv_signal(&rule->ir_cv); mutex_exit(&rule->ir_lock); return (ret); } /* The routine executed by the delayed rule processing taskq. */ static void ilb_server_del_tq(void *arg) { ilb_server_t *server = (ilb_server_t *)arg; mutex_enter(&server->iser_lock); while (server->iser_refcnt > 1) cv_wait(&server->iser_cv, &server->iser_lock); kstat_delete_netstack(server->iser_ksp, server->iser_stackid); kmem_free(server, sizeof (ilb_server_t)); } /* * Delete a back end server from a rule. If the address is IPv4, it is assumed * to be passed in as a mapped address. */ int ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, ilb_rule_t *rule, in6_addr_t *addr) { ilb_server_t *server; ilb_server_t *prev_server; int ret = 0; ASSERT((rule == NULL && rule_name != NULL) || (rule != NULL && rule_name == NULL)); if (rule == NULL) { if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, &ret)) == NULL) { return (ret); } } mutex_enter(&rule->ir_lock); /* If someone is already doing server add/del, sleeps and wait. */ while (rule->ir_flags & ILB_RULE_BUSY) { if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { if (rule_name != NULL) { if (--rule->ir_refcnt <= 2) cv_signal(&rule->ir_cv); } mutex_exit(&rule->ir_lock); return (EINTR); } } /* * Set the rule to be busy to make sure that no new packet can * use this rule. */ rule->ir_flags |= ILB_RULE_BUSY; /* Now wait for all other guys to finish their work. */ while (rule->ir_refcnt > 2) { if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { mutex_exit(&rule->ir_lock); ret = EINTR; goto end; } } mutex_exit(&rule->ir_lock); prev_server = NULL; for (server = rule->ir_servers; server != NULL; prev_server = server, server = server->iser_next) { if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr)) break; } if (server == NULL) { ret = ENOENT; goto end; } /* * Let the load balancing algorithm know about the removal. * The algorithm may disallow the removal... */ if ((ret = rule->ir_alg->ilb_alg_server_del(server, rule->ir_alg->ilb_alg_data)) != 0) { goto end; } if (prev_server == NULL) rule->ir_servers = server->iser_next; else prev_server->iser_next = server->iser_next; ILB_R_KSTAT_UPDATE(rule, num_servers, -1); /* * Mark the server as disabled so that if there is any sticky cache * using this server around, it won't be used. */ server->iser_enabled = B_FALSE; mutex_enter(&server->iser_lock); /* * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t * may not go away if there is still a conn using it. The NAT source * timer will do the garbage collection. */ ilb_destroy_nat_src(&server->iser_nat_src); /* If there is a hard limit on when a server should die, set it. */ if (rule->ir_conn_drain_timeout != 0) { (void) atomic_swap_64((uint64_t *)&server->iser_die_time, lbolt64 + SEC_TO_TICK(rule->ir_conn_drain_timeout)); } if (server->iser_refcnt > 1) { (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq, server, TQ_SLEEP); mutex_exit(&server->iser_lock); } else { kstat_delete_netstack(server->iser_ksp, server->iser_stackid); kmem_free(server, sizeof (ilb_server_t)); } end: mutex_enter(&rule->ir_lock); rule->ir_flags &= ~ILB_RULE_BUSY; if (rule_name != NULL) rule->ir_refcnt--; cv_signal(&rule->ir_cv); mutex_exit(&rule->ir_lock); return (ret); } /* * First check if the destination of the ICMP message matches a VIP of * a rule. If it does not, just return ILB_PASSED. * * If the destination matches a VIP: * * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end * server. * * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload * and see which back end server we should send this message to. And we * need to do NAT on both the payload message and the outside IP packet. * * For other ICMP messages, drop them. */ /* ARGSUSED */ static int ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ipaddr_t *lb_dst) { ipaddr_t vip; ilb_rule_t *rule; in6_addr_t addr6; if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule)) return (ILB_PASSED); if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) { ILB_R_KSTAT(rule, icmp_dropped); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } switch (icmph->icmph_type) { case ICMP_ECHO_REQUEST: ILB_R_KSTAT(rule, icmp_echo_processed); ILB_RULE_REFRELE(rule); icmph->icmph_type = ICMP_ECHO_REPLY; icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0); ipha->ipha_ttl = ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl; *lb_dst = ipha->ipha_src; vip = ipha->ipha_dst; ipha->ipha_dst = ipha->ipha_src; ipha->ipha_src = vip; return (ILB_BALANCED); case ICMP_DEST_UNREACHABLE: { int ret; if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) { ILB_R_KSTAT(rule, icmp_dropped); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph, &addr6)) { ILB_R_KSTAT(rule, icmp_2big_processed); ret = ILB_BALANCED; } else { ILB_R_KSTAT(rule, icmp_2big_dropped); ret = ILB_DROPPED; } ILB_RULE_REFRELE(rule); IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst); return (ret); } default: ILB_R_KSTAT(rule, icmp_dropped); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } } /* ARGSUSED */ static int ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6, in6_addr_t *lb_dst) { ilb_rule_t *rule; if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule)) return (ILB_PASSED); if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) { ILB_R_KSTAT(rule, icmp_dropped); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } switch (icmp6->icmp6_type) { case ICMP6_ECHO_REQUEST: { int hdr_len; ILB_R_KSTAT(rule, icmp_echo_processed); ILB_RULE_REFRELE(rule); icmp6->icmp6_type = ICMP6_ECHO_REPLY; icmp6->icmp6_cksum = ip6h->ip6_plen; hdr_len = (char *)icmp6 - (char *)ip6h; icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len, ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6)); ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; ip6h->ip6_hops = ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops; *lb_dst = ip6h->ip6_src; ip6h->ip6_src = ip6h->ip6_dst; ip6h->ip6_dst = *lb_dst; return (ILB_BALANCED); } case ICMP6_PACKET_TOO_BIG: { int ret; if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6, lb_dst)) { ILB_R_KSTAT(rule, icmp_2big_processed); ret = ILB_BALANCED; } else { ILB_R_KSTAT(rule, icmp_2big_dropped); ret = ILB_DROPPED; } ILB_RULE_REFRELE(rule); return (ret); } default: ILB_R_KSTAT(rule, icmp_dropped); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } } /* * Common routine to check an incoming packet and decide what to do with it. * called by ilb_check_v4|v6(). */ static int ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src, in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len, in6_addr_t *lb_dst) { in_port_t sport, dport; tcpha_t *tcph; udpha_t *udph; ilb_rule_t *rule; ilb_server_t *server; boolean_t balanced; struct ilb_sticky_s *s = NULL; int ret; uint32_t ip_sum, tp_sum; ilb_nat_info_t info; uint16_t nat_src_idx; boolean_t busy; /* * We don't really need to switch here since both protocols's * ports are at the same offset. Just prepare for future protocol * specific processing. */ switch (l4) { case IPPROTO_TCP: if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr) return (ILB_DROPPED); tcph = (tcpha_t *)tph; sport = tcph->tha_lport; dport = tcph->tha_fport; break; case IPPROTO_UDP: if (tph + sizeof (udpha_t) > mp->b_wptr) return (ILB_DROPPED); udph = (udpha_t *)tph; sport = udph->uha_src_port; dport = udph->uha_dst_port; break; default: return (ILB_PASSED); } /* Fast path, there is an existing conn. */ if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport, pkt_len, lb_dst)) { return (ILB_BALANCED); } /* * If there is no existing connection for the incoming packet, check * to see if the packet matches a rule. If not, just let IP decide * what to do with it. * * Note: a reply from back end server should not match a rule. A * reply should match one existing conn. */ rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid, pkt_len, &busy); if (rule == NULL) { /* If the rule is busy, just drop the packet. */ if (busy) return (ILB_DROPPED); else return (ILB_PASSED); } /* * The packet matches a rule, use the rule load balance algorithm * to find a server. */ balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport, rule->ir_alg->ilb_alg_data, &server); /* * This can only happen if there is no server in a rule or all * the servers are currently disabled. */ if (!balanced) goto no_server; /* * If the rule is sticky enabled, we need to check the sticky table. * If there is a sticky entry for the client, use the previous server * instead of the one found above (note that both can be the same). * If there is no entry for that client, add an entry to the sticky * table. Both the find and add are done in ilb_sticky_find_add() * to avoid checking for duplicate when adding an entry. */ if (rule->ir_flags & ILB_RULE_STICKY) { in6_addr_t addr; V6_MASK_COPY(*src, rule->ir_sticky_mask, addr); if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server, &s, &nat_src_idx)) == NULL) { ILB_R_KSTAT(rule, nomem_pkt_dropped); ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); goto no_server; } } /* * We are holding a reference on the rule, so the server * cannot go away. */ *lb_dst = server->iser_addr_v6; ILB_S_KSTAT(server, pkt_processed); ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len); switch (rule->ir_topo) { case ILB_TOPO_IMPL_NAT: { ilb_nat_src_entry_t *src_ent; uint16_t *src_idx; /* * We create a cache even if it is not a SYN segment. * The server should return a RST. When we see the * RST, we will destroy this cache. But by having * a cache, we know how to NAT the returned RST. */ info.vip = *dst; info.dport = dport; info.src = *src; info.sport = sport; /* If stickiness is enabled, use the same source address */ if (s != NULL) src_idx = &nat_src_idx; else src_idx = NULL; if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src, &info.nat_src, &info.nat_sport, src_idx)) == NULL) { if (s != NULL) ilb_sticky_refrele(s); ILB_R_KSTAT(rule, pkt_dropped); ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); ILB_R_KSTAT(rule, noport_pkt_dropped); ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len); ret = ILB_DROPPED; break; } info.src_ent = src_ent; info.nat_dst = server->iser_addr_v6; if (rule->ir_port_range && server->iser_port_range) { info.nat_dport = htons(ntohs(dport) - rule->ir_min_port + server->iser_min_port); } else { info.nat_dport = htons(server->iser_min_port); } /* * If ilb_conn_add() fails, it will release the reference on * sticky info and de-allocate the NAT source port allocated * above. */ if (ilb_conn_add(ilbs, rule, server, src, sport, dst, dport, &info, &ip_sum, &tp_sum, s) != 0) { ILB_R_KSTAT(rule, pkt_dropped); ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); ILB_R_KSTAT(rule, nomem_pkt_dropped); ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); ret = ILB_DROPPED; break; } ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); ret = ILB_BALANCED; break; } case ILB_TOPO_IMPL_HALF_NAT: info.vip = *dst; info.nat_dst = server->iser_addr_v6; info.dport = dport; if (rule->ir_port_range && server->iser_port_range) { info.nat_dport = htons(ntohs(dport) - rule->ir_min_port + server->iser_min_port); } else { info.nat_dport = htons(server->iser_min_port); } if (ilb_conn_add(ilbs, rule, server, src, sport, dst, dport, &info, &ip_sum, &tp_sum, s) != 0) { ILB_R_KSTAT(rule, pkt_dropped); ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); ILB_R_KSTAT(rule, nomem_pkt_dropped); ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); ret = ILB_DROPPED; break; } ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); ret = ILB_BALANCED; break; case ILB_TOPO_IMPL_DSR: /* * By decrementing the sticky refcnt, the period of * stickiness (life time of ilb_sticky_t) will be * from now to (now + default expiry time). */ if (s != NULL) ilb_sticky_refrele(s); ret = ILB_BALANCED; break; default: cmn_err(CE_PANIC, "data corruption unknown topology: %p", (void *) rule); break; } ILB_RULE_REFRELE(rule); return (ret); no_server: /* This can only happen if there is no server available. */ ILB_R_KSTAT(rule, pkt_dropped); ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); ILB_RULE_REFRELE(rule); return (ILB_DROPPED); } int ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4, uint8_t *tph, ipaddr_t *lb_dst) { in6_addr_t v6_src, v6_dst, v6_lb_dst; int ret; ASSERT(DB_REF(mp) == 1); if (l4 == IPPROTO_ICMP) { return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph, lb_dst)); } IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src); IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst); ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha, tph, ntohs(ipha->ipha_length), &v6_lb_dst); if (ret == ILB_BALANCED) IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst); return (ret); } int ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4, uint8_t *tph, in6_addr_t *lb_dst) { uint32_t pkt_len; ASSERT(DB_REF(mp) == 1); if (l4 == IPPROTO_ICMPV6) { return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph, lb_dst)); } pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst, IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst)); } void ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules) { ilb_rule_t *tmp_rule; mutex_enter(&ilbs->ilbs_g_lock); *num_rules = 0; for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; tmp_rule = tmp_rule->ir_next) { if (tmp_rule->ir_zoneid == zoneid) *num_rules += 1; } mutex_exit(&ilbs->ilbs_g_lock); } int ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, uint32_t *num_servers) { ilb_rule_t *rule; int err; if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) return (err); *num_servers = rule->ir_kstat.num_servers.value.ui64; ILB_RULE_REFRELE(rule); return (0); } int ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, ilb_server_info_t *servers, uint32_t *num_servers) { ilb_rule_t *rule; ilb_server_t *server; size_t cnt; int err; if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) return (err); for (server = rule->ir_servers, cnt = *num_servers; server != NULL && cnt > 0; server = server->iser_next, cnt--, servers++) { (void) memcpy(servers->name, server->iser_name, ILB_SERVER_NAMESZ); servers->addr = server->iser_addr_v6; servers->min_port = htons(server->iser_min_port); servers->max_port = htons(server->iser_max_port); servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0; servers->err = 0; } ILB_RULE_REFRELE(rule); *num_servers -= cnt; return (0); } void ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names, char *buf) { ilb_rule_t *tmp_rule; int cnt; if (*num_names == 0) return; mutex_enter(&ilbs->ilbs_g_lock); for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; tmp_rule = tmp_rule->ir_next) { if (tmp_rule->ir_zoneid != zoneid) continue; (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ); buf += ILB_RULE_NAMESZ; if (++cnt == *num_names) break; } mutex_exit(&ilbs->ilbs_g_lock); *num_names = cnt; } int ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd) { ilb_rule_t *rule; int err; if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) { return (err); } /* * Except the enabled flags, none of the following will change * in the life time of a rule. So we don't hold the mutex when * reading them. The worst is to report a wrong enabled flags. */ cmd->ip_ver = rule->ir_ipver; cmd->proto = rule->ir_proto; cmd->min_port = htons(rule->ir_min_port); cmd->max_port = htons(rule->ir_max_port); cmd->vip = rule->ir_target_v6; cmd->algo = rule->ir_alg_type; cmd->topo = rule->ir_topo; cmd->nat_src_start = rule->ir_nat_src_start; cmd->nat_src_end = rule->ir_nat_src_end; cmd->conn_drain_timeout = rule->ir_conn_drain_timeout; cmd->nat_expiry = rule->ir_nat_expiry; cmd->sticky_expiry = rule->ir_sticky_expiry; cmd->flags = 0; if (rule->ir_flags & ILB_RULE_ENABLED) cmd->flags |= ILB_RULE_ENABLED; if (rule->ir_flags & ILB_RULE_STICKY) { cmd->flags |= ILB_RULE_STICKY; cmd->sticky_mask = rule->ir_sticky_mask; } ILB_RULE_REFRELE(rule); return (0); } static void * ilb_stack_init(netstackid_t stackid, netstack_t *ns) { ilb_stack_t *ilbs; char tq_name[TASKQ_NAMELEN]; ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP); ilbs->ilbs_netstack = ns; ilbs->ilbs_rule_head = NULL; ilbs->ilbs_g_hash = NULL; mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL); ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP); if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) { kmem_free(ilbs, sizeof (ilb_stack_t)); return (NULL); } /* * ilbs_conn/sticky_hash related info is initialized in * ilb_conn/sticky_hash_init(). */ ilbs->ilbs_conn_taskq = NULL; ilbs->ilbs_rule_hash_size = ilb_rule_hash_size; ilbs->ilbs_conn_hash_size = ilb_conn_hash_size; ilbs->ilbs_c2s_conn_hash = NULL; ilbs->ilbs_s2c_conn_hash = NULL; ilbs->ilbs_conn_timer_list = NULL; ilbs->ilbs_sticky_hash = NULL; ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size; ilbs->ilbs_sticky_timer_list = NULL; ilbs->ilbs_sticky_taskq = NULL; /* The allocation is done later when there is a rule using NAT mode. */ ilbs->ilbs_nat_src = NULL; ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size; mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL); ilbs->ilbs_nat_src_tid = 0; /* For listing the conn hash table */ mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL); ilbs->ilbs_conn_list_busy = B_FALSE; ilbs->ilbs_conn_list_cur = 0; ilbs->ilbs_conn_list_connp = NULL; /* For listing the sticky hash table */ mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL); ilbs->ilbs_sticky_list_busy = B_FALSE; ilbs->ilbs_sticky_list_cur = 0; ilbs->ilbs_sticky_list_curp = NULL; (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", ns); ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR, minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); return (ilbs); } /* ARGSUSED */ static void ilb_stack_shutdown(netstackid_t stackid, void *arg) { ilb_stack_t *ilbs = (ilb_stack_t *)arg; ilb_rule_t *tmp_rule; ilb_sticky_hash_fini(ilbs); ilb_conn_hash_fini(ilbs); mutex_enter(&ilbs->ilbs_g_lock); while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) { ilb_rule_hash_del(tmp_rule); ilb_rule_g_del(ilbs, tmp_rule); mutex_exit(&ilbs->ilbs_g_lock); ilb_rule_del_common(ilbs, tmp_rule); mutex_enter(&ilbs->ilbs_g_lock); } mutex_exit(&ilbs->ilbs_g_lock); if (ilbs->ilbs_nat_src != NULL) ilb_nat_src_fini(ilbs); } static void ilb_stack_fini(netstackid_t stackid, void * arg) { ilb_stack_t *ilbs = (ilb_stack_t *)arg; ilb_rule_hash_fini(ilbs); taskq_destroy(ilbs->ilbs_rule_taskq); ilb_kstat_g_fini(stackid, ilbs); kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t)); kmem_free(ilbs, sizeof (ilb_stack_t)); } void ilb_ddi_g_init(void) { netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown, ilb_stack_fini); } void ilb_ddi_g_destroy(void) { netstack_unregister(NS_ILB); ilb_conn_cache_fini(); ilb_sticky_cache_fini(); }