xref: /illumos-gate/usr/src/uts/common/inet/ilb/ilb.c (revision 986b458dd38036ac346e3cedf55812c5fad90cde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/kmem.h>
28 #include <sys/ksynch.h>
29 #include <sys/systm.h>
30 #include <sys/socket.h>
31 #include <sys/disp.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/strsun.h>
35 #include <sys/sdt.h>
36 #include <sys/atomic.h>
37 #include <netinet/in.h>
38 #include <inet/ip.h>
39 #include <inet/ip6.h>
40 #include <inet/tcp.h>
41 #include <inet/udp_impl.h>
42 #include <inet/kstatcom.h>
43 
44 #include <inet/ilb_ip.h>
45 #include "ilb_alg.h"
46 #include "ilb_nat.h"
47 #include "ilb_conn.h"
48 
49 /* ILB kmem cache flag */
50 int ilb_kmem_flags = 0;
51 
52 /*
53  * The default size for the different hash tables.  Global for all stacks.
54  * But each stack has its own table, just that their sizes are the same.
55  */
56 static size_t ilb_rule_hash_size = 2048;
57 
58 static size_t ilb_conn_hash_size = 262144;
59 
60 static size_t ilb_sticky_hash_size = 262144;
61 
62 /* This should be a prime number. */
63 static size_t ilb_nat_src_hash_size = 97;
64 
65 /* Default NAT cache entry expiry time. */
66 static uint32_t ilb_conn_tcp_expiry = 120;
67 static uint32_t ilb_conn_udp_expiry = 60;
68 
69 /* Default sticky entry expiry time. */
70 static uint32_t ilb_sticky_expiry = 60;
71 
72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
73 #define	ILB_RULE_HASH(addr, hash_size) \
74 	((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
75 	*(addr)) & ((hash_size) - 1))
76 
77 /*
78  * Note on ILB delayed processing
79  *
80  * To avoid in line removal on some of the data structures, such as rules,
81  * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
82  * There are three types of ILB taskq:
83  *
84  * 1. rule handling: created at stack initialialization time, ilb_stack_init()
85  * 2. conn hash handling: created at conn hash initialization time,
86  *                        ilb_conn_hash_init()
87  * 3. sticky hash handling: created at sticky hash initialization time,
88  *                          ilb_sticky_hash_init()
89  *
90  * The rule taskq is for processing rule and server removal.  When a user
91  * land rule/server removal request comes in, a taskq is dispatched after
92  * removing the rule/server from all related hashes.  This taskq will wait
93  * until all references to the rule/server are gone before removing it.
94  * So the user land thread requesting the removal does not need to wait
95  * for the removal completion.
96  *
97  * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
98  * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
99  * and ilb_sticky_timer_size timers running for ilb_conn_hash and
100  * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
101  * portion (same size) of the hash table.  When a timer fires, it dispatches
102  * a conn hash taskq to clean up its portion of the table.  This avoids in
103  * line processing of the removal.
104  *
105  * There is another delayed processing, the clean up of NAT source address
106  * table.  We just use the timer to directly handle it instead of using
107  * a taskq.  The reason is that the table is small so it is OK to use the
108  * timer.
109  */
110 
111 /* ILB rule taskq constants. */
112 #define	ILB_RULE_TASKQ_NUM_THR	20
113 
114 /* Argument passed to ILB rule taskq routines. */
115 typedef	struct {
116 	ilb_stack_t	*ilbs;
117 	ilb_rule_t	*rule;
118 } ilb_rule_tq_t;
119 
120 /* kstat handling routines. */
121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
125     ilb_server_t *);
126 
127 /* Rule hash handling routines. */
128 static void ilb_rule_hash_init(ilb_stack_t *);
129 static void ilb_rule_hash_fini(ilb_stack_t *);
130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
131 static void ilb_rule_hash_del(ilb_rule_t *);
132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
133     in_port_t, zoneid_t, uint32_t, boolean_t *);
134 
135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
139     int *);
140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
141     int, in_port_t, in_port_t, const in6_addr_t *);
142 
143 /* Back end server handling routines. */
144 static void ilb_server_free(ilb_server_t *);
145 
146 /* Network stack handling routines. */
147 static void *ilb_stack_init(netstackid_t, netstack_t *);
148 static void ilb_stack_shutdown(netstackid_t, void *);
149 static void ilb_stack_fini(netstackid_t, void *);
150 
151 /* Sticky connection handling routines. */
152 static void ilb_rule_sticky_init(ilb_rule_t *);
153 static void ilb_rule_sticky_fini(ilb_rule_t *);
154 
155 /* Handy macro to check for unspecified address. */
156 #define	IS_ADDR_UNSPEC(addr)						\
157 	(IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :	\
158 	    IN6_IS_ADDR_UNSPECIFIED(addr))
159 
160 /*
161  * Global kstat instance counter.  When a rule is created, its kstat instance
162  * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
163  * incremented.
164  */
165 static uint_t ilb_kstat_instance = 0;
166 
167 /*
168  * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
169  * A rule's kstat has ILB_RULE_KS_CNAME class name.
170  */
171 #define	ILB_G_KS_NAME		"global"
172 #define	ILB_G_KS_CNAME		"kstat"
173 #define	ILB_RULE_KS_CNAME	"rulestat"
174 
175 static kstat_t *
176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
177 {
178 	kstat_t *ksp;
179 	ilb_g_kstat_t template = {
180 		{ "num_rules",		KSTAT_DATA_UINT64, 0 },
181 		{ "ip_frag_in",		KSTAT_DATA_UINT64, 0 },
182 		{ "ip_frag_dropped",	KSTAT_DATA_UINT64, 0 }
183 	};
184 
185 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
186 	    ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
187 	    KSTAT_FLAG_VIRTUAL, stackid);
188 	if (ksp == NULL)
189 		return (NULL);
190 	bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
191 	ksp->ks_data = ilbs->ilbs_kstat;
192 	ksp->ks_private = (void *)(uintptr_t)stackid;
193 
194 	kstat_install(ksp);
195 	return (ksp);
196 }
197 
198 static void
199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
200 {
201 	if (ilbs->ilbs_ksp != NULL) {
202 		ASSERT(stackid == (netstackid_t)(uintptr_t)
203 		    ilbs->ilbs_ksp->ks_private);
204 		kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
205 		ilbs->ilbs_ksp = NULL;
206 	}
207 }
208 
209 static kstat_t *
210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
211 {
212 	kstat_t *ksp;
213 	ilb_rule_kstat_t template = {
214 		{ "num_servers",		KSTAT_DATA_UINT64, 0 },
215 		{ "bytes_not_processed",	KSTAT_DATA_UINT64, 0 },
216 		{ "pkt_not_processed",		KSTAT_DATA_UINT64, 0 },
217 		{ "bytes_dropped",		KSTAT_DATA_UINT64, 0 },
218 		{ "pkt_dropped",		KSTAT_DATA_UINT64, 0 },
219 		{ "nomem_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
220 		{ "nomem_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
221 		{ "noport_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
222 		{ "noport_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
223 		{ "icmp_echo_processed",	KSTAT_DATA_UINT64, 0 },
224 		{ "icmp_dropped",		KSTAT_DATA_UINT64, 0 },
225 		{ "icmp_too_big_processed",	KSTAT_DATA_UINT64, 0 },
226 		{ "icmp_too_big_dropped",	KSTAT_DATA_UINT64, 0 }
227 	};
228 
229 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
230 	    rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
231 	    NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
232 	if (ksp == NULL)
233 		return (NULL);
234 
235 	bcopy(&template, &rule->ir_kstat, sizeof (template));
236 	ksp->ks_data = &rule->ir_kstat;
237 	ksp->ks_private = (void *)(uintptr_t)stackid;
238 
239 	kstat_install(ksp);
240 	return (ksp);
241 }
242 
243 static kstat_t *
244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
245     ilb_server_t *server)
246 {
247 	kstat_t *ksp;
248 	ilb_server_kstat_t template = {
249 		{ "bytes_processed",	KSTAT_DATA_UINT64, 0 },
250 		{ "pkt_processed",	KSTAT_DATA_UINT64, 0 },
251 		{ "ip_address",		KSTAT_DATA_STRING, 0 }
252 	};
253 	char cname_buf[KSTAT_STRLEN];
254 
255 	/* 7 is "-sstat" */
256 	ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
257 	(void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
258 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
259 	    server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
260 	    NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
261 	if (ksp == NULL)
262 		return (NULL);
263 
264 	bcopy(&template, &server->iser_kstat, sizeof (template));
265 	ksp->ks_data = &server->iser_kstat;
266 	ksp->ks_private = (void *)(uintptr_t)stackid;
267 
268 	kstat_named_setstr(&server->iser_kstat.ip_address,
269 	    server->iser_ip_addr);
270 	/* We never change the IP address */
271 	ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
272 
273 	kstat_install(ksp);
274 	return (ksp);
275 }
276 
277 /* Initialize the rule hash table. */
278 static void
279 ilb_rule_hash_init(ilb_stack_t *ilbs)
280 {
281 	int i;
282 
283 	/*
284 	 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
285 	 * the next power of 2.
286 	 */
287 	if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
288 		for (i = 0; i < 31; i++) {
289 			if (ilbs->ilbs_rule_hash_size < (1 << i))
290 				break;
291 		}
292 		ilbs->ilbs_rule_hash_size = 1 << i;
293 	}
294 	ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
295 	    ilbs->ilbs_rule_hash_size, KM_SLEEP);
296 	for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
297 		mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
298 		    MUTEX_DEFAULT, NULL);
299 	}
300 }
301 
302 /* Clean up the rule hash table. */
303 static void
304 ilb_rule_hash_fini(ilb_stack_t *ilbs)
305 {
306 	if (ilbs->ilbs_g_hash == NULL)
307 		return;
308 	kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
309 	    ilbs->ilbs_rule_hash_size);
310 }
311 
312 /* Add a rule to the rule hash table. */
313 static void
314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
315 {
316 	int i;
317 
318 	i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
319 	    ilbs->ilbs_rule_hash_size);
320 	DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
321 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
322 	rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
323 	if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
324 		ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
325 	rule->ir_hash_prev = NULL;
326 	ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
327 
328 	rule->ir_hash = &ilbs->ilbs_g_hash[i];
329 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
330 }
331 
332 /*
333  * Remove a rule from the rule hash table.  Note that the rule is not freed
334  * in this routine.
335  */
336 static void
337 ilb_rule_hash_del(ilb_rule_t *rule)
338 {
339 	mutex_enter(&rule->ir_hash->ilb_hash_lock);
340 	if (rule->ir_hash->ilb_hash_rule == rule) {
341 		rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
342 		if (rule->ir_hash_next != NULL)
343 			rule->ir_hash_next->ir_hash_prev = NULL;
344 	} else {
345 		if (rule->ir_hash_prev != NULL)
346 			rule->ir_hash_prev->ir_hash_next =
347 			    rule->ir_hash_next;
348 		if (rule->ir_hash_next != NULL) {
349 			rule->ir_hash_next->ir_hash_prev =
350 			    rule->ir_hash_prev;
351 		}
352 	}
353 	mutex_exit(&rule->ir_hash->ilb_hash_lock);
354 
355 	rule->ir_hash_next = NULL;
356 	rule->ir_hash_prev = NULL;
357 	rule->ir_hash = NULL;
358 }
359 
360 /*
361  * Given the info of a packet, look for a match in the rule hash table.
362  */
363 static ilb_rule_t *
364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
365     in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
366 {
367 	int i;
368 	ilb_rule_t *rule;
369 	ipaddr_t v4_addr;
370 
371 	*busy = B_FALSE;
372 	IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
373 	i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
374 	port = ntohs(port);
375 
376 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
377 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
378 	    rule = rule->ir_hash_next) {
379 		if (!rule->ir_port_range) {
380 			if (rule->ir_min_port != port)
381 				continue;
382 		} else {
383 			if (port < rule->ir_min_port ||
384 			    port > rule->ir_max_port) {
385 				continue;
386 			}
387 		}
388 		if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
389 		    rule->ir_zoneid != zoneid) {
390 			continue;
391 		}
392 
393 		if (l3 == IPPROTO_IP) {
394 			if (rule->ir_target_v4 != INADDR_ANY &&
395 			    rule->ir_target_v4 != v4_addr) {
396 				continue;
397 			}
398 		} else {
399 			if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
400 			    !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
401 				continue;
402 			}
403 		}
404 
405 		/*
406 		 * Just update the stats if the rule is disabled.
407 		 */
408 		mutex_enter(&rule->ir_lock);
409 		if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
410 			ILB_R_KSTAT(rule, pkt_not_processed);
411 			ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
412 			mutex_exit(&rule->ir_lock);
413 			rule = NULL;
414 			break;
415 		} else if (rule->ir_flags & ILB_RULE_BUSY) {
416 			/*
417 			 * If we are busy...
418 			 *
419 			 * XXX we should have a queue to postpone the
420 			 * packet processing.  But this requires a
421 			 * mechanism in IP to re-start the packet
422 			 * processing.  So for now, just drop the packet.
423 			 */
424 			ILB_R_KSTAT(rule, pkt_dropped);
425 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
426 			mutex_exit(&rule->ir_lock);
427 			*busy = B_TRUE;
428 			rule = NULL;
429 			break;
430 		} else {
431 			rule->ir_refcnt++;
432 			ASSERT(rule->ir_refcnt != 1);
433 			mutex_exit(&rule->ir_lock);
434 			break;
435 		}
436 	}
437 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
438 	return (rule);
439 }
440 
441 /*
442  * Add a rule to the global rule list.  This list is for finding all rules
443  * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
444  */
445 static void
446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
447 {
448 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
449 	rule->ir_next = ilbs->ilbs_rule_head;
450 	ilbs->ilbs_rule_head = rule;
451 	ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
452 }
453 
454 /* The call is assumed to hold the ilbs_g_lock. */
455 static void
456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
457 {
458 	ilb_rule_t *tmp_rule;
459 	ilb_rule_t *prev_rule;
460 
461 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
462 	prev_rule = NULL;
463 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
464 	    prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
465 		if (tmp_rule == rule)
466 			break;
467 	}
468 	if (tmp_rule == NULL) {
469 		mutex_exit(&ilbs->ilbs_g_lock);
470 		return;
471 	}
472 	if (prev_rule == NULL)
473 		ilbs->ilbs_rule_head = tmp_rule->ir_next;
474 	else
475 		prev_rule->ir_next = tmp_rule->ir_next;
476 	ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
477 }
478 
479 /*
480  * Helper routine to calculate how many source addresses are in a given
481  * range.
482  */
483 static int64_t
484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
485 {
486 	int64_t ret;
487 	uint32_t addr1, addr2;
488 
489 	/*
490 	 * Here we assume that the max number of NAT source cannot be
491 	 * large such that the most significant 2 s6_addr32 must be
492 	 * equal.
493 	 */
494 	addr1 = ntohl(a1->s6_addr32[3]);
495 	addr2 = ntohl(a2->s6_addr32[3]);
496 	if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
497 	    a1->s6_addr32[1] != a2->s6_addr32[1] ||
498 	    a1->s6_addr32[2] > a2->s6_addr32[2] ||
499 	    (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
500 		return (-1);
501 	}
502 	if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
503 		return (addr2 - addr1 + 1);
504 	} else {
505 		ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
506 		ret <<= 32;
507 		ret = ret + addr1 - addr2;
508 		return (ret + 1);
509 	}
510 }
511 
512 /*
513  * Add an ILB rule.
514  */
515 int
516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
517 {
518 	ilb_rule_t *rule;
519 	netstackid_t stackid;
520 	int ret;
521 	in_port_t min_port, max_port;
522 	int64_t num_src;
523 
524 	/* Sanity checks. */
525 	if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
526 		return (EINVAL);
527 
528 	/* Need to support SCTP... */
529 	if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
530 		return (EINVAL);
531 
532 	/* For full NAT, the NAT source must be supplied. */
533 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
534 		if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
535 		    IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
536 			return (EINVAL);
537 		}
538 	}
539 
540 	/* Check invalid mask */
541 	if ((cmd->flags & ILB_RULE_STICKY) &&
542 	    IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
543 		return (EINVAL);
544 	}
545 
546 	/* Port is passed in network byte order. */
547 	min_port = ntohs(cmd->min_port);
548 	max_port = ntohs(cmd->max_port);
549 	if (min_port > max_port)
550 		return (EINVAL);
551 
552 	/* min_port == 0 means "all ports". Make it so */
553 	if (min_port == 0) {
554 		min_port = 1;
555 		max_port = 65535;
556 	}
557 
558 	/* Funny address checking. */
559 	if (cmd->ip_ver == IPPROTO_IP) {
560 		in_addr_t v4_addr1, v4_addr2;
561 
562 		v4_addr1 = cmd->vip.s6_addr32[3];
563 		if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
564 		    CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
565 		    v4_addr1 == INADDR_ANY ||
566 		    !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
567 			return (EINVAL);
568 		}
569 
570 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
571 			v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
572 			v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
573 			if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
574 			    (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
575 			    v4_addr1 == INADDR_BROADCAST ||
576 			    v4_addr2 == INADDR_BROADCAST ||
577 			    v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
578 			    CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
579 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
580 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
581 				return (EINVAL);
582 			}
583 
584 			num_src = v4_addr2 - v4_addr1 + 1;
585 			if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
586 				return (EINVAL);
587 		}
588 	} else {
589 		if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
590 		    IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
591 		    IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
592 		    IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
593 			return (EINVAL);
594 		}
595 
596 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
597 			if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
598 			    IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
599 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
600 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
601 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
602 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
603 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
604 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
605 				return (EINVAL);
606 			}
607 
608 			if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
609 			    &cmd->nat_src_end)) < 0 ||
610 			    num_src > ILB_MAX_NAT_SRC) {
611 				return (EINVAL);
612 			}
613 		}
614 	}
615 
616 	mutex_enter(&ilbs->ilbs_g_lock);
617 	if (ilbs->ilbs_g_hash == NULL)
618 		ilb_rule_hash_init(ilbs);
619 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
620 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
621 		ilb_conn_hash_init(ilbs);
622 		ilb_nat_src_init(ilbs);
623 	}
624 
625 	/* Make sure that the new rule does not duplicate an existing one. */
626 	if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
627 	    min_port, max_port, &cmd->vip)) {
628 		mutex_exit(&ilbs->ilbs_g_lock);
629 		return (EEXIST);
630 	}
631 
632 	rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
633 	if (rule == NULL) {
634 		mutex_exit(&ilbs->ilbs_g_lock);
635 		return (ENOMEM);
636 	}
637 
638 	/* ir_name is all 0 to begin with */
639 	(void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
640 
641 	rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1);
642 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
643 	if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
644 		ret = ENOMEM;
645 		goto error;
646 	}
647 
648 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
649 		rule->ir_nat_src_start = cmd->nat_src_start;
650 		rule->ir_nat_src_end = cmd->nat_src_end;
651 	}
652 
653 	rule->ir_ipver = cmd->ip_ver;
654 	rule->ir_proto = cmd->proto;
655 	rule->ir_topo = cmd->topo;
656 
657 	rule->ir_min_port = min_port;
658 	rule->ir_max_port = max_port;
659 	if (rule->ir_min_port != rule->ir_max_port)
660 		rule->ir_port_range = B_TRUE;
661 	else
662 		rule->ir_port_range = B_FALSE;
663 
664 	rule->ir_zoneid = zoneid;
665 
666 	rule->ir_target_v6 = cmd->vip;
667 	rule->ir_servers = NULL;
668 
669 	/*
670 	 * The default connection drain timeout is indefinite (value 0),
671 	 * meaning we will wait for all connections to finish.  So we
672 	 * can assign cmd->conn_drain_timeout to it directly.
673 	 */
674 	rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
675 	if (cmd->nat_expiry != 0) {
676 		rule->ir_nat_expiry = cmd->nat_expiry;
677 	} else {
678 		switch (rule->ir_proto) {
679 		case IPPROTO_TCP:
680 			rule->ir_nat_expiry = ilb_conn_tcp_expiry;
681 			break;
682 		case IPPROTO_UDP:
683 			rule->ir_nat_expiry = ilb_conn_udp_expiry;
684 			break;
685 		default:
686 			cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
687 			    (void *)rule);
688 			break;
689 		}
690 	}
691 	if (cmd->sticky_expiry != 0)
692 		rule->ir_sticky_expiry = cmd->sticky_expiry;
693 	else
694 		rule->ir_sticky_expiry = ilb_sticky_expiry;
695 
696 	if (cmd->flags & ILB_RULE_STICKY) {
697 		rule->ir_flags |= ILB_RULE_STICKY;
698 		rule->ir_sticky_mask = cmd->sticky_mask;
699 		if (ilbs->ilbs_sticky_hash == NULL)
700 			ilb_sticky_hash_init(ilbs);
701 	}
702 	if (cmd->flags & ILB_RULE_ENABLED)
703 		rule->ir_flags |= ILB_RULE_ENABLED;
704 
705 	mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
706 	cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
707 
708 	rule->ir_refcnt = 1;
709 
710 	switch (cmd->algo) {
711 	case ILB_ALG_IMPL_ROUNDROBIN:
712 		if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
713 			ret = ENOMEM;
714 			goto error;
715 		}
716 		rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
717 		break;
718 	case ILB_ALG_IMPL_HASH_IP:
719 	case ILB_ALG_IMPL_HASH_IP_SPORT:
720 	case ILB_ALG_IMPL_HASH_IP_VIP:
721 		if ((rule->ir_alg = ilb_alg_hash_init(rule,
722 		    &cmd->algo)) == NULL) {
723 			ret = ENOMEM;
724 			goto error;
725 		}
726 		rule->ir_alg_type = cmd->algo;
727 		break;
728 	default:
729 		ret = EINVAL;
730 		goto error;
731 	}
732 
733 	/* Add it to the global list and hash array at the end. */
734 	ilb_rule_g_add(ilbs, rule);
735 	ilb_rule_hash_add(ilbs, rule, &cmd->vip);
736 
737 	mutex_exit(&ilbs->ilbs_g_lock);
738 
739 	return (0);
740 
741 error:
742 	mutex_exit(&ilbs->ilbs_g_lock);
743 	if (rule->ir_ksp != NULL) {
744 		/* stackid must be initialized if ir_ksp != NULL */
745 		kstat_delete_netstack(rule->ir_ksp, stackid);
746 	}
747 	kmem_free(rule, sizeof (ilb_rule_t));
748 	return (ret);
749 }
750 
751 /*
752  * The final part in deleting a rule.  Either called directly or by the
753  * taskq dispatched.
754  */
755 static void
756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
757 {
758 	netstackid_t stackid;
759 	ilb_server_t *server;
760 
761 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
762 
763 	/*
764 	 * Let the algorithm know that the rule is going away.  The
765 	 * algorithm fini routine will free all its resources with this
766 	 * rule.
767 	 */
768 	tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
769 
770 	while ((server = tmp_rule->ir_servers) != NULL) {
771 		mutex_enter(&server->iser_lock);
772 		ilb_destroy_nat_src(&server->iser_nat_src);
773 		if (tmp_rule->ir_conn_drain_timeout != 0) {
774 			/*
775 			 * The garbage collection thread checks this value
776 			 * without grabing a lock.  So we need to use
777 			 * atomic_swap_64() to make sure that the value seen
778 			 * by gc thread is intact.
779 			 */
780 			(void) atomic_swap_64(
781 			    (uint64_t *)&server->iser_die_time,
782 			    ddi_get_lbolt64() +
783 			    SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
784 		}
785 		while (server->iser_refcnt > 1)
786 			cv_wait(&server->iser_cv, &server->iser_lock);
787 		tmp_rule->ir_servers = server->iser_next;
788 		kstat_delete_netstack(server->iser_ksp, stackid);
789 		kmem_free(server, sizeof (ilb_server_t));
790 	}
791 
792 	ASSERT(tmp_rule->ir_ksp != NULL);
793 	kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
794 
795 	kmem_free(tmp_rule, sizeof (ilb_rule_t));
796 }
797 
798 /* The routine executed by the delayed rule taskq. */
799 static void
800 ilb_rule_del_tq(void *arg)
801 {
802 	ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
803 	ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
804 
805 	mutex_enter(&rule->ir_lock);
806 	while (rule->ir_refcnt > 1)
807 		cv_wait(&rule->ir_cv, &rule->ir_lock);
808 	ilb_rule_del_common(ilbs, rule);
809 	kmem_free(arg, sizeof (ilb_rule_tq_t));
810 }
811 
812 /* Routine to delete a rule. */
813 int
814 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
815 {
816 	ilb_rule_t *tmp_rule;
817 	ilb_rule_tq_t *arg;
818 	int err;
819 
820 	mutex_enter(&ilbs->ilbs_g_lock);
821 	if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
822 	    &err)) == NULL) {
823 		mutex_exit(&ilbs->ilbs_g_lock);
824 		return (err);
825 	}
826 
827 	/*
828 	 * First remove the rule from the hash array and the global list so
829 	 * that no one can find this rule any more.
830 	 */
831 	ilb_rule_hash_del(tmp_rule);
832 	ilb_rule_g_del(ilbs, tmp_rule);
833 	mutex_exit(&ilbs->ilbs_g_lock);
834 	ILB_RULE_REFRELE(tmp_rule);
835 
836 	/*
837 	 * Now no one can find this rule, we can remove it once all
838 	 * references to it are dropped and all references to the list
839 	 * of servers are dropped.  So dispatch a task to finish the deletion.
840 	 * We do this instead of letting the last one referencing the
841 	 * rule do it.  The reason is that the last one may be the
842 	 * interrupt thread.  We want to minimize the work it needs to
843 	 * do.  Rule deletion is not a critical task so it can be delayed.
844 	 */
845 	arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
846 	arg->ilbs = ilbs;
847 	arg->rule = tmp_rule;
848 	(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
849 	    TQ_SLEEP);
850 
851 	return (0);
852 }
853 
854 /*
855  * Given an IP address, check to see if there is a rule using this
856  * as the VIP.  It can be used to check if we need to drop a fragment.
857  */
858 boolean_t
859 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
860 {
861 	int i;
862 	ilb_rule_t *rule;
863 	boolean_t ret = B_FALSE;
864 
865 	i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
866 	    ilbs->ilbs_rule_hash_size);
867 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
868 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
869 	    rule = rule->ir_hash_next) {
870 		if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
871 			mutex_enter(&rule->ir_lock);
872 			if (rule->ir_flags & ILB_RULE_BUSY) {
873 				mutex_exit(&rule->ir_lock);
874 				break;
875 			}
876 			if (ret_rule != NULL) {
877 				rule->ir_refcnt++;
878 				mutex_exit(&rule->ir_lock);
879 				*ret_rule = rule;
880 			} else {
881 				mutex_exit(&rule->ir_lock);
882 			}
883 			ret = B_TRUE;
884 			break;
885 		}
886 	}
887 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
888 	return (ret);
889 }
890 
891 boolean_t
892 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
893 {
894 	int i;
895 	ilb_rule_t *rule;
896 	boolean_t ret = B_FALSE;
897 
898 	i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
899 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
900 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
901 	    rule = rule->ir_hash_next) {
902 		if (rule->ir_target_v6.s6_addr32[3] == addr) {
903 			mutex_enter(&rule->ir_lock);
904 			if (rule->ir_flags & ILB_RULE_BUSY) {
905 				mutex_exit(&rule->ir_lock);
906 				break;
907 			}
908 			if (ret_rule != NULL) {
909 				rule->ir_refcnt++;
910 				mutex_exit(&rule->ir_lock);
911 				*ret_rule = rule;
912 			} else {
913 				mutex_exit(&rule->ir_lock);
914 			}
915 			ret = B_TRUE;
916 			break;
917 		}
918 	}
919 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
920 	return (ret);
921 }
922 
923 static ilb_rule_t *
924 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
925     int *err)
926 {
927 	ilb_rule_t *tmp_rule;
928 
929 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
930 
931 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
932 	    tmp_rule = tmp_rule->ir_next) {
933 		if (tmp_rule->ir_zoneid != zoneid)
934 			continue;
935 		if (strcasecmp(tmp_rule->ir_name, name) == 0) {
936 			mutex_enter(&tmp_rule->ir_lock);
937 			if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
938 				mutex_exit(&tmp_rule->ir_lock);
939 				*err = EINPROGRESS;
940 				return (NULL);
941 			}
942 			tmp_rule->ir_refcnt++;
943 			mutex_exit(&tmp_rule->ir_lock);
944 			*err = 0;
945 			return (tmp_rule);
946 		}
947 	}
948 	*err = ENOENT;
949 	return (NULL);
950 }
951 
952 /* To find a rule with a given name and zone in the global rule list. */
953 ilb_rule_t *
954 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
955     int *err)
956 {
957 	ilb_rule_t *tmp_rule;
958 
959 	mutex_enter(&ilbs->ilbs_g_lock);
960 	tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
961 	mutex_exit(&ilbs->ilbs_g_lock);
962 	return (tmp_rule);
963 }
964 
965 /* Try to match the given packet info and zone ID with a rule. */
966 static boolean_t
967 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
968     int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
969 {
970 	ilb_rule_t *tmp_rule;
971 
972 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
973 
974 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
975 	    tmp_rule = tmp_rule->ir_next) {
976 		if (tmp_rule->ir_zoneid != zoneid)
977 			continue;
978 
979 		/*
980 		 * We don't allow the same name in different rules even if all
981 		 * the other rule components are different.
982 		 */
983 		if (strcasecmp(tmp_rule->ir_name, name) == 0)
984 			return (B_TRUE);
985 
986 		if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
987 			continue;
988 
989 		/*
990 		 * ir_min_port and ir_max_port are the same if ir_port_range
991 		 * is false.  In this case, if the ir_min|max_port (same) is
992 		 * outside of the given port range, it is OK.  In other cases,
993 		 * check if min and max port are outside a rule's range.
994 		 */
995 		if (tmp_rule->ir_max_port < min_port ||
996 		    tmp_rule->ir_min_port > max_port) {
997 			continue;
998 		}
999 
1000 		/*
1001 		 * If l3 is IPv4, the addr passed in is assumed to be
1002 		 * mapped address.
1003 		 */
1004 		if (V6_OR_V4_INADDR_ANY(*addr) ||
1005 		    V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1006 		    IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1007 			return (B_TRUE);
1008 		}
1009 	}
1010 	return (B_FALSE);
1011 }
1012 
1013 int
1014 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1015     const char *rule_name, ilb_rule_t *in_rule)
1016 {
1017 	ilb_rule_t *rule;
1018 	int err;
1019 
1020 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1021 	    (in_rule != NULL && rule_name == NULL));
1022 	if ((rule = in_rule) == NULL) {
1023 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1024 		    &err)) == NULL) {
1025 			return (err);
1026 		}
1027 	}
1028 	mutex_enter(&rule->ir_lock);
1029 	rule->ir_flags |= ILB_RULE_ENABLED;
1030 	mutex_exit(&rule->ir_lock);
1031 
1032 	/* Only refrele if the rule is passed in. */
1033 	if (in_rule == NULL)
1034 		ILB_RULE_REFRELE(rule);
1035 	return (0);
1036 }
1037 
1038 int
1039 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1040     const char *rule_name, ilb_rule_t *in_rule)
1041 {
1042 	ilb_rule_t *rule;
1043 	int err;
1044 
1045 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1046 	    (in_rule != NULL && rule_name == NULL));
1047 	if ((rule = in_rule) == NULL) {
1048 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1049 		    &err)) == NULL) {
1050 			return (err);
1051 		}
1052 	}
1053 	mutex_enter(&rule->ir_lock);
1054 	rule->ir_flags &= ~ILB_RULE_ENABLED;
1055 	mutex_exit(&rule->ir_lock);
1056 
1057 	/* Only refrele if the rule is passed in. */
1058 	if (in_rule == NULL)
1059 		ILB_RULE_REFRELE(rule);
1060 	return (0);
1061 }
1062 
1063 /*
1064  * XXX We should probably have a walker function to walk all rules.  For
1065  * now, just add a simple loop for enable/disable/del.
1066  */
1067 void
1068 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1069 {
1070 	ilb_rule_t *rule;
1071 
1072 	mutex_enter(&ilbs->ilbs_g_lock);
1073 	for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1074 		if (rule->ir_zoneid != zoneid)
1075 			continue;
1076 		/*
1077 		 * No need to hold the rule as we are holding the global
1078 		 * lock so it won't go away.  Ignore the return value here
1079 		 * as the rule is provided so the call cannot fail.
1080 		 */
1081 		(void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1082 	}
1083 	mutex_exit(&ilbs->ilbs_g_lock);
1084 }
1085 
1086 void
1087 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1088 {
1089 	ilb_rule_t *rule;
1090 
1091 	mutex_enter(&ilbs->ilbs_g_lock);
1092 	for (rule = ilbs->ilbs_rule_head; rule != NULL;
1093 	    rule = rule->ir_next) {
1094 		if (rule->ir_zoneid != zoneid)
1095 			continue;
1096 		(void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1097 	}
1098 	mutex_exit(&ilbs->ilbs_g_lock);
1099 }
1100 
1101 void
1102 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1103 {
1104 	ilb_rule_t *rule;
1105 	ilb_rule_tq_t *arg;
1106 
1107 	mutex_enter(&ilbs->ilbs_g_lock);
1108 	while ((rule = ilbs->ilbs_rule_head) != NULL) {
1109 		if (rule->ir_zoneid != zoneid)
1110 			continue;
1111 		ilb_rule_hash_del(rule);
1112 		ilb_rule_g_del(ilbs, rule);
1113 		mutex_exit(&ilbs->ilbs_g_lock);
1114 
1115 		arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1116 		arg->ilbs = ilbs;
1117 		arg->rule = rule;
1118 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1119 		    arg, TQ_SLEEP);
1120 
1121 		mutex_enter(&ilbs->ilbs_g_lock);
1122 	}
1123 	mutex_exit(&ilbs->ilbs_g_lock);
1124 }
1125 
1126 /*
1127  * This is just an optimization, so don't grab the global lock.  The
1128  * worst case is that we missed a couple packets.
1129  */
1130 boolean_t
1131 ilb_has_rules(ilb_stack_t *ilbs)
1132 {
1133 	return (ilbs->ilbs_rule_head != NULL);
1134 }
1135 
1136 
1137 static int
1138 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1139     ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1140 {
1141 	ilb_server_t *tmp_server;
1142 	int ret;
1143 
1144 	ASSERT((rule == NULL && rule_name != NULL) ||
1145 	    (rule != NULL && rule_name == NULL));
1146 
1147 	if (rule == NULL) {
1148 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1149 		    &ret)) == NULL) {
1150 			return (ret);
1151 		}
1152 	}
1153 
1154 	/* Once we get a hold on the rule, no server can be added/deleted. */
1155 	for (tmp_server = rule->ir_servers; tmp_server != NULL;
1156 	    tmp_server = tmp_server->iser_next) {
1157 		if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1158 			break;
1159 	}
1160 	if (tmp_server == NULL) {
1161 		ret = ENOENT;
1162 		goto done;
1163 	}
1164 
1165 	if (enable) {
1166 		ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1167 		    rule->ir_alg->ilb_alg_data);
1168 		if (ret == 0) {
1169 			tmp_server->iser_enabled = B_TRUE;
1170 			tmp_server->iser_die_time = 0;
1171 		}
1172 	} else {
1173 		ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1174 		    rule->ir_alg->ilb_alg_data);
1175 		if (ret == 0) {
1176 			tmp_server->iser_enabled = B_FALSE;
1177 			if (rule->ir_conn_drain_timeout != 0) {
1178 				(void) atomic_swap_64(
1179 				    (uint64_t *)&tmp_server->iser_die_time,
1180 				    ddi_get_lbolt64() + SEC_TO_TICK(
1181 				    rule->ir_conn_drain_timeout));
1182 			}
1183 		}
1184 	}
1185 
1186 done:
1187 	if (rule_name != NULL)
1188 		ILB_RULE_REFRELE(rule);
1189 	return (ret);
1190 }
1191 int
1192 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1193     ilb_rule_t *rule, in6_addr_t *addr)
1194 {
1195 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1196 }
1197 
1198 int
1199 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1200     ilb_rule_t *rule, in6_addr_t *addr)
1201 {
1202 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1203 }
1204 
1205 /*
1206  * Add a back end server to a rule.  If the address is IPv4, it is assumed
1207  * to be passed in as a mapped address.
1208  */
1209 int
1210 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1211 {
1212 	ilb_server_t	*server;
1213 	netstackid_t	stackid;
1214 	int		ret = 0;
1215 	in_port_t	min_port, max_port;
1216 	in_port_t	range;
1217 
1218 	/* Port is passed in network byte order. */
1219 	min_port = ntohs(info->min_port);
1220 	max_port = ntohs(info->max_port);
1221 	if (min_port > max_port)
1222 		return (EINVAL);
1223 
1224 	/* min_port == 0 means "all ports". Make it so */
1225 	if (min_port == 0) {
1226 		min_port = 1;
1227 		max_port = 65535;
1228 	}
1229 	range = max_port - min_port;
1230 
1231 	mutex_enter(&rule->ir_lock);
1232 	/* If someone is already doing server add/del, sleeps and wait. */
1233 	while (rule->ir_flags & ILB_RULE_BUSY) {
1234 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1235 			mutex_exit(&rule->ir_lock);
1236 			return (EINTR);
1237 		}
1238 	}
1239 
1240 	/*
1241 	 * Set the rule to be busy to make sure that no new packet can
1242 	 * use this rule.
1243 	 */
1244 	rule->ir_flags |= ILB_RULE_BUSY;
1245 
1246 	/* Now wait for all other guys to finish their work. */
1247 	while (rule->ir_refcnt > 2) {
1248 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1249 			mutex_exit(&rule->ir_lock);
1250 			ret = EINTR;
1251 			goto end;
1252 		}
1253 	}
1254 	mutex_exit(&rule->ir_lock);
1255 
1256 	/* Sanity checks... */
1257 	if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1258 	    rule->ir_ipver != IPPROTO_IP) ||
1259 	    (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1260 	    rule->ir_ipver != IPPROTO_IPV6)) {
1261 		ret = EINVAL;
1262 		goto end;
1263 	}
1264 
1265 	/*
1266 	 * Check for valid port range.
1267 	 *
1268 	 * For DSR, there can be no port shifting.  Hence the server
1269 	 * specification must be the same as the rule's.
1270 	 *
1271 	 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1272 	 * it must be equal to the same value as the rule port range.
1273 	 *
1274 	 */
1275 	if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1276 		if (rule->ir_max_port != max_port ||
1277 		    rule->ir_min_port != min_port) {
1278 			ret = EINVAL;
1279 			goto end;
1280 		}
1281 	} else {
1282 		if ((range != rule->ir_max_port - rule->ir_min_port) &&
1283 		    range != 0) {
1284 			ret = EINVAL;
1285 			goto end;
1286 		}
1287 	}
1288 
1289 	/* Check for duplicate. */
1290 	for (server = rule->ir_servers; server != NULL;
1291 	    server = server->iser_next) {
1292 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1293 		    strcasecmp(server->iser_name, info->name) == 0) {
1294 			break;
1295 		}
1296 	}
1297 	if (server != NULL) {
1298 		ret = EEXIST;
1299 		goto end;
1300 	}
1301 
1302 	if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1303 		ret = ENOMEM;
1304 		goto end;
1305 	}
1306 
1307 	(void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1308 	(void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1309 	    sizeof (server->iser_ip_addr));
1310 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1311 	server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1312 	if (server->iser_ksp == NULL) {
1313 		kmem_free(server, sizeof (ilb_server_t));
1314 		ret = EINVAL;
1315 		goto end;
1316 	}
1317 
1318 	server->iser_stackid = stackid;
1319 	server->iser_addr_v6 = info->addr;
1320 	server->iser_min_port = min_port;
1321 	server->iser_max_port = max_port;
1322 	if (min_port != max_port)
1323 		server->iser_port_range = B_TRUE;
1324 	else
1325 		server->iser_port_range = B_FALSE;
1326 
1327 	/*
1328 	 * If the rule uses NAT, find/create the NAT source entry to use
1329 	 * for this server.
1330 	 */
1331 	if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1332 		in_port_t port;
1333 
1334 		/*
1335 		 * If the server uses a port range, our port allocation
1336 		 * scheme needs to treat it as a wildcard.  Refer to the
1337 		 * comments in ilb_nat.c about the scheme.
1338 		 */
1339 		if (server->iser_port_range)
1340 			port = 0;
1341 		else
1342 			port = server->iser_min_port;
1343 
1344 		if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1345 		    &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1346 		    num_nat_src_v6(&rule->ir_nat_src_start,
1347 		    &rule->ir_nat_src_end))) != 0) {
1348 			kstat_delete_netstack(server->iser_ksp, stackid);
1349 			kmem_free(server, sizeof (ilb_server_t));
1350 			goto end;
1351 		}
1352 	}
1353 
1354 	/*
1355 	 * The iser_lock is only used to protect iser_refcnt.  All the other
1356 	 * fields in ilb_server_t should not change, except for iser_enabled.
1357 	 * The worst thing that can happen if iser_enabled is messed up is
1358 	 * that one or two packets may not be load balanced to a server
1359 	 * correctly.
1360 	 */
1361 	server->iser_refcnt = 1;
1362 	server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1363 	    B_FALSE;
1364 	mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1365 	cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1366 
1367 	/* Let the load balancing algorithm know about the addition. */
1368 	ASSERT(rule->ir_alg != NULL);
1369 	if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1370 	    rule->ir_alg->ilb_alg_data)) != 0) {
1371 		kstat_delete_netstack(server->iser_ksp, stackid);
1372 		kmem_free(server, sizeof (ilb_server_t));
1373 		goto end;
1374 	}
1375 
1376 	/*
1377 	 * No need to hold ir_lock since no other thread should manipulate
1378 	 * the following fields until ILB_RULE_BUSY is cleared.
1379 	 */
1380 	if (rule->ir_servers == NULL) {
1381 		server->iser_next = NULL;
1382 	} else {
1383 		server->iser_next = rule->ir_servers;
1384 	}
1385 	rule->ir_servers = server;
1386 	ILB_R_KSTAT(rule, num_servers);
1387 
1388 end:
1389 	mutex_enter(&rule->ir_lock);
1390 	rule->ir_flags &= ~ILB_RULE_BUSY;
1391 	cv_signal(&rule->ir_cv);
1392 	mutex_exit(&rule->ir_lock);
1393 	return (ret);
1394 }
1395 
1396 /* The routine executed by the delayed rule processing taskq. */
1397 static void
1398 ilb_server_del_tq(void *arg)
1399 {
1400 	ilb_server_t *server = (ilb_server_t *)arg;
1401 
1402 	mutex_enter(&server->iser_lock);
1403 	while (server->iser_refcnt > 1)
1404 		cv_wait(&server->iser_cv, &server->iser_lock);
1405 	kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1406 	kmem_free(server, sizeof (ilb_server_t));
1407 }
1408 
1409 /*
1410  * Delete a back end server from a rule.  If the address is IPv4, it is assumed
1411  * to be passed in as a mapped address.
1412  */
1413 int
1414 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1415     ilb_rule_t *rule, in6_addr_t *addr)
1416 {
1417 	ilb_server_t	*server;
1418 	ilb_server_t	*prev_server;
1419 	int		ret = 0;
1420 
1421 	ASSERT((rule == NULL && rule_name != NULL) ||
1422 	    (rule != NULL && rule_name == NULL));
1423 	if (rule == NULL) {
1424 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1425 		    &ret)) == NULL) {
1426 			return (ret);
1427 		}
1428 	}
1429 
1430 	mutex_enter(&rule->ir_lock);
1431 	/* If someone is already doing server add/del, sleeps and wait. */
1432 	while (rule->ir_flags & ILB_RULE_BUSY) {
1433 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1434 			if (rule_name != NULL) {
1435 				if (--rule->ir_refcnt <= 2)
1436 					cv_signal(&rule->ir_cv);
1437 			}
1438 			mutex_exit(&rule->ir_lock);
1439 			return (EINTR);
1440 		}
1441 	}
1442 	/*
1443 	 * Set the rule to be busy to make sure that no new packet can
1444 	 * use this rule.
1445 	 */
1446 	rule->ir_flags |= ILB_RULE_BUSY;
1447 
1448 	/* Now wait for all other guys to finish their work. */
1449 	while (rule->ir_refcnt > 2) {
1450 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1451 			mutex_exit(&rule->ir_lock);
1452 			ret = EINTR;
1453 			goto end;
1454 		}
1455 	}
1456 	mutex_exit(&rule->ir_lock);
1457 
1458 	prev_server = NULL;
1459 	for (server = rule->ir_servers; server != NULL;
1460 	    prev_server = server, server = server->iser_next) {
1461 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1462 			break;
1463 	}
1464 	if (server == NULL) {
1465 		ret = ENOENT;
1466 		goto end;
1467 	}
1468 
1469 	/*
1470 	 * Let the load balancing algorithm know about the removal.
1471 	 * The algorithm may disallow the removal...
1472 	 */
1473 	if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1474 	    rule->ir_alg->ilb_alg_data)) != 0) {
1475 		goto end;
1476 	}
1477 
1478 	if (prev_server == NULL)
1479 		rule->ir_servers = server->iser_next;
1480 	else
1481 		prev_server->iser_next = server->iser_next;
1482 
1483 	ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1484 
1485 	/*
1486 	 * Mark the server as disabled so that if there is any sticky cache
1487 	 * using this server around, it won't be used.
1488 	 */
1489 	server->iser_enabled = B_FALSE;
1490 
1491 	mutex_enter(&server->iser_lock);
1492 
1493 	/*
1494 	 * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
1495 	 * may not go away if there is still a conn using it.  The NAT source
1496 	 * timer will do the garbage collection.
1497 	 */
1498 	ilb_destroy_nat_src(&server->iser_nat_src);
1499 
1500 	/* If there is a hard limit on when a server should die, set it. */
1501 	if (rule->ir_conn_drain_timeout != 0) {
1502 		(void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1503 		    ddi_get_lbolt64() +
1504 		    SEC_TO_TICK(rule->ir_conn_drain_timeout));
1505 	}
1506 
1507 	if (server->iser_refcnt > 1) {
1508 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1509 		    server, TQ_SLEEP);
1510 		mutex_exit(&server->iser_lock);
1511 	} else {
1512 		kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1513 		kmem_free(server, sizeof (ilb_server_t));
1514 	}
1515 
1516 end:
1517 	mutex_enter(&rule->ir_lock);
1518 	rule->ir_flags &= ~ILB_RULE_BUSY;
1519 	if (rule_name != NULL)
1520 		rule->ir_refcnt--;
1521 	cv_signal(&rule->ir_cv);
1522 	mutex_exit(&rule->ir_lock);
1523 	return (ret);
1524 }
1525 
1526 /*
1527  * First check if the destination of the ICMP message matches a VIP of
1528  * a rule.  If it does not, just return ILB_PASSED.
1529  *
1530  * If the destination matches a VIP:
1531  *
1532  * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1533  * server.
1534  *
1535  * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1536  * and see which back end server we should send this message to.  And we
1537  * need to do NAT on both the payload message and the outside IP packet.
1538  *
1539  * For other ICMP messages, drop them.
1540  */
1541 /* ARGSUSED */
1542 static int
1543 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1544     icmph_t *icmph, ipaddr_t *lb_dst)
1545 {
1546 	ipaddr_t vip;
1547 	ilb_rule_t *rule;
1548 	in6_addr_t addr6;
1549 
1550 	if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1551 		return (ILB_PASSED);
1552 
1553 
1554 	if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1555 		ILB_R_KSTAT(rule, icmp_dropped);
1556 		ILB_RULE_REFRELE(rule);
1557 		return (ILB_DROPPED);
1558 	}
1559 
1560 	switch (icmph->icmph_type) {
1561 	case ICMP_ECHO_REQUEST:
1562 		ILB_R_KSTAT(rule, icmp_echo_processed);
1563 		ILB_RULE_REFRELE(rule);
1564 
1565 		icmph->icmph_type = ICMP_ECHO_REPLY;
1566 		icmph->icmph_checksum = 0;
1567 		icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1568 		ipha->ipha_ttl =
1569 		    ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1570 		*lb_dst = ipha->ipha_src;
1571 		vip = ipha->ipha_dst;
1572 		ipha->ipha_dst = ipha->ipha_src;
1573 		ipha->ipha_src = vip;
1574 		return (ILB_BALANCED);
1575 	case ICMP_DEST_UNREACHABLE: {
1576 		int ret;
1577 
1578 		if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1579 			ILB_R_KSTAT(rule, icmp_dropped);
1580 			ILB_RULE_REFRELE(rule);
1581 			return (ILB_DROPPED);
1582 		}
1583 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1584 		    &addr6)) {
1585 			ILB_R_KSTAT(rule, icmp_2big_processed);
1586 			ret = ILB_BALANCED;
1587 		} else {
1588 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1589 			ret = ILB_DROPPED;
1590 		}
1591 		ILB_RULE_REFRELE(rule);
1592 		IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1593 		return (ret);
1594 	}
1595 	default:
1596 		ILB_R_KSTAT(rule, icmp_dropped);
1597 		ILB_RULE_REFRELE(rule);
1598 		return (ILB_DROPPED);
1599 	}
1600 }
1601 
1602 /* ARGSUSED */
1603 static int
1604 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1605     icmp6_t *icmp6, in6_addr_t *lb_dst)
1606 {
1607 	ilb_rule_t *rule;
1608 
1609 	if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1610 		return (ILB_PASSED);
1611 
1612 	if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1613 		ILB_R_KSTAT(rule, icmp_dropped);
1614 		ILB_RULE_REFRELE(rule);
1615 		return (ILB_DROPPED);
1616 	}
1617 
1618 	switch (icmp6->icmp6_type) {
1619 	case ICMP6_ECHO_REQUEST: {
1620 		int hdr_len;
1621 
1622 		ILB_R_KSTAT(rule, icmp_echo_processed);
1623 		ILB_RULE_REFRELE(rule);
1624 
1625 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1626 		icmp6->icmp6_cksum = ip6h->ip6_plen;
1627 		hdr_len = (char *)icmp6 - (char *)ip6h;
1628 		icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1629 		    ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1630 		ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1631 		ip6h->ip6_hops =
1632 		    ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1633 		*lb_dst = ip6h->ip6_src;
1634 		ip6h->ip6_src = ip6h->ip6_dst;
1635 		ip6h->ip6_dst = *lb_dst;
1636 		return (ILB_BALANCED);
1637 	}
1638 	case ICMP6_PACKET_TOO_BIG: {
1639 		int ret;
1640 
1641 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1642 		    lb_dst)) {
1643 			ILB_R_KSTAT(rule, icmp_2big_processed);
1644 			ret = ILB_BALANCED;
1645 		} else {
1646 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1647 			ret = ILB_DROPPED;
1648 		}
1649 		ILB_RULE_REFRELE(rule);
1650 		return (ret);
1651 	}
1652 	default:
1653 		ILB_R_KSTAT(rule, icmp_dropped);
1654 		ILB_RULE_REFRELE(rule);
1655 		return (ILB_DROPPED);
1656 	}
1657 }
1658 
1659 /*
1660  * Common routine to check an incoming packet and decide what to do with it.
1661  * called by ilb_check_v4|v6().
1662  */
1663 static int
1664 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1665     in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1666     in6_addr_t *lb_dst)
1667 {
1668 	in_port_t		sport, dport;
1669 	tcpha_t			*tcph;
1670 	udpha_t			*udph;
1671 	ilb_rule_t		*rule;
1672 	ilb_server_t		*server;
1673 	boolean_t		balanced;
1674 	struct ilb_sticky_s	*s = NULL;
1675 	int			ret;
1676 	uint32_t		ip_sum, tp_sum;
1677 	ilb_nat_info_t		info;
1678 	uint16_t		nat_src_idx;
1679 	boolean_t		busy;
1680 
1681 	/*
1682 	 * We don't really need to switch here since both protocols's
1683 	 * ports are at the same offset.  Just prepare for future protocol
1684 	 * specific processing.
1685 	 */
1686 	switch (l4) {
1687 	case IPPROTO_TCP:
1688 		if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1689 			return (ILB_DROPPED);
1690 		tcph = (tcpha_t *)tph;
1691 		sport = tcph->tha_lport;
1692 		dport = tcph->tha_fport;
1693 		break;
1694 	case IPPROTO_UDP:
1695 		if (tph + sizeof (udpha_t) > mp->b_wptr)
1696 			return (ILB_DROPPED);
1697 		udph = (udpha_t *)tph;
1698 		sport = udph->uha_src_port;
1699 		dport = udph->uha_dst_port;
1700 		break;
1701 	default:
1702 		return (ILB_PASSED);
1703 	}
1704 
1705 	/* Fast path, there is an existing conn. */
1706 	if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1707 	    pkt_len, lb_dst)) {
1708 		return (ILB_BALANCED);
1709 	}
1710 
1711 	/*
1712 	 * If there is no existing connection for the incoming packet, check
1713 	 * to see if the packet matches a rule.  If not, just let IP decide
1714 	 * what to do with it.
1715 	 *
1716 	 * Note: a reply from back end server should not match a rule.  A
1717 	 * reply should match one existing conn.
1718 	 */
1719 	rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1720 	    pkt_len, &busy);
1721 	if (rule == NULL) {
1722 		/* If the rule is busy, just drop the packet. */
1723 		if (busy)
1724 			return (ILB_DROPPED);
1725 		else
1726 			return (ILB_PASSED);
1727 	}
1728 
1729 	/*
1730 	 * The packet matches a rule, use the rule load balance algorithm
1731 	 * to find a server.
1732 	 */
1733 	balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1734 	    rule->ir_alg->ilb_alg_data, &server);
1735 	/*
1736 	 * This can only happen if there is no server in a rule or all
1737 	 * the servers are currently disabled.
1738 	 */
1739 	if (!balanced)
1740 		goto no_server;
1741 
1742 	/*
1743 	 * If the rule is sticky enabled, we need to check the sticky table.
1744 	 * If there is a sticky entry for the client, use the previous server
1745 	 * instead of the one found above (note that both can be the same).
1746 	 * If there is no entry for that client, add an entry to the sticky
1747 	 * table.  Both the find and add are done in ilb_sticky_find_add()
1748 	 * to avoid checking for duplicate when adding an entry.
1749 	 */
1750 	if (rule->ir_flags & ILB_RULE_STICKY) {
1751 		in6_addr_t addr;
1752 
1753 		V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1754 		if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1755 		    &s, &nat_src_idx)) == NULL) {
1756 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1757 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1758 			goto no_server;
1759 		}
1760 	}
1761 
1762 	/*
1763 	 * We are holding a reference on the rule, so the server
1764 	 * cannot go away.
1765 	 */
1766 	*lb_dst = server->iser_addr_v6;
1767 	ILB_S_KSTAT(server, pkt_processed);
1768 	ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1769 
1770 	switch (rule->ir_topo) {
1771 	case ILB_TOPO_IMPL_NAT: {
1772 		ilb_nat_src_entry_t	*src_ent;
1773 		uint16_t		*src_idx;
1774 
1775 		/*
1776 		 * We create a cache even if it is not a SYN segment.
1777 		 * The server should return a RST.  When we see the
1778 		 * RST, we will destroy this cache.  But by having
1779 		 * a cache, we know how to NAT the returned RST.
1780 		 */
1781 		info.vip = *dst;
1782 		info.dport = dport;
1783 		info.src = *src;
1784 		info.sport = sport;
1785 
1786 		/* If stickiness is enabled, use the same source address */
1787 		if (s != NULL)
1788 			src_idx = &nat_src_idx;
1789 		else
1790 			src_idx = NULL;
1791 
1792 		if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1793 		    &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1794 			if (s != NULL)
1795 				ilb_sticky_refrele(s);
1796 			ILB_R_KSTAT(rule, pkt_dropped);
1797 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1798 			ILB_R_KSTAT(rule, noport_pkt_dropped);
1799 			ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1800 			ret = ILB_DROPPED;
1801 			break;
1802 		}
1803 		info.src_ent = src_ent;
1804 		info.nat_dst = server->iser_addr_v6;
1805 		if (rule->ir_port_range && server->iser_port_range) {
1806 			info.nat_dport = htons(ntohs(dport) -
1807 			    rule->ir_min_port + server->iser_min_port);
1808 		} else {
1809 			info.nat_dport = htons(server->iser_min_port);
1810 		}
1811 
1812 		/*
1813 		 * If ilb_conn_add() fails, it will release the reference on
1814 		 * sticky info and de-allocate the NAT source port allocated
1815 		 * above.
1816 		 */
1817 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1818 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1819 			ILB_R_KSTAT(rule, pkt_dropped);
1820 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1821 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1822 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1823 			ret = ILB_DROPPED;
1824 			break;
1825 		}
1826 		ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1827 		ret = ILB_BALANCED;
1828 		break;
1829 	}
1830 	case ILB_TOPO_IMPL_HALF_NAT:
1831 		info.vip = *dst;
1832 		info.nat_dst = server->iser_addr_v6;
1833 		info.dport = dport;
1834 		if (rule->ir_port_range && server->iser_port_range) {
1835 			info.nat_dport = htons(ntohs(dport) -
1836 			    rule->ir_min_port + server->iser_min_port);
1837 		} else {
1838 			info.nat_dport = htons(server->iser_min_port);
1839 		}
1840 
1841 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1842 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1843 			ILB_R_KSTAT(rule, pkt_dropped);
1844 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1845 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1846 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1847 			ret = ILB_DROPPED;
1848 			break;
1849 		}
1850 		ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1851 
1852 		ret = ILB_BALANCED;
1853 		break;
1854 	case ILB_TOPO_IMPL_DSR:
1855 		/*
1856 		 * By decrementing the sticky refcnt, the period of
1857 		 * stickiness (life time of ilb_sticky_t) will be
1858 		 * from now to (now + default expiry time).
1859 		 */
1860 		if (s != NULL)
1861 			ilb_sticky_refrele(s);
1862 		ret = ILB_BALANCED;
1863 		break;
1864 	default:
1865 		cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1866 		    (void *) rule);
1867 		break;
1868 	}
1869 	ILB_RULE_REFRELE(rule);
1870 	return (ret);
1871 
1872 no_server:
1873 	/* This can only happen if there is no server available. */
1874 	ILB_R_KSTAT(rule, pkt_dropped);
1875 	ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1876 	ILB_RULE_REFRELE(rule);
1877 	return (ILB_DROPPED);
1878 }
1879 
1880 int
1881 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1882     uint8_t *tph, ipaddr_t *lb_dst)
1883 {
1884 	in6_addr_t v6_src, v6_dst, v6_lb_dst;
1885 	int ret;
1886 
1887 	ASSERT(DB_REF(mp) == 1);
1888 
1889 	if (l4 == IPPROTO_ICMP) {
1890 		return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1891 		    lb_dst));
1892 	}
1893 
1894 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1895 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1896 	ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1897 	    tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1898 	if (ret == ILB_BALANCED)
1899 		IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1900 	return (ret);
1901 }
1902 
1903 int
1904 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1905     uint8_t *tph, in6_addr_t *lb_dst)
1906 {
1907 	uint32_t pkt_len;
1908 
1909 	ASSERT(DB_REF(mp) == 1);
1910 
1911 	if (l4 == IPPROTO_ICMPV6) {
1912 		return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1913 		    lb_dst));
1914 	}
1915 
1916 	pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1917 	return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1918 	    IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1919 }
1920 
1921 void
1922 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1923 {
1924 	ilb_rule_t *tmp_rule;
1925 
1926 	mutex_enter(&ilbs->ilbs_g_lock);
1927 	*num_rules = 0;
1928 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1929 	    tmp_rule = tmp_rule->ir_next) {
1930 		if (tmp_rule->ir_zoneid == zoneid)
1931 			*num_rules += 1;
1932 	}
1933 	mutex_exit(&ilbs->ilbs_g_lock);
1934 }
1935 
1936 int
1937 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1938     uint32_t *num_servers)
1939 {
1940 	ilb_rule_t *rule;
1941 	int err;
1942 
1943 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1944 		return (err);
1945 	*num_servers = rule->ir_kstat.num_servers.value.ui64;
1946 	ILB_RULE_REFRELE(rule);
1947 	return (0);
1948 }
1949 
1950 int
1951 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1952     ilb_server_info_t *servers, uint32_t *num_servers)
1953 {
1954 	ilb_rule_t *rule;
1955 	ilb_server_t *server;
1956 	size_t cnt;
1957 	int err;
1958 
1959 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1960 		return (err);
1961 	for (server = rule->ir_servers, cnt = *num_servers;
1962 	    server != NULL && cnt > 0;
1963 	    server = server->iser_next, cnt--, servers++) {
1964 		(void) memcpy(servers->name, server->iser_name,
1965 		    ILB_SERVER_NAMESZ);
1966 		servers->addr = server->iser_addr_v6;
1967 		servers->min_port = htons(server->iser_min_port);
1968 		servers->max_port = htons(server->iser_max_port);
1969 		servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1970 		servers->err = 0;
1971 	}
1972 	ILB_RULE_REFRELE(rule);
1973 	*num_servers -= cnt;
1974 
1975 	return (0);
1976 }
1977 
1978 void
1979 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1980     char *buf)
1981 {
1982 	ilb_rule_t *tmp_rule;
1983 	int cnt;
1984 
1985 	if (*num_names == 0)
1986 		return;
1987 
1988 	mutex_enter(&ilbs->ilbs_g_lock);
1989 	for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1990 	    tmp_rule = tmp_rule->ir_next) {
1991 		if (tmp_rule->ir_zoneid != zoneid)
1992 			continue;
1993 
1994 		(void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1995 		buf += ILB_RULE_NAMESZ;
1996 		if (++cnt == *num_names)
1997 			break;
1998 	}
1999 	mutex_exit(&ilbs->ilbs_g_lock);
2000 	*num_names = cnt;
2001 }
2002 
2003 int
2004 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2005 {
2006 	ilb_rule_t *rule;
2007 	int err;
2008 
2009 	if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2010 		return (err);
2011 	}
2012 
2013 	/*
2014 	 * Except the enabled flags, none of the following will change
2015 	 * in the life time of a rule.  So we don't hold the mutex when
2016 	 * reading them.  The worst is to report a wrong enabled flags.
2017 	 */
2018 	cmd->ip_ver = rule->ir_ipver;
2019 	cmd->proto = rule->ir_proto;
2020 	cmd->min_port = htons(rule->ir_min_port);
2021 	cmd->max_port = htons(rule->ir_max_port);
2022 
2023 	cmd->vip = rule->ir_target_v6;
2024 	cmd->algo = rule->ir_alg_type;
2025 	cmd->topo = rule->ir_topo;
2026 
2027 	cmd->nat_src_start = rule->ir_nat_src_start;
2028 	cmd->nat_src_end = rule->ir_nat_src_end;
2029 
2030 	cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2031 	cmd->nat_expiry = rule->ir_nat_expiry;
2032 	cmd->sticky_expiry = rule->ir_sticky_expiry;
2033 
2034 	cmd->flags = 0;
2035 	if (rule->ir_flags & ILB_RULE_ENABLED)
2036 		cmd->flags |= ILB_RULE_ENABLED;
2037 	if (rule->ir_flags & ILB_RULE_STICKY) {
2038 		cmd->flags |= ILB_RULE_STICKY;
2039 		cmd->sticky_mask = rule->ir_sticky_mask;
2040 	}
2041 
2042 	ILB_RULE_REFRELE(rule);
2043 	return (0);
2044 }
2045 
2046 static void *
2047 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2048 {
2049 	ilb_stack_t *ilbs;
2050 	char tq_name[TASKQ_NAMELEN];
2051 
2052 	ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2053 	ilbs->ilbs_netstack = ns;
2054 
2055 	ilbs->ilbs_rule_head = NULL;
2056 	ilbs->ilbs_g_hash = NULL;
2057 	mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2058 
2059 	ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2060 	if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2061 		kmem_free(ilbs, sizeof (ilb_stack_t));
2062 		return (NULL);
2063 	}
2064 
2065 	/*
2066 	 * ilbs_conn/sticky_hash related info is initialized in
2067 	 * ilb_conn/sticky_hash_init().
2068 	 */
2069 	ilbs->ilbs_conn_taskq = NULL;
2070 	ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2071 	ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2072 	ilbs->ilbs_c2s_conn_hash = NULL;
2073 	ilbs->ilbs_s2c_conn_hash = NULL;
2074 	ilbs->ilbs_conn_timer_list = NULL;
2075 
2076 	ilbs->ilbs_sticky_hash = NULL;
2077 	ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2078 	ilbs->ilbs_sticky_timer_list = NULL;
2079 	ilbs->ilbs_sticky_taskq = NULL;
2080 
2081 	/* The allocation is done later when there is a rule using NAT mode. */
2082 	ilbs->ilbs_nat_src = NULL;
2083 	ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2084 	mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2085 	ilbs->ilbs_nat_src_tid = 0;
2086 
2087 	/* For listing the conn hash table */
2088 	mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2089 	cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2090 	ilbs->ilbs_conn_list_busy = B_FALSE;
2091 	ilbs->ilbs_conn_list_cur = 0;
2092 	ilbs->ilbs_conn_list_connp = NULL;
2093 
2094 	/* For listing the sticky hash table */
2095 	mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2096 	cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2097 	ilbs->ilbs_sticky_list_busy = B_FALSE;
2098 	ilbs->ilbs_sticky_list_cur = 0;
2099 	ilbs->ilbs_sticky_list_curp = NULL;
2100 
2101 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2102 	    (void *)ns);
2103 	ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2104 	    minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2105 
2106 	return (ilbs);
2107 }
2108 
2109 /* ARGSUSED */
2110 static void
2111 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2112 {
2113 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2114 	ilb_rule_t *tmp_rule;
2115 
2116 	ilb_sticky_hash_fini(ilbs);
2117 	ilb_conn_hash_fini(ilbs);
2118 	mutex_enter(&ilbs->ilbs_g_lock);
2119 	while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2120 		ilb_rule_hash_del(tmp_rule);
2121 		ilb_rule_g_del(ilbs, tmp_rule);
2122 		mutex_exit(&ilbs->ilbs_g_lock);
2123 		ilb_rule_del_common(ilbs, tmp_rule);
2124 		mutex_enter(&ilbs->ilbs_g_lock);
2125 	}
2126 	mutex_exit(&ilbs->ilbs_g_lock);
2127 	if (ilbs->ilbs_nat_src != NULL)
2128 		ilb_nat_src_fini(ilbs);
2129 }
2130 
2131 static void
2132 ilb_stack_fini(netstackid_t stackid, void * arg)
2133 {
2134 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2135 
2136 	ilb_rule_hash_fini(ilbs);
2137 	taskq_destroy(ilbs->ilbs_rule_taskq);
2138 	ilb_kstat_g_fini(stackid, ilbs);
2139 	kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2140 	kmem_free(ilbs, sizeof (ilb_stack_t));
2141 }
2142 
2143 void
2144 ilb_ddi_g_init(void)
2145 {
2146 	netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2147 	    ilb_stack_fini);
2148 }
2149 
2150 void
2151 ilb_ddi_g_destroy(void)
2152 {
2153 	netstack_unregister(NS_ILB);
2154 	ilb_conn_cache_fini();
2155 	ilb_sticky_cache_fini();
2156 }
2157