xref: /titanic_51/usr/src/uts/common/inet/ilb/ilb.c (revision 565679070e884800f5d041d42d226813c0bbf6d8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/kmem.h>
28 #include <sys/ksynch.h>
29 #include <sys/systm.h>
30 #include <sys/socket.h>
31 #include <sys/disp.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/strsun.h>
35 #include <sys/sdt.h>
36 #include <sys/atomic.h>
37 #include <netinet/in.h>
38 #include <inet/ip.h>
39 #include <inet/ip6.h>
40 #include <inet/tcp.h>
41 #include <inet/udp_impl.h>
42 #include <inet/kstatcom.h>
43 
44 #include <inet/ilb_ip.h>
45 #include "ilb_alg.h"
46 #include "ilb_nat.h"
47 #include "ilb_conn.h"
48 
49 /* ILB kmem cache flag */
50 int ilb_kmem_flags = 0;
51 
52 /*
53  * The default size for the different hash tables.  Global for all stacks.
54  * But each stack has its own table, just that their sizes are the same.
55  */
56 static size_t ilb_rule_hash_size = 2048;
57 
58 static size_t ilb_conn_hash_size = 262144;
59 
60 static size_t ilb_sticky_hash_size = 262144;
61 
62 /* This should be a prime number. */
63 static size_t ilb_nat_src_hash_size = 97;
64 
65 /* Default NAT cache entry expiry time. */
66 static uint32_t ilb_conn_tcp_expiry = 120;
67 static uint32_t ilb_conn_udp_expiry = 60;
68 
69 /* Default sticky entry expiry time. */
70 static uint32_t ilb_sticky_expiry = 60;
71 
72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
73 #define	ILB_RULE_HASH(addr, hash_size) \
74 	((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
75 	*(addr)) & ((hash_size) - 1))
76 
77 /*
78  * Note on ILB delayed processing
79  *
80  * To avoid in line removal on some of the data structures, such as rules,
81  * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
82  * There are three types of ILB taskq:
83  *
84  * 1. rule handling: created at stack initialialization time, ilb_stack_init()
85  * 2. conn hash handling: created at conn hash initialization time,
86  *                        ilb_conn_hash_init()
87  * 3. sticky hash handling: created at sticky hash initialization time,
88  *                          ilb_sticky_hash_init()
89  *
90  * The rule taskq is for processing rule and server removal.  When a user
91  * land rule/server removal request comes in, a taskq is dispatched after
92  * removing the rule/server from all related hashes.  This taskq will wait
93  * until all references to the rule/server are gone before removing it.
94  * So the user land thread requesting the removal does not need to wait
95  * for the removal completion.
96  *
97  * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
98  * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
99  * and ilb_sticky_timer_size timers running for ilb_conn_hash and
100  * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
101  * portion (same size) of the hash table.  When a timer fires, it dispatches
102  * a conn hash taskq to clean up its portion of the table.  This avoids in
103  * line processing of the removal.
104  *
105  * There is another delayed processing, the clean up of NAT source address
106  * table.  We just use the timer to directly handle it instead of using
107  * a taskq.  The reason is that the table is small so it is OK to use the
108  * timer.
109  */
110 
111 /* ILB rule taskq constants. */
112 #define	ILB_RULE_TASKQ_NUM_THR	20
113 
114 /* Argument passed to ILB rule taskq routines. */
115 typedef	struct {
116 	ilb_stack_t	*ilbs;
117 	ilb_rule_t	*rule;
118 } ilb_rule_tq_t;
119 
120 /* kstat handling routines. */
121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
125     ilb_server_t *);
126 
127 /* Rule hash handling routines. */
128 static void ilb_rule_hash_init(ilb_stack_t *);
129 static void ilb_rule_hash_fini(ilb_stack_t *);
130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
131 static void ilb_rule_hash_del(ilb_rule_t *);
132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
133     in_port_t, zoneid_t, uint32_t, boolean_t *);
134 
135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
139     int *);
140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
141     int, in_port_t, in_port_t, const in6_addr_t *);
142 
143 /* Back end server handling routines. */
144 static void ilb_server_free(ilb_server_t *);
145 
146 /* Network stack handling routines. */
147 static void *ilb_stack_init(netstackid_t, netstack_t *);
148 static void ilb_stack_shutdown(netstackid_t, void *);
149 static void ilb_stack_fini(netstackid_t, void *);
150 
151 /* Sticky connection handling routines. */
152 static void ilb_rule_sticky_init(ilb_rule_t *);
153 static void ilb_rule_sticky_fini(ilb_rule_t *);
154 
155 /* Handy macro to check for unspecified address. */
156 #define	IS_ADDR_UNSPEC(addr)						\
157 	(IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :	\
158 	    IN6_IS_ADDR_UNSPECIFIED(addr))
159 
160 /*
161  * Global kstat instance counter.  When a rule is created, its kstat instance
162  * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
163  * incremented.
164  */
165 static uint_t ilb_kstat_instance = 0;
166 
167 /*
168  * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
169  * A rule's kstat has ILB_RULE_KS_CNAME class name.
170  */
171 #define	ILB_G_KS_NAME		"global"
172 #define	ILB_G_KS_CNAME		"kstat"
173 #define	ILB_RULE_KS_CNAME	"rulestat"
174 
175 static kstat_t *
176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
177 {
178 	kstat_t *ksp;
179 	ilb_g_kstat_t template = {
180 		{ "num_rules",		KSTAT_DATA_UINT64, 0 },
181 		{ "ip_frag_in",		KSTAT_DATA_UINT64, 0 },
182 		{ "ip_frag_dropped",	KSTAT_DATA_UINT64, 0 }
183 	};
184 
185 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
186 	    ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
187 	    KSTAT_FLAG_VIRTUAL, stackid);
188 	if (ksp == NULL)
189 		return (NULL);
190 	bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
191 	ksp->ks_data = ilbs->ilbs_kstat;
192 	ksp->ks_private = (void *)(uintptr_t)stackid;
193 
194 	kstat_install(ksp);
195 	return (ksp);
196 }
197 
198 static void
199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
200 {
201 	if (ilbs->ilbs_ksp != NULL) {
202 		ASSERT(stackid == (netstackid_t)(uintptr_t)
203 		    ilbs->ilbs_ksp->ks_private);
204 		kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
205 		ilbs->ilbs_ksp = NULL;
206 	}
207 }
208 
209 static kstat_t *
210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
211 {
212 	kstat_t *ksp;
213 	ilb_rule_kstat_t template = {
214 		{ "num_servers",		KSTAT_DATA_UINT64, 0 },
215 		{ "bytes_not_processed",	KSTAT_DATA_UINT64, 0 },
216 		{ "pkt_not_processed",		KSTAT_DATA_UINT64, 0 },
217 		{ "bytes_dropped",		KSTAT_DATA_UINT64, 0 },
218 		{ "pkt_dropped",		KSTAT_DATA_UINT64, 0 },
219 		{ "nomem_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
220 		{ "nomem_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
221 		{ "noport_bytes_dropped",	KSTAT_DATA_UINT64, 0 },
222 		{ "noport_pkt_dropped",		KSTAT_DATA_UINT64, 0 },
223 		{ "icmp_echo_processed",	KSTAT_DATA_UINT64, 0 },
224 		{ "icmp_dropped",		KSTAT_DATA_UINT64, 0 },
225 		{ "icmp_too_big_processed",	KSTAT_DATA_UINT64, 0 },
226 		{ "icmp_too_big_dropped",	KSTAT_DATA_UINT64, 0 }
227 	};
228 
229 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
230 	    rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
231 	    NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
232 	if (ksp == NULL)
233 		return (NULL);
234 
235 	bcopy(&template, &rule->ir_kstat, sizeof (template));
236 	ksp->ks_data = &rule->ir_kstat;
237 	ksp->ks_private = (void *)(uintptr_t)stackid;
238 
239 	kstat_install(ksp);
240 	return (ksp);
241 }
242 
243 static kstat_t *
244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
245     ilb_server_t *server)
246 {
247 	kstat_t *ksp;
248 	ilb_server_kstat_t template = {
249 		{ "bytes_processed",	KSTAT_DATA_UINT64, 0 },
250 		{ "pkt_processed",	KSTAT_DATA_UINT64, 0 },
251 		{ "ip_address",		KSTAT_DATA_STRING, 0 }
252 	};
253 	char cname_buf[KSTAT_STRLEN];
254 
255 	/* 7 is "-sstat" */
256 	ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
257 	(void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
258 	ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
259 	    server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
260 	    NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
261 	if (ksp == NULL)
262 		return (NULL);
263 
264 	bcopy(&template, &server->iser_kstat, sizeof (template));
265 	ksp->ks_data = &server->iser_kstat;
266 	ksp->ks_private = (void *)(uintptr_t)stackid;
267 
268 	kstat_named_setstr(&server->iser_kstat.ip_address,
269 	    server->iser_ip_addr);
270 	/* We never change the IP address */
271 	ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
272 
273 	kstat_install(ksp);
274 	return (ksp);
275 }
276 
277 /* Initialize the rule hash table. */
278 static void
279 ilb_rule_hash_init(ilb_stack_t *ilbs)
280 {
281 	int i;
282 
283 	/*
284 	 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
285 	 * the next power of 2.
286 	 */
287 	if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
288 		for (i = 0; i < 31; i++) {
289 			if (ilbs->ilbs_rule_hash_size < (1 << i))
290 				break;
291 		}
292 		ilbs->ilbs_rule_hash_size = 1 << i;
293 	}
294 	ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
295 	    ilbs->ilbs_rule_hash_size, KM_SLEEP);
296 	for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
297 		mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
298 		    MUTEX_DEFAULT, NULL);
299 	}
300 }
301 
302 /* Clean up the rule hash table. */
303 static void
304 ilb_rule_hash_fini(ilb_stack_t *ilbs)
305 {
306 	if (ilbs->ilbs_g_hash == NULL)
307 		return;
308 	kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
309 	    ilbs->ilbs_rule_hash_size);
310 }
311 
312 /* Add a rule to the rule hash table. */
313 static void
314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
315 {
316 	int i;
317 
318 	i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
319 	    ilbs->ilbs_rule_hash_size);
320 	DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
321 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
322 	rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
323 	if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
324 		ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
325 	rule->ir_hash_prev = NULL;
326 	ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
327 
328 	rule->ir_hash = &ilbs->ilbs_g_hash[i];
329 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
330 }
331 
332 /*
333  * Remove a rule from the rule hash table.  Note that the rule is not freed
334  * in this routine.
335  */
336 static void
337 ilb_rule_hash_del(ilb_rule_t *rule)
338 {
339 	mutex_enter(&rule->ir_hash->ilb_hash_lock);
340 	if (rule->ir_hash->ilb_hash_rule == rule) {
341 		rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
342 		if (rule->ir_hash_next != NULL)
343 			rule->ir_hash_next->ir_hash_prev = NULL;
344 	} else {
345 		if (rule->ir_hash_prev != NULL)
346 			rule->ir_hash_prev->ir_hash_next =
347 			    rule->ir_hash_next;
348 		if (rule->ir_hash_next != NULL) {
349 			rule->ir_hash_next->ir_hash_prev =
350 			    rule->ir_hash_prev;
351 		}
352 	}
353 	mutex_exit(&rule->ir_hash->ilb_hash_lock);
354 
355 	rule->ir_hash_next = NULL;
356 	rule->ir_hash_prev = NULL;
357 	rule->ir_hash = NULL;
358 }
359 
360 /*
361  * Given the info of a packet, look for a match in the rule hash table.
362  */
363 static ilb_rule_t *
364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
365     in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
366 {
367 	int i;
368 	ilb_rule_t *rule;
369 	ipaddr_t v4_addr;
370 
371 	*busy = B_FALSE;
372 	IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
373 	i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
374 	port = ntohs(port);
375 
376 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
377 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
378 	    rule = rule->ir_hash_next) {
379 		if (!rule->ir_port_range) {
380 			if (rule->ir_min_port != port)
381 				continue;
382 		} else {
383 			if (port < rule->ir_min_port ||
384 			    port > rule->ir_max_port) {
385 				continue;
386 			}
387 		}
388 		if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
389 		    rule->ir_zoneid != zoneid) {
390 			continue;
391 		}
392 
393 		if (l3 == IPPROTO_IP) {
394 			if (rule->ir_target_v4 != INADDR_ANY &&
395 			    rule->ir_target_v4 != v4_addr) {
396 				continue;
397 			}
398 		} else {
399 			if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
400 			    !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
401 				continue;
402 			}
403 		}
404 
405 		/*
406 		 * Just update the stats if the rule is disabled.
407 		 */
408 		mutex_enter(&rule->ir_lock);
409 		if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
410 			ILB_R_KSTAT(rule, pkt_not_processed);
411 			ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
412 			mutex_exit(&rule->ir_lock);
413 			rule = NULL;
414 			break;
415 		} else if (rule->ir_flags & ILB_RULE_BUSY) {
416 			/*
417 			 * If we are busy...
418 			 *
419 			 * XXX we should have a queue to postpone the
420 			 * packet processing.  But this requires a
421 			 * mechanism in IP to re-start the packet
422 			 * processing.  So for now, just drop the packet.
423 			 */
424 			ILB_R_KSTAT(rule, pkt_dropped);
425 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
426 			mutex_exit(&rule->ir_lock);
427 			*busy = B_TRUE;
428 			rule = NULL;
429 			break;
430 		} else {
431 			rule->ir_refcnt++;
432 			ASSERT(rule->ir_refcnt != 1);
433 			mutex_exit(&rule->ir_lock);
434 			break;
435 		}
436 	}
437 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
438 	return (rule);
439 }
440 
441 /*
442  * Add a rule to the global rule list.  This list is for finding all rules
443  * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
444  */
445 static void
446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
447 {
448 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
449 	rule->ir_next = ilbs->ilbs_rule_head;
450 	ilbs->ilbs_rule_head = rule;
451 	ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
452 }
453 
454 /* The call is assumed to hold the ilbs_g_lock. */
455 static void
456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
457 {
458 	ilb_rule_t *tmp_rule;
459 	ilb_rule_t *prev_rule;
460 
461 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
462 	prev_rule = NULL;
463 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
464 	    prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
465 		if (tmp_rule == rule)
466 			break;
467 	}
468 	if (tmp_rule == NULL) {
469 		mutex_exit(&ilbs->ilbs_g_lock);
470 		return;
471 	}
472 	if (prev_rule == NULL)
473 		ilbs->ilbs_rule_head = tmp_rule->ir_next;
474 	else
475 		prev_rule->ir_next = tmp_rule->ir_next;
476 	ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
477 }
478 
479 /*
480  * Helper routine to calculate how many source addresses are in a given
481  * range.
482  */
483 static int64_t
484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
485 {
486 	int64_t ret;
487 	uint32_t addr1, addr2;
488 
489 	/*
490 	 * Here we assume that the max number of NAT source cannot be
491 	 * large such that the most significant 2 s6_addr32 must be
492 	 * equal.
493 	 */
494 	addr1 = ntohl(a1->s6_addr32[3]);
495 	addr2 = ntohl(a2->s6_addr32[3]);
496 	if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
497 	    a1->s6_addr32[1] != a2->s6_addr32[1] ||
498 	    a1->s6_addr32[2] > a2->s6_addr32[2] ||
499 	    (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
500 		return (-1);
501 	}
502 	if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
503 		return (addr2 - addr1 + 1);
504 	} else {
505 		ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
506 		ret <<= 32;
507 		ret = ret + addr1 - addr2;
508 		return (ret + 1);
509 	}
510 }
511 
512 /*
513  * Add an ILB rule.
514  */
515 int
516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
517 {
518 	ilb_rule_t *rule;
519 	netstackid_t stackid;
520 	int ret;
521 	in_port_t min_port, max_port;
522 	int64_t num_src;
523 
524 	/* Sanity checks. */
525 	if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
526 		return (EINVAL);
527 
528 	/* Need to support SCTP... */
529 	if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
530 		return (EINVAL);
531 
532 	/* For full NAT, the NAT source must be supplied. */
533 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
534 		if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
535 		    IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
536 			return (EINVAL);
537 		}
538 	}
539 
540 	/* Check invalid mask */
541 	if ((cmd->flags & ILB_RULE_STICKY) &&
542 	    IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
543 		return (EINVAL);
544 	}
545 
546 	/* Port is passed in network byte order. */
547 	min_port = ntohs(cmd->min_port);
548 	max_port = ntohs(cmd->max_port);
549 	if (min_port > max_port)
550 		return (EINVAL);
551 
552 	/* min_port == 0 means "all ports". Make it so */
553 	if (min_port == 0) {
554 		min_port = 1;
555 		max_port = 65535;
556 	}
557 
558 	/* Funny address checking. */
559 	if (cmd->ip_ver == IPPROTO_IP) {
560 		in_addr_t v4_addr1, v4_addr2;
561 
562 		v4_addr1 = cmd->vip.s6_addr32[3];
563 		if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
564 		    CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
565 		    v4_addr1 == INADDR_ANY ||
566 		    !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
567 			return (EINVAL);
568 		}
569 
570 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
571 			v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
572 			v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
573 			if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
574 			    (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
575 			    v4_addr1 == INADDR_BROADCAST ||
576 			    v4_addr2 == INADDR_BROADCAST ||
577 			    v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
578 			    CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
579 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
580 			    !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
581 				return (EINVAL);
582 			}
583 
584 			num_src = v4_addr2 - v4_addr1 + 1;
585 			if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
586 				return (EINVAL);
587 		}
588 	} else {
589 		if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
590 		    IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
591 		    IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
592 		    IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
593 			return (EINVAL);
594 		}
595 
596 		if (cmd->topo == ILB_TOPO_IMPL_NAT) {
597 			if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
598 			    IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
599 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
600 			    IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
601 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
602 			    IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
603 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
604 			    IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
605 				return (EINVAL);
606 			}
607 
608 			if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
609 			    &cmd->nat_src_end)) < 0 ||
610 			    num_src > ILB_MAX_NAT_SRC) {
611 				return (EINVAL);
612 			}
613 		}
614 	}
615 
616 	mutex_enter(&ilbs->ilbs_g_lock);
617 	if (ilbs->ilbs_g_hash == NULL)
618 		ilb_rule_hash_init(ilbs);
619 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
620 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
621 		ilb_conn_hash_init(ilbs);
622 		ilb_nat_src_init(ilbs);
623 	}
624 
625 	/* Make sure that the new rule does not duplicate an existing one. */
626 	if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
627 	    min_port, max_port, &cmd->vip)) {
628 		mutex_exit(&ilbs->ilbs_g_lock);
629 		return (EEXIST);
630 	}
631 
632 	rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
633 	if (rule == NULL) {
634 		mutex_exit(&ilbs->ilbs_g_lock);
635 		return (ENOMEM);
636 	}
637 
638 	/* ir_name is all 0 to begin with */
639 	(void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
640 
641 	rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1);
642 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
643 	if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
644 		ret = ENOMEM;
645 		goto error;
646 	}
647 
648 	if (cmd->topo == ILB_TOPO_IMPL_NAT) {
649 		rule->ir_nat_src_start = cmd->nat_src_start;
650 		rule->ir_nat_src_end = cmd->nat_src_end;
651 	}
652 
653 	rule->ir_ipver = cmd->ip_ver;
654 	rule->ir_proto = cmd->proto;
655 	rule->ir_topo = cmd->topo;
656 
657 	rule->ir_min_port = min_port;
658 	rule->ir_max_port = max_port;
659 	if (rule->ir_min_port != rule->ir_max_port)
660 		rule->ir_port_range = B_TRUE;
661 	else
662 		rule->ir_port_range = B_FALSE;
663 
664 	rule->ir_zoneid = zoneid;
665 
666 	rule->ir_target_v6 = cmd->vip;
667 	rule->ir_servers = NULL;
668 
669 	/*
670 	 * The default connection drain timeout is indefinite (value 0),
671 	 * meaning we will wait for all connections to finish.  So we
672 	 * can assign cmd->conn_drain_timeout to it directly.
673 	 */
674 	rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
675 	if (cmd->nat_expiry != 0) {
676 		rule->ir_nat_expiry = cmd->nat_expiry;
677 	} else {
678 		switch (rule->ir_proto) {
679 		case IPPROTO_TCP:
680 			rule->ir_nat_expiry = ilb_conn_tcp_expiry;
681 			break;
682 		case IPPROTO_UDP:
683 			rule->ir_nat_expiry = ilb_conn_udp_expiry;
684 			break;
685 		default:
686 			cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
687 			    (void *)rule);
688 			break;
689 		}
690 	}
691 	if (cmd->sticky_expiry != 0)
692 		rule->ir_sticky_expiry = cmd->sticky_expiry;
693 	else
694 		rule->ir_sticky_expiry = ilb_sticky_expiry;
695 
696 	if (cmd->flags & ILB_RULE_STICKY) {
697 		rule->ir_flags |= ILB_RULE_STICKY;
698 		rule->ir_sticky_mask = cmd->sticky_mask;
699 		if (ilbs->ilbs_sticky_hash == NULL)
700 			ilb_sticky_hash_init(ilbs);
701 	}
702 	if (cmd->flags & ILB_RULE_ENABLED)
703 		rule->ir_flags |= ILB_RULE_ENABLED;
704 
705 	mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
706 	cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
707 
708 	rule->ir_refcnt = 1;
709 
710 	switch (cmd->algo) {
711 	case ILB_ALG_IMPL_ROUNDROBIN:
712 		if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
713 			ret = ENOMEM;
714 			goto error;
715 		}
716 		rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
717 		break;
718 	case ILB_ALG_IMPL_HASH_IP:
719 	case ILB_ALG_IMPL_HASH_IP_SPORT:
720 	case ILB_ALG_IMPL_HASH_IP_VIP:
721 		if ((rule->ir_alg = ilb_alg_hash_init(rule,
722 		    &cmd->algo)) == NULL) {
723 			ret = ENOMEM;
724 			goto error;
725 		}
726 		rule->ir_alg_type = cmd->algo;
727 		break;
728 	default:
729 		ret = EINVAL;
730 		goto error;
731 	}
732 
733 	/* Add it to the global list and hash array at the end. */
734 	ilb_rule_g_add(ilbs, rule);
735 	ilb_rule_hash_add(ilbs, rule, &cmd->vip);
736 
737 	mutex_exit(&ilbs->ilbs_g_lock);
738 
739 	return (0);
740 
741 error:
742 	mutex_exit(&ilbs->ilbs_g_lock);
743 	if (rule->ir_ksp != NULL) {
744 		/* stackid must be initialized if ir_ksp != NULL */
745 		kstat_delete_netstack(rule->ir_ksp, stackid);
746 	}
747 	kmem_free(rule, sizeof (ilb_rule_t));
748 	return (ret);
749 }
750 
751 /*
752  * The final part in deleting a rule.  Either called directly or by the
753  * taskq dispatched.
754  */
755 static void
756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
757 {
758 	netstackid_t stackid;
759 	ilb_server_t *server;
760 
761 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
762 
763 	/*
764 	 * Let the algorithm know that the rule is going away.  The
765 	 * algorithm fini routine will free all its resources with this
766 	 * rule.
767 	 */
768 	tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
769 
770 	while ((server = tmp_rule->ir_servers) != NULL) {
771 		mutex_enter(&server->iser_lock);
772 		ilb_destroy_nat_src(&server->iser_nat_src);
773 		if (tmp_rule->ir_conn_drain_timeout != 0) {
774 			/*
775 			 * The garbage collection thread checks this value
776 			 * without grabing a lock.  So we need to use
777 			 * atomic_swap_64() to make sure that the value seen
778 			 * by gc thread is intact.
779 			 */
780 			(void) atomic_swap_64(
781 			    (uint64_t *)&server->iser_die_time, lbolt64 +
782 			    SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
783 		}
784 		while (server->iser_refcnt > 1)
785 			cv_wait(&server->iser_cv, &server->iser_lock);
786 		tmp_rule->ir_servers = server->iser_next;
787 		kstat_delete_netstack(server->iser_ksp, stackid);
788 		kmem_free(server, sizeof (ilb_server_t));
789 	}
790 
791 	ASSERT(tmp_rule->ir_ksp != NULL);
792 	kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
793 
794 	kmem_free(tmp_rule, sizeof (ilb_rule_t));
795 }
796 
797 /* The routine executed by the delayed rule taskq. */
798 static void
799 ilb_rule_del_tq(void *arg)
800 {
801 	ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
802 	ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
803 
804 	mutex_enter(&rule->ir_lock);
805 	while (rule->ir_refcnt > 1)
806 		cv_wait(&rule->ir_cv, &rule->ir_lock);
807 	ilb_rule_del_common(ilbs, rule);
808 	kmem_free(arg, sizeof (ilb_rule_tq_t));
809 }
810 
811 /* Routine to delete a rule. */
812 int
813 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
814 {
815 	ilb_rule_t *tmp_rule;
816 	ilb_rule_tq_t *arg;
817 	int err;
818 
819 	mutex_enter(&ilbs->ilbs_g_lock);
820 	if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
821 	    &err)) == NULL) {
822 		mutex_exit(&ilbs->ilbs_g_lock);
823 		return (err);
824 	}
825 
826 	/*
827 	 * First remove the rule from the hash array and the global list so
828 	 * that no one can find this rule any more.
829 	 */
830 	ilb_rule_hash_del(tmp_rule);
831 	ilb_rule_g_del(ilbs, tmp_rule);
832 	mutex_exit(&ilbs->ilbs_g_lock);
833 	ILB_RULE_REFRELE(tmp_rule);
834 
835 	/*
836 	 * Now no one can find this rule, we can remove it once all
837 	 * references to it are dropped and all references to the list
838 	 * of servers are dropped.  So dispatch a task to finish the deletion.
839 	 * We do this instead of letting the last one referencing the
840 	 * rule do it.  The reason is that the last one may be the
841 	 * interrupt thread.  We want to minimize the work it needs to
842 	 * do.  Rule deletion is not a critical task so it can be delayed.
843 	 */
844 	arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
845 	arg->ilbs = ilbs;
846 	arg->rule = tmp_rule;
847 	(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
848 	    TQ_SLEEP);
849 
850 	return (0);
851 }
852 
853 /*
854  * Given an IP address, check to see if there is a rule using this
855  * as the VIP.  It can be used to check if we need to drop a fragment.
856  */
857 boolean_t
858 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
859 {
860 	int i;
861 	ilb_rule_t *rule;
862 	boolean_t ret = B_FALSE;
863 
864 	i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
865 	    ilbs->ilbs_rule_hash_size);
866 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
867 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
868 	    rule = rule->ir_hash_next) {
869 		if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
870 			mutex_enter(&rule->ir_lock);
871 			if (rule->ir_flags & ILB_RULE_BUSY) {
872 				mutex_exit(&rule->ir_lock);
873 				break;
874 			}
875 			if (ret_rule != NULL) {
876 				rule->ir_refcnt++;
877 				mutex_exit(&rule->ir_lock);
878 				*ret_rule = rule;
879 			} else {
880 				mutex_exit(&rule->ir_lock);
881 			}
882 			ret = B_TRUE;
883 			break;
884 		}
885 	}
886 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
887 	return (ret);
888 }
889 
890 boolean_t
891 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
892 {
893 	int i;
894 	ilb_rule_t *rule;
895 	boolean_t ret = B_FALSE;
896 
897 	i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
898 	mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
899 	for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
900 	    rule = rule->ir_hash_next) {
901 		if (rule->ir_target_v6.s6_addr32[3] == addr) {
902 			mutex_enter(&rule->ir_lock);
903 			if (rule->ir_flags & ILB_RULE_BUSY) {
904 				mutex_exit(&rule->ir_lock);
905 				break;
906 			}
907 			if (ret_rule != NULL) {
908 				rule->ir_refcnt++;
909 				mutex_exit(&rule->ir_lock);
910 				*ret_rule = rule;
911 			} else {
912 				mutex_exit(&rule->ir_lock);
913 			}
914 			ret = B_TRUE;
915 			break;
916 		}
917 	}
918 	mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
919 	return (ret);
920 }
921 
922 static ilb_rule_t *
923 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
924     int *err)
925 {
926 	ilb_rule_t *tmp_rule;
927 
928 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
929 
930 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
931 	    tmp_rule = tmp_rule->ir_next) {
932 		if (tmp_rule->ir_zoneid != zoneid)
933 			continue;
934 		if (strcasecmp(tmp_rule->ir_name, name) == 0) {
935 			mutex_enter(&tmp_rule->ir_lock);
936 			if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
937 				mutex_exit(&tmp_rule->ir_lock);
938 				*err = EINPROGRESS;
939 				return (NULL);
940 			}
941 			tmp_rule->ir_refcnt++;
942 			mutex_exit(&tmp_rule->ir_lock);
943 			*err = 0;
944 			return (tmp_rule);
945 		}
946 	}
947 	*err = ENOENT;
948 	return (NULL);
949 }
950 
951 /* To find a rule with a given name and zone in the global rule list. */
952 ilb_rule_t *
953 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
954     int *err)
955 {
956 	ilb_rule_t *tmp_rule;
957 
958 	mutex_enter(&ilbs->ilbs_g_lock);
959 	tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
960 	mutex_exit(&ilbs->ilbs_g_lock);
961 	return (tmp_rule);
962 }
963 
964 /* Try to match the given packet info and zone ID with a rule. */
965 static boolean_t
966 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
967     int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
968 {
969 	ilb_rule_t *tmp_rule;
970 
971 	ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
972 
973 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
974 	    tmp_rule = tmp_rule->ir_next) {
975 		if (tmp_rule->ir_zoneid != zoneid)
976 			continue;
977 
978 		/*
979 		 * We don't allow the same name in different rules even if all
980 		 * the other rule components are different.
981 		 */
982 		if (strcasecmp(tmp_rule->ir_name, name) == 0)
983 			return (B_TRUE);
984 
985 		if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
986 			continue;
987 
988 		/*
989 		 * ir_min_port and ir_max_port are the same if ir_port_range
990 		 * is false.  In this case, if the ir_min|max_port (same) is
991 		 * outside of the given port range, it is OK.  In other cases,
992 		 * check if min and max port are outside a rule's range.
993 		 */
994 		if (tmp_rule->ir_max_port < min_port ||
995 		    tmp_rule->ir_min_port > max_port) {
996 			continue;
997 		}
998 
999 		/*
1000 		 * If l3 is IPv4, the addr passed in is assumed to be
1001 		 * mapped address.
1002 		 */
1003 		if (V6_OR_V4_INADDR_ANY(*addr) ||
1004 		    V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1005 		    IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1006 			return (B_TRUE);
1007 		}
1008 	}
1009 	return (B_FALSE);
1010 }
1011 
1012 int
1013 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1014     const char *rule_name, ilb_rule_t *in_rule)
1015 {
1016 	ilb_rule_t *rule;
1017 	int err;
1018 
1019 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1020 	    (in_rule != NULL && rule_name == NULL));
1021 	if ((rule = in_rule) == NULL) {
1022 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1023 		    &err)) == NULL) {
1024 			return (err);
1025 		}
1026 	}
1027 	mutex_enter(&rule->ir_lock);
1028 	rule->ir_flags |= ILB_RULE_ENABLED;
1029 	mutex_exit(&rule->ir_lock);
1030 
1031 	/* Only refrele if the rule is passed in. */
1032 	if (in_rule == NULL)
1033 		ILB_RULE_REFRELE(rule);
1034 	return (0);
1035 }
1036 
1037 int
1038 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1039     const char *rule_name, ilb_rule_t *in_rule)
1040 {
1041 	ilb_rule_t *rule;
1042 	int err;
1043 
1044 	ASSERT((in_rule == NULL && rule_name != NULL) ||
1045 	    (in_rule != NULL && rule_name == NULL));
1046 	if ((rule = in_rule) == NULL) {
1047 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1048 		    &err)) == NULL) {
1049 			return (err);
1050 		}
1051 	}
1052 	mutex_enter(&rule->ir_lock);
1053 	rule->ir_flags &= ~ILB_RULE_ENABLED;
1054 	mutex_exit(&rule->ir_lock);
1055 
1056 	/* Only refrele if the rule is passed in. */
1057 	if (in_rule == NULL)
1058 		ILB_RULE_REFRELE(rule);
1059 	return (0);
1060 }
1061 
1062 /*
1063  * XXX We should probably have a walker function to walk all rules.  For
1064  * now, just add a simple loop for enable/disable/del.
1065  */
1066 void
1067 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1068 {
1069 	ilb_rule_t *rule;
1070 
1071 	mutex_enter(&ilbs->ilbs_g_lock);
1072 	for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1073 		if (rule->ir_zoneid != zoneid)
1074 			continue;
1075 		/*
1076 		 * No need to hold the rule as we are holding the global
1077 		 * lock so it won't go away.  Ignore the return value here
1078 		 * as the rule is provided so the call cannot fail.
1079 		 */
1080 		(void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1081 	}
1082 	mutex_exit(&ilbs->ilbs_g_lock);
1083 }
1084 
1085 void
1086 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1087 {
1088 	ilb_rule_t *rule;
1089 
1090 	mutex_enter(&ilbs->ilbs_g_lock);
1091 	for (rule = ilbs->ilbs_rule_head; rule != NULL;
1092 	    rule = rule->ir_next) {
1093 		if (rule->ir_zoneid != zoneid)
1094 			continue;
1095 		(void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1096 	}
1097 	mutex_exit(&ilbs->ilbs_g_lock);
1098 }
1099 
1100 void
1101 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1102 {
1103 	ilb_rule_t *rule;
1104 	ilb_rule_tq_t *arg;
1105 
1106 	mutex_enter(&ilbs->ilbs_g_lock);
1107 	while ((rule = ilbs->ilbs_rule_head) != NULL) {
1108 		if (rule->ir_zoneid != zoneid)
1109 			continue;
1110 		ilb_rule_hash_del(rule);
1111 		ilb_rule_g_del(ilbs, rule);
1112 		mutex_exit(&ilbs->ilbs_g_lock);
1113 
1114 		arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1115 		arg->ilbs = ilbs;
1116 		arg->rule = rule;
1117 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1118 		    arg, TQ_SLEEP);
1119 
1120 		mutex_enter(&ilbs->ilbs_g_lock);
1121 	}
1122 	mutex_exit(&ilbs->ilbs_g_lock);
1123 }
1124 
1125 /*
1126  * This is just an optimization, so don't grab the global lock.  The
1127  * worst case is that we missed a couple packets.
1128  */
1129 boolean_t
1130 ilb_has_rules(ilb_stack_t *ilbs)
1131 {
1132 	return (ilbs->ilbs_rule_head != NULL);
1133 }
1134 
1135 
1136 static int
1137 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1138     ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1139 {
1140 	ilb_server_t *tmp_server;
1141 	int ret;
1142 
1143 	ASSERT((rule == NULL && rule_name != NULL) ||
1144 	    (rule != NULL && rule_name == NULL));
1145 
1146 	if (rule == NULL) {
1147 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1148 		    &ret)) == NULL) {
1149 			return (ret);
1150 		}
1151 	}
1152 
1153 	/* Once we get a hold on the rule, no server can be added/deleted. */
1154 	for (tmp_server = rule->ir_servers; tmp_server != NULL;
1155 	    tmp_server = tmp_server->iser_next) {
1156 		if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1157 			break;
1158 	}
1159 	if (tmp_server == NULL) {
1160 		ret = ENOENT;
1161 		goto done;
1162 	}
1163 
1164 	if (enable) {
1165 		ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1166 		    rule->ir_alg->ilb_alg_data);
1167 		if (ret == 0) {
1168 			tmp_server->iser_enabled = B_TRUE;
1169 			tmp_server->iser_die_time = 0;
1170 		}
1171 	} else {
1172 		ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1173 		    rule->ir_alg->ilb_alg_data);
1174 		if (ret == 0) {
1175 			tmp_server->iser_enabled = B_FALSE;
1176 			if (rule->ir_conn_drain_timeout != 0) {
1177 				(void) atomic_swap_64(
1178 				    (uint64_t *)&tmp_server->iser_die_time,
1179 				    lbolt64 + SEC_TO_TICK(
1180 				    rule->ir_conn_drain_timeout));
1181 			}
1182 		}
1183 	}
1184 
1185 done:
1186 	if (rule_name != NULL)
1187 		ILB_RULE_REFRELE(rule);
1188 	return (ret);
1189 }
1190 int
1191 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1192     ilb_rule_t *rule, in6_addr_t *addr)
1193 {
1194 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1195 }
1196 
1197 int
1198 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1199     ilb_rule_t *rule, in6_addr_t *addr)
1200 {
1201 	return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1202 }
1203 
1204 /*
1205  * Add a back end server to a rule.  If the address is IPv4, it is assumed
1206  * to be passed in as a mapped address.
1207  */
1208 int
1209 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1210 {
1211 	ilb_server_t	*server;
1212 	netstackid_t	stackid;
1213 	int		ret = 0;
1214 	in_port_t	min_port, max_port;
1215 	in_port_t	range;
1216 
1217 	/* Port is passed in network byte order. */
1218 	min_port = ntohs(info->min_port);
1219 	max_port = ntohs(info->max_port);
1220 	if (min_port > max_port)
1221 		return (EINVAL);
1222 
1223 	/* min_port == 0 means "all ports". Make it so */
1224 	if (min_port == 0) {
1225 		min_port = 1;
1226 		max_port = 65535;
1227 	}
1228 	range = max_port - min_port;
1229 
1230 	mutex_enter(&rule->ir_lock);
1231 	/* If someone is already doing server add/del, sleeps and wait. */
1232 	while (rule->ir_flags & ILB_RULE_BUSY) {
1233 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1234 			mutex_exit(&rule->ir_lock);
1235 			return (EINTR);
1236 		}
1237 	}
1238 
1239 	/*
1240 	 * Set the rule to be busy to make sure that no new packet can
1241 	 * use this rule.
1242 	 */
1243 	rule->ir_flags |= ILB_RULE_BUSY;
1244 
1245 	/* Now wait for all other guys to finish their work. */
1246 	while (rule->ir_refcnt > 2) {
1247 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1248 			mutex_exit(&rule->ir_lock);
1249 			ret = EINTR;
1250 			goto end;
1251 		}
1252 	}
1253 	mutex_exit(&rule->ir_lock);
1254 
1255 	/* Sanity checks... */
1256 	if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1257 	    rule->ir_ipver != IPPROTO_IP) ||
1258 	    (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1259 	    rule->ir_ipver != IPPROTO_IPV6)) {
1260 		ret = EINVAL;
1261 		goto end;
1262 	}
1263 
1264 	/*
1265 	 * Check for valid port range.
1266 	 *
1267 	 * For DSR, there can be no port shifting.  Hence the server
1268 	 * specification must be the same as the rule's.
1269 	 *
1270 	 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1271 	 * it must be equal to the same value as the rule port range.
1272 	 *
1273 	 */
1274 	if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1275 		if (rule->ir_max_port != max_port ||
1276 		    rule->ir_min_port != min_port) {
1277 			ret = EINVAL;
1278 			goto end;
1279 		}
1280 	} else {
1281 		if ((range != rule->ir_max_port - rule->ir_min_port) &&
1282 		    range != 0) {
1283 			ret = EINVAL;
1284 			goto end;
1285 		}
1286 	}
1287 
1288 	/* Check for duplicate. */
1289 	for (server = rule->ir_servers; server != NULL;
1290 	    server = server->iser_next) {
1291 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1292 		    strcasecmp(server->iser_name, info->name) == 0) {
1293 			break;
1294 		}
1295 	}
1296 	if (server != NULL) {
1297 		ret = EEXIST;
1298 		goto end;
1299 	}
1300 
1301 	if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1302 		ret = ENOMEM;
1303 		goto end;
1304 	}
1305 
1306 	(void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1307 	(void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1308 	    sizeof (server->iser_ip_addr));
1309 	stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1310 	server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1311 	if (server->iser_ksp == NULL) {
1312 		kmem_free(server, sizeof (ilb_server_t));
1313 		ret = EINVAL;
1314 		goto end;
1315 	}
1316 
1317 	server->iser_stackid = stackid;
1318 	server->iser_addr_v6 = info->addr;
1319 	server->iser_min_port = min_port;
1320 	server->iser_max_port = max_port;
1321 	if (min_port != max_port)
1322 		server->iser_port_range = B_TRUE;
1323 	else
1324 		server->iser_port_range = B_FALSE;
1325 
1326 	/*
1327 	 * If the rule uses NAT, find/create the NAT source entry to use
1328 	 * for this server.
1329 	 */
1330 	if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1331 		in_port_t port;
1332 
1333 		/*
1334 		 * If the server uses a port range, our port allocation
1335 		 * scheme needs to treat it as a wildcard.  Refer to the
1336 		 * comments in ilb_nat.c about the scheme.
1337 		 */
1338 		if (server->iser_port_range)
1339 			port = 0;
1340 		else
1341 			port = server->iser_min_port;
1342 
1343 		if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1344 		    &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1345 		    num_nat_src_v6(&rule->ir_nat_src_start,
1346 		    &rule->ir_nat_src_end))) != 0) {
1347 			kstat_delete_netstack(server->iser_ksp, stackid);
1348 			kmem_free(server, sizeof (ilb_server_t));
1349 			goto end;
1350 		}
1351 	}
1352 
1353 	/*
1354 	 * The iser_lock is only used to protect iser_refcnt.  All the other
1355 	 * fields in ilb_server_t should not change, except for iser_enabled.
1356 	 * The worst thing that can happen if iser_enabled is messed up is
1357 	 * that one or two packets may not be load balanced to a server
1358 	 * correctly.
1359 	 */
1360 	server->iser_refcnt = 1;
1361 	server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1362 	    B_FALSE;
1363 	mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1364 	cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1365 
1366 	/* Let the load balancing algorithm know about the addition. */
1367 	ASSERT(rule->ir_alg != NULL);
1368 	if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1369 	    rule->ir_alg->ilb_alg_data)) != 0) {
1370 		kstat_delete_netstack(server->iser_ksp, stackid);
1371 		kmem_free(server, sizeof (ilb_server_t));
1372 		goto end;
1373 	}
1374 
1375 	/*
1376 	 * No need to hold ir_lock since no other thread should manipulate
1377 	 * the following fields until ILB_RULE_BUSY is cleared.
1378 	 */
1379 	if (rule->ir_servers == NULL) {
1380 		server->iser_next = NULL;
1381 	} else {
1382 		server->iser_next = rule->ir_servers;
1383 	}
1384 	rule->ir_servers = server;
1385 	ILB_R_KSTAT(rule, num_servers);
1386 
1387 end:
1388 	mutex_enter(&rule->ir_lock);
1389 	rule->ir_flags &= ~ILB_RULE_BUSY;
1390 	cv_signal(&rule->ir_cv);
1391 	mutex_exit(&rule->ir_lock);
1392 	return (ret);
1393 }
1394 
1395 /* The routine executed by the delayed rule processing taskq. */
1396 static void
1397 ilb_server_del_tq(void *arg)
1398 {
1399 	ilb_server_t *server = (ilb_server_t *)arg;
1400 
1401 	mutex_enter(&server->iser_lock);
1402 	while (server->iser_refcnt > 1)
1403 		cv_wait(&server->iser_cv, &server->iser_lock);
1404 	kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1405 	kmem_free(server, sizeof (ilb_server_t));
1406 }
1407 
1408 /*
1409  * Delete a back end server from a rule.  If the address is IPv4, it is assumed
1410  * to be passed in as a mapped address.
1411  */
1412 int
1413 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1414     ilb_rule_t *rule, in6_addr_t *addr)
1415 {
1416 	ilb_server_t	*server;
1417 	ilb_server_t	*prev_server;
1418 	int		ret = 0;
1419 
1420 	ASSERT((rule == NULL && rule_name != NULL) ||
1421 	    (rule != NULL && rule_name == NULL));
1422 	if (rule == NULL) {
1423 		if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1424 		    &ret)) == NULL) {
1425 			return (ret);
1426 		}
1427 	}
1428 
1429 	mutex_enter(&rule->ir_lock);
1430 	/* If someone is already doing server add/del, sleeps and wait. */
1431 	while (rule->ir_flags & ILB_RULE_BUSY) {
1432 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1433 			if (rule_name != NULL) {
1434 				if (--rule->ir_refcnt <= 2)
1435 					cv_signal(&rule->ir_cv);
1436 			}
1437 			mutex_exit(&rule->ir_lock);
1438 			return (EINTR);
1439 		}
1440 	}
1441 	/*
1442 	 * Set the rule to be busy to make sure that no new packet can
1443 	 * use this rule.
1444 	 */
1445 	rule->ir_flags |= ILB_RULE_BUSY;
1446 
1447 	/* Now wait for all other guys to finish their work. */
1448 	while (rule->ir_refcnt > 2) {
1449 		if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1450 			mutex_exit(&rule->ir_lock);
1451 			ret = EINTR;
1452 			goto end;
1453 		}
1454 	}
1455 	mutex_exit(&rule->ir_lock);
1456 
1457 	prev_server = NULL;
1458 	for (server = rule->ir_servers; server != NULL;
1459 	    prev_server = server, server = server->iser_next) {
1460 		if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1461 			break;
1462 	}
1463 	if (server == NULL) {
1464 		ret = ENOENT;
1465 		goto end;
1466 	}
1467 
1468 	/*
1469 	 * Let the load balancing algorithm know about the removal.
1470 	 * The algorithm may disallow the removal...
1471 	 */
1472 	if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1473 	    rule->ir_alg->ilb_alg_data)) != 0) {
1474 		goto end;
1475 	}
1476 
1477 	if (prev_server == NULL)
1478 		rule->ir_servers = server->iser_next;
1479 	else
1480 		prev_server->iser_next = server->iser_next;
1481 
1482 	ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1483 
1484 	/*
1485 	 * Mark the server as disabled so that if there is any sticky cache
1486 	 * using this server around, it won't be used.
1487 	 */
1488 	server->iser_enabled = B_FALSE;
1489 
1490 	mutex_enter(&server->iser_lock);
1491 
1492 	/*
1493 	 * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
1494 	 * may not go away if there is still a conn using it.  The NAT source
1495 	 * timer will do the garbage collection.
1496 	 */
1497 	ilb_destroy_nat_src(&server->iser_nat_src);
1498 
1499 	/* If there is a hard limit on when a server should die, set it. */
1500 	if (rule->ir_conn_drain_timeout != 0) {
1501 		(void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1502 		    lbolt64 + SEC_TO_TICK(rule->ir_conn_drain_timeout));
1503 	}
1504 
1505 	if (server->iser_refcnt > 1) {
1506 		(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1507 		    server, TQ_SLEEP);
1508 		mutex_exit(&server->iser_lock);
1509 	} else {
1510 		kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1511 		kmem_free(server, sizeof (ilb_server_t));
1512 	}
1513 
1514 end:
1515 	mutex_enter(&rule->ir_lock);
1516 	rule->ir_flags &= ~ILB_RULE_BUSY;
1517 	if (rule_name != NULL)
1518 		rule->ir_refcnt--;
1519 	cv_signal(&rule->ir_cv);
1520 	mutex_exit(&rule->ir_lock);
1521 	return (ret);
1522 }
1523 
1524 /*
1525  * First check if the destination of the ICMP message matches a VIP of
1526  * a rule.  If it does not, just return ILB_PASSED.
1527  *
1528  * If the destination matches a VIP:
1529  *
1530  * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1531  * server.
1532  *
1533  * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1534  * and see which back end server we should send this message to.  And we
1535  * need to do NAT on both the payload message and the outside IP packet.
1536  *
1537  * For other ICMP messages, drop them.
1538  */
1539 /* ARGSUSED */
1540 static int
1541 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1542     icmph_t *icmph, ipaddr_t *lb_dst)
1543 {
1544 	ipaddr_t vip;
1545 	ilb_rule_t *rule;
1546 	in6_addr_t addr6;
1547 
1548 	if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1549 		return (ILB_PASSED);
1550 
1551 
1552 	if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1553 		ILB_R_KSTAT(rule, icmp_dropped);
1554 		ILB_RULE_REFRELE(rule);
1555 		return (ILB_DROPPED);
1556 	}
1557 
1558 	switch (icmph->icmph_type) {
1559 	case ICMP_ECHO_REQUEST:
1560 		ILB_R_KSTAT(rule, icmp_echo_processed);
1561 		ILB_RULE_REFRELE(rule);
1562 
1563 		icmph->icmph_type = ICMP_ECHO_REPLY;
1564 		icmph->icmph_checksum = 0;
1565 		icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1566 		ipha->ipha_ttl =
1567 		    ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1568 		*lb_dst = ipha->ipha_src;
1569 		vip = ipha->ipha_dst;
1570 		ipha->ipha_dst = ipha->ipha_src;
1571 		ipha->ipha_src = vip;
1572 		return (ILB_BALANCED);
1573 	case ICMP_DEST_UNREACHABLE: {
1574 		int ret;
1575 
1576 		if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1577 			ILB_R_KSTAT(rule, icmp_dropped);
1578 			ILB_RULE_REFRELE(rule);
1579 			return (ILB_DROPPED);
1580 		}
1581 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1582 		    &addr6)) {
1583 			ILB_R_KSTAT(rule, icmp_2big_processed);
1584 			ret = ILB_BALANCED;
1585 		} else {
1586 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1587 			ret = ILB_DROPPED;
1588 		}
1589 		ILB_RULE_REFRELE(rule);
1590 		IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1591 		return (ret);
1592 	}
1593 	default:
1594 		ILB_R_KSTAT(rule, icmp_dropped);
1595 		ILB_RULE_REFRELE(rule);
1596 		return (ILB_DROPPED);
1597 	}
1598 }
1599 
1600 /* ARGSUSED */
1601 static int
1602 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1603     icmp6_t *icmp6, in6_addr_t *lb_dst)
1604 {
1605 	ilb_rule_t *rule;
1606 
1607 	if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1608 		return (ILB_PASSED);
1609 
1610 	if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1611 		ILB_R_KSTAT(rule, icmp_dropped);
1612 		ILB_RULE_REFRELE(rule);
1613 		return (ILB_DROPPED);
1614 	}
1615 
1616 	switch (icmp6->icmp6_type) {
1617 	case ICMP6_ECHO_REQUEST: {
1618 		int hdr_len;
1619 
1620 		ILB_R_KSTAT(rule, icmp_echo_processed);
1621 		ILB_RULE_REFRELE(rule);
1622 
1623 		icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1624 		icmp6->icmp6_cksum = ip6h->ip6_plen;
1625 		hdr_len = (char *)icmp6 - (char *)ip6h;
1626 		icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1627 		    ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1628 		ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1629 		ip6h->ip6_hops =
1630 		    ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1631 		*lb_dst = ip6h->ip6_src;
1632 		ip6h->ip6_src = ip6h->ip6_dst;
1633 		ip6h->ip6_dst = *lb_dst;
1634 		return (ILB_BALANCED);
1635 	}
1636 	case ICMP6_PACKET_TOO_BIG: {
1637 		int ret;
1638 
1639 		if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1640 		    lb_dst)) {
1641 			ILB_R_KSTAT(rule, icmp_2big_processed);
1642 			ret = ILB_BALANCED;
1643 		} else {
1644 			ILB_R_KSTAT(rule, icmp_2big_dropped);
1645 			ret = ILB_DROPPED;
1646 		}
1647 		ILB_RULE_REFRELE(rule);
1648 		return (ret);
1649 	}
1650 	default:
1651 		ILB_R_KSTAT(rule, icmp_dropped);
1652 		ILB_RULE_REFRELE(rule);
1653 		return (ILB_DROPPED);
1654 	}
1655 }
1656 
1657 /*
1658  * Common routine to check an incoming packet and decide what to do with it.
1659  * called by ilb_check_v4|v6().
1660  */
1661 static int
1662 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1663     in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1664     in6_addr_t *lb_dst)
1665 {
1666 	in_port_t		sport, dport;
1667 	tcpha_t			*tcph;
1668 	udpha_t			*udph;
1669 	ilb_rule_t		*rule;
1670 	ilb_server_t		*server;
1671 	boolean_t		balanced;
1672 	struct ilb_sticky_s	*s = NULL;
1673 	int			ret;
1674 	uint32_t		ip_sum, tp_sum;
1675 	ilb_nat_info_t		info;
1676 	uint16_t		nat_src_idx;
1677 	boolean_t		busy;
1678 
1679 	/*
1680 	 * We don't really need to switch here since both protocols's
1681 	 * ports are at the same offset.  Just prepare for future protocol
1682 	 * specific processing.
1683 	 */
1684 	switch (l4) {
1685 	case IPPROTO_TCP:
1686 		if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1687 			return (ILB_DROPPED);
1688 		tcph = (tcpha_t *)tph;
1689 		sport = tcph->tha_lport;
1690 		dport = tcph->tha_fport;
1691 		break;
1692 	case IPPROTO_UDP:
1693 		if (tph + sizeof (udpha_t) > mp->b_wptr)
1694 			return (ILB_DROPPED);
1695 		udph = (udpha_t *)tph;
1696 		sport = udph->uha_src_port;
1697 		dport = udph->uha_dst_port;
1698 		break;
1699 	default:
1700 		return (ILB_PASSED);
1701 	}
1702 
1703 	/* Fast path, there is an existing conn. */
1704 	if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1705 	    pkt_len, lb_dst)) {
1706 		return (ILB_BALANCED);
1707 	}
1708 
1709 	/*
1710 	 * If there is no existing connection for the incoming packet, check
1711 	 * to see if the packet matches a rule.  If not, just let IP decide
1712 	 * what to do with it.
1713 	 *
1714 	 * Note: a reply from back end server should not match a rule.  A
1715 	 * reply should match one existing conn.
1716 	 */
1717 	rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1718 	    pkt_len, &busy);
1719 	if (rule == NULL) {
1720 		/* If the rule is busy, just drop the packet. */
1721 		if (busy)
1722 			return (ILB_DROPPED);
1723 		else
1724 			return (ILB_PASSED);
1725 	}
1726 
1727 	/*
1728 	 * The packet matches a rule, use the rule load balance algorithm
1729 	 * to find a server.
1730 	 */
1731 	balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1732 	    rule->ir_alg->ilb_alg_data, &server);
1733 	/*
1734 	 * This can only happen if there is no server in a rule or all
1735 	 * the servers are currently disabled.
1736 	 */
1737 	if (!balanced)
1738 		goto no_server;
1739 
1740 	/*
1741 	 * If the rule is sticky enabled, we need to check the sticky table.
1742 	 * If there is a sticky entry for the client, use the previous server
1743 	 * instead of the one found above (note that both can be the same).
1744 	 * If there is no entry for that client, add an entry to the sticky
1745 	 * table.  Both the find and add are done in ilb_sticky_find_add()
1746 	 * to avoid checking for duplicate when adding an entry.
1747 	 */
1748 	if (rule->ir_flags & ILB_RULE_STICKY) {
1749 		in6_addr_t addr;
1750 
1751 		V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1752 		if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1753 		    &s, &nat_src_idx)) == NULL) {
1754 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1755 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1756 			goto no_server;
1757 		}
1758 	}
1759 
1760 	/*
1761 	 * We are holding a reference on the rule, so the server
1762 	 * cannot go away.
1763 	 */
1764 	*lb_dst = server->iser_addr_v6;
1765 	ILB_S_KSTAT(server, pkt_processed);
1766 	ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1767 
1768 	switch (rule->ir_topo) {
1769 	case ILB_TOPO_IMPL_NAT: {
1770 		ilb_nat_src_entry_t	*src_ent;
1771 		uint16_t		*src_idx;
1772 
1773 		/*
1774 		 * We create a cache even if it is not a SYN segment.
1775 		 * The server should return a RST.  When we see the
1776 		 * RST, we will destroy this cache.  But by having
1777 		 * a cache, we know how to NAT the returned RST.
1778 		 */
1779 		info.vip = *dst;
1780 		info.dport = dport;
1781 		info.src = *src;
1782 		info.sport = sport;
1783 
1784 		/* If stickiness is enabled, use the same source address */
1785 		if (s != NULL)
1786 			src_idx = &nat_src_idx;
1787 		else
1788 			src_idx = NULL;
1789 
1790 		if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1791 		    &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1792 			if (s != NULL)
1793 				ilb_sticky_refrele(s);
1794 			ILB_R_KSTAT(rule, pkt_dropped);
1795 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1796 			ILB_R_KSTAT(rule, noport_pkt_dropped);
1797 			ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1798 			ret = ILB_DROPPED;
1799 			break;
1800 		}
1801 		info.src_ent = src_ent;
1802 		info.nat_dst = server->iser_addr_v6;
1803 		if (rule->ir_port_range && server->iser_port_range) {
1804 			info.nat_dport = htons(ntohs(dport) -
1805 			    rule->ir_min_port + server->iser_min_port);
1806 		} else {
1807 			info.nat_dport = htons(server->iser_min_port);
1808 		}
1809 
1810 		/*
1811 		 * If ilb_conn_add() fails, it will release the reference on
1812 		 * sticky info and de-allocate the NAT source port allocated
1813 		 * above.
1814 		 */
1815 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1816 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1817 			ILB_R_KSTAT(rule, pkt_dropped);
1818 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1819 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1820 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1821 			ret = ILB_DROPPED;
1822 			break;
1823 		}
1824 		ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1825 		ret = ILB_BALANCED;
1826 		break;
1827 	}
1828 	case ILB_TOPO_IMPL_HALF_NAT:
1829 		info.vip = *dst;
1830 		info.nat_dst = server->iser_addr_v6;
1831 		info.dport = dport;
1832 		if (rule->ir_port_range && server->iser_port_range) {
1833 			info.nat_dport = htons(ntohs(dport) -
1834 			    rule->ir_min_port + server->iser_min_port);
1835 		} else {
1836 			info.nat_dport = htons(server->iser_min_port);
1837 		}
1838 
1839 		if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1840 		    dport, &info, &ip_sum, &tp_sum, s) != 0) {
1841 			ILB_R_KSTAT(rule, pkt_dropped);
1842 			ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1843 			ILB_R_KSTAT(rule, nomem_pkt_dropped);
1844 			ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1845 			ret = ILB_DROPPED;
1846 			break;
1847 		}
1848 		ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1849 
1850 		ret = ILB_BALANCED;
1851 		break;
1852 	case ILB_TOPO_IMPL_DSR:
1853 		/*
1854 		 * By decrementing the sticky refcnt, the period of
1855 		 * stickiness (life time of ilb_sticky_t) will be
1856 		 * from now to (now + default expiry time).
1857 		 */
1858 		if (s != NULL)
1859 			ilb_sticky_refrele(s);
1860 		ret = ILB_BALANCED;
1861 		break;
1862 	default:
1863 		cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1864 		    (void *) rule);
1865 		break;
1866 	}
1867 	ILB_RULE_REFRELE(rule);
1868 	return (ret);
1869 
1870 no_server:
1871 	/* This can only happen if there is no server available. */
1872 	ILB_R_KSTAT(rule, pkt_dropped);
1873 	ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1874 	ILB_RULE_REFRELE(rule);
1875 	return (ILB_DROPPED);
1876 }
1877 
1878 int
1879 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1880     uint8_t *tph, ipaddr_t *lb_dst)
1881 {
1882 	in6_addr_t v6_src, v6_dst, v6_lb_dst;
1883 	int ret;
1884 
1885 	ASSERT(DB_REF(mp) == 1);
1886 
1887 	if (l4 == IPPROTO_ICMP) {
1888 		return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1889 		    lb_dst));
1890 	}
1891 
1892 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1893 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1894 	ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1895 	    tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1896 	if (ret == ILB_BALANCED)
1897 		IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1898 	return (ret);
1899 }
1900 
1901 int
1902 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1903     uint8_t *tph, in6_addr_t *lb_dst)
1904 {
1905 	uint32_t pkt_len;
1906 
1907 	ASSERT(DB_REF(mp) == 1);
1908 
1909 	if (l4 == IPPROTO_ICMPV6) {
1910 		return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1911 		    lb_dst));
1912 	}
1913 
1914 	pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1915 	return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1916 	    IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1917 }
1918 
1919 void
1920 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1921 {
1922 	ilb_rule_t *tmp_rule;
1923 
1924 	mutex_enter(&ilbs->ilbs_g_lock);
1925 	*num_rules = 0;
1926 	for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1927 	    tmp_rule = tmp_rule->ir_next) {
1928 		if (tmp_rule->ir_zoneid == zoneid)
1929 			*num_rules += 1;
1930 	}
1931 	mutex_exit(&ilbs->ilbs_g_lock);
1932 }
1933 
1934 int
1935 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1936     uint32_t *num_servers)
1937 {
1938 	ilb_rule_t *rule;
1939 	int err;
1940 
1941 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1942 		return (err);
1943 	*num_servers = rule->ir_kstat.num_servers.value.ui64;
1944 	ILB_RULE_REFRELE(rule);
1945 	return (0);
1946 }
1947 
1948 int
1949 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1950     ilb_server_info_t *servers, uint32_t *num_servers)
1951 {
1952 	ilb_rule_t *rule;
1953 	ilb_server_t *server;
1954 	size_t cnt;
1955 	int err;
1956 
1957 	if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1958 		return (err);
1959 	for (server = rule->ir_servers, cnt = *num_servers;
1960 	    server != NULL && cnt > 0;
1961 	    server = server->iser_next, cnt--, servers++) {
1962 		(void) memcpy(servers->name, server->iser_name,
1963 		    ILB_SERVER_NAMESZ);
1964 		servers->addr = server->iser_addr_v6;
1965 		servers->min_port = htons(server->iser_min_port);
1966 		servers->max_port = htons(server->iser_max_port);
1967 		servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1968 		servers->err = 0;
1969 	}
1970 	ILB_RULE_REFRELE(rule);
1971 	*num_servers -= cnt;
1972 
1973 	return (0);
1974 }
1975 
1976 void
1977 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1978     char *buf)
1979 {
1980 	ilb_rule_t *tmp_rule;
1981 	int cnt;
1982 
1983 	if (*num_names == 0)
1984 		return;
1985 
1986 	mutex_enter(&ilbs->ilbs_g_lock);
1987 	for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1988 	    tmp_rule = tmp_rule->ir_next) {
1989 		if (tmp_rule->ir_zoneid != zoneid)
1990 			continue;
1991 
1992 		(void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1993 		buf += ILB_RULE_NAMESZ;
1994 		if (++cnt == *num_names)
1995 			break;
1996 	}
1997 	mutex_exit(&ilbs->ilbs_g_lock);
1998 	*num_names = cnt;
1999 }
2000 
2001 int
2002 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2003 {
2004 	ilb_rule_t *rule;
2005 	int err;
2006 
2007 	if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2008 		return (err);
2009 	}
2010 
2011 	/*
2012 	 * Except the enabled flags, none of the following will change
2013 	 * in the life time of a rule.  So we don't hold the mutex when
2014 	 * reading them.  The worst is to report a wrong enabled flags.
2015 	 */
2016 	cmd->ip_ver = rule->ir_ipver;
2017 	cmd->proto = rule->ir_proto;
2018 	cmd->min_port = htons(rule->ir_min_port);
2019 	cmd->max_port = htons(rule->ir_max_port);
2020 
2021 	cmd->vip = rule->ir_target_v6;
2022 	cmd->algo = rule->ir_alg_type;
2023 	cmd->topo = rule->ir_topo;
2024 
2025 	cmd->nat_src_start = rule->ir_nat_src_start;
2026 	cmd->nat_src_end = rule->ir_nat_src_end;
2027 
2028 	cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2029 	cmd->nat_expiry = rule->ir_nat_expiry;
2030 	cmd->sticky_expiry = rule->ir_sticky_expiry;
2031 
2032 	cmd->flags = 0;
2033 	if (rule->ir_flags & ILB_RULE_ENABLED)
2034 		cmd->flags |= ILB_RULE_ENABLED;
2035 	if (rule->ir_flags & ILB_RULE_STICKY) {
2036 		cmd->flags |= ILB_RULE_STICKY;
2037 		cmd->sticky_mask = rule->ir_sticky_mask;
2038 	}
2039 
2040 	ILB_RULE_REFRELE(rule);
2041 	return (0);
2042 }
2043 
2044 static void *
2045 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2046 {
2047 	ilb_stack_t *ilbs;
2048 	char tq_name[TASKQ_NAMELEN];
2049 
2050 	ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2051 	ilbs->ilbs_netstack = ns;
2052 
2053 	ilbs->ilbs_rule_head = NULL;
2054 	ilbs->ilbs_g_hash = NULL;
2055 	mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2056 
2057 	ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2058 	if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2059 		kmem_free(ilbs, sizeof (ilb_stack_t));
2060 		return (NULL);
2061 	}
2062 
2063 	/*
2064 	 * ilbs_conn/sticky_hash related info is initialized in
2065 	 * ilb_conn/sticky_hash_init().
2066 	 */
2067 	ilbs->ilbs_conn_taskq = NULL;
2068 	ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2069 	ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2070 	ilbs->ilbs_c2s_conn_hash = NULL;
2071 	ilbs->ilbs_s2c_conn_hash = NULL;
2072 	ilbs->ilbs_conn_timer_list = NULL;
2073 
2074 	ilbs->ilbs_sticky_hash = NULL;
2075 	ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2076 	ilbs->ilbs_sticky_timer_list = NULL;
2077 	ilbs->ilbs_sticky_taskq = NULL;
2078 
2079 	/* The allocation is done later when there is a rule using NAT mode. */
2080 	ilbs->ilbs_nat_src = NULL;
2081 	ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2082 	mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2083 	ilbs->ilbs_nat_src_tid = 0;
2084 
2085 	/* For listing the conn hash table */
2086 	mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2087 	cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2088 	ilbs->ilbs_conn_list_busy = B_FALSE;
2089 	ilbs->ilbs_conn_list_cur = 0;
2090 	ilbs->ilbs_conn_list_connp = NULL;
2091 
2092 	/* For listing the sticky hash table */
2093 	mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2094 	cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2095 	ilbs->ilbs_sticky_list_busy = B_FALSE;
2096 	ilbs->ilbs_sticky_list_cur = 0;
2097 	ilbs->ilbs_sticky_list_curp = NULL;
2098 
2099 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", ns);
2100 	ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2101 	    minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2102 
2103 	return (ilbs);
2104 }
2105 
2106 /* ARGSUSED */
2107 static void
2108 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2109 {
2110 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2111 	ilb_rule_t *tmp_rule;
2112 
2113 	ilb_sticky_hash_fini(ilbs);
2114 	ilb_conn_hash_fini(ilbs);
2115 	mutex_enter(&ilbs->ilbs_g_lock);
2116 	while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2117 		ilb_rule_hash_del(tmp_rule);
2118 		ilb_rule_g_del(ilbs, tmp_rule);
2119 		mutex_exit(&ilbs->ilbs_g_lock);
2120 		ilb_rule_del_common(ilbs, tmp_rule);
2121 		mutex_enter(&ilbs->ilbs_g_lock);
2122 	}
2123 	mutex_exit(&ilbs->ilbs_g_lock);
2124 	if (ilbs->ilbs_nat_src != NULL)
2125 		ilb_nat_src_fini(ilbs);
2126 }
2127 
2128 static void
2129 ilb_stack_fini(netstackid_t stackid, void * arg)
2130 {
2131 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2132 
2133 	ilb_rule_hash_fini(ilbs);
2134 	taskq_destroy(ilbs->ilbs_rule_taskq);
2135 	ilb_kstat_g_fini(stackid, ilbs);
2136 	kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2137 	kmem_free(ilbs, sizeof (ilb_stack_t));
2138 }
2139 
2140 void
2141 ilb_ddi_g_init(void)
2142 {
2143 	netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2144 	    ilb_stack_fini);
2145 }
2146 
2147 void
2148 ilb_ddi_g_destroy(void)
2149 {
2150 	netstack_unregister(NS_ILB);
2151 	ilb_conn_cache_fini();
2152 	ilb_sticky_cache_fini();
2153 }
2154