xref: /titanic_50/usr/src/uts/common/inet/ilb/ilb_conn.c (revision 799823bbed51a695d01e13511bbb1369980bb714)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2014 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/time.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/sdt.h>
35 #include <sys/atomic.h>
36 #include <netinet/in.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <inet/tcp.h>
40 #include <inet/udp_impl.h>
41 #include <inet/ilb.h>
42 
43 #include "ilb_stack.h"
44 #include "ilb_impl.h"
45 #include "ilb_conn.h"
46 #include "ilb_nat.h"
47 
48 /*
49  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
50  *
51  * start: starting index into the hash table to do gc
52  * end: ending index into the hash table to do gc
53  * ilbs: pointer to the ilb_stack_t of the IP stack
54  * tid_lock: mutex to protect the timer id.
55  * tid: timer id of the timer
56  */
57 typedef struct ilb_timer_s {
58 	uint32_t	start;
59 	uint32_t	end;
60 	ilb_stack_t	*ilbs;
61 	kmutex_t	tid_lock;
62 	timeout_id_t	tid;
63 } ilb_timer_t;
64 
65 /* Hash macro for finding the index to the conn hash table */
66 #define	ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)	\
67 	(((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +		\
68 	(*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +		\
69 	(*((saddr) + 1) ^ *((daddr) + 1)) * 37 +		\
70 	(*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &	\
71 	((hash_size) - 1))
72 
73 /* Kmem cache for the conn hash entry */
74 static struct kmem_cache *ilb_conn_cache = NULL;
75 
76 /*
77  * There are 60 timers running to do conn cache garbage collection.  Each
78  * gc thread is responsible for 1/60 of the conn hash table.
79  */
80 static int ilb_conn_timer_size = 60;
81 
82 /* Each of the above gc timers wake up every 15s to do the gc. */
83 static int ilb_conn_cache_timeout = 15;
84 
85 #define	ILB_STICKY_HASH(saddr, rule, hash_size)			\
86 	(((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +		\
87 	(*((saddr) + 2) ^ ((rule) >> 16)) * 961 +		\
88 	(*((saddr) + 1) ^ ((rule) >> 8)) * 31 +			\
89 	(*(saddr) ^ (rule))) & ((hash_size) - 1))
90 
91 static struct kmem_cache *ilb_sticky_cache = NULL;
92 
93 /*
94  * There are 60 timers running to do sticky cache garbage collection.  Each
95  * gc thread is responsible for 1/60 of the sticky hash table.
96  */
97 static int ilb_sticky_timer_size = 60;
98 
99 /* Each of the above gc timers wake up every 15s to do the gc. */
100 static int ilb_sticky_timeout = 15;
101 
102 #define	ILB_STICKY_REFRELE(s)			\
103 {						\
104 	mutex_enter(&(s)->hash->sticky_lock);	\
105 	(s)->refcnt--;				\
106 	(s)->atime = ddi_get_lbolt64();		\
107 	mutex_exit(&s->hash->sticky_lock);	\
108 }
109 
110 
111 static void
112 ilb_conn_cache_init(void)
113 {
114 	ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
115 	    sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
116 	    ilb_kmem_flags);
117 }
118 
119 void
120 ilb_conn_cache_fini(void)
121 {
122 	if (ilb_conn_cache != NULL) {
123 		kmem_cache_destroy(ilb_conn_cache);
124 		ilb_conn_cache = NULL;
125 	}
126 }
127 
128 static void
129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
130 {
131 	ilb_conn_hash_t *hash;
132 	ilb_conn_t **next, **prev;
133 	ilb_conn_t **next_prev, **prev_next;
134 
135 	if (c2s) {
136 		hash = connp->conn_c2s_hash;
137 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
138 		next = &connp->conn_c2s_next;
139 		prev = &connp->conn_c2s_prev;
140 		if (*next != NULL)
141 			next_prev = &(*next)->conn_c2s_prev;
142 		if (*prev != NULL)
143 			prev_next = &(*prev)->conn_c2s_next;
144 	} else {
145 		hash = connp->conn_s2c_hash;
146 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
147 		next = &connp->conn_s2c_next;
148 		prev = &connp->conn_s2c_prev;
149 		if (*next != NULL)
150 			next_prev = &(*next)->conn_s2c_prev;
151 		if (*prev != NULL)
152 			prev_next = &(*prev)->conn_s2c_next;
153 	}
154 
155 	if (hash->ilb_connp == connp) {
156 		hash->ilb_connp = *next;
157 		if (*next != NULL)
158 			*next_prev = NULL;
159 	} else {
160 		if (*prev != NULL)
161 			*prev_next = *next;
162 		if (*next != NULL)
163 			*next_prev = *prev;
164 	}
165 	ASSERT(hash->ilb_conn_cnt > 0);
166 	hash->ilb_conn_cnt--;
167 
168 	*next = NULL;
169 	*prev = NULL;
170 }
171 
172 static void
173 ilb_conn_remove(ilb_conn_t *connp)
174 {
175 	ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
176 	ilb_conn_remove_common(connp, B_TRUE);
177 	ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
178 	ilb_conn_remove_common(connp, B_FALSE);
179 
180 	if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
181 		in_port_t port;
182 
183 		port = ntohs(connp->conn_rule_cache.info.nat_sport);
184 		vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
185 		    (void *)(uintptr_t)port, 1);
186 	}
187 
188 	if (connp->conn_sticky != NULL)
189 		ILB_STICKY_REFRELE(connp->conn_sticky);
190 	ILB_SERVER_REFRELE(connp->conn_server);
191 	kmem_cache_free(ilb_conn_cache, connp);
192 }
193 
194 /*
195  * Routine to do periodic garbage collection of conn hash entries.  When
196  * a conn hash timer fires, it dispatches a taskq to call this function
197  * to do the gc.  Note that each taskq is responisble for a portion of
198  * the table.  The portion is stored in timer->start, timer->end.
199  */
200 static void
201 ilb_conn_cleanup(void *arg)
202 {
203 	ilb_timer_t *timer = (ilb_timer_t *)arg;
204 	uint32_t i;
205 	ilb_stack_t *ilbs;
206 	ilb_conn_hash_t *c2s_hash, *s2c_hash;
207 	ilb_conn_t *connp, *nxt_connp;
208 	int64_t now;
209 	int64_t expiry;
210 	boolean_t die_now;
211 
212 	ilbs = timer->ilbs;
213 	c2s_hash = ilbs->ilbs_c2s_conn_hash;
214 	ASSERT(c2s_hash != NULL);
215 
216 	now = ddi_get_lbolt64();
217 	for (i = timer->start; i < timer->end; i++) {
218 		mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
219 		if ((connp = c2s_hash[i].ilb_connp) == NULL) {
220 			ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
221 			mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
222 			continue;
223 		}
224 		do {
225 			ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
226 			ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
227 			nxt_connp = connp->conn_c2s_next;
228 			expiry = now - SEC_TO_TICK(connp->conn_expiry);
229 			if (connp->conn_server->iser_die_time != 0 &&
230 			    connp->conn_server->iser_die_time < now)
231 				die_now = B_TRUE;
232 			else
233 				die_now = B_FALSE;
234 			s2c_hash = connp->conn_s2c_hash;
235 			mutex_enter(&s2c_hash->ilb_conn_hash_lock);
236 
237 			if (connp->conn_gc || die_now ||
238 			    (connp->conn_c2s_atime < expiry &&
239 			    connp->conn_s2c_atime < expiry)) {
240 				/* Need to update the nat list cur_connp */
241 				if (connp == ilbs->ilbs_conn_list_connp) {
242 					ilbs->ilbs_conn_list_connp =
243 					    connp->conn_c2s_next;
244 				}
245 				ilb_conn_remove(connp);
246 				goto nxt_connp;
247 			}
248 
249 			if (connp->conn_l4 != IPPROTO_TCP)
250 				goto nxt_connp;
251 
252 			/* Update and check TCP related conn info */
253 			if (connp->conn_c2s_tcp_fin_sent &&
254 			    SEQ_GT(connp->conn_s2c_tcp_ack,
255 			    connp->conn_c2s_tcp_fss)) {
256 				connp->conn_c2s_tcp_fin_acked = B_TRUE;
257 			}
258 			if (connp->conn_s2c_tcp_fin_sent &&
259 			    SEQ_GT(connp->conn_c2s_tcp_ack,
260 			    connp->conn_s2c_tcp_fss)) {
261 				connp->conn_s2c_tcp_fin_acked = B_TRUE;
262 			}
263 			if (connp->conn_c2s_tcp_fin_acked &&
264 			    connp->conn_s2c_tcp_fin_acked) {
265 				ilb_conn_remove(connp);
266 			}
267 nxt_connp:
268 			mutex_exit(&s2c_hash->ilb_conn_hash_lock);
269 			connp = nxt_connp;
270 		} while (connp != NULL);
271 		mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
272 	}
273 }
274 
275 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
276 static void
277 ilb_conn_timer(void *arg)
278 {
279 	ilb_timer_t *timer = (ilb_timer_t *)arg;
280 
281 	(void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
282 	    arg, TQ_SLEEP);
283 	mutex_enter(&timer->tid_lock);
284 	if (timer->tid == 0) {
285 		mutex_exit(&timer->tid_lock);
286 	} else {
287 		timer->tid = timeout(ilb_conn_timer, arg,
288 		    SEC_TO_TICK(ilb_conn_cache_timeout));
289 		mutex_exit(&timer->tid_lock);
290 	}
291 }
292 
293 void
294 ilb_conn_hash_init(ilb_stack_t *ilbs)
295 {
296 	extern pri_t minclsyspri;
297 	int i, part;
298 	ilb_timer_t *tm;
299 	char tq_name[TASKQ_NAMELEN];
300 
301 	/*
302 	 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
303 	 * the next power of 2.
304 	 */
305 	if (!ISP2(ilbs->ilbs_conn_hash_size)) {
306 		for (i = 0; i < 31; i++) {
307 			if (ilbs->ilbs_conn_hash_size < (1 << i))
308 				break;
309 		}
310 		ilbs->ilbs_conn_hash_size = 1 << i;
311 	}
312 
313 	/*
314 	 * Can sleep since this should be called when a rule is being added,
315 	 * hence we are not in interrupt context.
316 	 */
317 	ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
318 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
319 	ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
320 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
321 
322 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
323 		mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
324 		    NULL, MUTEX_DEFAULT, NULL);
325 	}
326 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
327 		mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
328 		    NULL, MUTEX_DEFAULT, NULL);
329 	}
330 
331 	if (ilb_conn_cache == NULL)
332 		ilb_conn_cache_init();
333 
334 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
335 	    (void *)ilbs->ilbs_netstack);
336 	ASSERT(ilbs->ilbs_conn_taskq == NULL);
337 	ilbs->ilbs_conn_taskq = taskq_create(tq_name,
338 	    ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
339 	    ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
340 
341 	ASSERT(ilbs->ilbs_conn_timer_list == NULL);
342 	ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
343 	    ilb_conn_timer_size, KM_SLEEP);
344 
345 	/*
346 	 * The hash table is divided in equal partition for those timers
347 	 * to do garbage collection.
348 	 */
349 	part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
350 	for (i = 0; i < ilb_conn_timer_size; i++) {
351 		tm = ilbs->ilbs_conn_timer_list + i;
352 		tm->start = i * part;
353 		tm->end = i * part + part;
354 		if (tm->end > ilbs->ilbs_conn_hash_size)
355 			tm->end = ilbs->ilbs_conn_hash_size;
356 		tm->ilbs = ilbs;
357 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
358 		/* Spread out the starting execution time of all the timers. */
359 		tm->tid = timeout(ilb_conn_timer, tm,
360 		    SEC_TO_TICK(ilb_conn_cache_timeout + i));
361 	}
362 }
363 
364 void
365 ilb_conn_hash_fini(ilb_stack_t *ilbs)
366 {
367 	uint32_t i;
368 	ilb_conn_t *connp;
369 	ilb_conn_hash_t *hash;
370 
371 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
372 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
373 		return;
374 	}
375 
376 	/* Stop all the timers first. */
377 	for (i = 0; i < ilb_conn_timer_size; i++) {
378 		timeout_id_t tid;
379 
380 		/* Setting tid to 0 tells the timer handler not to restart. */
381 		mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
382 		tid = ilbs->ilbs_conn_timer_list[i].tid;
383 		ilbs->ilbs_conn_timer_list[i].tid = 0;
384 		mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
385 		(void) untimeout(tid);
386 	}
387 	kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
388 	    ilb_conn_timer_size);
389 	taskq_destroy(ilbs->ilbs_conn_taskq);
390 	ilbs->ilbs_conn_taskq = NULL;
391 
392 	/* Then remove all the conns. */
393 	hash = ilbs->ilbs_s2c_conn_hash;
394 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
395 		while ((connp = hash[i].ilb_connp) != NULL) {
396 			hash[i].ilb_connp = connp->conn_s2c_next;
397 			ILB_SERVER_REFRELE(connp->conn_server);
398 			if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
399 				ilb_nat_src_entry_t *ent;
400 				in_port_t port;
401 
402 				/*
403 				 * src_ent will be freed in ilb_nat_src_fini().
404 				 */
405 				port = ntohs(
406 				    connp->conn_rule_cache.info.nat_sport);
407 				ent = connp->conn_rule_cache.info.src_ent;
408 				vmem_free(ent->nse_port_arena,
409 				    (void *)(uintptr_t)port, 1);
410 			}
411 			kmem_cache_free(ilb_conn_cache, connp);
412 		}
413 	}
414 	kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
415 	    ilbs->ilbs_conn_hash_size);
416 	kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
417 	    ilbs->ilbs_conn_hash_size);
418 }
419 
420 /*
421  * Internet checksum adjustment calculation routines.  We pre-calculate
422  * checksum adjustment so that we don't need to compute the checksum on
423  * the whole packet when we change address/port in the packet.
424  */
425 
426 static void
427 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
428     in_port_t new_port, uint32_t *adj_sum)
429 {
430 	uint32_t sum;
431 
432 	sum = *oaddr + *(oaddr + 1) + old_port;
433 	while ((sum >> 16) != 0)
434 		sum = (sum & 0xffff) + (sum >> 16);
435 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
436 }
437 
438 static void
439 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
440     in_port_t new_port, uint32_t *adj_sum)
441 {
442 	uint32_t sum = 0;
443 
444 	sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
445 	    *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
446 	    old_port;
447 	while ((sum >> 16) != 0)
448 		sum = (sum & 0xffff) + (sum >> 16);
449 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
450 	    *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
451 	    *(naddr + 6) + *(naddr + 7) + new_port;
452 }
453 
454 static void
455 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
456     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
457     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
458 {
459 	uint32_t sum;
460 
461 	sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
462 	    old_port2;
463 	while ((sum >> 16) != 0)
464 		sum = (sum & 0xffff) + (sum >> 16);
465 	*adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
466 	    *naddr2 + *(naddr2 + 1) + new_port2;
467 }
468 
469 static void
470 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
471     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
472     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
473 {
474 	uint32_t sum = 0;
475 
476 	sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
477 	    *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
478 	    old_port1;
479 	sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
480 	    *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
481 	    old_port2;
482 	while ((sum >> 16) != 0)
483 		sum = (sum & 0xffff) + (sum >> 16);
484 	sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
485 	    *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
486 	    *(naddr1 + 7) + new_port1;
487 	*adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
488 	    *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
489 	    *(naddr2 + 7) + new_port2;
490 }
491 
492 /*
493  * Add a conn hash entry to the tables.  Note that a conn hash entry
494  * (ilb_conn_t) contains info on both directions.  And there are two hash
495  * tables, one for client to server and the other for server to client.
496  * So the same entry is added to both tables and can be ccessed by two
497  * thread simultaneously.  But each thread will only access data on one
498  * direction, so there is no conflict.
499  */
500 int
501 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
502     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
503     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
504 {
505 	ilb_conn_t *connp;
506 	ilb_conn_hash_t *hash;
507 	int i;
508 
509 	connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
510 	if (connp == NULL) {
511 		if (s != NULL) {
512 			if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
513 				ilb_nat_src_entry_t **entry;
514 
515 				entry = s->server->iser_nat_src->src_list;
516 				vmem_free(entry[s->nat_src_idx]->nse_port_arena,
517 				    (void *)(uintptr_t)ntohs(info->nat_sport),
518 				    1);
519 			}
520 			ILB_STICKY_REFRELE(s);
521 		}
522 		return (ENOMEM);
523 	}
524 
525 	connp->conn_l4 = rule->ir_proto;
526 
527 	connp->conn_server = server;
528 	ILB_SERVER_REFHOLD(server);
529 	connp->conn_sticky = s;
530 
531 	connp->conn_rule_cache.topo = rule->ir_topo;
532 	connp->conn_rule_cache.info = *info;
533 
534 	connp->conn_gc = B_FALSE;
535 
536 	connp->conn_expiry = rule->ir_nat_expiry;
537 	connp->conn_cr_time = ddi_get_lbolt64();
538 
539 	/* Client to server info. */
540 	connp->conn_c2s_saddr = *src;
541 	connp->conn_c2s_sport = sport;
542 	connp->conn_c2s_daddr = *dst;
543 	connp->conn_c2s_dport = dport;
544 
545 	connp->conn_c2s_atime = ddi_get_lbolt64();
546 	/* The packet ths triggers this creation should be counted */
547 	connp->conn_c2s_pkt_cnt = 1;
548 	connp->conn_c2s_tcp_fin_sent = B_FALSE;
549 	connp->conn_c2s_tcp_fin_acked = B_FALSE;
550 
551 	/* Server to client info, before NAT */
552 	switch (rule->ir_topo) {
553 	case ILB_TOPO_IMPL_HALF_NAT:
554 		connp->conn_s2c_saddr = info->nat_dst;
555 		connp->conn_s2c_sport = info->nat_dport;
556 		connp->conn_s2c_daddr = *src;
557 		connp->conn_s2c_dport = sport;
558 
559 		/* Pre-calculate checksum changes for both directions */
560 		if (rule->ir_ipver == IPPROTO_IP) {
561 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
562 			    (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
563 			    &connp->conn_c2s_ip_sum);
564 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
565 			    (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
566 			    info->nat_dport, &connp->conn_c2s_tp_sum);
567 			*ip_sum = connp->conn_c2s_ip_sum;
568 			*tp_sum = connp->conn_c2s_tp_sum;
569 
570 			hnat_cksum_v4(
571 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
572 			    (uint16_t *)&dst->s6_addr32[3], 0, 0,
573 			    &connp->conn_s2c_ip_sum);
574 			hnat_cksum_v4(
575 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
576 			    (uint16_t *)&dst->s6_addr32[3],
577 			    info->nat_dport, dport,
578 			    &connp->conn_s2c_tp_sum);
579 		} else {
580 			connp->conn_c2s_ip_sum = 0;
581 			hnat_cksum_v6((uint16_t *)dst,
582 			    (uint16_t *)&info->nat_dst, dport,
583 			    info->nat_dport, &connp->conn_c2s_tp_sum);
584 			*ip_sum = 0;
585 			*tp_sum = connp->conn_c2s_tp_sum;
586 
587 			connp->conn_s2c_ip_sum = 0;
588 			hnat_cksum_v6((uint16_t *)&info->nat_dst,
589 			    (uint16_t *)dst, info->nat_dport, dport,
590 			    &connp->conn_s2c_tp_sum);
591 		}
592 		break;
593 	case ILB_TOPO_IMPL_NAT:
594 		connp->conn_s2c_saddr = info->nat_dst;
595 		connp->conn_s2c_sport = info->nat_dport;
596 		connp->conn_s2c_daddr = info->nat_src;
597 		connp->conn_s2c_dport = info->nat_sport;
598 
599 		if (rule->ir_ipver == IPPROTO_IP) {
600 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
601 			    (uint16_t *)&dst->s6_addr32[3],
602 			    (uint16_t *)&info->nat_src.s6_addr32[3],
603 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
604 			    0, 0, 0, 0, &connp->conn_c2s_ip_sum);
605 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
606 			    (uint16_t *)&dst->s6_addr32[3],
607 			    (uint16_t *)&info->nat_src.s6_addr32[3],
608 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
609 			    sport, dport, info->nat_sport,
610 			    info->nat_dport, &connp->conn_c2s_tp_sum);
611 			*ip_sum = connp->conn_c2s_ip_sum;
612 			*tp_sum = connp->conn_c2s_tp_sum;
613 
614 			fnat_cksum_v4(
615 			    (uint16_t *)&info->nat_src.s6_addr32[3],
616 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
617 			    (uint16_t *)&src->s6_addr32[3],
618 			    (uint16_t *)&dst->s6_addr32[3],
619 			    0, 0, 0, 0, &connp->conn_s2c_ip_sum);
620 			fnat_cksum_v4(
621 			    (uint16_t *)&info->nat_src.s6_addr32[3],
622 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
623 			    (uint16_t *)&src->s6_addr32[3],
624 			    (uint16_t *)&dst->s6_addr32[3],
625 			    info->nat_sport, info->nat_dport,
626 			    sport, dport, &connp->conn_s2c_tp_sum);
627 		} else {
628 			fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
629 			    (uint16_t *)&info->nat_src,
630 			    (uint16_t *)&info->nat_dst,
631 			    sport, dport, info->nat_sport,
632 			    info->nat_dport, &connp->conn_c2s_tp_sum);
633 			connp->conn_c2s_ip_sum = 0;
634 			*ip_sum = 0;
635 			*tp_sum = connp->conn_c2s_tp_sum;
636 
637 			fnat_cksum_v6((uint16_t *)&info->nat_src,
638 			    (uint16_t *)&info->nat_dst, (uint16_t *)src,
639 			    (uint16_t *)dst, info->nat_sport,
640 			    info->nat_dport, sport, dport,
641 			    &connp->conn_s2c_tp_sum);
642 			connp->conn_s2c_ip_sum = 0;
643 		}
644 		break;
645 	}
646 
647 	connp->conn_s2c_atime = ddi_get_lbolt64();
648 	connp->conn_s2c_pkt_cnt = 1;
649 	connp->conn_s2c_tcp_fin_sent = B_FALSE;
650 	connp->conn_s2c_tcp_fin_acked = B_FALSE;
651 
652 	/* Add it to the s2c hash table. */
653 	hash = ilbs->ilbs_s2c_conn_hash;
654 	i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
655 	    ntohs(connp->conn_s2c_sport),
656 	    (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
657 	    ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
658 	connp->conn_s2c_hash = &hash[i];
659 	DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
660 
661 	mutex_enter(&hash[i].ilb_conn_hash_lock);
662 	hash[i].ilb_conn_cnt++;
663 	connp->conn_s2c_next = hash[i].ilb_connp;
664 	if (hash[i].ilb_connp != NULL)
665 		hash[i].ilb_connp->conn_s2c_prev = connp;
666 	connp->conn_s2c_prev = NULL;
667 	hash[i].ilb_connp = connp;
668 	mutex_exit(&hash[i].ilb_conn_hash_lock);
669 
670 	/* Add it to the c2s hash table. */
671 	hash = ilbs->ilbs_c2s_conn_hash;
672 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
673 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
674 	    ilbs->ilbs_conn_hash_size);
675 	connp->conn_c2s_hash = &hash[i];
676 	DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
677 
678 	mutex_enter(&hash[i].ilb_conn_hash_lock);
679 	hash[i].ilb_conn_cnt++;
680 	connp->conn_c2s_next = hash[i].ilb_connp;
681 	if (hash[i].ilb_connp != NULL)
682 		hash[i].ilb_connp->conn_c2s_prev = connp;
683 	connp->conn_c2s_prev = NULL;
684 	hash[i].ilb_connp = connp;
685 	mutex_exit(&hash[i].ilb_conn_hash_lock);
686 
687 	return (0);
688 }
689 
690 /*
691  * If a connection is using TCP, we keep track of simple TCP state transition
692  * so that we know when to clean up an entry.
693  */
694 static boolean_t
695 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
696     boolean_t c2s)
697 {
698 	uint32_t ack, seq;
699 	int32_t seg_len;
700 
701 	if (tcpha->tha_flags & TH_RST)
702 		return (B_FALSE);
703 
704 	seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
705 	    TCP_HDR_LENGTH((tcph_t *)tcpha);
706 
707 	if (tcpha->tha_flags & TH_ACK)
708 		ack = ntohl(tcpha->tha_ack);
709 	seq = ntohl(tcpha->tha_seq);
710 	if (c2s) {
711 		ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
712 		if (tcpha->tha_flags & TH_FIN) {
713 			connp->conn_c2s_tcp_fss = seq + seg_len;
714 			connp->conn_c2s_tcp_fin_sent = B_TRUE;
715 		}
716 		connp->conn_c2s_tcp_ack = ack;
717 
718 		/* Port reuse by the client, restart the conn. */
719 		if (connp->conn_c2s_tcp_fin_sent &&
720 		    SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
721 			connp->conn_c2s_tcp_fin_sent = B_FALSE;
722 			connp->conn_c2s_tcp_fin_acked = B_FALSE;
723 		}
724 	} else {
725 		ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
726 		if (tcpha->tha_flags & TH_FIN) {
727 			connp->conn_s2c_tcp_fss = seq + seg_len;
728 			connp->conn_s2c_tcp_fin_sent = B_TRUE;
729 		}
730 		connp->conn_s2c_tcp_ack = ack;
731 
732 		/* Port reuse by the client, restart the conn. */
733 		if (connp->conn_s2c_tcp_fin_sent &&
734 		    SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
735 			connp->conn_s2c_tcp_fin_sent = B_FALSE;
736 			connp->conn_s2c_tcp_fin_acked = B_FALSE;
737 		}
738 	}
739 
740 	return (B_TRUE);
741 }
742 
743 /*
744  * Helper routint to find conn hash entry given some packet information and
745  * the traffic direction (c2s, client to server?)
746  */
747 static boolean_t
748 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
749     in_port_t sport, in6_addr_t *dst, in_port_t dport,
750     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
751     int32_t pkt_len, boolean_t c2s)
752 {
753 	ilb_conn_hash_t *hash;
754 	uint_t i;
755 	ilb_conn_t *connp;
756 	boolean_t tcp_alive;
757 	boolean_t ret = B_FALSE;
758 
759 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
760 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
761 	    ilbs->ilbs_conn_hash_size);
762 	if (c2s) {
763 		hash = ilbs->ilbs_c2s_conn_hash;
764 		mutex_enter(&hash[i].ilb_conn_hash_lock);
765 		for (connp = hash[i].ilb_connp; connp != NULL;
766 		    connp = connp->conn_c2s_next) {
767 			if (connp->conn_l4 == l4 &&
768 			    connp->conn_c2s_dport == dport &&
769 			    connp->conn_c2s_sport == sport &&
770 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
771 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
772 				connp->conn_c2s_atime = ddi_get_lbolt64();
773 				connp->conn_c2s_pkt_cnt++;
774 				*rule_cache = connp->conn_rule_cache;
775 				*ip_sum = connp->conn_c2s_ip_sum;
776 				*tp_sum = connp->conn_c2s_tp_sum;
777 				ret = B_TRUE;
778 				break;
779 			}
780 		}
781 	} else {
782 		hash = ilbs->ilbs_s2c_conn_hash;
783 		mutex_enter(&hash[i].ilb_conn_hash_lock);
784 		for (connp = hash[i].ilb_connp; connp != NULL;
785 		    connp = connp->conn_s2c_next) {
786 			if (connp->conn_l4 == l4 &&
787 			    connp->conn_s2c_dport == dport &&
788 			    connp->conn_s2c_sport == sport &&
789 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
790 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
791 				connp->conn_s2c_atime = ddi_get_lbolt64();
792 				connp->conn_s2c_pkt_cnt++;
793 				*rule_cache = connp->conn_rule_cache;
794 				*ip_sum = connp->conn_s2c_ip_sum;
795 				*tp_sum = connp->conn_s2c_tp_sum;
796 				ret = B_TRUE;
797 				break;
798 			}
799 		}
800 	}
801 	if (ret) {
802 		ILB_S_KSTAT(connp->conn_server, pkt_processed);
803 		ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
804 		    pkt_len);
805 
806 		switch (l4) {
807 		case (IPPROTO_TCP):
808 			tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
809 			    c2s);
810 			if (!tcp_alive) {
811 				connp->conn_gc = B_TRUE;
812 			}
813 			break;
814 		default:
815 			break;
816 		}
817 	}
818 	mutex_exit(&hash[i].ilb_conn_hash_lock);
819 
820 	return (ret);
821 }
822 
823 /*
824  * To check if a give packet matches an existing conn hash entry.  If it
825  * does, return the information about this entry so that the caller can
826  * do the proper NAT.
827  */
828 boolean_t
829 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
830     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
831     uint32_t pkt_len, in6_addr_t *lb_dst)
832 {
833 	ilb_rule_info_t rule_cache;
834 	uint32_t adj_ip_sum, adj_tp_sum;
835 	boolean_t ret;
836 
837 	/* Check the incoming hash table. */
838 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
839 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
840 		switch (rule_cache.topo) {
841 		case ILB_TOPO_IMPL_NAT:
842 			*lb_dst = rule_cache.info.nat_dst;
843 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
844 			    adj_ip_sum, adj_tp_sum, B_TRUE);
845 			ret = B_TRUE;
846 			break;
847 		case ILB_TOPO_IMPL_HALF_NAT:
848 			*lb_dst = rule_cache.info.nat_dst;
849 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
850 			    adj_ip_sum, adj_tp_sum, B_TRUE);
851 			ret = B_TRUE;
852 			break;
853 		default:
854 			ret = B_FALSE;
855 			break;
856 		}
857 		return (ret);
858 	}
859 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
860 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
861 		switch (rule_cache.topo) {
862 		case ILB_TOPO_IMPL_NAT:
863 			*lb_dst = rule_cache.info.src;
864 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
865 			    adj_ip_sum, adj_tp_sum, B_FALSE);
866 			ret = B_TRUE;
867 			break;
868 		case ILB_TOPO_IMPL_HALF_NAT:
869 			*lb_dst = *dst;
870 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
871 			    adj_ip_sum, adj_tp_sum, B_FALSE);
872 			ret = B_TRUE;
873 			break;
874 		default:
875 			ret = B_FALSE;
876 			break;
877 		}
878 		return (ret);
879 	}
880 
881 	return (B_FALSE);
882 }
883 
884 /*
885  * To check if an ICMP packet belongs to a connection in one of the conn
886  * hash entries.
887  */
888 boolean_t
889 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
890     void *icmph, in6_addr_t *lb_dst)
891 {
892 	ilb_conn_hash_t *hash;
893 	ipha_t *in_iph4;
894 	ip6_t *in_iph6;
895 	icmph_t *icmph4;
896 	icmp6_t *icmph6;
897 	in6_addr_t *in_src_p, *in_dst_p;
898 	in_port_t *sport, *dport;
899 	int l4;
900 	uint_t i;
901 	ilb_conn_t *connp;
902 	ilb_rule_info_t rule_cache;
903 	uint32_t adj_ip_sum;
904 	boolean_t full_nat;
905 
906 	if (l3 == IPPROTO_IP) {
907 		in6_addr_t in_src, in_dst;
908 
909 		icmph4 = (icmph_t *)icmph;
910 		in_iph4 = (ipha_t *)&icmph4[1];
911 
912 		if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
913 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
914 			return (B_FALSE);
915 		}
916 
917 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
918 		in_src_p = &in_src;
919 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
920 		in_dst_p = &in_dst;
921 
922 		l4 = in_iph4->ipha_protocol;
923 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
924 			return (B_FALSE);
925 
926 		sport = (in_port_t *)((char *)in_iph4 +
927 		    IPH_HDR_LENGTH(in_iph4));
928 		dport = sport + 1;
929 
930 		DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
931 		    in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
932 		    ntohs(*sport), uint16_t, ntohs(*dport));
933 	} else {
934 		ASSERT(l3 == IPPROTO_IPV6);
935 
936 		icmph6 = (icmp6_t *)icmph;
937 		in_iph6 = (ip6_t *)&icmph6[1];
938 		in_src_p = &in_iph6->ip6_src;
939 		in_dst_p = &in_iph6->ip6_dst;
940 
941 		if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
942 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
943 			return (B_FALSE);
944 		}
945 
946 		l4 = in_iph6->ip6_nxt;
947 		/* We don't go deep inside an IPv6 packet yet. */
948 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
949 			return (B_FALSE);
950 
951 		sport = (in_port_t *)&in_iph6[1];
952 		dport = sport + 1;
953 
954 		DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
955 		    &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
956 		    uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
957 	}
958 
959 	i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
960 	    (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
961 	    ilbs->ilbs_conn_hash_size);
962 	hash = ilbs->ilbs_c2s_conn_hash;
963 
964 	mutex_enter(&hash[i].ilb_conn_hash_lock);
965 	for (connp = hash[i].ilb_connp; connp != NULL;
966 	    connp = connp->conn_c2s_next) {
967 		if (connp->conn_l4 == l4 &&
968 		    connp->conn_c2s_dport == *sport &&
969 		    connp->conn_c2s_sport == *dport &&
970 		    IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
971 		    IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
972 			connp->conn_c2s_atime = ddi_get_lbolt64();
973 			connp->conn_c2s_pkt_cnt++;
974 			rule_cache = connp->conn_rule_cache;
975 			adj_ip_sum = connp->conn_c2s_ip_sum;
976 			break;
977 		}
978 	}
979 	mutex_exit(&hash[i].ilb_conn_hash_lock);
980 
981 	if (connp == NULL) {
982 		DTRACE_PROBE(ilb__chk__icmp__conn__failed);
983 		return (B_FALSE);
984 	}
985 
986 	switch (rule_cache.topo) {
987 	case ILB_TOPO_IMPL_NAT:
988 		full_nat = B_TRUE;
989 		break;
990 	case ILB_TOPO_IMPL_HALF_NAT:
991 		full_nat = B_FALSE;
992 		break;
993 	default:
994 		return (B_FALSE);
995 	}
996 
997 	*lb_dst = rule_cache.info.nat_dst;
998 	if (l3 == IPPROTO_IP) {
999 		ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
1000 		    &rule_cache.info, adj_ip_sum, full_nat);
1001 	} else {
1002 		ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1003 		    &rule_cache.info, full_nat);
1004 	}
1005 	return (B_TRUE);
1006 }
1007 
1008 /*
1009  * This routine sends up the conn hash table to user land.  Note that the
1010  * request is an ioctl, hence we cannot really differentiate requests
1011  * from different clients.  There is no context shared between different
1012  * ioctls.  Here we make the assumption that the user land ilbd will
1013  * only allow one client to show the conn hash table at any time.
1014  * Otherwise, the results will be "very" inconsistent.
1015  *
1016  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1017  * to read from the beginning of the able.  After a certain entries
1018  * are reported, the kernel remembers the position of the last returned
1019  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1020  * it will return entries starting from where it was left off.  When
1021  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1022  * the client that there is no more entry.
1023  *
1024  * It is assumed that the caller has checked the size of nat so that it
1025  * can hold num entries.
1026  */
1027 /* ARGSUSED */
1028 int
1029 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1030     uint32_t *num, uint32_t *flags)
1031 {
1032 	ilb_conn_hash_t *hash;
1033 	ilb_conn_t *cur_connp;
1034 	uint32_t i, j;
1035 	int ret = 0;
1036 
1037 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1038 	while (ilbs->ilbs_conn_list_busy) {
1039 		if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1040 		    &ilbs->ilbs_conn_list_lock) == 0) {
1041 			mutex_exit(&ilbs->ilbs_conn_list_lock);
1042 			return (EINTR);
1043 		}
1044 	}
1045 	if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1046 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1047 		mutex_exit(&ilbs->ilbs_conn_list_lock);
1048 		*num = 0;
1049 		*flags |= ILB_LIST_END;
1050 		return (0);
1051 	}
1052 	ilbs->ilbs_conn_list_busy = B_TRUE;
1053 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1054 
1055 	if (*flags & ILB_LIST_BEGIN) {
1056 		i = 0;
1057 		mutex_enter(&hash[0].ilb_conn_hash_lock);
1058 		cur_connp = hash[0].ilb_connp;
1059 	} else if (*flags & ILB_LIST_CONT) {
1060 		if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1061 			*num = 0;
1062 			*flags |= ILB_LIST_END;
1063 			goto done;
1064 		}
1065 		i = ilbs->ilbs_conn_list_cur;
1066 		mutex_enter(&hash[i].ilb_conn_hash_lock);
1067 		cur_connp = ilbs->ilbs_conn_list_connp;
1068 	} else {
1069 		ret = EINVAL;
1070 		goto done;
1071 	}
1072 
1073 	j = 0;
1074 	while (j < *num) {
1075 		if (cur_connp == NULL) {
1076 			mutex_exit(&hash[i].ilb_conn_hash_lock);
1077 			if (++i == ilbs->ilbs_conn_hash_size) {
1078 				*flags |= ILB_LIST_END;
1079 				break;
1080 			}
1081 			mutex_enter(&hash[i].ilb_conn_hash_lock);
1082 			cur_connp = hash[i].ilb_connp;
1083 			continue;
1084 		}
1085 		nat[j].proto = cur_connp->conn_l4;
1086 
1087 		nat[j].in_global = cur_connp->conn_c2s_daddr;
1088 		nat[j].in_global_port = cur_connp->conn_c2s_dport;
1089 		nat[j].out_global = cur_connp->conn_c2s_saddr;
1090 		nat[j].out_global_port = cur_connp->conn_c2s_sport;
1091 
1092 		nat[j].in_local = cur_connp->conn_s2c_saddr;
1093 		nat[j].in_local_port = cur_connp->conn_s2c_sport;
1094 		nat[j].out_local = cur_connp->conn_s2c_daddr;
1095 		nat[j].out_local_port = cur_connp->conn_s2c_dport;
1096 
1097 		nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1098 		nat[j].last_access_time =
1099 		    TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1100 
1101 		/*
1102 		 * The conn_s2c_pkt_cnt may not be accurate since we are not
1103 		 * holding the s2c hash lock.
1104 		 */
1105 		nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1106 		    cur_connp->conn_s2c_pkt_cnt;
1107 		j++;
1108 
1109 		cur_connp = cur_connp->conn_c2s_next;
1110 	}
1111 	ilbs->ilbs_conn_list_connp = cur_connp;
1112 	if (j == *num)
1113 		mutex_exit(&hash[i].ilb_conn_hash_lock);
1114 
1115 	ilbs->ilbs_conn_list_cur = i;
1116 
1117 	*num = j;
1118 done:
1119 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1120 	ilbs->ilbs_conn_list_busy = B_FALSE;
1121 	cv_signal(&ilbs->ilbs_conn_list_cv);
1122 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1123 
1124 	return (ret);
1125 }
1126 
1127 
1128 /*
1129  * Stickiness (persistence) handling routines.
1130  */
1131 
1132 
1133 static void
1134 ilb_sticky_cache_init(void)
1135 {
1136 	ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1137 	    sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1138 	    ilb_kmem_flags);
1139 }
1140 
1141 void
1142 ilb_sticky_cache_fini(void)
1143 {
1144 	if (ilb_sticky_cache != NULL) {
1145 		kmem_cache_destroy(ilb_sticky_cache);
1146 		ilb_sticky_cache = NULL;
1147 	}
1148 }
1149 
1150 void
1151 ilb_sticky_refrele(ilb_sticky_t *s)
1152 {
1153 	ILB_STICKY_REFRELE(s);
1154 }
1155 
1156 static ilb_sticky_t *
1157 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1158 {
1159 	ilb_sticky_t *s;
1160 
1161 	ASSERT(mutex_owned(&hash->sticky_lock));
1162 
1163 	for (s = list_head(&hash->sticky_head); s != NULL;
1164 	    s = list_next(&hash->sticky_head, s)) {
1165 		if (s->rule_instance == rule->ir_ks_instance) {
1166 			if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1167 				return (s);
1168 		}
1169 	}
1170 	return (NULL);
1171 }
1172 
1173 static ilb_sticky_t *
1174 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1175     in6_addr_t *src)
1176 {
1177 	ilb_sticky_t *s;
1178 
1179 	ASSERT(mutex_owned(&hash->sticky_lock));
1180 
1181 	if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1182 		return (NULL);
1183 
1184 	/*
1185 	 * The rule instance is for handling the scenario when the same
1186 	 * client talks to different rules at the same time.  Stickiness
1187 	 * is per rule so we can use the rule instance to differentiate
1188 	 * the client's request.
1189 	 */
1190 	s->rule_instance = rule->ir_ks_instance;
1191 	/*
1192 	 * Copy the rule name for listing all sticky cache entry.  ir_name
1193 	 * is guaranteed to be NULL terminated.
1194 	 */
1195 	(void) strcpy(s->rule_name, rule->ir_name);
1196 	s->server = server;
1197 
1198 	/*
1199 	 * Grab a ref cnt on the server so that it won't go away while
1200 	 * it is still in the sticky table.
1201 	 */
1202 	ILB_SERVER_REFHOLD(server);
1203 	s->src = *src;
1204 	s->expiry = rule->ir_sticky_expiry;
1205 	s->refcnt = 1;
1206 	s->hash = hash;
1207 
1208 	/*
1209 	 * There is no need to set atime here since the refcnt is not
1210 	 * zero.  A sticky entry is removed only when the refcnt is
1211 	 * zero.  But just set it here for debugging purpose.  The
1212 	 * atime is set when a refrele is done on a sticky entry.
1213 	 */
1214 	s->atime = ddi_get_lbolt64();
1215 
1216 	list_insert_head(&hash->sticky_head, s);
1217 	hash->sticky_cnt++;
1218 	return (s);
1219 }
1220 
1221 /*
1222  * This routine checks if there is an existing sticky entry which matches
1223  * a given packet.  If there is one, return it.  If there is not, create
1224  * a sticky entry using the packet's info.
1225  */
1226 ilb_server_t *
1227 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1228     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1229 {
1230 	int i;
1231 	ilb_sticky_hash_t *hash;
1232 	ilb_sticky_t *s;
1233 
1234 	ASSERT(server != NULL);
1235 
1236 	*res = NULL;
1237 
1238 	i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1239 	    (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1240 	hash = &ilbs->ilbs_sticky_hash[i];
1241 
1242 	/* First check if there is already an entry. */
1243 	mutex_enter(&hash->sticky_lock);
1244 	s = ilb_sticky_lookup(hash, rule, src);
1245 
1246 	/* No sticky entry, add one. */
1247 	if (s == NULL) {
1248 add_new_entry:
1249 		s = ilb_sticky_add(hash, rule, server, src);
1250 		if (s == NULL) {
1251 			mutex_exit(&hash->sticky_lock);
1252 			return (NULL);
1253 		}
1254 		/*
1255 		 * Find a source for this server.  All subseqent requests from
1256 		 * the same client matching this sticky entry will use this
1257 		 * source address in doing NAT.  The current algorithm is
1258 		 * simple, rotate the source address.  Note that the
1259 		 * source address array does not change after it's created, so
1260 		 * it is OK to just increment the cur index.
1261 		 */
1262 		if (server->iser_nat_src != NULL) {
1263 			/* It is a hint, does not need to be atomic. */
1264 			*src_ent_idx = (server->iser_nat_src->cur++ %
1265 			    server->iser_nat_src->num_src);
1266 			s->nat_src_idx = *src_ent_idx;
1267 		}
1268 		mutex_exit(&hash->sticky_lock);
1269 		*res = s;
1270 		return (server);
1271 	}
1272 
1273 	/*
1274 	 * We don't hold any lock accessing iser_enabled.  Refer to the
1275 	 * comment in ilb_server_add() about iser_lock.
1276 	 */
1277 	if (!s->server->iser_enabled) {
1278 		/*
1279 		 * s->server == server can only happen if there is a race in
1280 		 * toggling the iser_enabled flag (we don't hold a lock doing
1281 		 * that) so that the load balance algorithm still returns a
1282 		 * disabled server.  In this case, just drop the packet...
1283 		 */
1284 		if (s->server == server) {
1285 			mutex_exit(&hash->sticky_lock);
1286 			return (NULL);
1287 		}
1288 
1289 		/*
1290 		 * The old server is disabled and there is a new server, use
1291 		 * the new one to create a sticky entry.  Since we will
1292 		 * add the entry at the beginning, subsequent lookup will
1293 		 * find this new entry instead of the old one.
1294 		 */
1295 		goto add_new_entry;
1296 	}
1297 
1298 	s->refcnt++;
1299 	*res = s;
1300 	mutex_exit(&hash->sticky_lock);
1301 	if (server->iser_nat_src != NULL)
1302 		*src_ent_idx = s->nat_src_idx;
1303 	return (s->server);
1304 }
1305 
1306 static void
1307 ilb_sticky_cleanup(void *arg)
1308 {
1309 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1310 	uint32_t i;
1311 	ilb_stack_t *ilbs;
1312 	ilb_sticky_hash_t *hash;
1313 	ilb_sticky_t *s, *nxt_s;
1314 	int64_t now, expiry;
1315 
1316 	ilbs = timer->ilbs;
1317 	hash = ilbs->ilbs_sticky_hash;
1318 	ASSERT(hash != NULL);
1319 
1320 	now = ddi_get_lbolt64();
1321 	for (i = timer->start; i < timer->end; i++) {
1322 		mutex_enter(&hash[i].sticky_lock);
1323 		for (s = list_head(&hash[i].sticky_head); s != NULL;
1324 		    s = nxt_s) {
1325 			nxt_s = list_next(&hash[i].sticky_head, s);
1326 			if (s->refcnt != 0)
1327 				continue;
1328 			expiry = now - SEC_TO_TICK(s->expiry);
1329 			if (s->atime < expiry) {
1330 				ILB_SERVER_REFRELE(s->server);
1331 				list_remove(&hash[i].sticky_head, s);
1332 				kmem_cache_free(ilb_sticky_cache, s);
1333 				hash[i].sticky_cnt--;
1334 			}
1335 		}
1336 		mutex_exit(&hash[i].sticky_lock);
1337 	}
1338 }
1339 
1340 static void
1341 ilb_sticky_timer(void *arg)
1342 {
1343 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1344 
1345 	(void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1346 	    ilb_sticky_cleanup, arg, TQ_SLEEP);
1347 	mutex_enter(&timer->tid_lock);
1348 	if (timer->tid == 0) {
1349 		mutex_exit(&timer->tid_lock);
1350 	} else {
1351 		timer->tid = timeout(ilb_sticky_timer, arg,
1352 		    SEC_TO_TICK(ilb_sticky_timeout));
1353 		mutex_exit(&timer->tid_lock);
1354 	}
1355 }
1356 
1357 void
1358 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1359 {
1360 	extern pri_t minclsyspri;
1361 	int i, part;
1362 	char tq_name[TASKQ_NAMELEN];
1363 	ilb_timer_t *tm;
1364 
1365 	if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1366 		for (i = 0; i < 31; i++) {
1367 			if (ilbs->ilbs_sticky_hash_size < (1 << i))
1368 				break;
1369 		}
1370 		ilbs->ilbs_sticky_hash_size = 1 << i;
1371 	}
1372 
1373 	ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1374 	    ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1375 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1376 		mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1377 		    MUTEX_DEFAULT, NULL);
1378 		list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1379 		    sizeof (ilb_sticky_t),
1380 		    offsetof(ilb_sticky_t, list));
1381 	}
1382 
1383 	if (ilb_sticky_cache == NULL)
1384 		ilb_sticky_cache_init();
1385 
1386 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1387 	    (void *)ilbs->ilbs_netstack);
1388 	ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1389 	ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1390 	    ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1391 	    ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1392 
1393 	ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1394 	ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1395 	    ilb_sticky_timer_size, KM_SLEEP);
1396 	part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1397 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1398 		tm = ilbs->ilbs_sticky_timer_list + i;
1399 		tm->start = i * part;
1400 		tm->end = i * part + part;
1401 		if (tm->end > ilbs->ilbs_sticky_hash_size)
1402 			tm->end = ilbs->ilbs_sticky_hash_size;
1403 		tm->ilbs = ilbs;
1404 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1405 		/* Spread out the starting execution time of all the timers. */
1406 		tm->tid = timeout(ilb_sticky_timer, tm,
1407 		    SEC_TO_TICK(ilb_sticky_timeout + i));
1408 	}
1409 }
1410 
1411 void
1412 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1413 {
1414 	int i;
1415 	ilb_sticky_t *s;
1416 
1417 	if (ilbs->ilbs_sticky_hash == NULL)
1418 		return;
1419 
1420 	/* Stop all the timers first. */
1421 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1422 		timeout_id_t tid;
1423 
1424 		/* Setting tid to 0 tells the timer handler not to restart. */
1425 		mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1426 		tid = ilbs->ilbs_sticky_timer_list[i].tid;
1427 		ilbs->ilbs_sticky_timer_list[i].tid = 0;
1428 		mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1429 		(void) untimeout(tid);
1430 	}
1431 	kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1432 	    ilb_sticky_timer_size);
1433 	taskq_destroy(ilbs->ilbs_sticky_taskq);
1434 	ilbs->ilbs_sticky_taskq = NULL;
1435 
1436 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1437 		while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1438 		    != NULL) {
1439 			list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1440 			ILB_SERVER_REFRELE(s->server);
1441 			kmem_free(s, sizeof (ilb_sticky_t));
1442 		}
1443 	}
1444 	kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1445 	    sizeof (ilb_sticky_hash_t));
1446 }
1447 
1448 /*
1449  * This routine sends up the sticky hash table to user land.  Refer to
1450  * the comments before ilb_list_nat().  Both routines assume similar
1451  * conditions.
1452  *
1453  * It is assumed that the caller has checked the size of st so that it
1454  * can hold num entries.
1455  */
1456 /* ARGSUSED */
1457 int
1458 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1459     uint32_t *num, uint32_t *flags)
1460 {
1461 	ilb_sticky_hash_t *hash;
1462 	ilb_sticky_t *curp;
1463 	uint32_t i, j;
1464 	int ret = 0;
1465 
1466 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1467 	while (ilbs->ilbs_sticky_list_busy) {
1468 		if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1469 		    &ilbs->ilbs_sticky_list_lock) == 0) {
1470 			mutex_exit(&ilbs->ilbs_sticky_list_lock);
1471 			return (EINTR);
1472 		}
1473 	}
1474 	if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1475 		mutex_exit(&ilbs->ilbs_sticky_list_lock);
1476 		*num = 0;
1477 		*flags |= ILB_LIST_END;
1478 		return (0);
1479 	}
1480 	ilbs->ilbs_sticky_list_busy = B_TRUE;
1481 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1482 
1483 	if (*flags & ILB_LIST_BEGIN) {
1484 		i = 0;
1485 		mutex_enter(&hash[0].sticky_lock);
1486 		curp = list_head(&hash[0].sticky_head);
1487 	} else if (*flags & ILB_LIST_CONT) {
1488 		if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1489 			*num = 0;
1490 			*flags |= ILB_LIST_END;
1491 			goto done;
1492 		}
1493 		i = ilbs->ilbs_sticky_list_cur;
1494 		mutex_enter(&hash[i].sticky_lock);
1495 		curp = ilbs->ilbs_sticky_list_curp;
1496 	} else {
1497 		ret = EINVAL;
1498 		goto done;
1499 	}
1500 
1501 	j = 0;
1502 	while (j < *num) {
1503 		if (curp == NULL) {
1504 			mutex_exit(&hash[i].sticky_lock);
1505 			if (++i == ilbs->ilbs_sticky_hash_size) {
1506 				*flags |= ILB_LIST_END;
1507 				break;
1508 			}
1509 			mutex_enter(&hash[i].sticky_lock);
1510 			curp = list_head(&hash[i].sticky_head);
1511 			continue;
1512 		}
1513 		(void) strcpy(st[j].rule_name, curp->rule_name);
1514 		st[j].req_addr = curp->src;
1515 		st[j].srv_addr = curp->server->iser_addr_v6;
1516 		st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1517 		j++;
1518 		curp = list_next(&hash[i].sticky_head, curp);
1519 	}
1520 	ilbs->ilbs_sticky_list_curp = curp;
1521 	if (j == *num)
1522 		mutex_exit(&hash[i].sticky_lock);
1523 
1524 	ilbs->ilbs_sticky_list_cur = i;
1525 
1526 	*num = j;
1527 done:
1528 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1529 	ilbs->ilbs_sticky_list_busy = B_FALSE;
1530 	cv_signal(&ilbs->ilbs_sticky_list_cv);
1531 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1532 
1533 	return (ret);
1534 }
1535