xref: /titanic_52/usr/src/uts/common/inet/ilb/ilb_conn.c (revision 69112edd987c28fa551d4f8d9362a84a45365f17)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/conf.h>
29 #include <sys/time.h>
30 #include <sys/taskq.h>
31 #include <sys/cmn_err.h>
32 #include <sys/sdt.h>
33 #include <sys/atomic.h>
34 #include <netinet/in.h>
35 #include <inet/ip.h>
36 #include <inet/ip6.h>
37 #include <inet/tcp.h>
38 #include <inet/udp_impl.h>
39 #include <inet/ilb.h>
40 
41 #include "ilb_stack.h"
42 #include "ilb_impl.h"
43 #include "ilb_conn.h"
44 #include "ilb_nat.h"
45 
46 /*
47  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
48  *
49  * start: starting index into the hash table to do gc
50  * end: ending index into the hash table to do gc
51  * ilbs: pointer to the ilb_stack_t of the IP stack
52  * tid_lock: mutex to protect the timer id.
53  * tid: timer id of the timer
54  */
55 typedef struct ilb_timer_s {
56 	uint32_t	start;
57 	uint32_t	end;
58 	ilb_stack_t	*ilbs;
59 	kmutex_t	tid_lock;
60 	timeout_id_t	tid;
61 } ilb_timer_t;
62 
63 /* Hash macro for finding the index to the conn hash table */
64 #define	ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)	\
65 	(((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +		\
66 	(*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +		\
67 	(*((saddr) + 1) ^ *((daddr) + 1)) * 37 +		\
68 	(*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &	\
69 	((hash_size) - 1))
70 
71 /* Kmem cache for the conn hash entry */
72 static struct kmem_cache *ilb_conn_cache = NULL;
73 
74 /*
75  * There are 60 timers running to do conn cache garbage collection.  Each
76  * gc thread is responsible for 1/60 of the conn hash table.
77  */
78 static int ilb_conn_timer_size = 60;
79 
80 /* Each of the above gc timers wake up every 15s to do the gc. */
81 static int ilb_conn_cache_timeout = 15;
82 
83 #define	ILB_STICKY_HASH(saddr, rule, hash_size)			\
84 	(((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +		\
85 	(*((saddr) + 2) ^ ((rule) >> 16)) * 961 +		\
86 	(*((saddr) + 1) ^ ((rule) >> 8)) * 31 +			\
87 	(*(saddr) ^ (rule))) & ((hash_size) - 1))
88 
89 static struct kmem_cache *ilb_sticky_cache = NULL;
90 
91 /*
92  * There are 60 timers running to do sticky cache garbage collection.  Each
93  * gc thread is responsible for 1/60 of the sticky hash table.
94  */
95 static int ilb_sticky_timer_size = 60;
96 
97 /* Each of the above gc timers wake up every 15s to do the gc. */
98 static int ilb_sticky_timeout = 15;
99 
100 #define	ILB_STICKY_REFRELE(s)			\
101 {						\
102 	mutex_enter(&(s)->hash->sticky_lock);	\
103 	(s)->refcnt--;				\
104 	(s)->atime = ddi_get_lbolt64();		\
105 	mutex_exit(&s->hash->sticky_lock);	\
106 }
107 
108 
109 static void
110 ilb_conn_cache_init(void)
111 {
112 	ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
113 	    sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
114 	    ilb_kmem_flags);
115 }
116 
117 void
118 ilb_conn_cache_fini(void)
119 {
120 	if (ilb_conn_cache != NULL) {
121 		kmem_cache_destroy(ilb_conn_cache);
122 		ilb_conn_cache = NULL;
123 	}
124 }
125 
126 static void
127 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
128 {
129 	ilb_conn_hash_t *hash;
130 	ilb_conn_t **next, **prev;
131 	ilb_conn_t **next_prev, **prev_next;
132 
133 	if (c2s) {
134 		hash = connp->conn_c2s_hash;
135 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
136 		next = &connp->conn_c2s_next;
137 		prev = &connp->conn_c2s_prev;
138 		if (*next != NULL)
139 			next_prev = &(*next)->conn_c2s_prev;
140 		if (*prev != NULL)
141 			prev_next = &(*prev)->conn_c2s_next;
142 	} else {
143 		hash = connp->conn_s2c_hash;
144 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
145 		next = &connp->conn_s2c_next;
146 		prev = &connp->conn_s2c_prev;
147 		if (*next != NULL)
148 			next_prev = &(*next)->conn_s2c_prev;
149 		if (*prev != NULL)
150 			prev_next = &(*prev)->conn_s2c_next;
151 	}
152 
153 	if (hash->ilb_connp == connp) {
154 		hash->ilb_connp = *next;
155 		if (*next != NULL)
156 			*next_prev = NULL;
157 	} else {
158 		if (*prev != NULL)
159 			*prev_next = *next;
160 		if (*next != NULL)
161 			*next_prev = *prev;
162 	}
163 	ASSERT(hash->ilb_conn_cnt > 0);
164 	hash->ilb_conn_cnt--;
165 
166 	*next = NULL;
167 	*prev = NULL;
168 }
169 
170 static void
171 ilb_conn_remove(ilb_conn_t *connp)
172 {
173 	ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
174 	ilb_conn_remove_common(connp, B_TRUE);
175 	ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
176 	ilb_conn_remove_common(connp, B_FALSE);
177 
178 	if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
179 		in_port_t port;
180 
181 		port = ntohs(connp->conn_rule_cache.info.nat_sport);
182 		vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
183 		    (void *)(uintptr_t)port, 1);
184 	}
185 
186 	if (connp->conn_sticky != NULL)
187 		ILB_STICKY_REFRELE(connp->conn_sticky);
188 	ILB_SERVER_REFRELE(connp->conn_server);
189 	kmem_cache_free(ilb_conn_cache, connp);
190 }
191 
192 /*
193  * Routine to do periodic garbage collection of conn hash entries.  When
194  * a conn hash timer fires, it dispatches a taskq to call this function
195  * to do the gc.  Note that each taskq is responisble for a portion of
196  * the table.  The portion is stored in timer->start, timer->end.
197  */
198 static void
199 ilb_conn_cleanup(void *arg)
200 {
201 	ilb_timer_t *timer = (ilb_timer_t *)arg;
202 	uint32_t i;
203 	ilb_stack_t *ilbs;
204 	ilb_conn_hash_t *c2s_hash, *s2c_hash;
205 	ilb_conn_t *connp, *nxt_connp;
206 	int64_t now;
207 	int64_t expiry;
208 	boolean_t die_now;
209 
210 	ilbs = timer->ilbs;
211 	c2s_hash = ilbs->ilbs_c2s_conn_hash;
212 	ASSERT(c2s_hash != NULL);
213 
214 	now = ddi_get_lbolt64();
215 	for (i = timer->start; i < timer->end; i++) {
216 		mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
217 		if ((connp = c2s_hash[i].ilb_connp) == NULL) {
218 			ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
219 			mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
220 			continue;
221 		}
222 		do {
223 			ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
224 			ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
225 			nxt_connp = connp->conn_c2s_next;
226 			expiry = now - SEC_TO_TICK(connp->conn_expiry);
227 			if (connp->conn_server->iser_die_time != 0 &&
228 			    connp->conn_server->iser_die_time < now)
229 				die_now = B_TRUE;
230 			else
231 				die_now = B_FALSE;
232 			s2c_hash = connp->conn_s2c_hash;
233 			mutex_enter(&s2c_hash->ilb_conn_hash_lock);
234 
235 			if (connp->conn_gc || die_now ||
236 			    (connp->conn_c2s_atime < expiry &&
237 			    connp->conn_s2c_atime < expiry)) {
238 				/* Need to update the nat list cur_connp */
239 				if (connp == ilbs->ilbs_conn_list_connp) {
240 					ilbs->ilbs_conn_list_connp =
241 					    connp->conn_c2s_next;
242 				}
243 				ilb_conn_remove(connp);
244 				goto nxt_connp;
245 			}
246 
247 			if (connp->conn_l4 != IPPROTO_TCP)
248 				goto nxt_connp;
249 
250 			/* Update and check TCP related conn info */
251 			if (connp->conn_c2s_tcp_fin_sent &&
252 			    SEQ_GT(connp->conn_s2c_tcp_ack,
253 			    connp->conn_c2s_tcp_fss)) {
254 				connp->conn_c2s_tcp_fin_acked = B_TRUE;
255 			}
256 			if (connp->conn_s2c_tcp_fin_sent &&
257 			    SEQ_GT(connp->conn_c2s_tcp_ack,
258 			    connp->conn_s2c_tcp_fss)) {
259 				connp->conn_s2c_tcp_fin_acked = B_TRUE;
260 			}
261 			if (connp->conn_c2s_tcp_fin_acked &&
262 			    connp->conn_s2c_tcp_fin_acked) {
263 				ilb_conn_remove(connp);
264 			}
265 nxt_connp:
266 			mutex_exit(&s2c_hash->ilb_conn_hash_lock);
267 			connp = nxt_connp;
268 		} while (connp != NULL);
269 		mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
270 	}
271 }
272 
273 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
274 static void
275 ilb_conn_timer(void *arg)
276 {
277 	ilb_timer_t *timer = (ilb_timer_t *)arg;
278 
279 	(void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
280 	    arg, TQ_SLEEP);
281 	mutex_enter(&timer->tid_lock);
282 	if (timer->tid == 0) {
283 		mutex_exit(&timer->tid_lock);
284 	} else {
285 		timer->tid = timeout(ilb_conn_timer, arg,
286 		    SEC_TO_TICK(ilb_conn_cache_timeout));
287 		mutex_exit(&timer->tid_lock);
288 	}
289 }
290 
291 void
292 ilb_conn_hash_init(ilb_stack_t *ilbs)
293 {
294 	extern pri_t minclsyspri;
295 	int i, part;
296 	ilb_timer_t *tm;
297 	char tq_name[TASKQ_NAMELEN];
298 
299 	/*
300 	 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
301 	 * the next power of 2.
302 	 */
303 	if (ilbs->ilbs_conn_hash_size & (ilbs->ilbs_conn_hash_size - 1)) {
304 		for (i = 0; i < 31; i++) {
305 			if (ilbs->ilbs_conn_hash_size < (1 << i))
306 				break;
307 		}
308 		ilbs->ilbs_conn_hash_size = 1 << i;
309 	}
310 
311 	/*
312 	 * Can sleep since this should be called when a rule is being added,
313 	 * hence we are not in interrupt context.
314 	 */
315 	ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
316 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
317 	ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
318 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
319 
320 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
321 		mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
322 		    NULL, MUTEX_DEFAULT, NULL);
323 	}
324 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
325 		mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
326 		    NULL, MUTEX_DEFAULT, NULL);
327 	}
328 
329 	if (ilb_conn_cache == NULL)
330 		ilb_conn_cache_init();
331 
332 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
333 	    (void *)ilbs->ilbs_netstack);
334 	ASSERT(ilbs->ilbs_conn_taskq == NULL);
335 	ilbs->ilbs_conn_taskq = taskq_create(tq_name,
336 	    ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
337 	    ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
338 
339 	ASSERT(ilbs->ilbs_conn_timer_list == NULL);
340 	ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
341 	    ilb_conn_timer_size, KM_SLEEP);
342 
343 	/*
344 	 * The hash table is divided in equal partition for those timers
345 	 * to do garbage collection.
346 	 */
347 	part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
348 	for (i = 0; i < ilb_conn_timer_size; i++) {
349 		tm = ilbs->ilbs_conn_timer_list + i;
350 		tm->start = i * part;
351 		tm->end = i * part + part;
352 		if (tm->end > ilbs->ilbs_conn_hash_size)
353 			tm->end = ilbs->ilbs_conn_hash_size;
354 		tm->ilbs = ilbs;
355 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
356 		/* Spread out the starting execution time of all the timers. */
357 		tm->tid = timeout(ilb_conn_timer, tm,
358 		    SEC_TO_TICK(ilb_conn_cache_timeout + i));
359 	}
360 }
361 
362 void
363 ilb_conn_hash_fini(ilb_stack_t *ilbs)
364 {
365 	uint32_t i;
366 	ilb_conn_t *connp;
367 
368 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
369 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
370 		return;
371 	}
372 
373 	/* Stop all the timers first. */
374 	for (i = 0; i < ilb_conn_timer_size; i++) {
375 		timeout_id_t tid;
376 
377 		/* Setting tid to 0 tells the timer handler not to restart. */
378 		mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
379 		tid = ilbs->ilbs_conn_timer_list[i].tid;
380 		ilbs->ilbs_conn_timer_list[i].tid = 0;
381 		mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
382 		(void) untimeout(tid);
383 	}
384 	kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
385 	    ilb_conn_timer_size);
386 	taskq_destroy(ilbs->ilbs_conn_taskq);
387 	ilbs->ilbs_conn_taskq = NULL;
388 
389 	/* Then remove all the conns. */
390 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
391 		while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
392 			ilbs->ilbs_s2c_conn_hash->ilb_connp =
393 			    connp->conn_s2c_next;
394 			ILB_SERVER_REFRELE(connp->conn_server);
395 			if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
396 				ilb_nat_src_entry_t *ent;
397 				in_port_t port;
398 
399 				/*
400 				 * src_ent will be freed in ilb_nat_src_fini().
401 				 */
402 				port = ntohs(
403 				    connp->conn_rule_cache.info.nat_sport);
404 				ent = connp->conn_rule_cache.info.src_ent;
405 				vmem_free(ent->nse_port_arena,
406 				    (void *)(uintptr_t)port, 1);
407 			}
408 			kmem_cache_free(ilb_conn_cache, connp);
409 		}
410 	}
411 	kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
412 	    ilbs->ilbs_conn_hash_size);
413 	kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
414 	    ilbs->ilbs_conn_hash_size);
415 }
416 
417 /*
418  * Internet checksum adjustment calculation routines.  We pre-calculate
419  * checksum adjustment so that we don't need to compute the checksum on
420  * the whole packet when we change address/port in the packet.
421  */
422 
423 static void
424 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
425     in_port_t new_port, uint32_t *adj_sum)
426 {
427 	uint32_t sum;
428 
429 	sum = *oaddr + *(oaddr + 1) + old_port;
430 	while ((sum >> 16) != 0)
431 		sum = (sum & 0xffff) + (sum >> 16);
432 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
433 }
434 
435 static void
436 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
437     in_port_t new_port, uint32_t *adj_sum)
438 {
439 	uint32_t sum = 0;
440 
441 	sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
442 	    *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
443 	    old_port;
444 	while ((sum >> 16) != 0)
445 		sum = (sum & 0xffff) + (sum >> 16);
446 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
447 	    *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
448 	    *(naddr + 6) + *(naddr + 7) + new_port;
449 }
450 
451 static void
452 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
453     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
454     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
455 {
456 	uint32_t sum;
457 
458 	sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
459 	    old_port2;
460 	while ((sum >> 16) != 0)
461 		sum = (sum & 0xffff) + (sum >> 16);
462 	*adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
463 	    *naddr2 + *(naddr2 + 1) + new_port2;
464 }
465 
466 static void
467 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
468     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
469     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
470 {
471 	uint32_t sum = 0;
472 
473 	sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
474 	    *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
475 	    old_port1;
476 	sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
477 	    *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
478 	    old_port2;
479 	while ((sum >> 16) != 0)
480 		sum = (sum & 0xffff) + (sum >> 16);
481 	sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
482 	    *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
483 	    *(naddr1 + 7) + new_port1;
484 	*adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
485 	    *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
486 	    *(naddr2 + 7) + new_port2;
487 }
488 
489 /*
490  * Add a conn hash entry to the tables.  Note that a conn hash entry
491  * (ilb_conn_t) contains info on both directions.  And there are two hash
492  * tables, one for client to server and the other for server to client.
493  * So the same entry is added to both tables and can be ccessed by two
494  * thread simultaneously.  But each thread will only access data on one
495  * direction, so there is no conflict.
496  */
497 int
498 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
499     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
500     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
501 {
502 	ilb_conn_t *connp;
503 	ilb_conn_hash_t *hash;
504 	int i;
505 
506 	connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
507 	if (connp == NULL) {
508 		if (s != NULL) {
509 			if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
510 				ilb_nat_src_entry_t **entry;
511 
512 				entry = s->server->iser_nat_src->src_list;
513 				vmem_free(entry[s->nat_src_idx]->nse_port_arena,
514 				    (void *)(uintptr_t)ntohs(info->nat_sport),
515 				    1);
516 			}
517 			ILB_STICKY_REFRELE(s);
518 		}
519 		return (ENOMEM);
520 	}
521 
522 	connp->conn_l4 = rule->ir_proto;
523 
524 	connp->conn_server = server;
525 	ILB_SERVER_REFHOLD(server);
526 	connp->conn_sticky = s;
527 
528 	connp->conn_rule_cache.topo = rule->ir_topo;
529 	connp->conn_rule_cache.info = *info;
530 
531 	connp->conn_gc = B_FALSE;
532 
533 	connp->conn_expiry = rule->ir_nat_expiry;
534 	connp->conn_cr_time = ddi_get_lbolt64();
535 
536 	/* Client to server info. */
537 	connp->conn_c2s_saddr = *src;
538 	connp->conn_c2s_sport = sport;
539 	connp->conn_c2s_daddr = *dst;
540 	connp->conn_c2s_dport = dport;
541 
542 	connp->conn_c2s_atime = ddi_get_lbolt64();
543 	/* The packet ths triggers this creation should be counted */
544 	connp->conn_c2s_pkt_cnt = 1;
545 	connp->conn_c2s_tcp_fin_sent = B_FALSE;
546 	connp->conn_c2s_tcp_fin_acked = B_FALSE;
547 
548 	/* Server to client info, before NAT */
549 	switch (rule->ir_topo) {
550 	case ILB_TOPO_IMPL_HALF_NAT:
551 		connp->conn_s2c_saddr = info->nat_dst;
552 		connp->conn_s2c_sport = info->nat_dport;
553 		connp->conn_s2c_daddr = *src;
554 		connp->conn_s2c_dport = sport;
555 
556 		/* Pre-calculate checksum changes for both directions */
557 		if (rule->ir_ipver == IPPROTO_IP) {
558 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
559 			    (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
560 			    &connp->conn_c2s_ip_sum);
561 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
562 			    (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
563 			    info->nat_dport, &connp->conn_c2s_tp_sum);
564 			*ip_sum = connp->conn_c2s_ip_sum;
565 			*tp_sum = connp->conn_c2s_tp_sum;
566 
567 			hnat_cksum_v4(
568 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
569 			    (uint16_t *)&dst->s6_addr32[3], 0, 0,
570 			    &connp->conn_s2c_ip_sum);
571 			hnat_cksum_v4(
572 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
573 			    (uint16_t *)&dst->s6_addr32[3],
574 			    info->nat_dport, dport,
575 			    &connp->conn_s2c_tp_sum);
576 		} else {
577 			connp->conn_c2s_ip_sum = 0;
578 			hnat_cksum_v6((uint16_t *)dst,
579 			    (uint16_t *)&info->nat_dst, dport,
580 			    info->nat_dport, &connp->conn_c2s_tp_sum);
581 			*ip_sum = 0;
582 			*tp_sum = connp->conn_c2s_tp_sum;
583 
584 			connp->conn_s2c_ip_sum = 0;
585 			hnat_cksum_v6((uint16_t *)&info->nat_dst,
586 			    (uint16_t *)dst, info->nat_dport, dport,
587 			    &connp->conn_s2c_tp_sum);
588 		}
589 		break;
590 	case ILB_TOPO_IMPL_NAT:
591 		connp->conn_s2c_saddr = info->nat_dst;
592 		connp->conn_s2c_sport = info->nat_dport;
593 		connp->conn_s2c_daddr = info->nat_src;
594 		connp->conn_s2c_dport = info->nat_sport;
595 
596 		if (rule->ir_ipver == IPPROTO_IP) {
597 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
598 			    (uint16_t *)&dst->s6_addr32[3],
599 			    (uint16_t *)&info->nat_src.s6_addr32[3],
600 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
601 			    0, 0, 0, 0, &connp->conn_c2s_ip_sum);
602 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
603 			    (uint16_t *)&dst->s6_addr32[3],
604 			    (uint16_t *)&info->nat_src.s6_addr32[3],
605 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
606 			    sport, dport, info->nat_sport,
607 			    info->nat_dport, &connp->conn_c2s_tp_sum);
608 			*ip_sum = connp->conn_c2s_ip_sum;
609 			*tp_sum = connp->conn_c2s_tp_sum;
610 
611 			fnat_cksum_v4(
612 			    (uint16_t *)&info->nat_src.s6_addr32[3],
613 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
614 			    (uint16_t *)&src->s6_addr32[3],
615 			    (uint16_t *)&dst->s6_addr32[3],
616 			    0, 0, 0, 0, &connp->conn_s2c_ip_sum);
617 			fnat_cksum_v4(
618 			    (uint16_t *)&info->nat_src.s6_addr32[3],
619 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
620 			    (uint16_t *)&src->s6_addr32[3],
621 			    (uint16_t *)&dst->s6_addr32[3],
622 			    info->nat_sport, info->nat_dport,
623 			    sport, dport, &connp->conn_s2c_tp_sum);
624 		} else {
625 			fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
626 			    (uint16_t *)&info->nat_src,
627 			    (uint16_t *)&info->nat_dst,
628 			    sport, dport, info->nat_sport,
629 			    info->nat_dport, &connp->conn_c2s_tp_sum);
630 			connp->conn_c2s_ip_sum = 0;
631 			*ip_sum = 0;
632 			*tp_sum = connp->conn_c2s_tp_sum;
633 
634 			fnat_cksum_v6((uint16_t *)&info->nat_src,
635 			    (uint16_t *)&info->nat_dst, (uint16_t *)src,
636 			    (uint16_t *)dst, info->nat_sport,
637 			    info->nat_dport, sport, dport,
638 			    &connp->conn_s2c_tp_sum);
639 			connp->conn_s2c_ip_sum = 0;
640 		}
641 		break;
642 	}
643 
644 	connp->conn_s2c_atime = ddi_get_lbolt64();
645 	connp->conn_s2c_pkt_cnt = 1;
646 	connp->conn_s2c_tcp_fin_sent = B_FALSE;
647 	connp->conn_s2c_tcp_fin_acked = B_FALSE;
648 
649 	/* Add it to the s2c hash table. */
650 	hash = ilbs->ilbs_s2c_conn_hash;
651 	i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
652 	    ntohs(connp->conn_s2c_sport),
653 	    (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
654 	    ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
655 	connp->conn_s2c_hash = &hash[i];
656 	DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
657 
658 	mutex_enter(&hash[i].ilb_conn_hash_lock);
659 	hash[i].ilb_conn_cnt++;
660 	connp->conn_s2c_next = hash[i].ilb_connp;
661 	if (hash[i].ilb_connp != NULL)
662 		hash[i].ilb_connp->conn_s2c_prev = connp;
663 	connp->conn_s2c_prev = NULL;
664 	hash[i].ilb_connp = connp;
665 	mutex_exit(&hash[i].ilb_conn_hash_lock);
666 
667 	/* Add it to the c2s hash table. */
668 	hash = ilbs->ilbs_c2s_conn_hash;
669 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
670 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
671 	    ilbs->ilbs_conn_hash_size);
672 	connp->conn_c2s_hash = &hash[i];
673 	DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
674 
675 	mutex_enter(&hash[i].ilb_conn_hash_lock);
676 	hash[i].ilb_conn_cnt++;
677 	connp->conn_c2s_next = hash[i].ilb_connp;
678 	if (hash[i].ilb_connp != NULL)
679 		hash[i].ilb_connp->conn_c2s_prev = connp;
680 	connp->conn_c2s_prev = NULL;
681 	hash[i].ilb_connp = connp;
682 	mutex_exit(&hash[i].ilb_conn_hash_lock);
683 
684 	return (0);
685 }
686 
687 /*
688  * If a connection is using TCP, we keep track of simple TCP state transition
689  * so that we know when to clean up an entry.
690  */
691 static boolean_t
692 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
693     boolean_t c2s)
694 {
695 	uint32_t ack, seq;
696 	int32_t seg_len;
697 
698 	if (tcpha->tha_flags & TH_RST)
699 		return (B_FALSE);
700 
701 	seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
702 	    TCP_HDR_LENGTH((tcph_t *)tcpha);
703 
704 	if (tcpha->tha_flags & TH_ACK)
705 		ack = ntohl(tcpha->tha_ack);
706 	seq = ntohl(tcpha->tha_seq);
707 	if (c2s) {
708 		ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
709 		if (tcpha->tha_flags & TH_FIN) {
710 			connp->conn_c2s_tcp_fss = seq + seg_len;
711 			connp->conn_c2s_tcp_fin_sent = B_TRUE;
712 		}
713 		connp->conn_c2s_tcp_ack = ack;
714 
715 		/* Port reuse by the client, restart the conn. */
716 		if (connp->conn_c2s_tcp_fin_sent &&
717 		    SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
718 			connp->conn_c2s_tcp_fin_sent = B_FALSE;
719 			connp->conn_c2s_tcp_fin_acked = B_FALSE;
720 		}
721 	} else {
722 		ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
723 		if (tcpha->tha_flags & TH_FIN) {
724 			connp->conn_s2c_tcp_fss = seq + seg_len;
725 			connp->conn_s2c_tcp_fin_sent = B_TRUE;
726 		}
727 		connp->conn_s2c_tcp_ack = ack;
728 
729 		/* Port reuse by the client, restart the conn. */
730 		if (connp->conn_s2c_tcp_fin_sent &&
731 		    SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
732 			connp->conn_s2c_tcp_fin_sent = B_FALSE;
733 			connp->conn_s2c_tcp_fin_acked = B_FALSE;
734 		}
735 	}
736 
737 	return (B_TRUE);
738 }
739 
740 /*
741  * Helper routint to find conn hash entry given some packet information and
742  * the traffic direction (c2s, client to server?)
743  */
744 static boolean_t
745 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
746     in_port_t sport, in6_addr_t *dst, in_port_t dport,
747     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
748     int32_t pkt_len, boolean_t c2s)
749 {
750 	ilb_conn_hash_t *hash;
751 	uint_t i;
752 	ilb_conn_t *connp;
753 	boolean_t tcp_alive;
754 	boolean_t ret = B_FALSE;
755 
756 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
757 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
758 	    ilbs->ilbs_conn_hash_size);
759 	if (c2s) {
760 		hash = ilbs->ilbs_c2s_conn_hash;
761 		mutex_enter(&hash[i].ilb_conn_hash_lock);
762 		for (connp = hash[i].ilb_connp; connp != NULL;
763 		    connp = connp->conn_c2s_next) {
764 			if (connp->conn_l4 == l4 &&
765 			    connp->conn_c2s_dport == dport &&
766 			    connp->conn_c2s_sport == sport &&
767 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
768 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
769 				connp->conn_c2s_atime = ddi_get_lbolt64();
770 				connp->conn_c2s_pkt_cnt++;
771 				*rule_cache = connp->conn_rule_cache;
772 				*ip_sum = connp->conn_c2s_ip_sum;
773 				*tp_sum = connp->conn_c2s_tp_sum;
774 				ret = B_TRUE;
775 				break;
776 			}
777 		}
778 	} else {
779 		hash = ilbs->ilbs_s2c_conn_hash;
780 		mutex_enter(&hash[i].ilb_conn_hash_lock);
781 		for (connp = hash[i].ilb_connp; connp != NULL;
782 		    connp = connp->conn_s2c_next) {
783 			if (connp->conn_l4 == l4 &&
784 			    connp->conn_s2c_dport == dport &&
785 			    connp->conn_s2c_sport == sport &&
786 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
787 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
788 				connp->conn_s2c_atime = ddi_get_lbolt64();
789 				connp->conn_s2c_pkt_cnt++;
790 				*rule_cache = connp->conn_rule_cache;
791 				*ip_sum = connp->conn_s2c_ip_sum;
792 				*tp_sum = connp->conn_s2c_tp_sum;
793 				ret = B_TRUE;
794 				break;
795 			}
796 		}
797 	}
798 	if (ret) {
799 		ILB_S_KSTAT(connp->conn_server, pkt_processed);
800 		ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
801 		    pkt_len);
802 
803 		switch (l4) {
804 		case (IPPROTO_TCP):
805 			tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
806 			    c2s);
807 			if (!tcp_alive) {
808 				connp->conn_gc = B_TRUE;
809 			}
810 			break;
811 		default:
812 			break;
813 		}
814 	}
815 	mutex_exit(&hash[i].ilb_conn_hash_lock);
816 
817 	return (ret);
818 }
819 
820 /*
821  * To check if a give packet matches an existing conn hash entry.  If it
822  * does, return the information about this entry so that the caller can
823  * do the proper NAT.
824  */
825 boolean_t
826 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
827     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
828     uint32_t pkt_len, in6_addr_t *lb_dst)
829 {
830 	ilb_rule_info_t rule_cache;
831 	uint32_t adj_ip_sum, adj_tp_sum;
832 	boolean_t ret;
833 
834 	/* Check the incoming hash table. */
835 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
836 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
837 		switch (rule_cache.topo) {
838 		case ILB_TOPO_IMPL_NAT:
839 			*lb_dst = rule_cache.info.nat_dst;
840 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
841 			    adj_ip_sum, adj_tp_sum, B_TRUE);
842 			ret = B_TRUE;
843 			break;
844 		case ILB_TOPO_IMPL_HALF_NAT:
845 			*lb_dst = rule_cache.info.nat_dst;
846 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
847 			    adj_ip_sum, adj_tp_sum, B_TRUE);
848 			ret = B_TRUE;
849 			break;
850 		default:
851 			ret = B_FALSE;
852 			break;
853 		}
854 		return (ret);
855 	}
856 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
857 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
858 		switch (rule_cache.topo) {
859 		case ILB_TOPO_IMPL_NAT:
860 			*lb_dst = rule_cache.info.src;
861 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
862 			    adj_ip_sum, adj_tp_sum, B_FALSE);
863 			ret = B_TRUE;
864 			break;
865 		case ILB_TOPO_IMPL_HALF_NAT:
866 			*lb_dst = *dst;
867 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
868 			    adj_ip_sum, adj_tp_sum, B_FALSE);
869 			ret = B_TRUE;
870 			break;
871 		default:
872 			ret = B_FALSE;
873 			break;
874 		}
875 		return (ret);
876 	}
877 
878 	return (B_FALSE);
879 }
880 
881 /*
882  * To check if an ICMP packet belongs to a connection in one of the conn
883  * hash entries.
884  */
885 boolean_t
886 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
887     void *icmph, in6_addr_t *lb_dst)
888 {
889 	ilb_conn_hash_t *hash;
890 	ipha_t *in_iph4;
891 	ip6_t *in_iph6;
892 	icmph_t *icmph4;
893 	icmp6_t *icmph6;
894 	in6_addr_t *in_src_p, *in_dst_p;
895 	in_port_t *sport, *dport;
896 	int l4;
897 	uint_t i;
898 	ilb_conn_t *connp;
899 	ilb_rule_info_t rule_cache;
900 	uint32_t adj_ip_sum;
901 	boolean_t full_nat;
902 
903 	if (l3 == IPPROTO_IP) {
904 		in6_addr_t in_src, in_dst;
905 
906 		icmph4 = (icmph_t *)icmph;
907 		in_iph4 = (ipha_t *)&icmph4[1];
908 
909 		if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
910 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
911 			return (B_FALSE);
912 		}
913 
914 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
915 		in_src_p = &in_src;
916 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
917 		in_dst_p = &in_dst;
918 
919 		l4 = in_iph4->ipha_protocol;
920 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
921 			return (B_FALSE);
922 
923 		sport = (in_port_t *)((char *)in_iph4 +
924 		    IPH_HDR_LENGTH(in_iph4));
925 		dport = sport + 1;
926 
927 		DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
928 		    in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
929 		    ntohs(*sport), uint16_t, ntohs(*dport));
930 	} else {
931 		ASSERT(l3 == IPPROTO_IPV6);
932 
933 		icmph6 = (icmp6_t *)icmph;
934 		in_iph6 = (ip6_t *)&icmph6[1];
935 		in_src_p = &in_iph6->ip6_src;
936 		in_dst_p = &in_iph6->ip6_dst;
937 
938 		if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
939 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
940 			return (B_FALSE);
941 		}
942 
943 		l4 = in_iph6->ip6_nxt;
944 		/* We don't go deep inside an IPv6 packet yet. */
945 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
946 			return (B_FALSE);
947 
948 		sport = (in_port_t *)&in_iph6[1];
949 		dport = sport + 1;
950 
951 		DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
952 		    &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
953 		    uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
954 	}
955 
956 	i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
957 	    (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
958 	    ilbs->ilbs_conn_hash_size);
959 	hash = ilbs->ilbs_c2s_conn_hash;
960 
961 	mutex_enter(&hash[i].ilb_conn_hash_lock);
962 	for (connp = hash[i].ilb_connp; connp != NULL;
963 	    connp = connp->conn_c2s_next) {
964 		if (connp->conn_l4 == l4 &&
965 		    connp->conn_c2s_dport == *sport &&
966 		    connp->conn_c2s_sport == *dport &&
967 		    IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
968 		    IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
969 			connp->conn_c2s_atime = ddi_get_lbolt64();
970 			connp->conn_c2s_pkt_cnt++;
971 			rule_cache = connp->conn_rule_cache;
972 			adj_ip_sum = connp->conn_c2s_ip_sum;
973 			break;
974 		}
975 	}
976 	mutex_exit(&hash[i].ilb_conn_hash_lock);
977 
978 	if (connp == NULL) {
979 		DTRACE_PROBE(ilb__chk__icmp__conn__failed);
980 		return (B_FALSE);
981 	}
982 
983 	switch (rule_cache.topo) {
984 	case ILB_TOPO_IMPL_NAT:
985 		full_nat = B_TRUE;
986 		break;
987 	case ILB_TOPO_IMPL_HALF_NAT:
988 		full_nat = B_FALSE;
989 		break;
990 	default:
991 		return (B_FALSE);
992 	}
993 
994 	*lb_dst = rule_cache.info.nat_dst;
995 	if (l3 == IPPROTO_IP) {
996 		ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
997 		    &rule_cache.info, adj_ip_sum, full_nat);
998 	} else {
999 		ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1000 		    &rule_cache.info, full_nat);
1001 	}
1002 	return (B_TRUE);
1003 }
1004 
1005 /*
1006  * This routine sends up the conn hash table to user land.  Note that the
1007  * request is an ioctl, hence we cannot really differentiate requests
1008  * from different clients.  There is no context shared between different
1009  * ioctls.  Here we make the assumption that the user land ilbd will
1010  * only allow one client to show the conn hash table at any time.
1011  * Otherwise, the results will be "very" inconsistent.
1012  *
1013  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1014  * to read from the beginning of the able.  After a certain entries
1015  * are reported, the kernel remembers the position of the last returned
1016  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1017  * it will return entries starting from where it was left off.  When
1018  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1019  * the client that there is no more entry.
1020  *
1021  * It is assumed that the caller has checked the size of nat so that it
1022  * can hold num entries.
1023  */
1024 /* ARGSUSED */
1025 int
1026 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1027     uint32_t *num, uint32_t *flags)
1028 {
1029 	ilb_conn_hash_t *hash;
1030 	ilb_conn_t *cur_connp;
1031 	uint32_t i, j;
1032 	int ret = 0;
1033 
1034 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1035 	while (ilbs->ilbs_conn_list_busy) {
1036 		if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1037 		    &ilbs->ilbs_conn_list_lock) == 0) {
1038 			mutex_exit(&ilbs->ilbs_conn_list_lock);
1039 			return (EINTR);
1040 		}
1041 	}
1042 	if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1043 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1044 		mutex_exit(&ilbs->ilbs_conn_list_lock);
1045 		*num = 0;
1046 		*flags |= ILB_LIST_END;
1047 		return (0);
1048 	}
1049 	ilbs->ilbs_conn_list_busy = B_TRUE;
1050 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1051 
1052 	if (*flags & ILB_LIST_BEGIN) {
1053 		i = 0;
1054 		mutex_enter(&hash[0].ilb_conn_hash_lock);
1055 		cur_connp = hash[0].ilb_connp;
1056 	} else if (*flags & ILB_LIST_CONT) {
1057 		if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1058 			*num = 0;
1059 			*flags |= ILB_LIST_END;
1060 			goto done;
1061 		}
1062 		i = ilbs->ilbs_conn_list_cur;
1063 		mutex_enter(&hash[i].ilb_conn_hash_lock);
1064 		cur_connp = ilbs->ilbs_conn_list_connp;
1065 	} else {
1066 		ret = EINVAL;
1067 		goto done;
1068 	}
1069 
1070 	j = 0;
1071 	while (j < *num) {
1072 		if (cur_connp == NULL) {
1073 			mutex_exit(&hash[i].ilb_conn_hash_lock);
1074 			if (++i == ilbs->ilbs_conn_hash_size) {
1075 				*flags |= ILB_LIST_END;
1076 				break;
1077 			}
1078 			mutex_enter(&hash[i].ilb_conn_hash_lock);
1079 			cur_connp = hash[i].ilb_connp;
1080 			continue;
1081 		}
1082 		nat[j].proto = cur_connp->conn_l4;
1083 
1084 		nat[j].in_global = cur_connp->conn_c2s_daddr;
1085 		nat[j].in_global_port = cur_connp->conn_c2s_dport;
1086 		nat[j].out_global = cur_connp->conn_c2s_saddr;
1087 		nat[j].out_global_port = cur_connp->conn_c2s_sport;
1088 
1089 		nat[j].in_local = cur_connp->conn_s2c_saddr;
1090 		nat[j].in_local_port = cur_connp->conn_s2c_sport;
1091 		nat[j].out_local = cur_connp->conn_s2c_daddr;
1092 		nat[j].out_local_port = cur_connp->conn_s2c_dport;
1093 
1094 		nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1095 		nat[j].last_access_time =
1096 		    TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1097 
1098 		/*
1099 		 * The conn_s2c_pkt_cnt may not be accurate since we are not
1100 		 * holding the s2c hash lock.
1101 		 */
1102 		nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1103 		    cur_connp->conn_s2c_pkt_cnt;
1104 		j++;
1105 
1106 		cur_connp = cur_connp->conn_c2s_next;
1107 	}
1108 	ilbs->ilbs_conn_list_connp = cur_connp;
1109 	if (j == *num)
1110 		mutex_exit(&hash[i].ilb_conn_hash_lock);
1111 
1112 	ilbs->ilbs_conn_list_cur = i;
1113 
1114 	*num = j;
1115 done:
1116 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1117 	ilbs->ilbs_conn_list_busy = B_FALSE;
1118 	cv_signal(&ilbs->ilbs_conn_list_cv);
1119 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1120 
1121 	return (ret);
1122 }
1123 
1124 
1125 /*
1126  * Stickiness (persistence) handling routines.
1127  */
1128 
1129 
1130 static void
1131 ilb_sticky_cache_init(void)
1132 {
1133 	ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1134 	    sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1135 	    ilb_kmem_flags);
1136 }
1137 
1138 void
1139 ilb_sticky_cache_fini(void)
1140 {
1141 	if (ilb_sticky_cache != NULL) {
1142 		kmem_cache_destroy(ilb_sticky_cache);
1143 		ilb_sticky_cache = NULL;
1144 	}
1145 }
1146 
1147 void
1148 ilb_sticky_refrele(ilb_sticky_t *s)
1149 {
1150 	ILB_STICKY_REFRELE(s);
1151 }
1152 
1153 static ilb_sticky_t *
1154 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1155 {
1156 	ilb_sticky_t *s;
1157 
1158 	ASSERT(mutex_owned(&hash->sticky_lock));
1159 
1160 	for (s = list_head(&hash->sticky_head); s != NULL;
1161 	    s = list_next(&hash->sticky_head, s)) {
1162 		if (s->rule_instance == rule->ir_ks_instance) {
1163 			if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1164 				return (s);
1165 		}
1166 	}
1167 	return (NULL);
1168 }
1169 
1170 static ilb_sticky_t *
1171 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1172     in6_addr_t *src)
1173 {
1174 	ilb_sticky_t *s;
1175 
1176 	ASSERT(mutex_owned(&hash->sticky_lock));
1177 
1178 	if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1179 		return (NULL);
1180 
1181 	/*
1182 	 * The rule instance is for handling the scenario when the same
1183 	 * client talks to different rules at the same time.  Stickiness
1184 	 * is per rule so we can use the rule instance to differentiate
1185 	 * the client's request.
1186 	 */
1187 	s->rule_instance = rule->ir_ks_instance;
1188 	/*
1189 	 * Copy the rule name for listing all sticky cache entry.  ir_name
1190 	 * is guaranteed to be NULL terminated.
1191 	 */
1192 	(void) strcpy(s->rule_name, rule->ir_name);
1193 	s->server = server;
1194 
1195 	/*
1196 	 * Grab a ref cnt on the server so that it won't go away while
1197 	 * it is still in the sticky table.
1198 	 */
1199 	ILB_SERVER_REFHOLD(server);
1200 	s->src = *src;
1201 	s->expiry = rule->ir_sticky_expiry;
1202 	s->refcnt = 1;
1203 	s->hash = hash;
1204 
1205 	/*
1206 	 * There is no need to set atime here since the refcnt is not
1207 	 * zero.  A sticky entry is removed only when the refcnt is
1208 	 * zero.  But just set it here for debugging purpose.  The
1209 	 * atime is set when a refrele is done on a sticky entry.
1210 	 */
1211 	s->atime = ddi_get_lbolt64();
1212 
1213 	list_insert_head(&hash->sticky_head, s);
1214 	hash->sticky_cnt++;
1215 	return (s);
1216 }
1217 
1218 /*
1219  * This routine checks if there is an existing sticky entry which matches
1220  * a given packet.  If there is one, return it.  If there is not, create
1221  * a sticky entry using the packet's info.
1222  */
1223 ilb_server_t *
1224 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1225     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1226 {
1227 	int i;
1228 	ilb_sticky_hash_t *hash;
1229 	ilb_sticky_t *s;
1230 
1231 	ASSERT(server != NULL);
1232 
1233 	*res = NULL;
1234 
1235 	i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1236 	    (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1237 	hash = &ilbs->ilbs_sticky_hash[i];
1238 
1239 	/* First check if there is already an entry. */
1240 	mutex_enter(&hash->sticky_lock);
1241 	s = ilb_sticky_lookup(hash, rule, src);
1242 
1243 	/* No sticky entry, add one. */
1244 	if (s == NULL) {
1245 add_new_entry:
1246 		s = ilb_sticky_add(hash, rule, server, src);
1247 		if (s == NULL) {
1248 			mutex_exit(&hash->sticky_lock);
1249 			return (NULL);
1250 		}
1251 		/*
1252 		 * Find a source for this server.  All subseqent requests from
1253 		 * the same client matching this sticky entry will use this
1254 		 * source address in doing NAT.  The current algorithm is
1255 		 * simple, rotate the source address.  Note that the
1256 		 * source address array does not change after it's created, so
1257 		 * it is OK to just increment the cur index.
1258 		 */
1259 		if (server->iser_nat_src != NULL) {
1260 			/* It is a hint, does not need to be atomic. */
1261 			*src_ent_idx = (server->iser_nat_src->cur++ %
1262 			    server->iser_nat_src->num_src);
1263 			s->nat_src_idx = *src_ent_idx;
1264 		}
1265 		mutex_exit(&hash->sticky_lock);
1266 		*res = s;
1267 		return (server);
1268 	}
1269 
1270 	/*
1271 	 * We don't hold any lock accessing iser_enabled.  Refer to the
1272 	 * comment in ilb_server_add() about iser_lock.
1273 	 */
1274 	if (!s->server->iser_enabled) {
1275 		/*
1276 		 * s->server == server can only happen if there is a race in
1277 		 * toggling the iser_enabled flag (we don't hold a lock doing
1278 		 * that) so that the load balance algorithm still returns a
1279 		 * disabled server.  In this case, just drop the packet...
1280 		 */
1281 		if (s->server == server) {
1282 			mutex_exit(&hash->sticky_lock);
1283 			return (NULL);
1284 		}
1285 
1286 		/*
1287 		 * The old server is disabled and there is a new server, use
1288 		 * the new one to create a sticky entry.  Since we will
1289 		 * add the entry at the beginning, subsequent lookup will
1290 		 * find this new entry instead of the old one.
1291 		 */
1292 		goto add_new_entry;
1293 	}
1294 
1295 	s->refcnt++;
1296 	*res = s;
1297 	mutex_exit(&hash->sticky_lock);
1298 	if (server->iser_nat_src != NULL)
1299 		*src_ent_idx = s->nat_src_idx;
1300 	return (s->server);
1301 }
1302 
1303 static void
1304 ilb_sticky_cleanup(void *arg)
1305 {
1306 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1307 	uint32_t i;
1308 	ilb_stack_t *ilbs;
1309 	ilb_sticky_hash_t *hash;
1310 	ilb_sticky_t *s, *nxt_s;
1311 	int64_t now, expiry;
1312 
1313 	ilbs = timer->ilbs;
1314 	hash = ilbs->ilbs_sticky_hash;
1315 	ASSERT(hash != NULL);
1316 
1317 	now = ddi_get_lbolt64();
1318 	for (i = timer->start; i < timer->end; i++) {
1319 		mutex_enter(&hash[i].sticky_lock);
1320 		for (s = list_head(&hash[i].sticky_head); s != NULL;
1321 		    s = nxt_s) {
1322 			nxt_s = list_next(&hash[i].sticky_head, s);
1323 			if (s->refcnt != 0)
1324 				continue;
1325 			expiry = now - SEC_TO_TICK(s->expiry);
1326 			if (s->atime < expiry) {
1327 				ILB_SERVER_REFRELE(s->server);
1328 				list_remove(&hash[i].sticky_head, s);
1329 				kmem_cache_free(ilb_sticky_cache, s);
1330 				hash[i].sticky_cnt--;
1331 			}
1332 		}
1333 		mutex_exit(&hash[i].sticky_lock);
1334 	}
1335 }
1336 
1337 static void
1338 ilb_sticky_timer(void *arg)
1339 {
1340 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1341 
1342 	(void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1343 	    ilb_sticky_cleanup, arg, TQ_SLEEP);
1344 	mutex_enter(&timer->tid_lock);
1345 	if (timer->tid == 0) {
1346 		mutex_exit(&timer->tid_lock);
1347 	} else {
1348 		timer->tid = timeout(ilb_sticky_timer, arg,
1349 		    SEC_TO_TICK(ilb_sticky_timeout));
1350 		mutex_exit(&timer->tid_lock);
1351 	}
1352 }
1353 
1354 void
1355 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1356 {
1357 	extern pri_t minclsyspri;
1358 	int i, part;
1359 	char tq_name[TASKQ_NAMELEN];
1360 	ilb_timer_t *tm;
1361 
1362 	if (ilbs->ilbs_sticky_hash_size & (ilbs->ilbs_sticky_hash_size - 1)) {
1363 		for (i = 0; i < 31; i++) {
1364 			if (ilbs->ilbs_sticky_hash_size < (1 << i))
1365 				break;
1366 		}
1367 		ilbs->ilbs_sticky_hash_size = 1 << i;
1368 	}
1369 
1370 	ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1371 	    ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1372 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1373 		mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1374 		    MUTEX_DEFAULT, NULL);
1375 		list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1376 		    sizeof (ilb_sticky_t),
1377 		    offsetof(ilb_sticky_t, list));
1378 	}
1379 
1380 	if (ilb_sticky_cache == NULL)
1381 		ilb_sticky_cache_init();
1382 
1383 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1384 	    (void *)ilbs->ilbs_netstack);
1385 	ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1386 	ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1387 	    ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1388 	    ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1389 
1390 	ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1391 	ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1392 	    ilb_sticky_timer_size, KM_SLEEP);
1393 	part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1394 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1395 		tm = ilbs->ilbs_sticky_timer_list + i;
1396 		tm->start = i * part;
1397 		tm->end = i * part + part;
1398 		if (tm->end > ilbs->ilbs_sticky_hash_size)
1399 			tm->end = ilbs->ilbs_sticky_hash_size;
1400 		tm->ilbs = ilbs;
1401 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1402 		/* Spread out the starting execution time of all the timers. */
1403 		tm->tid = timeout(ilb_sticky_timer, tm,
1404 		    SEC_TO_TICK(ilb_sticky_timeout + i));
1405 	}
1406 }
1407 
1408 void
1409 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1410 {
1411 	int i;
1412 	ilb_sticky_t *s;
1413 
1414 	if (ilbs->ilbs_sticky_hash == NULL)
1415 		return;
1416 
1417 	/* Stop all the timers first. */
1418 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1419 		timeout_id_t tid;
1420 
1421 		/* Setting tid to 0 tells the timer handler not to restart. */
1422 		mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1423 		tid = ilbs->ilbs_sticky_timer_list[i].tid;
1424 		ilbs->ilbs_sticky_timer_list[i].tid = 0;
1425 		mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1426 		(void) untimeout(tid);
1427 	}
1428 	kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1429 	    ilb_sticky_timer_size);
1430 	taskq_destroy(ilbs->ilbs_sticky_taskq);
1431 	ilbs->ilbs_sticky_taskq = NULL;
1432 
1433 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1434 		while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1435 		    != NULL) {
1436 			list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1437 			ILB_SERVER_REFRELE(s->server);
1438 			kmem_free(s, sizeof (ilb_sticky_t));
1439 		}
1440 	}
1441 	kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1442 	    sizeof (ilb_sticky_hash_t));
1443 }
1444 
1445 /*
1446  * This routine sends up the sticky hash table to user land.  Refer to
1447  * the comments before ilb_list_nat().  Both routines assume similar
1448  * conditions.
1449  *
1450  * It is assumed that the caller has checked the size of st so that it
1451  * can hold num entries.
1452  */
1453 /* ARGSUSED */
1454 int
1455 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1456     uint32_t *num, uint32_t *flags)
1457 {
1458 	ilb_sticky_hash_t *hash;
1459 	ilb_sticky_t *curp;
1460 	uint32_t i, j;
1461 	int ret = 0;
1462 
1463 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1464 	while (ilbs->ilbs_sticky_list_busy) {
1465 		if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1466 		    &ilbs->ilbs_sticky_list_lock) == 0) {
1467 			mutex_exit(&ilbs->ilbs_sticky_list_lock);
1468 			return (EINTR);
1469 		}
1470 	}
1471 	if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1472 		mutex_exit(&ilbs->ilbs_sticky_list_lock);
1473 		*num = 0;
1474 		*flags |= ILB_LIST_END;
1475 		return (0);
1476 	}
1477 	ilbs->ilbs_sticky_list_busy = B_TRUE;
1478 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1479 
1480 	if (*flags & ILB_LIST_BEGIN) {
1481 		i = 0;
1482 		mutex_enter(&hash[0].sticky_lock);
1483 		curp = list_head(&hash[0].sticky_head);
1484 	} else if (*flags & ILB_LIST_CONT) {
1485 		if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1486 			*num = 0;
1487 			*flags |= ILB_LIST_END;
1488 			goto done;
1489 		}
1490 		i = ilbs->ilbs_sticky_list_cur;
1491 		mutex_enter(&hash[i].sticky_lock);
1492 		curp = ilbs->ilbs_sticky_list_curp;
1493 	} else {
1494 		ret = EINVAL;
1495 		goto done;
1496 	}
1497 
1498 	j = 0;
1499 	while (j < *num) {
1500 		if (curp == NULL) {
1501 			mutex_exit(&hash[i].sticky_lock);
1502 			if (++i == ilbs->ilbs_sticky_hash_size) {
1503 				*flags |= ILB_LIST_END;
1504 				break;
1505 			}
1506 			mutex_enter(&hash[i].sticky_lock);
1507 			curp = list_head(&hash[i].sticky_head);
1508 			continue;
1509 		}
1510 		(void) strcpy(st[j].rule_name, curp->rule_name);
1511 		st[j].req_addr = curp->src;
1512 		st[j].srv_addr = curp->server->iser_addr_v6;
1513 		st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1514 		j++;
1515 		curp = list_next(&hash[i].sticky_head, curp);
1516 	}
1517 	ilbs->ilbs_sticky_list_curp = curp;
1518 	if (j == *num)
1519 		mutex_exit(&hash[i].sticky_lock);
1520 
1521 	ilbs->ilbs_sticky_list_cur = i;
1522 
1523 	*num = j;
1524 done:
1525 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1526 	ilbs->ilbs_sticky_list_busy = B_FALSE;
1527 	cv_signal(&ilbs->ilbs_sticky_list_cv);
1528 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1529 
1530 	return (ret);
1531 }
1532