xref: /titanic_51/usr/src/uts/common/inet/ilb/ilb_conn.c (revision c029eafbb040b81649027c5ae5a38e92d214461b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/sysmacros.h>
28 #include <sys/types.h>
29 #include <sys/conf.h>
30 #include <sys/time.h>
31 #include <sys/taskq.h>
32 #include <sys/cmn_err.h>
33 #include <sys/sdt.h>
34 #include <sys/atomic.h>
35 #include <netinet/in.h>
36 #include <inet/ip.h>
37 #include <inet/ip6.h>
38 #include <inet/tcp.h>
39 #include <inet/udp_impl.h>
40 #include <inet/ilb.h>
41 
42 #include "ilb_stack.h"
43 #include "ilb_impl.h"
44 #include "ilb_conn.h"
45 #include "ilb_nat.h"
46 
47 /*
48  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
49  *
50  * start: starting index into the hash table to do gc
51  * end: ending index into the hash table to do gc
52  * ilbs: pointer to the ilb_stack_t of the IP stack
53  * tid_lock: mutex to protect the timer id.
54  * tid: timer id of the timer
55  */
56 typedef struct ilb_timer_s {
57 	uint32_t	start;
58 	uint32_t	end;
59 	ilb_stack_t	*ilbs;
60 	kmutex_t	tid_lock;
61 	timeout_id_t	tid;
62 } ilb_timer_t;
63 
64 /* Hash macro for finding the index to the conn hash table */
65 #define	ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)	\
66 	(((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +		\
67 	(*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +		\
68 	(*((saddr) + 1) ^ *((daddr) + 1)) * 37 +		\
69 	(*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &	\
70 	((hash_size) - 1))
71 
72 /* Kmem cache for the conn hash entry */
73 static struct kmem_cache *ilb_conn_cache = NULL;
74 
75 /*
76  * There are 60 timers running to do conn cache garbage collection.  Each
77  * gc thread is responsible for 1/60 of the conn hash table.
78  */
79 static int ilb_conn_timer_size = 60;
80 
81 /* Each of the above gc timers wake up every 15s to do the gc. */
82 static int ilb_conn_cache_timeout = 15;
83 
84 #define	ILB_STICKY_HASH(saddr, rule, hash_size)			\
85 	(((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +		\
86 	(*((saddr) + 2) ^ ((rule) >> 16)) * 961 +		\
87 	(*((saddr) + 1) ^ ((rule) >> 8)) * 31 +			\
88 	(*(saddr) ^ (rule))) & ((hash_size) - 1))
89 
90 static struct kmem_cache *ilb_sticky_cache = NULL;
91 
92 /*
93  * There are 60 timers running to do sticky cache garbage collection.  Each
94  * gc thread is responsible for 1/60 of the sticky hash table.
95  */
96 static int ilb_sticky_timer_size = 60;
97 
98 /* Each of the above gc timers wake up every 15s to do the gc. */
99 static int ilb_sticky_timeout = 15;
100 
101 #define	ILB_STICKY_REFRELE(s)			\
102 {						\
103 	mutex_enter(&(s)->hash->sticky_lock);	\
104 	(s)->refcnt--;				\
105 	(s)->atime = ddi_get_lbolt64();		\
106 	mutex_exit(&s->hash->sticky_lock);	\
107 }
108 
109 
110 static void
111 ilb_conn_cache_init(void)
112 {
113 	ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
114 	    sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
115 	    ilb_kmem_flags);
116 }
117 
118 void
119 ilb_conn_cache_fini(void)
120 {
121 	if (ilb_conn_cache != NULL) {
122 		kmem_cache_destroy(ilb_conn_cache);
123 		ilb_conn_cache = NULL;
124 	}
125 }
126 
127 static void
128 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
129 {
130 	ilb_conn_hash_t *hash;
131 	ilb_conn_t **next, **prev;
132 	ilb_conn_t **next_prev, **prev_next;
133 
134 	if (c2s) {
135 		hash = connp->conn_c2s_hash;
136 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
137 		next = &connp->conn_c2s_next;
138 		prev = &connp->conn_c2s_prev;
139 		if (*next != NULL)
140 			next_prev = &(*next)->conn_c2s_prev;
141 		if (*prev != NULL)
142 			prev_next = &(*prev)->conn_c2s_next;
143 	} else {
144 		hash = connp->conn_s2c_hash;
145 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
146 		next = &connp->conn_s2c_next;
147 		prev = &connp->conn_s2c_prev;
148 		if (*next != NULL)
149 			next_prev = &(*next)->conn_s2c_prev;
150 		if (*prev != NULL)
151 			prev_next = &(*prev)->conn_s2c_next;
152 	}
153 
154 	if (hash->ilb_connp == connp) {
155 		hash->ilb_connp = *next;
156 		if (*next != NULL)
157 			*next_prev = NULL;
158 	} else {
159 		if (*prev != NULL)
160 			*prev_next = *next;
161 		if (*next != NULL)
162 			*next_prev = *prev;
163 	}
164 	ASSERT(hash->ilb_conn_cnt > 0);
165 	hash->ilb_conn_cnt--;
166 
167 	*next = NULL;
168 	*prev = NULL;
169 }
170 
171 static void
172 ilb_conn_remove(ilb_conn_t *connp)
173 {
174 	ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
175 	ilb_conn_remove_common(connp, B_TRUE);
176 	ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
177 	ilb_conn_remove_common(connp, B_FALSE);
178 
179 	if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
180 		in_port_t port;
181 
182 		port = ntohs(connp->conn_rule_cache.info.nat_sport);
183 		vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
184 		    (void *)(uintptr_t)port, 1);
185 	}
186 
187 	if (connp->conn_sticky != NULL)
188 		ILB_STICKY_REFRELE(connp->conn_sticky);
189 	ILB_SERVER_REFRELE(connp->conn_server);
190 	kmem_cache_free(ilb_conn_cache, connp);
191 }
192 
193 /*
194  * Routine to do periodic garbage collection of conn hash entries.  When
195  * a conn hash timer fires, it dispatches a taskq to call this function
196  * to do the gc.  Note that each taskq is responisble for a portion of
197  * the table.  The portion is stored in timer->start, timer->end.
198  */
199 static void
200 ilb_conn_cleanup(void *arg)
201 {
202 	ilb_timer_t *timer = (ilb_timer_t *)arg;
203 	uint32_t i;
204 	ilb_stack_t *ilbs;
205 	ilb_conn_hash_t *c2s_hash, *s2c_hash;
206 	ilb_conn_t *connp, *nxt_connp;
207 	int64_t now;
208 	int64_t expiry;
209 	boolean_t die_now;
210 
211 	ilbs = timer->ilbs;
212 	c2s_hash = ilbs->ilbs_c2s_conn_hash;
213 	ASSERT(c2s_hash != NULL);
214 
215 	now = ddi_get_lbolt64();
216 	for (i = timer->start; i < timer->end; i++) {
217 		mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
218 		if ((connp = c2s_hash[i].ilb_connp) == NULL) {
219 			ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
220 			mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
221 			continue;
222 		}
223 		do {
224 			ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
225 			ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
226 			nxt_connp = connp->conn_c2s_next;
227 			expiry = now - SEC_TO_TICK(connp->conn_expiry);
228 			if (connp->conn_server->iser_die_time != 0 &&
229 			    connp->conn_server->iser_die_time < now)
230 				die_now = B_TRUE;
231 			else
232 				die_now = B_FALSE;
233 			s2c_hash = connp->conn_s2c_hash;
234 			mutex_enter(&s2c_hash->ilb_conn_hash_lock);
235 
236 			if (connp->conn_gc || die_now ||
237 			    (connp->conn_c2s_atime < expiry &&
238 			    connp->conn_s2c_atime < expiry)) {
239 				/* Need to update the nat list cur_connp */
240 				if (connp == ilbs->ilbs_conn_list_connp) {
241 					ilbs->ilbs_conn_list_connp =
242 					    connp->conn_c2s_next;
243 				}
244 				ilb_conn_remove(connp);
245 				goto nxt_connp;
246 			}
247 
248 			if (connp->conn_l4 != IPPROTO_TCP)
249 				goto nxt_connp;
250 
251 			/* Update and check TCP related conn info */
252 			if (connp->conn_c2s_tcp_fin_sent &&
253 			    SEQ_GT(connp->conn_s2c_tcp_ack,
254 			    connp->conn_c2s_tcp_fss)) {
255 				connp->conn_c2s_tcp_fin_acked = B_TRUE;
256 			}
257 			if (connp->conn_s2c_tcp_fin_sent &&
258 			    SEQ_GT(connp->conn_c2s_tcp_ack,
259 			    connp->conn_s2c_tcp_fss)) {
260 				connp->conn_s2c_tcp_fin_acked = B_TRUE;
261 			}
262 			if (connp->conn_c2s_tcp_fin_acked &&
263 			    connp->conn_s2c_tcp_fin_acked) {
264 				ilb_conn_remove(connp);
265 			}
266 nxt_connp:
267 			mutex_exit(&s2c_hash->ilb_conn_hash_lock);
268 			connp = nxt_connp;
269 		} while (connp != NULL);
270 		mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
271 	}
272 }
273 
274 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
275 static void
276 ilb_conn_timer(void *arg)
277 {
278 	ilb_timer_t *timer = (ilb_timer_t *)arg;
279 
280 	(void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
281 	    arg, TQ_SLEEP);
282 	mutex_enter(&timer->tid_lock);
283 	if (timer->tid == 0) {
284 		mutex_exit(&timer->tid_lock);
285 	} else {
286 		timer->tid = timeout(ilb_conn_timer, arg,
287 		    SEC_TO_TICK(ilb_conn_cache_timeout));
288 		mutex_exit(&timer->tid_lock);
289 	}
290 }
291 
292 void
293 ilb_conn_hash_init(ilb_stack_t *ilbs)
294 {
295 	extern pri_t minclsyspri;
296 	int i, part;
297 	ilb_timer_t *tm;
298 	char tq_name[TASKQ_NAMELEN];
299 
300 	/*
301 	 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
302 	 * the next power of 2.
303 	 */
304 	if (!ISP2(ilbs->ilbs_conn_hash_size)) {
305 		for (i = 0; i < 31; i++) {
306 			if (ilbs->ilbs_conn_hash_size < (1 << i))
307 				break;
308 		}
309 		ilbs->ilbs_conn_hash_size = 1 << i;
310 	}
311 
312 	/*
313 	 * Can sleep since this should be called when a rule is being added,
314 	 * hence we are not in interrupt context.
315 	 */
316 	ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
317 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
318 	ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
319 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
320 
321 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
322 		mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
323 		    NULL, MUTEX_DEFAULT, NULL);
324 	}
325 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
326 		mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
327 		    NULL, MUTEX_DEFAULT, NULL);
328 	}
329 
330 	if (ilb_conn_cache == NULL)
331 		ilb_conn_cache_init();
332 
333 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
334 	    (void *)ilbs->ilbs_netstack);
335 	ASSERT(ilbs->ilbs_conn_taskq == NULL);
336 	ilbs->ilbs_conn_taskq = taskq_create(tq_name,
337 	    ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
338 	    ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
339 
340 	ASSERT(ilbs->ilbs_conn_timer_list == NULL);
341 	ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
342 	    ilb_conn_timer_size, KM_SLEEP);
343 
344 	/*
345 	 * The hash table is divided in equal partition for those timers
346 	 * to do garbage collection.
347 	 */
348 	part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
349 	for (i = 0; i < ilb_conn_timer_size; i++) {
350 		tm = ilbs->ilbs_conn_timer_list + i;
351 		tm->start = i * part;
352 		tm->end = i * part + part;
353 		if (tm->end > ilbs->ilbs_conn_hash_size)
354 			tm->end = ilbs->ilbs_conn_hash_size;
355 		tm->ilbs = ilbs;
356 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
357 		/* Spread out the starting execution time of all the timers. */
358 		tm->tid = timeout(ilb_conn_timer, tm,
359 		    SEC_TO_TICK(ilb_conn_cache_timeout + i));
360 	}
361 }
362 
363 void
364 ilb_conn_hash_fini(ilb_stack_t *ilbs)
365 {
366 	uint32_t i;
367 	ilb_conn_t *connp;
368 
369 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
370 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
371 		return;
372 	}
373 
374 	/* Stop all the timers first. */
375 	for (i = 0; i < ilb_conn_timer_size; i++) {
376 		timeout_id_t tid;
377 
378 		/* Setting tid to 0 tells the timer handler not to restart. */
379 		mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
380 		tid = ilbs->ilbs_conn_timer_list[i].tid;
381 		ilbs->ilbs_conn_timer_list[i].tid = 0;
382 		mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
383 		(void) untimeout(tid);
384 	}
385 	kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
386 	    ilb_conn_timer_size);
387 	taskq_destroy(ilbs->ilbs_conn_taskq);
388 	ilbs->ilbs_conn_taskq = NULL;
389 
390 	/* Then remove all the conns. */
391 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
392 		while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
393 			ilbs->ilbs_s2c_conn_hash->ilb_connp =
394 			    connp->conn_s2c_next;
395 			ILB_SERVER_REFRELE(connp->conn_server);
396 			if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
397 				ilb_nat_src_entry_t *ent;
398 				in_port_t port;
399 
400 				/*
401 				 * src_ent will be freed in ilb_nat_src_fini().
402 				 */
403 				port = ntohs(
404 				    connp->conn_rule_cache.info.nat_sport);
405 				ent = connp->conn_rule_cache.info.src_ent;
406 				vmem_free(ent->nse_port_arena,
407 				    (void *)(uintptr_t)port, 1);
408 			}
409 			kmem_cache_free(ilb_conn_cache, connp);
410 		}
411 	}
412 	kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
413 	    ilbs->ilbs_conn_hash_size);
414 	kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
415 	    ilbs->ilbs_conn_hash_size);
416 }
417 
418 /*
419  * Internet checksum adjustment calculation routines.  We pre-calculate
420  * checksum adjustment so that we don't need to compute the checksum on
421  * the whole packet when we change address/port in the packet.
422  */
423 
424 static void
425 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
426     in_port_t new_port, uint32_t *adj_sum)
427 {
428 	uint32_t sum;
429 
430 	sum = *oaddr + *(oaddr + 1) + old_port;
431 	while ((sum >> 16) != 0)
432 		sum = (sum & 0xffff) + (sum >> 16);
433 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
434 }
435 
436 static void
437 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
438     in_port_t new_port, uint32_t *adj_sum)
439 {
440 	uint32_t sum = 0;
441 
442 	sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
443 	    *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
444 	    old_port;
445 	while ((sum >> 16) != 0)
446 		sum = (sum & 0xffff) + (sum >> 16);
447 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
448 	    *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
449 	    *(naddr + 6) + *(naddr + 7) + new_port;
450 }
451 
452 static void
453 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
454     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
455     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
456 {
457 	uint32_t sum;
458 
459 	sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
460 	    old_port2;
461 	while ((sum >> 16) != 0)
462 		sum = (sum & 0xffff) + (sum >> 16);
463 	*adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
464 	    *naddr2 + *(naddr2 + 1) + new_port2;
465 }
466 
467 static void
468 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
469     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
470     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
471 {
472 	uint32_t sum = 0;
473 
474 	sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
475 	    *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
476 	    old_port1;
477 	sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
478 	    *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
479 	    old_port2;
480 	while ((sum >> 16) != 0)
481 		sum = (sum & 0xffff) + (sum >> 16);
482 	sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
483 	    *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
484 	    *(naddr1 + 7) + new_port1;
485 	*adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
486 	    *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
487 	    *(naddr2 + 7) + new_port2;
488 }
489 
490 /*
491  * Add a conn hash entry to the tables.  Note that a conn hash entry
492  * (ilb_conn_t) contains info on both directions.  And there are two hash
493  * tables, one for client to server and the other for server to client.
494  * So the same entry is added to both tables and can be ccessed by two
495  * thread simultaneously.  But each thread will only access data on one
496  * direction, so there is no conflict.
497  */
498 int
499 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
500     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
501     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
502 {
503 	ilb_conn_t *connp;
504 	ilb_conn_hash_t *hash;
505 	int i;
506 
507 	connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
508 	if (connp == NULL) {
509 		if (s != NULL) {
510 			if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
511 				ilb_nat_src_entry_t **entry;
512 
513 				entry = s->server->iser_nat_src->src_list;
514 				vmem_free(entry[s->nat_src_idx]->nse_port_arena,
515 				    (void *)(uintptr_t)ntohs(info->nat_sport),
516 				    1);
517 			}
518 			ILB_STICKY_REFRELE(s);
519 		}
520 		return (ENOMEM);
521 	}
522 
523 	connp->conn_l4 = rule->ir_proto;
524 
525 	connp->conn_server = server;
526 	ILB_SERVER_REFHOLD(server);
527 	connp->conn_sticky = s;
528 
529 	connp->conn_rule_cache.topo = rule->ir_topo;
530 	connp->conn_rule_cache.info = *info;
531 
532 	connp->conn_gc = B_FALSE;
533 
534 	connp->conn_expiry = rule->ir_nat_expiry;
535 	connp->conn_cr_time = ddi_get_lbolt64();
536 
537 	/* Client to server info. */
538 	connp->conn_c2s_saddr = *src;
539 	connp->conn_c2s_sport = sport;
540 	connp->conn_c2s_daddr = *dst;
541 	connp->conn_c2s_dport = dport;
542 
543 	connp->conn_c2s_atime = ddi_get_lbolt64();
544 	/* The packet ths triggers this creation should be counted */
545 	connp->conn_c2s_pkt_cnt = 1;
546 	connp->conn_c2s_tcp_fin_sent = B_FALSE;
547 	connp->conn_c2s_tcp_fin_acked = B_FALSE;
548 
549 	/* Server to client info, before NAT */
550 	switch (rule->ir_topo) {
551 	case ILB_TOPO_IMPL_HALF_NAT:
552 		connp->conn_s2c_saddr = info->nat_dst;
553 		connp->conn_s2c_sport = info->nat_dport;
554 		connp->conn_s2c_daddr = *src;
555 		connp->conn_s2c_dport = sport;
556 
557 		/* Pre-calculate checksum changes for both directions */
558 		if (rule->ir_ipver == IPPROTO_IP) {
559 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
560 			    (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
561 			    &connp->conn_c2s_ip_sum);
562 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
563 			    (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
564 			    info->nat_dport, &connp->conn_c2s_tp_sum);
565 			*ip_sum = connp->conn_c2s_ip_sum;
566 			*tp_sum = connp->conn_c2s_tp_sum;
567 
568 			hnat_cksum_v4(
569 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
570 			    (uint16_t *)&dst->s6_addr32[3], 0, 0,
571 			    &connp->conn_s2c_ip_sum);
572 			hnat_cksum_v4(
573 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
574 			    (uint16_t *)&dst->s6_addr32[3],
575 			    info->nat_dport, dport,
576 			    &connp->conn_s2c_tp_sum);
577 		} else {
578 			connp->conn_c2s_ip_sum = 0;
579 			hnat_cksum_v6((uint16_t *)dst,
580 			    (uint16_t *)&info->nat_dst, dport,
581 			    info->nat_dport, &connp->conn_c2s_tp_sum);
582 			*ip_sum = 0;
583 			*tp_sum = connp->conn_c2s_tp_sum;
584 
585 			connp->conn_s2c_ip_sum = 0;
586 			hnat_cksum_v6((uint16_t *)&info->nat_dst,
587 			    (uint16_t *)dst, info->nat_dport, dport,
588 			    &connp->conn_s2c_tp_sum);
589 		}
590 		break;
591 	case ILB_TOPO_IMPL_NAT:
592 		connp->conn_s2c_saddr = info->nat_dst;
593 		connp->conn_s2c_sport = info->nat_dport;
594 		connp->conn_s2c_daddr = info->nat_src;
595 		connp->conn_s2c_dport = info->nat_sport;
596 
597 		if (rule->ir_ipver == IPPROTO_IP) {
598 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
599 			    (uint16_t *)&dst->s6_addr32[3],
600 			    (uint16_t *)&info->nat_src.s6_addr32[3],
601 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
602 			    0, 0, 0, 0, &connp->conn_c2s_ip_sum);
603 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
604 			    (uint16_t *)&dst->s6_addr32[3],
605 			    (uint16_t *)&info->nat_src.s6_addr32[3],
606 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
607 			    sport, dport, info->nat_sport,
608 			    info->nat_dport, &connp->conn_c2s_tp_sum);
609 			*ip_sum = connp->conn_c2s_ip_sum;
610 			*tp_sum = connp->conn_c2s_tp_sum;
611 
612 			fnat_cksum_v4(
613 			    (uint16_t *)&info->nat_src.s6_addr32[3],
614 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
615 			    (uint16_t *)&src->s6_addr32[3],
616 			    (uint16_t *)&dst->s6_addr32[3],
617 			    0, 0, 0, 0, &connp->conn_s2c_ip_sum);
618 			fnat_cksum_v4(
619 			    (uint16_t *)&info->nat_src.s6_addr32[3],
620 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
621 			    (uint16_t *)&src->s6_addr32[3],
622 			    (uint16_t *)&dst->s6_addr32[3],
623 			    info->nat_sport, info->nat_dport,
624 			    sport, dport, &connp->conn_s2c_tp_sum);
625 		} else {
626 			fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
627 			    (uint16_t *)&info->nat_src,
628 			    (uint16_t *)&info->nat_dst,
629 			    sport, dport, info->nat_sport,
630 			    info->nat_dport, &connp->conn_c2s_tp_sum);
631 			connp->conn_c2s_ip_sum = 0;
632 			*ip_sum = 0;
633 			*tp_sum = connp->conn_c2s_tp_sum;
634 
635 			fnat_cksum_v6((uint16_t *)&info->nat_src,
636 			    (uint16_t *)&info->nat_dst, (uint16_t *)src,
637 			    (uint16_t *)dst, info->nat_sport,
638 			    info->nat_dport, sport, dport,
639 			    &connp->conn_s2c_tp_sum);
640 			connp->conn_s2c_ip_sum = 0;
641 		}
642 		break;
643 	}
644 
645 	connp->conn_s2c_atime = ddi_get_lbolt64();
646 	connp->conn_s2c_pkt_cnt = 1;
647 	connp->conn_s2c_tcp_fin_sent = B_FALSE;
648 	connp->conn_s2c_tcp_fin_acked = B_FALSE;
649 
650 	/* Add it to the s2c hash table. */
651 	hash = ilbs->ilbs_s2c_conn_hash;
652 	i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
653 	    ntohs(connp->conn_s2c_sport),
654 	    (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
655 	    ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
656 	connp->conn_s2c_hash = &hash[i];
657 	DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
658 
659 	mutex_enter(&hash[i].ilb_conn_hash_lock);
660 	hash[i].ilb_conn_cnt++;
661 	connp->conn_s2c_next = hash[i].ilb_connp;
662 	if (hash[i].ilb_connp != NULL)
663 		hash[i].ilb_connp->conn_s2c_prev = connp;
664 	connp->conn_s2c_prev = NULL;
665 	hash[i].ilb_connp = connp;
666 	mutex_exit(&hash[i].ilb_conn_hash_lock);
667 
668 	/* Add it to the c2s hash table. */
669 	hash = ilbs->ilbs_c2s_conn_hash;
670 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
671 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
672 	    ilbs->ilbs_conn_hash_size);
673 	connp->conn_c2s_hash = &hash[i];
674 	DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
675 
676 	mutex_enter(&hash[i].ilb_conn_hash_lock);
677 	hash[i].ilb_conn_cnt++;
678 	connp->conn_c2s_next = hash[i].ilb_connp;
679 	if (hash[i].ilb_connp != NULL)
680 		hash[i].ilb_connp->conn_c2s_prev = connp;
681 	connp->conn_c2s_prev = NULL;
682 	hash[i].ilb_connp = connp;
683 	mutex_exit(&hash[i].ilb_conn_hash_lock);
684 
685 	return (0);
686 }
687 
688 /*
689  * If a connection is using TCP, we keep track of simple TCP state transition
690  * so that we know when to clean up an entry.
691  */
692 static boolean_t
693 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
694     boolean_t c2s)
695 {
696 	uint32_t ack, seq;
697 	int32_t seg_len;
698 
699 	if (tcpha->tha_flags & TH_RST)
700 		return (B_FALSE);
701 
702 	seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
703 	    TCP_HDR_LENGTH((tcph_t *)tcpha);
704 
705 	if (tcpha->tha_flags & TH_ACK)
706 		ack = ntohl(tcpha->tha_ack);
707 	seq = ntohl(tcpha->tha_seq);
708 	if (c2s) {
709 		ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
710 		if (tcpha->tha_flags & TH_FIN) {
711 			connp->conn_c2s_tcp_fss = seq + seg_len;
712 			connp->conn_c2s_tcp_fin_sent = B_TRUE;
713 		}
714 		connp->conn_c2s_tcp_ack = ack;
715 
716 		/* Port reuse by the client, restart the conn. */
717 		if (connp->conn_c2s_tcp_fin_sent &&
718 		    SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
719 			connp->conn_c2s_tcp_fin_sent = B_FALSE;
720 			connp->conn_c2s_tcp_fin_acked = B_FALSE;
721 		}
722 	} else {
723 		ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
724 		if (tcpha->tha_flags & TH_FIN) {
725 			connp->conn_s2c_tcp_fss = seq + seg_len;
726 			connp->conn_s2c_tcp_fin_sent = B_TRUE;
727 		}
728 		connp->conn_s2c_tcp_ack = ack;
729 
730 		/* Port reuse by the client, restart the conn. */
731 		if (connp->conn_s2c_tcp_fin_sent &&
732 		    SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
733 			connp->conn_s2c_tcp_fin_sent = B_FALSE;
734 			connp->conn_s2c_tcp_fin_acked = B_FALSE;
735 		}
736 	}
737 
738 	return (B_TRUE);
739 }
740 
741 /*
742  * Helper routint to find conn hash entry given some packet information and
743  * the traffic direction (c2s, client to server?)
744  */
745 static boolean_t
746 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
747     in_port_t sport, in6_addr_t *dst, in_port_t dport,
748     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
749     int32_t pkt_len, boolean_t c2s)
750 {
751 	ilb_conn_hash_t *hash;
752 	uint_t i;
753 	ilb_conn_t *connp;
754 	boolean_t tcp_alive;
755 	boolean_t ret = B_FALSE;
756 
757 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
758 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
759 	    ilbs->ilbs_conn_hash_size);
760 	if (c2s) {
761 		hash = ilbs->ilbs_c2s_conn_hash;
762 		mutex_enter(&hash[i].ilb_conn_hash_lock);
763 		for (connp = hash[i].ilb_connp; connp != NULL;
764 		    connp = connp->conn_c2s_next) {
765 			if (connp->conn_l4 == l4 &&
766 			    connp->conn_c2s_dport == dport &&
767 			    connp->conn_c2s_sport == sport &&
768 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
769 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
770 				connp->conn_c2s_atime = ddi_get_lbolt64();
771 				connp->conn_c2s_pkt_cnt++;
772 				*rule_cache = connp->conn_rule_cache;
773 				*ip_sum = connp->conn_c2s_ip_sum;
774 				*tp_sum = connp->conn_c2s_tp_sum;
775 				ret = B_TRUE;
776 				break;
777 			}
778 		}
779 	} else {
780 		hash = ilbs->ilbs_s2c_conn_hash;
781 		mutex_enter(&hash[i].ilb_conn_hash_lock);
782 		for (connp = hash[i].ilb_connp; connp != NULL;
783 		    connp = connp->conn_s2c_next) {
784 			if (connp->conn_l4 == l4 &&
785 			    connp->conn_s2c_dport == dport &&
786 			    connp->conn_s2c_sport == sport &&
787 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
788 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
789 				connp->conn_s2c_atime = ddi_get_lbolt64();
790 				connp->conn_s2c_pkt_cnt++;
791 				*rule_cache = connp->conn_rule_cache;
792 				*ip_sum = connp->conn_s2c_ip_sum;
793 				*tp_sum = connp->conn_s2c_tp_sum;
794 				ret = B_TRUE;
795 				break;
796 			}
797 		}
798 	}
799 	if (ret) {
800 		ILB_S_KSTAT(connp->conn_server, pkt_processed);
801 		ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
802 		    pkt_len);
803 
804 		switch (l4) {
805 		case (IPPROTO_TCP):
806 			tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
807 			    c2s);
808 			if (!tcp_alive) {
809 				connp->conn_gc = B_TRUE;
810 			}
811 			break;
812 		default:
813 			break;
814 		}
815 	}
816 	mutex_exit(&hash[i].ilb_conn_hash_lock);
817 
818 	return (ret);
819 }
820 
821 /*
822  * To check if a give packet matches an existing conn hash entry.  If it
823  * does, return the information about this entry so that the caller can
824  * do the proper NAT.
825  */
826 boolean_t
827 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
828     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
829     uint32_t pkt_len, in6_addr_t *lb_dst)
830 {
831 	ilb_rule_info_t rule_cache;
832 	uint32_t adj_ip_sum, adj_tp_sum;
833 	boolean_t ret;
834 
835 	/* Check the incoming hash table. */
836 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
837 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
838 		switch (rule_cache.topo) {
839 		case ILB_TOPO_IMPL_NAT:
840 			*lb_dst = rule_cache.info.nat_dst;
841 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
842 			    adj_ip_sum, adj_tp_sum, B_TRUE);
843 			ret = B_TRUE;
844 			break;
845 		case ILB_TOPO_IMPL_HALF_NAT:
846 			*lb_dst = rule_cache.info.nat_dst;
847 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
848 			    adj_ip_sum, adj_tp_sum, B_TRUE);
849 			ret = B_TRUE;
850 			break;
851 		default:
852 			ret = B_FALSE;
853 			break;
854 		}
855 		return (ret);
856 	}
857 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
858 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
859 		switch (rule_cache.topo) {
860 		case ILB_TOPO_IMPL_NAT:
861 			*lb_dst = rule_cache.info.src;
862 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
863 			    adj_ip_sum, adj_tp_sum, B_FALSE);
864 			ret = B_TRUE;
865 			break;
866 		case ILB_TOPO_IMPL_HALF_NAT:
867 			*lb_dst = *dst;
868 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
869 			    adj_ip_sum, adj_tp_sum, B_FALSE);
870 			ret = B_TRUE;
871 			break;
872 		default:
873 			ret = B_FALSE;
874 			break;
875 		}
876 		return (ret);
877 	}
878 
879 	return (B_FALSE);
880 }
881 
882 /*
883  * To check if an ICMP packet belongs to a connection in one of the conn
884  * hash entries.
885  */
886 boolean_t
887 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
888     void *icmph, in6_addr_t *lb_dst)
889 {
890 	ilb_conn_hash_t *hash;
891 	ipha_t *in_iph4;
892 	ip6_t *in_iph6;
893 	icmph_t *icmph4;
894 	icmp6_t *icmph6;
895 	in6_addr_t *in_src_p, *in_dst_p;
896 	in_port_t *sport, *dport;
897 	int l4;
898 	uint_t i;
899 	ilb_conn_t *connp;
900 	ilb_rule_info_t rule_cache;
901 	uint32_t adj_ip_sum;
902 	boolean_t full_nat;
903 
904 	if (l3 == IPPROTO_IP) {
905 		in6_addr_t in_src, in_dst;
906 
907 		icmph4 = (icmph_t *)icmph;
908 		in_iph4 = (ipha_t *)&icmph4[1];
909 
910 		if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
911 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
912 			return (B_FALSE);
913 		}
914 
915 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
916 		in_src_p = &in_src;
917 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
918 		in_dst_p = &in_dst;
919 
920 		l4 = in_iph4->ipha_protocol;
921 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
922 			return (B_FALSE);
923 
924 		sport = (in_port_t *)((char *)in_iph4 +
925 		    IPH_HDR_LENGTH(in_iph4));
926 		dport = sport + 1;
927 
928 		DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
929 		    in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
930 		    ntohs(*sport), uint16_t, ntohs(*dport));
931 	} else {
932 		ASSERT(l3 == IPPROTO_IPV6);
933 
934 		icmph6 = (icmp6_t *)icmph;
935 		in_iph6 = (ip6_t *)&icmph6[1];
936 		in_src_p = &in_iph6->ip6_src;
937 		in_dst_p = &in_iph6->ip6_dst;
938 
939 		if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
940 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
941 			return (B_FALSE);
942 		}
943 
944 		l4 = in_iph6->ip6_nxt;
945 		/* We don't go deep inside an IPv6 packet yet. */
946 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
947 			return (B_FALSE);
948 
949 		sport = (in_port_t *)&in_iph6[1];
950 		dport = sport + 1;
951 
952 		DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
953 		    &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
954 		    uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
955 	}
956 
957 	i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
958 	    (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
959 	    ilbs->ilbs_conn_hash_size);
960 	hash = ilbs->ilbs_c2s_conn_hash;
961 
962 	mutex_enter(&hash[i].ilb_conn_hash_lock);
963 	for (connp = hash[i].ilb_connp; connp != NULL;
964 	    connp = connp->conn_c2s_next) {
965 		if (connp->conn_l4 == l4 &&
966 		    connp->conn_c2s_dport == *sport &&
967 		    connp->conn_c2s_sport == *dport &&
968 		    IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
969 		    IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
970 			connp->conn_c2s_atime = ddi_get_lbolt64();
971 			connp->conn_c2s_pkt_cnt++;
972 			rule_cache = connp->conn_rule_cache;
973 			adj_ip_sum = connp->conn_c2s_ip_sum;
974 			break;
975 		}
976 	}
977 	mutex_exit(&hash[i].ilb_conn_hash_lock);
978 
979 	if (connp == NULL) {
980 		DTRACE_PROBE(ilb__chk__icmp__conn__failed);
981 		return (B_FALSE);
982 	}
983 
984 	switch (rule_cache.topo) {
985 	case ILB_TOPO_IMPL_NAT:
986 		full_nat = B_TRUE;
987 		break;
988 	case ILB_TOPO_IMPL_HALF_NAT:
989 		full_nat = B_FALSE;
990 		break;
991 	default:
992 		return (B_FALSE);
993 	}
994 
995 	*lb_dst = rule_cache.info.nat_dst;
996 	if (l3 == IPPROTO_IP) {
997 		ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
998 		    &rule_cache.info, adj_ip_sum, full_nat);
999 	} else {
1000 		ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1001 		    &rule_cache.info, full_nat);
1002 	}
1003 	return (B_TRUE);
1004 }
1005 
1006 /*
1007  * This routine sends up the conn hash table to user land.  Note that the
1008  * request is an ioctl, hence we cannot really differentiate requests
1009  * from different clients.  There is no context shared between different
1010  * ioctls.  Here we make the assumption that the user land ilbd will
1011  * only allow one client to show the conn hash table at any time.
1012  * Otherwise, the results will be "very" inconsistent.
1013  *
1014  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1015  * to read from the beginning of the able.  After a certain entries
1016  * are reported, the kernel remembers the position of the last returned
1017  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1018  * it will return entries starting from where it was left off.  When
1019  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1020  * the client that there is no more entry.
1021  *
1022  * It is assumed that the caller has checked the size of nat so that it
1023  * can hold num entries.
1024  */
1025 /* ARGSUSED */
1026 int
1027 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1028     uint32_t *num, uint32_t *flags)
1029 {
1030 	ilb_conn_hash_t *hash;
1031 	ilb_conn_t *cur_connp;
1032 	uint32_t i, j;
1033 	int ret = 0;
1034 
1035 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1036 	while (ilbs->ilbs_conn_list_busy) {
1037 		if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1038 		    &ilbs->ilbs_conn_list_lock) == 0) {
1039 			mutex_exit(&ilbs->ilbs_conn_list_lock);
1040 			return (EINTR);
1041 		}
1042 	}
1043 	if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1044 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1045 		mutex_exit(&ilbs->ilbs_conn_list_lock);
1046 		*num = 0;
1047 		*flags |= ILB_LIST_END;
1048 		return (0);
1049 	}
1050 	ilbs->ilbs_conn_list_busy = B_TRUE;
1051 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1052 
1053 	if (*flags & ILB_LIST_BEGIN) {
1054 		i = 0;
1055 		mutex_enter(&hash[0].ilb_conn_hash_lock);
1056 		cur_connp = hash[0].ilb_connp;
1057 	} else if (*flags & ILB_LIST_CONT) {
1058 		if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1059 			*num = 0;
1060 			*flags |= ILB_LIST_END;
1061 			goto done;
1062 		}
1063 		i = ilbs->ilbs_conn_list_cur;
1064 		mutex_enter(&hash[i].ilb_conn_hash_lock);
1065 		cur_connp = ilbs->ilbs_conn_list_connp;
1066 	} else {
1067 		ret = EINVAL;
1068 		goto done;
1069 	}
1070 
1071 	j = 0;
1072 	while (j < *num) {
1073 		if (cur_connp == NULL) {
1074 			mutex_exit(&hash[i].ilb_conn_hash_lock);
1075 			if (++i == ilbs->ilbs_conn_hash_size) {
1076 				*flags |= ILB_LIST_END;
1077 				break;
1078 			}
1079 			mutex_enter(&hash[i].ilb_conn_hash_lock);
1080 			cur_connp = hash[i].ilb_connp;
1081 			continue;
1082 		}
1083 		nat[j].proto = cur_connp->conn_l4;
1084 
1085 		nat[j].in_global = cur_connp->conn_c2s_daddr;
1086 		nat[j].in_global_port = cur_connp->conn_c2s_dport;
1087 		nat[j].out_global = cur_connp->conn_c2s_saddr;
1088 		nat[j].out_global_port = cur_connp->conn_c2s_sport;
1089 
1090 		nat[j].in_local = cur_connp->conn_s2c_saddr;
1091 		nat[j].in_local_port = cur_connp->conn_s2c_sport;
1092 		nat[j].out_local = cur_connp->conn_s2c_daddr;
1093 		nat[j].out_local_port = cur_connp->conn_s2c_dport;
1094 
1095 		nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1096 		nat[j].last_access_time =
1097 		    TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1098 
1099 		/*
1100 		 * The conn_s2c_pkt_cnt may not be accurate since we are not
1101 		 * holding the s2c hash lock.
1102 		 */
1103 		nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1104 		    cur_connp->conn_s2c_pkt_cnt;
1105 		j++;
1106 
1107 		cur_connp = cur_connp->conn_c2s_next;
1108 	}
1109 	ilbs->ilbs_conn_list_connp = cur_connp;
1110 	if (j == *num)
1111 		mutex_exit(&hash[i].ilb_conn_hash_lock);
1112 
1113 	ilbs->ilbs_conn_list_cur = i;
1114 
1115 	*num = j;
1116 done:
1117 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1118 	ilbs->ilbs_conn_list_busy = B_FALSE;
1119 	cv_signal(&ilbs->ilbs_conn_list_cv);
1120 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1121 
1122 	return (ret);
1123 }
1124 
1125 
1126 /*
1127  * Stickiness (persistence) handling routines.
1128  */
1129 
1130 
1131 static void
1132 ilb_sticky_cache_init(void)
1133 {
1134 	ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1135 	    sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1136 	    ilb_kmem_flags);
1137 }
1138 
1139 void
1140 ilb_sticky_cache_fini(void)
1141 {
1142 	if (ilb_sticky_cache != NULL) {
1143 		kmem_cache_destroy(ilb_sticky_cache);
1144 		ilb_sticky_cache = NULL;
1145 	}
1146 }
1147 
1148 void
1149 ilb_sticky_refrele(ilb_sticky_t *s)
1150 {
1151 	ILB_STICKY_REFRELE(s);
1152 }
1153 
1154 static ilb_sticky_t *
1155 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1156 {
1157 	ilb_sticky_t *s;
1158 
1159 	ASSERT(mutex_owned(&hash->sticky_lock));
1160 
1161 	for (s = list_head(&hash->sticky_head); s != NULL;
1162 	    s = list_next(&hash->sticky_head, s)) {
1163 		if (s->rule_instance == rule->ir_ks_instance) {
1164 			if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1165 				return (s);
1166 		}
1167 	}
1168 	return (NULL);
1169 }
1170 
1171 static ilb_sticky_t *
1172 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1173     in6_addr_t *src)
1174 {
1175 	ilb_sticky_t *s;
1176 
1177 	ASSERT(mutex_owned(&hash->sticky_lock));
1178 
1179 	if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1180 		return (NULL);
1181 
1182 	/*
1183 	 * The rule instance is for handling the scenario when the same
1184 	 * client talks to different rules at the same time.  Stickiness
1185 	 * is per rule so we can use the rule instance to differentiate
1186 	 * the client's request.
1187 	 */
1188 	s->rule_instance = rule->ir_ks_instance;
1189 	/*
1190 	 * Copy the rule name for listing all sticky cache entry.  ir_name
1191 	 * is guaranteed to be NULL terminated.
1192 	 */
1193 	(void) strcpy(s->rule_name, rule->ir_name);
1194 	s->server = server;
1195 
1196 	/*
1197 	 * Grab a ref cnt on the server so that it won't go away while
1198 	 * it is still in the sticky table.
1199 	 */
1200 	ILB_SERVER_REFHOLD(server);
1201 	s->src = *src;
1202 	s->expiry = rule->ir_sticky_expiry;
1203 	s->refcnt = 1;
1204 	s->hash = hash;
1205 
1206 	/*
1207 	 * There is no need to set atime here since the refcnt is not
1208 	 * zero.  A sticky entry is removed only when the refcnt is
1209 	 * zero.  But just set it here for debugging purpose.  The
1210 	 * atime is set when a refrele is done on a sticky entry.
1211 	 */
1212 	s->atime = ddi_get_lbolt64();
1213 
1214 	list_insert_head(&hash->sticky_head, s);
1215 	hash->sticky_cnt++;
1216 	return (s);
1217 }
1218 
1219 /*
1220  * This routine checks if there is an existing sticky entry which matches
1221  * a given packet.  If there is one, return it.  If there is not, create
1222  * a sticky entry using the packet's info.
1223  */
1224 ilb_server_t *
1225 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1226     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1227 {
1228 	int i;
1229 	ilb_sticky_hash_t *hash;
1230 	ilb_sticky_t *s;
1231 
1232 	ASSERT(server != NULL);
1233 
1234 	*res = NULL;
1235 
1236 	i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1237 	    (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1238 	hash = &ilbs->ilbs_sticky_hash[i];
1239 
1240 	/* First check if there is already an entry. */
1241 	mutex_enter(&hash->sticky_lock);
1242 	s = ilb_sticky_lookup(hash, rule, src);
1243 
1244 	/* No sticky entry, add one. */
1245 	if (s == NULL) {
1246 add_new_entry:
1247 		s = ilb_sticky_add(hash, rule, server, src);
1248 		if (s == NULL) {
1249 			mutex_exit(&hash->sticky_lock);
1250 			return (NULL);
1251 		}
1252 		/*
1253 		 * Find a source for this server.  All subseqent requests from
1254 		 * the same client matching this sticky entry will use this
1255 		 * source address in doing NAT.  The current algorithm is
1256 		 * simple, rotate the source address.  Note that the
1257 		 * source address array does not change after it's created, so
1258 		 * it is OK to just increment the cur index.
1259 		 */
1260 		if (server->iser_nat_src != NULL) {
1261 			/* It is a hint, does not need to be atomic. */
1262 			*src_ent_idx = (server->iser_nat_src->cur++ %
1263 			    server->iser_nat_src->num_src);
1264 			s->nat_src_idx = *src_ent_idx;
1265 		}
1266 		mutex_exit(&hash->sticky_lock);
1267 		*res = s;
1268 		return (server);
1269 	}
1270 
1271 	/*
1272 	 * We don't hold any lock accessing iser_enabled.  Refer to the
1273 	 * comment in ilb_server_add() about iser_lock.
1274 	 */
1275 	if (!s->server->iser_enabled) {
1276 		/*
1277 		 * s->server == server can only happen if there is a race in
1278 		 * toggling the iser_enabled flag (we don't hold a lock doing
1279 		 * that) so that the load balance algorithm still returns a
1280 		 * disabled server.  In this case, just drop the packet...
1281 		 */
1282 		if (s->server == server) {
1283 			mutex_exit(&hash->sticky_lock);
1284 			return (NULL);
1285 		}
1286 
1287 		/*
1288 		 * The old server is disabled and there is a new server, use
1289 		 * the new one to create a sticky entry.  Since we will
1290 		 * add the entry at the beginning, subsequent lookup will
1291 		 * find this new entry instead of the old one.
1292 		 */
1293 		goto add_new_entry;
1294 	}
1295 
1296 	s->refcnt++;
1297 	*res = s;
1298 	mutex_exit(&hash->sticky_lock);
1299 	if (server->iser_nat_src != NULL)
1300 		*src_ent_idx = s->nat_src_idx;
1301 	return (s->server);
1302 }
1303 
1304 static void
1305 ilb_sticky_cleanup(void *arg)
1306 {
1307 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1308 	uint32_t i;
1309 	ilb_stack_t *ilbs;
1310 	ilb_sticky_hash_t *hash;
1311 	ilb_sticky_t *s, *nxt_s;
1312 	int64_t now, expiry;
1313 
1314 	ilbs = timer->ilbs;
1315 	hash = ilbs->ilbs_sticky_hash;
1316 	ASSERT(hash != NULL);
1317 
1318 	now = ddi_get_lbolt64();
1319 	for (i = timer->start; i < timer->end; i++) {
1320 		mutex_enter(&hash[i].sticky_lock);
1321 		for (s = list_head(&hash[i].sticky_head); s != NULL;
1322 		    s = nxt_s) {
1323 			nxt_s = list_next(&hash[i].sticky_head, s);
1324 			if (s->refcnt != 0)
1325 				continue;
1326 			expiry = now - SEC_TO_TICK(s->expiry);
1327 			if (s->atime < expiry) {
1328 				ILB_SERVER_REFRELE(s->server);
1329 				list_remove(&hash[i].sticky_head, s);
1330 				kmem_cache_free(ilb_sticky_cache, s);
1331 				hash[i].sticky_cnt--;
1332 			}
1333 		}
1334 		mutex_exit(&hash[i].sticky_lock);
1335 	}
1336 }
1337 
1338 static void
1339 ilb_sticky_timer(void *arg)
1340 {
1341 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1342 
1343 	(void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1344 	    ilb_sticky_cleanup, arg, TQ_SLEEP);
1345 	mutex_enter(&timer->tid_lock);
1346 	if (timer->tid == 0) {
1347 		mutex_exit(&timer->tid_lock);
1348 	} else {
1349 		timer->tid = timeout(ilb_sticky_timer, arg,
1350 		    SEC_TO_TICK(ilb_sticky_timeout));
1351 		mutex_exit(&timer->tid_lock);
1352 	}
1353 }
1354 
1355 void
1356 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1357 {
1358 	extern pri_t minclsyspri;
1359 	int i, part;
1360 	char tq_name[TASKQ_NAMELEN];
1361 	ilb_timer_t *tm;
1362 
1363 	if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1364 		for (i = 0; i < 31; i++) {
1365 			if (ilbs->ilbs_sticky_hash_size < (1 << i))
1366 				break;
1367 		}
1368 		ilbs->ilbs_sticky_hash_size = 1 << i;
1369 	}
1370 
1371 	ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1372 	    ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1373 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1374 		mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1375 		    MUTEX_DEFAULT, NULL);
1376 		list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1377 		    sizeof (ilb_sticky_t),
1378 		    offsetof(ilb_sticky_t, list));
1379 	}
1380 
1381 	if (ilb_sticky_cache == NULL)
1382 		ilb_sticky_cache_init();
1383 
1384 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1385 	    (void *)ilbs->ilbs_netstack);
1386 	ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1387 	ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1388 	    ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1389 	    ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1390 
1391 	ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1392 	ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1393 	    ilb_sticky_timer_size, KM_SLEEP);
1394 	part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1395 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1396 		tm = ilbs->ilbs_sticky_timer_list + i;
1397 		tm->start = i * part;
1398 		tm->end = i * part + part;
1399 		if (tm->end > ilbs->ilbs_sticky_hash_size)
1400 			tm->end = ilbs->ilbs_sticky_hash_size;
1401 		tm->ilbs = ilbs;
1402 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1403 		/* Spread out the starting execution time of all the timers. */
1404 		tm->tid = timeout(ilb_sticky_timer, tm,
1405 		    SEC_TO_TICK(ilb_sticky_timeout + i));
1406 	}
1407 }
1408 
1409 void
1410 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1411 {
1412 	int i;
1413 	ilb_sticky_t *s;
1414 
1415 	if (ilbs->ilbs_sticky_hash == NULL)
1416 		return;
1417 
1418 	/* Stop all the timers first. */
1419 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1420 		timeout_id_t tid;
1421 
1422 		/* Setting tid to 0 tells the timer handler not to restart. */
1423 		mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1424 		tid = ilbs->ilbs_sticky_timer_list[i].tid;
1425 		ilbs->ilbs_sticky_timer_list[i].tid = 0;
1426 		mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1427 		(void) untimeout(tid);
1428 	}
1429 	kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1430 	    ilb_sticky_timer_size);
1431 	taskq_destroy(ilbs->ilbs_sticky_taskq);
1432 	ilbs->ilbs_sticky_taskq = NULL;
1433 
1434 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1435 		while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1436 		    != NULL) {
1437 			list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1438 			ILB_SERVER_REFRELE(s->server);
1439 			kmem_free(s, sizeof (ilb_sticky_t));
1440 		}
1441 	}
1442 	kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1443 	    sizeof (ilb_sticky_hash_t));
1444 }
1445 
1446 /*
1447  * This routine sends up the sticky hash table to user land.  Refer to
1448  * the comments before ilb_list_nat().  Both routines assume similar
1449  * conditions.
1450  *
1451  * It is assumed that the caller has checked the size of st so that it
1452  * can hold num entries.
1453  */
1454 /* ARGSUSED */
1455 int
1456 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1457     uint32_t *num, uint32_t *flags)
1458 {
1459 	ilb_sticky_hash_t *hash;
1460 	ilb_sticky_t *curp;
1461 	uint32_t i, j;
1462 	int ret = 0;
1463 
1464 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1465 	while (ilbs->ilbs_sticky_list_busy) {
1466 		if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1467 		    &ilbs->ilbs_sticky_list_lock) == 0) {
1468 			mutex_exit(&ilbs->ilbs_sticky_list_lock);
1469 			return (EINTR);
1470 		}
1471 	}
1472 	if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1473 		mutex_exit(&ilbs->ilbs_sticky_list_lock);
1474 		*num = 0;
1475 		*flags |= ILB_LIST_END;
1476 		return (0);
1477 	}
1478 	ilbs->ilbs_sticky_list_busy = B_TRUE;
1479 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1480 
1481 	if (*flags & ILB_LIST_BEGIN) {
1482 		i = 0;
1483 		mutex_enter(&hash[0].sticky_lock);
1484 		curp = list_head(&hash[0].sticky_head);
1485 	} else if (*flags & ILB_LIST_CONT) {
1486 		if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1487 			*num = 0;
1488 			*flags |= ILB_LIST_END;
1489 			goto done;
1490 		}
1491 		i = ilbs->ilbs_sticky_list_cur;
1492 		mutex_enter(&hash[i].sticky_lock);
1493 		curp = ilbs->ilbs_sticky_list_curp;
1494 	} else {
1495 		ret = EINVAL;
1496 		goto done;
1497 	}
1498 
1499 	j = 0;
1500 	while (j < *num) {
1501 		if (curp == NULL) {
1502 			mutex_exit(&hash[i].sticky_lock);
1503 			if (++i == ilbs->ilbs_sticky_hash_size) {
1504 				*flags |= ILB_LIST_END;
1505 				break;
1506 			}
1507 			mutex_enter(&hash[i].sticky_lock);
1508 			curp = list_head(&hash[i].sticky_head);
1509 			continue;
1510 		}
1511 		(void) strcpy(st[j].rule_name, curp->rule_name);
1512 		st[j].req_addr = curp->src;
1513 		st[j].srv_addr = curp->server->iser_addr_v6;
1514 		st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1515 		j++;
1516 		curp = list_next(&hash[i].sticky_head, curp);
1517 	}
1518 	ilbs->ilbs_sticky_list_curp = curp;
1519 	if (j == *num)
1520 		mutex_exit(&hash[i].sticky_lock);
1521 
1522 	ilbs->ilbs_sticky_list_cur = i;
1523 
1524 	*num = j;
1525 done:
1526 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1527 	ilbs->ilbs_sticky_list_busy = B_FALSE;
1528 	cv_signal(&ilbs->ilbs_sticky_list_cv);
1529 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1530 
1531 	return (ret);
1532 }
1533