xref: /illumos-gate/usr/src/uts/common/inet/ilb/ilb_conn.c (revision f9c7ecc9013918bf789e154e2ed5f2e8fb6ce24c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2014 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/time.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/sdt.h>
35 #include <sys/atomic.h>
36 #include <netinet/in.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <inet/tcp.h>
40 #include <inet/udp_impl.h>
41 #include <inet/ilb.h>
42 
43 #include "ilb_stack.h"
44 #include "ilb_impl.h"
45 #include "ilb_conn.h"
46 #include "ilb_nat.h"
47 
48 /*
49  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
50  *
51  * start: starting index into the hash table to do gc
52  * end: ending index into the hash table to do gc
53  * ilbs: pointer to the ilb_stack_t of the IP stack
54  * tid_lock: mutex to protect the timer id.
55  * tid: timer id of the timer
56  */
57 typedef struct ilb_timer_s {
58 	uint32_t	start;
59 	uint32_t	end;
60 	ilb_stack_t	*ilbs;
61 	kmutex_t	tid_lock;
62 	timeout_id_t	tid;
63 } ilb_timer_t;
64 
65 /* Hash macro for finding the index to the conn hash table */
66 #define	ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)	\
67 	(((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +		\
68 	(*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +		\
69 	(*((saddr) + 1) ^ *((daddr) + 1)) * 37 +		\
70 	(*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &	\
71 	((hash_size) - 1))
72 
73 /* Kmem cache for the conn hash entry */
74 static struct kmem_cache *ilb_conn_cache = NULL;
75 
76 /*
77  * There are 60 timers running to do conn cache garbage collection.  Each
78  * gc thread is responsible for 1/60 of the conn hash table.
79  */
80 static int ilb_conn_timer_size = 60;
81 
82 /* Each of the above gc timers wake up every 15s to do the gc. */
83 static int ilb_conn_cache_timeout = 15;
84 
85 #define	ILB_STICKY_HASH(saddr, rule, hash_size)			\
86 	(((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +		\
87 	(*((saddr) + 2) ^ ((rule) >> 16)) * 961 +		\
88 	(*((saddr) + 1) ^ ((rule) >> 8)) * 31 +			\
89 	(*(saddr) ^ (rule))) & ((hash_size) - 1))
90 
91 static struct kmem_cache *ilb_sticky_cache = NULL;
92 
93 /*
94  * There are 60 timers running to do sticky cache garbage collection.  Each
95  * gc thread is responsible for 1/60 of the sticky hash table.
96  */
97 static int ilb_sticky_timer_size = 60;
98 
99 /* Each of the above gc timers wake up every 15s to do the gc. */
100 static int ilb_sticky_timeout = 15;
101 
102 #define	ILB_STICKY_REFRELE(s)			\
103 {						\
104 	mutex_enter(&(s)->hash->sticky_lock);	\
105 	(s)->refcnt--;				\
106 	(s)->atime = ddi_get_lbolt64();		\
107 	mutex_exit(&s->hash->sticky_lock);	\
108 }
109 
110 
111 static void
112 ilb_conn_cache_init(void)
113 {
114 	ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
115 	    sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
116 	    ilb_kmem_flags);
117 }
118 
119 void
120 ilb_conn_cache_fini(void)
121 {
122 	if (ilb_conn_cache != NULL) {
123 		kmem_cache_destroy(ilb_conn_cache);
124 		ilb_conn_cache = NULL;
125 	}
126 }
127 
128 static void
129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
130 {
131 	ilb_conn_hash_t *hash;
132 	ilb_conn_t **next, **prev;
133 	ilb_conn_t **next_prev, **prev_next;
134 
135 	next_prev = NULL;
136 	prev_next = NULL;
137 
138 	if (c2s) {
139 		hash = connp->conn_c2s_hash;
140 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
141 		next = &connp->conn_c2s_next;
142 		prev = &connp->conn_c2s_prev;
143 		if (*next != NULL)
144 			next_prev = &(*next)->conn_c2s_prev;
145 		if (*prev != NULL)
146 			prev_next = &(*prev)->conn_c2s_next;
147 	} else {
148 		hash = connp->conn_s2c_hash;
149 		ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
150 		next = &connp->conn_s2c_next;
151 		prev = &connp->conn_s2c_prev;
152 		if (*next != NULL)
153 			next_prev = &(*next)->conn_s2c_prev;
154 		if (*prev != NULL)
155 			prev_next = &(*prev)->conn_s2c_next;
156 	}
157 
158 	if (hash->ilb_connp == connp) {
159 		hash->ilb_connp = *next;
160 		if (*next != NULL)
161 			*next_prev = NULL;
162 	} else {
163 		if (*prev != NULL)
164 			*prev_next = *next;
165 		if (*next != NULL)
166 			*next_prev = *prev;
167 	}
168 	ASSERT(hash->ilb_conn_cnt > 0);
169 	hash->ilb_conn_cnt--;
170 
171 	*next = NULL;
172 	*prev = NULL;
173 }
174 
175 static void
176 ilb_conn_remove(ilb_conn_t *connp)
177 {
178 	ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
179 	ilb_conn_remove_common(connp, B_TRUE);
180 	ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
181 	ilb_conn_remove_common(connp, B_FALSE);
182 
183 	if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
184 		in_port_t port;
185 
186 		port = ntohs(connp->conn_rule_cache.info.nat_sport);
187 		vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
188 		    (void *)(uintptr_t)port, 1);
189 	}
190 
191 	if (connp->conn_sticky != NULL)
192 		ILB_STICKY_REFRELE(connp->conn_sticky);
193 	ILB_SERVER_REFRELE(connp->conn_server);
194 	kmem_cache_free(ilb_conn_cache, connp);
195 }
196 
197 /*
198  * Routine to do periodic garbage collection of conn hash entries.  When
199  * a conn hash timer fires, it dispatches a taskq to call this function
200  * to do the gc.  Note that each taskq is responisble for a portion of
201  * the table.  The portion is stored in timer->start, timer->end.
202  */
203 static void
204 ilb_conn_cleanup(void *arg)
205 {
206 	ilb_timer_t *timer = (ilb_timer_t *)arg;
207 	uint32_t i;
208 	ilb_stack_t *ilbs;
209 	ilb_conn_hash_t *c2s_hash, *s2c_hash;
210 	ilb_conn_t *connp, *nxt_connp;
211 	int64_t now;
212 	int64_t expiry;
213 	boolean_t die_now;
214 
215 	ilbs = timer->ilbs;
216 	c2s_hash = ilbs->ilbs_c2s_conn_hash;
217 	ASSERT(c2s_hash != NULL);
218 
219 	now = ddi_get_lbolt64();
220 	for (i = timer->start; i < timer->end; i++) {
221 		mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
222 		if ((connp = c2s_hash[i].ilb_connp) == NULL) {
223 			ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
224 			mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
225 			continue;
226 		}
227 		do {
228 			ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
229 			ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
230 			nxt_connp = connp->conn_c2s_next;
231 			expiry = now - SEC_TO_TICK(connp->conn_expiry);
232 			if (connp->conn_server->iser_die_time != 0 &&
233 			    connp->conn_server->iser_die_time < now)
234 				die_now = B_TRUE;
235 			else
236 				die_now = B_FALSE;
237 			s2c_hash = connp->conn_s2c_hash;
238 			mutex_enter(&s2c_hash->ilb_conn_hash_lock);
239 
240 			if (connp->conn_gc || die_now ||
241 			    (connp->conn_c2s_atime < expiry &&
242 			    connp->conn_s2c_atime < expiry)) {
243 				/* Need to update the nat list cur_connp */
244 				if (connp == ilbs->ilbs_conn_list_connp) {
245 					ilbs->ilbs_conn_list_connp =
246 					    connp->conn_c2s_next;
247 				}
248 				ilb_conn_remove(connp);
249 				goto nxt_connp;
250 			}
251 
252 			if (connp->conn_l4 != IPPROTO_TCP)
253 				goto nxt_connp;
254 
255 			/* Update and check TCP related conn info */
256 			if (connp->conn_c2s_tcp_fin_sent &&
257 			    SEQ_GT(connp->conn_s2c_tcp_ack,
258 			    connp->conn_c2s_tcp_fss)) {
259 				connp->conn_c2s_tcp_fin_acked = B_TRUE;
260 			}
261 			if (connp->conn_s2c_tcp_fin_sent &&
262 			    SEQ_GT(connp->conn_c2s_tcp_ack,
263 			    connp->conn_s2c_tcp_fss)) {
264 				connp->conn_s2c_tcp_fin_acked = B_TRUE;
265 			}
266 			if (connp->conn_c2s_tcp_fin_acked &&
267 			    connp->conn_s2c_tcp_fin_acked) {
268 				ilb_conn_remove(connp);
269 			}
270 nxt_connp:
271 			mutex_exit(&s2c_hash->ilb_conn_hash_lock);
272 			connp = nxt_connp;
273 		} while (connp != NULL);
274 		mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
275 	}
276 }
277 
278 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
279 static void
280 ilb_conn_timer(void *arg)
281 {
282 	ilb_timer_t *timer = (ilb_timer_t *)arg;
283 
284 	(void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
285 	    arg, TQ_SLEEP);
286 	mutex_enter(&timer->tid_lock);
287 	if (timer->tid == 0) {
288 		mutex_exit(&timer->tid_lock);
289 	} else {
290 		timer->tid = timeout(ilb_conn_timer, arg,
291 		    SEC_TO_TICK(ilb_conn_cache_timeout));
292 		mutex_exit(&timer->tid_lock);
293 	}
294 }
295 
296 void
297 ilb_conn_hash_init(ilb_stack_t *ilbs)
298 {
299 	extern pri_t minclsyspri;
300 	int i, part;
301 	ilb_timer_t *tm;
302 	char tq_name[TASKQ_NAMELEN];
303 
304 	/*
305 	 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
306 	 * the next power of 2.
307 	 */
308 	if (!ISP2(ilbs->ilbs_conn_hash_size)) {
309 		for (i = 0; i < 31; i++) {
310 			if (ilbs->ilbs_conn_hash_size < (1 << i))
311 				break;
312 		}
313 		ilbs->ilbs_conn_hash_size = 1 << i;
314 	}
315 
316 	/*
317 	 * Can sleep since this should be called when a rule is being added,
318 	 * hence we are not in interrupt context.
319 	 */
320 	ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
321 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
322 	ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
323 	    ilbs->ilbs_conn_hash_size, KM_SLEEP);
324 
325 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
326 		mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
327 		    NULL, MUTEX_DEFAULT, NULL);
328 	}
329 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
330 		mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
331 		    NULL, MUTEX_DEFAULT, NULL);
332 	}
333 
334 	if (ilb_conn_cache == NULL)
335 		ilb_conn_cache_init();
336 
337 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
338 	    (void *)ilbs->ilbs_netstack);
339 	ASSERT(ilbs->ilbs_conn_taskq == NULL);
340 	ilbs->ilbs_conn_taskq = taskq_create(tq_name,
341 	    ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
342 	    ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
343 
344 	ASSERT(ilbs->ilbs_conn_timer_list == NULL);
345 	ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
346 	    ilb_conn_timer_size, KM_SLEEP);
347 
348 	/*
349 	 * The hash table is divided in equal partition for those timers
350 	 * to do garbage collection.
351 	 */
352 	part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
353 	for (i = 0; i < ilb_conn_timer_size; i++) {
354 		tm = ilbs->ilbs_conn_timer_list + i;
355 		tm->start = i * part;
356 		tm->end = i * part + part;
357 		if (tm->end > ilbs->ilbs_conn_hash_size)
358 			tm->end = ilbs->ilbs_conn_hash_size;
359 		tm->ilbs = ilbs;
360 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
361 		/* Spread out the starting execution time of all the timers. */
362 		tm->tid = timeout(ilb_conn_timer, tm,
363 		    SEC_TO_TICK(ilb_conn_cache_timeout + i));
364 	}
365 }
366 
367 void
368 ilb_conn_hash_fini(ilb_stack_t *ilbs)
369 {
370 	uint32_t i;
371 	ilb_conn_t *connp;
372 	ilb_conn_hash_t *hash;
373 
374 	if (ilbs->ilbs_c2s_conn_hash == NULL) {
375 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
376 		return;
377 	}
378 
379 	/* Stop all the timers first. */
380 	for (i = 0; i < ilb_conn_timer_size; i++) {
381 		timeout_id_t tid;
382 
383 		/* Setting tid to 0 tells the timer handler not to restart. */
384 		mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
385 		tid = ilbs->ilbs_conn_timer_list[i].tid;
386 		ilbs->ilbs_conn_timer_list[i].tid = 0;
387 		mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
388 		(void) untimeout(tid);
389 	}
390 	kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
391 	    ilb_conn_timer_size);
392 	taskq_destroy(ilbs->ilbs_conn_taskq);
393 	ilbs->ilbs_conn_taskq = NULL;
394 
395 	/* Then remove all the conns. */
396 	hash = ilbs->ilbs_s2c_conn_hash;
397 	for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
398 		while ((connp = hash[i].ilb_connp) != NULL) {
399 			hash[i].ilb_connp = connp->conn_s2c_next;
400 			ILB_SERVER_REFRELE(connp->conn_server);
401 			if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
402 				ilb_nat_src_entry_t *ent;
403 				in_port_t port;
404 
405 				/*
406 				 * src_ent will be freed in ilb_nat_src_fini().
407 				 */
408 				port = ntohs(
409 				    connp->conn_rule_cache.info.nat_sport);
410 				ent = connp->conn_rule_cache.info.src_ent;
411 				vmem_free(ent->nse_port_arena,
412 				    (void *)(uintptr_t)port, 1);
413 			}
414 			kmem_cache_free(ilb_conn_cache, connp);
415 		}
416 	}
417 	kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
418 	    ilbs->ilbs_conn_hash_size);
419 	kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
420 	    ilbs->ilbs_conn_hash_size);
421 }
422 
423 /*
424  * Internet checksum adjustment calculation routines.  We pre-calculate
425  * checksum adjustment so that we don't need to compute the checksum on
426  * the whole packet when we change address/port in the packet.
427  */
428 
429 static void
430 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
431     in_port_t new_port, uint32_t *adj_sum)
432 {
433 	uint32_t sum;
434 
435 	sum = *oaddr + *(oaddr + 1) + old_port;
436 	while ((sum >> 16) != 0)
437 		sum = (sum & 0xffff) + (sum >> 16);
438 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
439 }
440 
441 static void
442 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
443     in_port_t new_port, uint32_t *adj_sum)
444 {
445 	uint32_t sum = 0;
446 
447 	sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
448 	    *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
449 	    old_port;
450 	while ((sum >> 16) != 0)
451 		sum = (sum & 0xffff) + (sum >> 16);
452 	*adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
453 	    *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
454 	    *(naddr + 6) + *(naddr + 7) + new_port;
455 }
456 
457 static void
458 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
459     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
460     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
461 {
462 	uint32_t sum;
463 
464 	sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
465 	    old_port2;
466 	while ((sum >> 16) != 0)
467 		sum = (sum & 0xffff) + (sum >> 16);
468 	*adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
469 	    *naddr2 + *(naddr2 + 1) + new_port2;
470 }
471 
472 static void
473 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
474     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
475     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
476 {
477 	uint32_t sum = 0;
478 
479 	sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
480 	    *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
481 	    old_port1;
482 	sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
483 	    *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
484 	    old_port2;
485 	while ((sum >> 16) != 0)
486 		sum = (sum & 0xffff) + (sum >> 16);
487 	sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
488 	    *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
489 	    *(naddr1 + 7) + new_port1;
490 	*adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
491 	    *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
492 	    *(naddr2 + 7) + new_port2;
493 }
494 
495 /*
496  * Add a conn hash entry to the tables.  Note that a conn hash entry
497  * (ilb_conn_t) contains info on both directions.  And there are two hash
498  * tables, one for client to server and the other for server to client.
499  * So the same entry is added to both tables and can be ccessed by two
500  * thread simultaneously.  But each thread will only access data on one
501  * direction, so there is no conflict.
502  */
503 int
504 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
505     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
506     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
507 {
508 	ilb_conn_t *connp;
509 	ilb_conn_hash_t *hash;
510 	int i;
511 
512 	connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
513 	if (connp == NULL) {
514 		if (s != NULL) {
515 			if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
516 				ilb_nat_src_entry_t **entry;
517 
518 				entry = s->server->iser_nat_src->src_list;
519 				vmem_free(entry[s->nat_src_idx]->nse_port_arena,
520 				    (void *)(uintptr_t)ntohs(info->nat_sport),
521 				    1);
522 			}
523 			ILB_STICKY_REFRELE(s);
524 		}
525 		return (ENOMEM);
526 	}
527 
528 	connp->conn_l4 = rule->ir_proto;
529 
530 	connp->conn_server = server;
531 	ILB_SERVER_REFHOLD(server);
532 	connp->conn_sticky = s;
533 
534 	connp->conn_rule_cache.topo = rule->ir_topo;
535 	connp->conn_rule_cache.info = *info;
536 
537 	connp->conn_gc = B_FALSE;
538 
539 	connp->conn_expiry = rule->ir_nat_expiry;
540 	connp->conn_cr_time = ddi_get_lbolt64();
541 
542 	/* Client to server info. */
543 	connp->conn_c2s_saddr = *src;
544 	connp->conn_c2s_sport = sport;
545 	connp->conn_c2s_daddr = *dst;
546 	connp->conn_c2s_dport = dport;
547 
548 	connp->conn_c2s_atime = ddi_get_lbolt64();
549 	/* The packet ths triggers this creation should be counted */
550 	connp->conn_c2s_pkt_cnt = 1;
551 	connp->conn_c2s_tcp_fin_sent = B_FALSE;
552 	connp->conn_c2s_tcp_fin_acked = B_FALSE;
553 
554 	/* Server to client info, before NAT */
555 	switch (rule->ir_topo) {
556 	case ILB_TOPO_IMPL_HALF_NAT:
557 		connp->conn_s2c_saddr = info->nat_dst;
558 		connp->conn_s2c_sport = info->nat_dport;
559 		connp->conn_s2c_daddr = *src;
560 		connp->conn_s2c_dport = sport;
561 
562 		/* Pre-calculate checksum changes for both directions */
563 		if (rule->ir_ipver == IPPROTO_IP) {
564 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
565 			    (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
566 			    &connp->conn_c2s_ip_sum);
567 			hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
568 			    (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
569 			    info->nat_dport, &connp->conn_c2s_tp_sum);
570 			*ip_sum = connp->conn_c2s_ip_sum;
571 			*tp_sum = connp->conn_c2s_tp_sum;
572 
573 			hnat_cksum_v4(
574 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
575 			    (uint16_t *)&dst->s6_addr32[3], 0, 0,
576 			    &connp->conn_s2c_ip_sum);
577 			hnat_cksum_v4(
578 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
579 			    (uint16_t *)&dst->s6_addr32[3],
580 			    info->nat_dport, dport,
581 			    &connp->conn_s2c_tp_sum);
582 		} else {
583 			connp->conn_c2s_ip_sum = 0;
584 			hnat_cksum_v6((uint16_t *)dst,
585 			    (uint16_t *)&info->nat_dst, dport,
586 			    info->nat_dport, &connp->conn_c2s_tp_sum);
587 			*ip_sum = 0;
588 			*tp_sum = connp->conn_c2s_tp_sum;
589 
590 			connp->conn_s2c_ip_sum = 0;
591 			hnat_cksum_v6((uint16_t *)&info->nat_dst,
592 			    (uint16_t *)dst, info->nat_dport, dport,
593 			    &connp->conn_s2c_tp_sum);
594 		}
595 		break;
596 	case ILB_TOPO_IMPL_NAT:
597 		connp->conn_s2c_saddr = info->nat_dst;
598 		connp->conn_s2c_sport = info->nat_dport;
599 		connp->conn_s2c_daddr = info->nat_src;
600 		connp->conn_s2c_dport = info->nat_sport;
601 
602 		if (rule->ir_ipver == IPPROTO_IP) {
603 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
604 			    (uint16_t *)&dst->s6_addr32[3],
605 			    (uint16_t *)&info->nat_src.s6_addr32[3],
606 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
607 			    0, 0, 0, 0, &connp->conn_c2s_ip_sum);
608 			fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
609 			    (uint16_t *)&dst->s6_addr32[3],
610 			    (uint16_t *)&info->nat_src.s6_addr32[3],
611 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
612 			    sport, dport, info->nat_sport,
613 			    info->nat_dport, &connp->conn_c2s_tp_sum);
614 			*ip_sum = connp->conn_c2s_ip_sum;
615 			*tp_sum = connp->conn_c2s_tp_sum;
616 
617 			fnat_cksum_v4(
618 			    (uint16_t *)&info->nat_src.s6_addr32[3],
619 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
620 			    (uint16_t *)&src->s6_addr32[3],
621 			    (uint16_t *)&dst->s6_addr32[3],
622 			    0, 0, 0, 0, &connp->conn_s2c_ip_sum);
623 			fnat_cksum_v4(
624 			    (uint16_t *)&info->nat_src.s6_addr32[3],
625 			    (uint16_t *)&info->nat_dst.s6_addr32[3],
626 			    (uint16_t *)&src->s6_addr32[3],
627 			    (uint16_t *)&dst->s6_addr32[3],
628 			    info->nat_sport, info->nat_dport,
629 			    sport, dport, &connp->conn_s2c_tp_sum);
630 		} else {
631 			fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
632 			    (uint16_t *)&info->nat_src,
633 			    (uint16_t *)&info->nat_dst,
634 			    sport, dport, info->nat_sport,
635 			    info->nat_dport, &connp->conn_c2s_tp_sum);
636 			connp->conn_c2s_ip_sum = 0;
637 			*ip_sum = 0;
638 			*tp_sum = connp->conn_c2s_tp_sum;
639 
640 			fnat_cksum_v6((uint16_t *)&info->nat_src,
641 			    (uint16_t *)&info->nat_dst, (uint16_t *)src,
642 			    (uint16_t *)dst, info->nat_sport,
643 			    info->nat_dport, sport, dport,
644 			    &connp->conn_s2c_tp_sum);
645 			connp->conn_s2c_ip_sum = 0;
646 		}
647 		break;
648 	}
649 
650 	connp->conn_s2c_atime = ddi_get_lbolt64();
651 	connp->conn_s2c_pkt_cnt = 1;
652 	connp->conn_s2c_tcp_fin_sent = B_FALSE;
653 	connp->conn_s2c_tcp_fin_acked = B_FALSE;
654 
655 	/* Add it to the s2c hash table. */
656 	hash = ilbs->ilbs_s2c_conn_hash;
657 	i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
658 	    ntohs(connp->conn_s2c_sport),
659 	    (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
660 	    ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
661 	connp->conn_s2c_hash = &hash[i];
662 	DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
663 
664 	mutex_enter(&hash[i].ilb_conn_hash_lock);
665 	hash[i].ilb_conn_cnt++;
666 	connp->conn_s2c_next = hash[i].ilb_connp;
667 	if (hash[i].ilb_connp != NULL)
668 		hash[i].ilb_connp->conn_s2c_prev = connp;
669 	connp->conn_s2c_prev = NULL;
670 	hash[i].ilb_connp = connp;
671 	mutex_exit(&hash[i].ilb_conn_hash_lock);
672 
673 	/* Add it to the c2s hash table. */
674 	hash = ilbs->ilbs_c2s_conn_hash;
675 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
676 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
677 	    ilbs->ilbs_conn_hash_size);
678 	connp->conn_c2s_hash = &hash[i];
679 	DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
680 
681 	mutex_enter(&hash[i].ilb_conn_hash_lock);
682 	hash[i].ilb_conn_cnt++;
683 	connp->conn_c2s_next = hash[i].ilb_connp;
684 	if (hash[i].ilb_connp != NULL)
685 		hash[i].ilb_connp->conn_c2s_prev = connp;
686 	connp->conn_c2s_prev = NULL;
687 	hash[i].ilb_connp = connp;
688 	mutex_exit(&hash[i].ilb_conn_hash_lock);
689 
690 	return (0);
691 }
692 
693 /*
694  * If a connection is using TCP, we keep track of simple TCP state transition
695  * so that we know when to clean up an entry.
696  */
697 static boolean_t
698 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
699     boolean_t c2s)
700 {
701 	uint32_t ack, seq;
702 	int32_t seg_len;
703 
704 	ack = 0;
705 	if (tcpha->tha_flags & TH_RST)
706 		return (B_FALSE);
707 
708 	seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
709 	    TCP_HDR_LENGTH((tcph_t *)tcpha);
710 
711 	if (tcpha->tha_flags & TH_ACK)
712 		ack = ntohl(tcpha->tha_ack);
713 	seq = ntohl(tcpha->tha_seq);
714 	if (c2s) {
715 		ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
716 		if (tcpha->tha_flags & TH_FIN) {
717 			connp->conn_c2s_tcp_fss = seq + seg_len;
718 			connp->conn_c2s_tcp_fin_sent = B_TRUE;
719 		}
720 		connp->conn_c2s_tcp_ack = ack;
721 
722 		/* Port reuse by the client, restart the conn. */
723 		if (connp->conn_c2s_tcp_fin_sent &&
724 		    SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
725 			connp->conn_c2s_tcp_fin_sent = B_FALSE;
726 			connp->conn_c2s_tcp_fin_acked = B_FALSE;
727 		}
728 	} else {
729 		ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
730 		if (tcpha->tha_flags & TH_FIN) {
731 			connp->conn_s2c_tcp_fss = seq + seg_len;
732 			connp->conn_s2c_tcp_fin_sent = B_TRUE;
733 		}
734 		connp->conn_s2c_tcp_ack = ack;
735 
736 		/* Port reuse by the client, restart the conn. */
737 		if (connp->conn_s2c_tcp_fin_sent &&
738 		    SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
739 			connp->conn_s2c_tcp_fin_sent = B_FALSE;
740 			connp->conn_s2c_tcp_fin_acked = B_FALSE;
741 		}
742 	}
743 
744 	return (B_TRUE);
745 }
746 
747 /*
748  * Helper routint to find conn hash entry given some packet information and
749  * the traffic direction (c2s, client to server?)
750  */
751 static boolean_t
752 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
753     in_port_t sport, in6_addr_t *dst, in_port_t dport,
754     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
755     int32_t pkt_len, boolean_t c2s)
756 {
757 	ilb_conn_hash_t *hash;
758 	uint_t i;
759 	ilb_conn_t *connp;
760 	boolean_t tcp_alive;
761 	boolean_t ret = B_FALSE;
762 
763 	i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
764 	    (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
765 	    ilbs->ilbs_conn_hash_size);
766 	if (c2s) {
767 		hash = ilbs->ilbs_c2s_conn_hash;
768 		mutex_enter(&hash[i].ilb_conn_hash_lock);
769 		for (connp = hash[i].ilb_connp; connp != NULL;
770 		    connp = connp->conn_c2s_next) {
771 			if (connp->conn_l4 == l4 &&
772 			    connp->conn_c2s_dport == dport &&
773 			    connp->conn_c2s_sport == sport &&
774 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
775 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
776 				connp->conn_c2s_atime = ddi_get_lbolt64();
777 				connp->conn_c2s_pkt_cnt++;
778 				*rule_cache = connp->conn_rule_cache;
779 				*ip_sum = connp->conn_c2s_ip_sum;
780 				*tp_sum = connp->conn_c2s_tp_sum;
781 				ret = B_TRUE;
782 				break;
783 			}
784 		}
785 	} else {
786 		hash = ilbs->ilbs_s2c_conn_hash;
787 		mutex_enter(&hash[i].ilb_conn_hash_lock);
788 		for (connp = hash[i].ilb_connp; connp != NULL;
789 		    connp = connp->conn_s2c_next) {
790 			if (connp->conn_l4 == l4 &&
791 			    connp->conn_s2c_dport == dport &&
792 			    connp->conn_s2c_sport == sport &&
793 			    IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
794 			    IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
795 				connp->conn_s2c_atime = ddi_get_lbolt64();
796 				connp->conn_s2c_pkt_cnt++;
797 				*rule_cache = connp->conn_rule_cache;
798 				*ip_sum = connp->conn_s2c_ip_sum;
799 				*tp_sum = connp->conn_s2c_tp_sum;
800 				ret = B_TRUE;
801 				break;
802 			}
803 		}
804 	}
805 	if (ret) {
806 		ILB_S_KSTAT(connp->conn_server, pkt_processed);
807 		ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
808 		    pkt_len);
809 
810 		switch (l4) {
811 		case (IPPROTO_TCP):
812 			tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
813 			    c2s);
814 			if (!tcp_alive) {
815 				connp->conn_gc = B_TRUE;
816 			}
817 			break;
818 		default:
819 			break;
820 		}
821 	}
822 	mutex_exit(&hash[i].ilb_conn_hash_lock);
823 
824 	return (ret);
825 }
826 
827 /*
828  * To check if a give packet matches an existing conn hash entry.  If it
829  * does, return the information about this entry so that the caller can
830  * do the proper NAT.
831  */
832 boolean_t
833 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
834     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
835     uint32_t pkt_len, in6_addr_t *lb_dst)
836 {
837 	ilb_rule_info_t rule_cache;
838 	uint32_t adj_ip_sum, adj_tp_sum;
839 	boolean_t ret;
840 
841 	/* Check the incoming hash table. */
842 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
843 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
844 		switch (rule_cache.topo) {
845 		case ILB_TOPO_IMPL_NAT:
846 			*lb_dst = rule_cache.info.nat_dst;
847 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
848 			    adj_ip_sum, adj_tp_sum, B_TRUE);
849 			ret = B_TRUE;
850 			break;
851 		case ILB_TOPO_IMPL_HALF_NAT:
852 			*lb_dst = rule_cache.info.nat_dst;
853 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
854 			    adj_ip_sum, adj_tp_sum, B_TRUE);
855 			ret = B_TRUE;
856 			break;
857 		default:
858 			ret = B_FALSE;
859 			break;
860 		}
861 		return (ret);
862 	}
863 	if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
864 	    &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
865 		switch (rule_cache.topo) {
866 		case ILB_TOPO_IMPL_NAT:
867 			*lb_dst = rule_cache.info.src;
868 			ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
869 			    adj_ip_sum, adj_tp_sum, B_FALSE);
870 			ret = B_TRUE;
871 			break;
872 		case ILB_TOPO_IMPL_HALF_NAT:
873 			*lb_dst = *dst;
874 			ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
875 			    adj_ip_sum, adj_tp_sum, B_FALSE);
876 			ret = B_TRUE;
877 			break;
878 		default:
879 			ret = B_FALSE;
880 			break;
881 		}
882 		return (ret);
883 	}
884 
885 	return (B_FALSE);
886 }
887 
888 /*
889  * To check if an ICMP packet belongs to a connection in one of the conn
890  * hash entries.
891  */
892 boolean_t
893 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
894     void *icmph, in6_addr_t *lb_dst)
895 {
896 	ilb_conn_hash_t *hash;
897 	ipha_t *in_iph4;
898 	ip6_t *in_iph6;
899 	icmph_t *icmph4;
900 	icmp6_t *icmph6;
901 	in6_addr_t *in_src_p, *in_dst_p;
902 	in_port_t *sport, *dport;
903 	int l4;
904 	uint_t i;
905 	ilb_conn_t *connp;
906 	ilb_rule_info_t rule_cache;
907 	uint32_t adj_ip_sum;
908 	boolean_t full_nat;
909 
910 	in_iph4 = NULL;
911 	in_iph6 = NULL;
912 	icmph4 = NULL;
913 	icmph6 = NULL;
914 
915 	if (l3 == IPPROTO_IP) {
916 		in6_addr_t in_src, in_dst;
917 
918 		icmph4 = (icmph_t *)icmph;
919 		in_iph4 = (ipha_t *)&icmph4[1];
920 
921 		if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
922 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
923 			return (B_FALSE);
924 		}
925 
926 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
927 		in_src_p = &in_src;
928 		IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
929 		in_dst_p = &in_dst;
930 
931 		l4 = in_iph4->ipha_protocol;
932 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
933 			return (B_FALSE);
934 
935 		sport = (in_port_t *)((char *)in_iph4 +
936 		    IPH_HDR_LENGTH(in_iph4));
937 		dport = sport + 1;
938 
939 		DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
940 		    in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
941 		    ntohs(*sport), uint16_t, ntohs(*dport));
942 	} else {
943 		ASSERT(l3 == IPPROTO_IPV6);
944 
945 		icmph6 = (icmp6_t *)icmph;
946 		in_iph6 = (ip6_t *)&icmph6[1];
947 		in_src_p = &in_iph6->ip6_src;
948 		in_dst_p = &in_iph6->ip6_dst;
949 
950 		if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
951 		    ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
952 			return (B_FALSE);
953 		}
954 
955 		l4 = in_iph6->ip6_nxt;
956 		/* We don't go deep inside an IPv6 packet yet. */
957 		if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
958 			return (B_FALSE);
959 
960 		sport = (in_port_t *)&in_iph6[1];
961 		dport = sport + 1;
962 
963 		DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
964 		    &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
965 		    uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
966 	}
967 
968 	i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
969 	    (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
970 	    ilbs->ilbs_conn_hash_size);
971 	hash = ilbs->ilbs_c2s_conn_hash;
972 
973 	mutex_enter(&hash[i].ilb_conn_hash_lock);
974 	for (connp = hash[i].ilb_connp; connp != NULL;
975 	    connp = connp->conn_c2s_next) {
976 		if (connp->conn_l4 == l4 &&
977 		    connp->conn_c2s_dport == *sport &&
978 		    connp->conn_c2s_sport == *dport &&
979 		    IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
980 		    IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
981 			connp->conn_c2s_atime = ddi_get_lbolt64();
982 			connp->conn_c2s_pkt_cnt++;
983 			rule_cache = connp->conn_rule_cache;
984 			adj_ip_sum = connp->conn_c2s_ip_sum;
985 			break;
986 		}
987 	}
988 	mutex_exit(&hash[i].ilb_conn_hash_lock);
989 
990 	if (connp == NULL) {
991 		DTRACE_PROBE(ilb__chk__icmp__conn__failed);
992 		return (B_FALSE);
993 	}
994 
995 	switch (rule_cache.topo) {
996 	case ILB_TOPO_IMPL_NAT:
997 		full_nat = B_TRUE;
998 		break;
999 	case ILB_TOPO_IMPL_HALF_NAT:
1000 		full_nat = B_FALSE;
1001 		break;
1002 	default:
1003 		return (B_FALSE);
1004 	}
1005 
1006 	*lb_dst = rule_cache.info.nat_dst;
1007 	if (l3 == IPPROTO_IP) {
1008 		ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
1009 		    &rule_cache.info, adj_ip_sum, full_nat);
1010 	} else {
1011 		ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1012 		    &rule_cache.info, full_nat);
1013 	}
1014 	return (B_TRUE);
1015 }
1016 
1017 /*
1018  * This routine sends up the conn hash table to user land.  Note that the
1019  * request is an ioctl, hence we cannot really differentiate requests
1020  * from different clients.  There is no context shared between different
1021  * ioctls.  Here we make the assumption that the user land ilbd will
1022  * only allow one client to show the conn hash table at any time.
1023  * Otherwise, the results will be "very" inconsistent.
1024  *
1025  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1026  * to read from the beginning of the able.  After a certain entries
1027  * are reported, the kernel remembers the position of the last returned
1028  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1029  * it will return entries starting from where it was left off.  When
1030  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1031  * the client that there is no more entry.
1032  *
1033  * It is assumed that the caller has checked the size of nat so that it
1034  * can hold num entries.
1035  */
1036 /* ARGSUSED */
1037 int
1038 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1039     uint32_t *num, uint32_t *flags)
1040 {
1041 	ilb_conn_hash_t *hash;
1042 	ilb_conn_t *cur_connp;
1043 	uint32_t i, j;
1044 	int ret = 0;
1045 
1046 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1047 	while (ilbs->ilbs_conn_list_busy) {
1048 		if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1049 		    &ilbs->ilbs_conn_list_lock) == 0) {
1050 			mutex_exit(&ilbs->ilbs_conn_list_lock);
1051 			return (EINTR);
1052 		}
1053 	}
1054 	if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1055 		ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1056 		mutex_exit(&ilbs->ilbs_conn_list_lock);
1057 		*num = 0;
1058 		*flags |= ILB_LIST_END;
1059 		return (0);
1060 	}
1061 	ilbs->ilbs_conn_list_busy = B_TRUE;
1062 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1063 
1064 	if (*flags & ILB_LIST_BEGIN) {
1065 		i = 0;
1066 		mutex_enter(&hash[0].ilb_conn_hash_lock);
1067 		cur_connp = hash[0].ilb_connp;
1068 	} else if (*flags & ILB_LIST_CONT) {
1069 		if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1070 			*num = 0;
1071 			*flags |= ILB_LIST_END;
1072 			goto done;
1073 		}
1074 		i = ilbs->ilbs_conn_list_cur;
1075 		mutex_enter(&hash[i].ilb_conn_hash_lock);
1076 		cur_connp = ilbs->ilbs_conn_list_connp;
1077 	} else {
1078 		ret = EINVAL;
1079 		goto done;
1080 	}
1081 
1082 	j = 0;
1083 	while (j < *num) {
1084 		if (cur_connp == NULL) {
1085 			mutex_exit(&hash[i].ilb_conn_hash_lock);
1086 			if (++i == ilbs->ilbs_conn_hash_size) {
1087 				*flags |= ILB_LIST_END;
1088 				break;
1089 			}
1090 			mutex_enter(&hash[i].ilb_conn_hash_lock);
1091 			cur_connp = hash[i].ilb_connp;
1092 			continue;
1093 		}
1094 		nat[j].proto = cur_connp->conn_l4;
1095 
1096 		nat[j].in_global = cur_connp->conn_c2s_daddr;
1097 		nat[j].in_global_port = cur_connp->conn_c2s_dport;
1098 		nat[j].out_global = cur_connp->conn_c2s_saddr;
1099 		nat[j].out_global_port = cur_connp->conn_c2s_sport;
1100 
1101 		nat[j].in_local = cur_connp->conn_s2c_saddr;
1102 		nat[j].in_local_port = cur_connp->conn_s2c_sport;
1103 		nat[j].out_local = cur_connp->conn_s2c_daddr;
1104 		nat[j].out_local_port = cur_connp->conn_s2c_dport;
1105 
1106 		nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1107 		nat[j].last_access_time =
1108 		    TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1109 
1110 		/*
1111 		 * The conn_s2c_pkt_cnt may not be accurate since we are not
1112 		 * holding the s2c hash lock.
1113 		 */
1114 		nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1115 		    cur_connp->conn_s2c_pkt_cnt;
1116 		j++;
1117 
1118 		cur_connp = cur_connp->conn_c2s_next;
1119 	}
1120 	ilbs->ilbs_conn_list_connp = cur_connp;
1121 	if (j == *num)
1122 		mutex_exit(&hash[i].ilb_conn_hash_lock);
1123 
1124 	ilbs->ilbs_conn_list_cur = i;
1125 
1126 	*num = j;
1127 done:
1128 	mutex_enter(&ilbs->ilbs_conn_list_lock);
1129 	ilbs->ilbs_conn_list_busy = B_FALSE;
1130 	cv_signal(&ilbs->ilbs_conn_list_cv);
1131 	mutex_exit(&ilbs->ilbs_conn_list_lock);
1132 
1133 	return (ret);
1134 }
1135 
1136 
1137 /*
1138  * Stickiness (persistence) handling routines.
1139  */
1140 
1141 
1142 static void
1143 ilb_sticky_cache_init(void)
1144 {
1145 	ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1146 	    sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1147 	    ilb_kmem_flags);
1148 }
1149 
1150 void
1151 ilb_sticky_cache_fini(void)
1152 {
1153 	if (ilb_sticky_cache != NULL) {
1154 		kmem_cache_destroy(ilb_sticky_cache);
1155 		ilb_sticky_cache = NULL;
1156 	}
1157 }
1158 
1159 void
1160 ilb_sticky_refrele(ilb_sticky_t *s)
1161 {
1162 	ILB_STICKY_REFRELE(s);
1163 }
1164 
1165 static ilb_sticky_t *
1166 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1167 {
1168 	ilb_sticky_t *s;
1169 
1170 	ASSERT(mutex_owned(&hash->sticky_lock));
1171 
1172 	for (s = list_head(&hash->sticky_head); s != NULL;
1173 	    s = list_next(&hash->sticky_head, s)) {
1174 		if (s->rule_instance == rule->ir_ks_instance) {
1175 			if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1176 				return (s);
1177 		}
1178 	}
1179 	return (NULL);
1180 }
1181 
1182 static ilb_sticky_t *
1183 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1184     in6_addr_t *src)
1185 {
1186 	ilb_sticky_t *s;
1187 
1188 	ASSERT(mutex_owned(&hash->sticky_lock));
1189 
1190 	if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1191 		return (NULL);
1192 
1193 	/*
1194 	 * The rule instance is for handling the scenario when the same
1195 	 * client talks to different rules at the same time.  Stickiness
1196 	 * is per rule so we can use the rule instance to differentiate
1197 	 * the client's request.
1198 	 */
1199 	s->rule_instance = rule->ir_ks_instance;
1200 	/*
1201 	 * Copy the rule name for listing all sticky cache entry.  ir_name
1202 	 * is guaranteed to be NULL terminated.
1203 	 */
1204 	(void) strcpy(s->rule_name, rule->ir_name);
1205 	s->server = server;
1206 
1207 	/*
1208 	 * Grab a ref cnt on the server so that it won't go away while
1209 	 * it is still in the sticky table.
1210 	 */
1211 	ILB_SERVER_REFHOLD(server);
1212 	s->src = *src;
1213 	s->expiry = rule->ir_sticky_expiry;
1214 	s->refcnt = 1;
1215 	s->hash = hash;
1216 
1217 	/*
1218 	 * There is no need to set atime here since the refcnt is not
1219 	 * zero.  A sticky entry is removed only when the refcnt is
1220 	 * zero.  But just set it here for debugging purpose.  The
1221 	 * atime is set when a refrele is done on a sticky entry.
1222 	 */
1223 	s->atime = ddi_get_lbolt64();
1224 
1225 	list_insert_head(&hash->sticky_head, s);
1226 	hash->sticky_cnt++;
1227 	return (s);
1228 }
1229 
1230 /*
1231  * This routine checks if there is an existing sticky entry which matches
1232  * a given packet.  If there is one, return it.  If there is not, create
1233  * a sticky entry using the packet's info.
1234  */
1235 ilb_server_t *
1236 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1237     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1238 {
1239 	int i;
1240 	ilb_sticky_hash_t *hash;
1241 	ilb_sticky_t *s;
1242 
1243 	ASSERT(server != NULL);
1244 
1245 	*res = NULL;
1246 
1247 	i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1248 	    (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1249 	hash = &ilbs->ilbs_sticky_hash[i];
1250 
1251 	/* First check if there is already an entry. */
1252 	mutex_enter(&hash->sticky_lock);
1253 	s = ilb_sticky_lookup(hash, rule, src);
1254 
1255 	/* No sticky entry, add one. */
1256 	if (s == NULL) {
1257 add_new_entry:
1258 		s = ilb_sticky_add(hash, rule, server, src);
1259 		if (s == NULL) {
1260 			mutex_exit(&hash->sticky_lock);
1261 			return (NULL);
1262 		}
1263 		/*
1264 		 * Find a source for this server.  All subseqent requests from
1265 		 * the same client matching this sticky entry will use this
1266 		 * source address in doing NAT.  The current algorithm is
1267 		 * simple, rotate the source address.  Note that the
1268 		 * source address array does not change after it's created, so
1269 		 * it is OK to just increment the cur index.
1270 		 */
1271 		if (server->iser_nat_src != NULL) {
1272 			/* It is a hint, does not need to be atomic. */
1273 			*src_ent_idx = (server->iser_nat_src->cur++ %
1274 			    server->iser_nat_src->num_src);
1275 			s->nat_src_idx = *src_ent_idx;
1276 		}
1277 		mutex_exit(&hash->sticky_lock);
1278 		*res = s;
1279 		return (server);
1280 	}
1281 
1282 	/*
1283 	 * We don't hold any lock accessing iser_enabled.  Refer to the
1284 	 * comment in ilb_server_add() about iser_lock.
1285 	 */
1286 	if (!s->server->iser_enabled) {
1287 		/*
1288 		 * s->server == server can only happen if there is a race in
1289 		 * toggling the iser_enabled flag (we don't hold a lock doing
1290 		 * that) so that the load balance algorithm still returns a
1291 		 * disabled server.  In this case, just drop the packet...
1292 		 */
1293 		if (s->server == server) {
1294 			mutex_exit(&hash->sticky_lock);
1295 			return (NULL);
1296 		}
1297 
1298 		/*
1299 		 * The old server is disabled and there is a new server, use
1300 		 * the new one to create a sticky entry.  Since we will
1301 		 * add the entry at the beginning, subsequent lookup will
1302 		 * find this new entry instead of the old one.
1303 		 */
1304 		goto add_new_entry;
1305 	}
1306 
1307 	s->refcnt++;
1308 	*res = s;
1309 	mutex_exit(&hash->sticky_lock);
1310 	if (server->iser_nat_src != NULL)
1311 		*src_ent_idx = s->nat_src_idx;
1312 	return (s->server);
1313 }
1314 
1315 static void
1316 ilb_sticky_cleanup(void *arg)
1317 {
1318 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1319 	uint32_t i;
1320 	ilb_stack_t *ilbs;
1321 	ilb_sticky_hash_t *hash;
1322 	ilb_sticky_t *s, *nxt_s;
1323 	int64_t now, expiry;
1324 
1325 	ilbs = timer->ilbs;
1326 	hash = ilbs->ilbs_sticky_hash;
1327 	ASSERT(hash != NULL);
1328 
1329 	now = ddi_get_lbolt64();
1330 	for (i = timer->start; i < timer->end; i++) {
1331 		mutex_enter(&hash[i].sticky_lock);
1332 		for (s = list_head(&hash[i].sticky_head); s != NULL;
1333 		    s = nxt_s) {
1334 			nxt_s = list_next(&hash[i].sticky_head, s);
1335 			if (s->refcnt != 0)
1336 				continue;
1337 			expiry = now - SEC_TO_TICK(s->expiry);
1338 			if (s->atime < expiry) {
1339 				ILB_SERVER_REFRELE(s->server);
1340 				list_remove(&hash[i].sticky_head, s);
1341 				kmem_cache_free(ilb_sticky_cache, s);
1342 				hash[i].sticky_cnt--;
1343 			}
1344 		}
1345 		mutex_exit(&hash[i].sticky_lock);
1346 	}
1347 }
1348 
1349 static void
1350 ilb_sticky_timer(void *arg)
1351 {
1352 	ilb_timer_t *timer = (ilb_timer_t *)arg;
1353 
1354 	(void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1355 	    ilb_sticky_cleanup, arg, TQ_SLEEP);
1356 	mutex_enter(&timer->tid_lock);
1357 	if (timer->tid == 0) {
1358 		mutex_exit(&timer->tid_lock);
1359 	} else {
1360 		timer->tid = timeout(ilb_sticky_timer, arg,
1361 		    SEC_TO_TICK(ilb_sticky_timeout));
1362 		mutex_exit(&timer->tid_lock);
1363 	}
1364 }
1365 
1366 void
1367 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1368 {
1369 	extern pri_t minclsyspri;
1370 	int i, part;
1371 	char tq_name[TASKQ_NAMELEN];
1372 	ilb_timer_t *tm;
1373 
1374 	if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1375 		for (i = 0; i < 31; i++) {
1376 			if (ilbs->ilbs_sticky_hash_size < (1 << i))
1377 				break;
1378 		}
1379 		ilbs->ilbs_sticky_hash_size = 1 << i;
1380 	}
1381 
1382 	ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1383 	    ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1384 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1385 		mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1386 		    MUTEX_DEFAULT, NULL);
1387 		list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1388 		    sizeof (ilb_sticky_t),
1389 		    offsetof(ilb_sticky_t, list));
1390 	}
1391 
1392 	if (ilb_sticky_cache == NULL)
1393 		ilb_sticky_cache_init();
1394 
1395 	(void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1396 	    (void *)ilbs->ilbs_netstack);
1397 	ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1398 	ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1399 	    ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1400 	    ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1401 
1402 	ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1403 	ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1404 	    ilb_sticky_timer_size, KM_SLEEP);
1405 	part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1406 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1407 		tm = ilbs->ilbs_sticky_timer_list + i;
1408 		tm->start = i * part;
1409 		tm->end = i * part + part;
1410 		if (tm->end > ilbs->ilbs_sticky_hash_size)
1411 			tm->end = ilbs->ilbs_sticky_hash_size;
1412 		tm->ilbs = ilbs;
1413 		mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1414 		/* Spread out the starting execution time of all the timers. */
1415 		tm->tid = timeout(ilb_sticky_timer, tm,
1416 		    SEC_TO_TICK(ilb_sticky_timeout + i));
1417 	}
1418 }
1419 
1420 void
1421 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1422 {
1423 	int i;
1424 	ilb_sticky_t *s;
1425 
1426 	if (ilbs->ilbs_sticky_hash == NULL)
1427 		return;
1428 
1429 	/* Stop all the timers first. */
1430 	for (i = 0; i < ilb_sticky_timer_size; i++) {
1431 		timeout_id_t tid;
1432 
1433 		/* Setting tid to 0 tells the timer handler not to restart. */
1434 		mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1435 		tid = ilbs->ilbs_sticky_timer_list[i].tid;
1436 		ilbs->ilbs_sticky_timer_list[i].tid = 0;
1437 		mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1438 		(void) untimeout(tid);
1439 	}
1440 	kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1441 	    ilb_sticky_timer_size);
1442 	taskq_destroy(ilbs->ilbs_sticky_taskq);
1443 	ilbs->ilbs_sticky_taskq = NULL;
1444 
1445 	for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1446 		while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1447 		    != NULL) {
1448 			list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1449 			ILB_SERVER_REFRELE(s->server);
1450 			kmem_free(s, sizeof (ilb_sticky_t));
1451 		}
1452 	}
1453 	kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1454 	    sizeof (ilb_sticky_hash_t));
1455 }
1456 
1457 /*
1458  * This routine sends up the sticky hash table to user land.  Refer to
1459  * the comments before ilb_list_nat().  Both routines assume similar
1460  * conditions.
1461  *
1462  * It is assumed that the caller has checked the size of st so that it
1463  * can hold num entries.
1464  */
1465 /* ARGSUSED */
1466 int
1467 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1468     uint32_t *num, uint32_t *flags)
1469 {
1470 	ilb_sticky_hash_t *hash;
1471 	ilb_sticky_t *curp;
1472 	uint32_t i, j;
1473 	int ret = 0;
1474 
1475 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1476 	while (ilbs->ilbs_sticky_list_busy) {
1477 		if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1478 		    &ilbs->ilbs_sticky_list_lock) == 0) {
1479 			mutex_exit(&ilbs->ilbs_sticky_list_lock);
1480 			return (EINTR);
1481 		}
1482 	}
1483 	if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1484 		mutex_exit(&ilbs->ilbs_sticky_list_lock);
1485 		*num = 0;
1486 		*flags |= ILB_LIST_END;
1487 		return (0);
1488 	}
1489 	ilbs->ilbs_sticky_list_busy = B_TRUE;
1490 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1491 
1492 	if (*flags & ILB_LIST_BEGIN) {
1493 		i = 0;
1494 		mutex_enter(&hash[0].sticky_lock);
1495 		curp = list_head(&hash[0].sticky_head);
1496 	} else if (*flags & ILB_LIST_CONT) {
1497 		if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1498 			*num = 0;
1499 			*flags |= ILB_LIST_END;
1500 			goto done;
1501 		}
1502 		i = ilbs->ilbs_sticky_list_cur;
1503 		mutex_enter(&hash[i].sticky_lock);
1504 		curp = ilbs->ilbs_sticky_list_curp;
1505 	} else {
1506 		ret = EINVAL;
1507 		goto done;
1508 	}
1509 
1510 	j = 0;
1511 	while (j < *num) {
1512 		if (curp == NULL) {
1513 			mutex_exit(&hash[i].sticky_lock);
1514 			if (++i == ilbs->ilbs_sticky_hash_size) {
1515 				*flags |= ILB_LIST_END;
1516 				break;
1517 			}
1518 			mutex_enter(&hash[i].sticky_lock);
1519 			curp = list_head(&hash[i].sticky_head);
1520 			continue;
1521 		}
1522 		(void) strcpy(st[j].rule_name, curp->rule_name);
1523 		st[j].req_addr = curp->src;
1524 		st[j].srv_addr = curp->server->iser_addr_v6;
1525 		st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1526 		j++;
1527 		curp = list_next(&hash[i].sticky_head, curp);
1528 	}
1529 	ilbs->ilbs_sticky_list_curp = curp;
1530 	if (j == *num)
1531 		mutex_exit(&hash[i].sticky_lock);
1532 
1533 	ilbs->ilbs_sticky_list_cur = i;
1534 
1535 	*num = j;
1536 done:
1537 	mutex_enter(&ilbs->ilbs_sticky_list_lock);
1538 	ilbs->ilbs_sticky_list_busy = B_FALSE;
1539 	cv_signal(&ilbs->ilbs_sticky_list_cv);
1540 	mutex_exit(&ilbs->ilbs_sticky_list_lock);
1541 
1542 	return (ret);
1543 }
1544