1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2014 Joyent, Inc. All rights reserved.
26 */
27
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/time.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/sdt.h>
35 #include <sys/atomic.h>
36 #include <netinet/in.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <inet/tcp.h>
40 #include <inet/udp_impl.h>
41 #include <inet/ilb.h>
42
43 #include "ilb_stack.h"
44 #include "ilb_impl.h"
45 #include "ilb_conn.h"
46 #include "ilb_nat.h"
47
48 /*
49 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
50 *
51 * start: starting index into the hash table to do gc
52 * end: ending index into the hash table to do gc
53 * ilbs: pointer to the ilb_stack_t of the IP stack
54 * tid_lock: mutex to protect the timer id.
55 * tid: timer id of the timer
56 */
57 typedef struct ilb_timer_s {
58 uint32_t start;
59 uint32_t end;
60 ilb_stack_t *ilbs;
61 kmutex_t tid_lock;
62 timeout_id_t tid;
63 } ilb_timer_t;
64
65 /* Hash macro for finding the index to the conn hash table */
66 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \
67 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \
68 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \
69 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \
70 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \
71 ((hash_size) - 1))
72
73 /* Kmem cache for the conn hash entry */
74 static struct kmem_cache *ilb_conn_cache = NULL;
75
76 /*
77 * There are 60 timers running to do conn cache garbage collection. Each
78 * gc thread is responsible for 1/60 of the conn hash table.
79 */
80 static int ilb_conn_timer_size = 60;
81
82 /* Each of the above gc timers wake up every 15s to do the gc. */
83 static int ilb_conn_cache_timeout = 15;
84
85 #define ILB_STICKY_HASH(saddr, rule, hash_size) \
86 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \
87 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \
88 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \
89 (*(saddr) ^ (rule))) & ((hash_size) - 1))
90
91 static struct kmem_cache *ilb_sticky_cache = NULL;
92
93 /*
94 * There are 60 timers running to do sticky cache garbage collection. Each
95 * gc thread is responsible for 1/60 of the sticky hash table.
96 */
97 static int ilb_sticky_timer_size = 60;
98
99 /* Each of the above gc timers wake up every 15s to do the gc. */
100 static int ilb_sticky_timeout = 15;
101
102 #define ILB_STICKY_REFRELE(s) \
103 { \
104 mutex_enter(&(s)->hash->sticky_lock); \
105 (s)->refcnt--; \
106 (s)->atime = ddi_get_lbolt64(); \
107 mutex_exit(&s->hash->sticky_lock); \
108 }
109
110
111 static void
ilb_conn_cache_init(void)112 ilb_conn_cache_init(void)
113 {
114 ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
115 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
116 ilb_kmem_flags);
117 }
118
119 void
ilb_conn_cache_fini(void)120 ilb_conn_cache_fini(void)
121 {
122 if (ilb_conn_cache != NULL) {
123 kmem_cache_destroy(ilb_conn_cache);
124 ilb_conn_cache = NULL;
125 }
126 }
127
128 static void
ilb_conn_remove_common(ilb_conn_t * connp,boolean_t c2s)129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
130 {
131 ilb_conn_hash_t *hash;
132 ilb_conn_t **next, **prev;
133 ilb_conn_t **next_prev, **prev_next;
134
135 next_prev = NULL;
136 prev_next = NULL;
137
138 if (c2s) {
139 hash = connp->conn_c2s_hash;
140 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
141 next = &connp->conn_c2s_next;
142 prev = &connp->conn_c2s_prev;
143 if (*next != NULL)
144 next_prev = &(*next)->conn_c2s_prev;
145 if (*prev != NULL)
146 prev_next = &(*prev)->conn_c2s_next;
147 } else {
148 hash = connp->conn_s2c_hash;
149 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
150 next = &connp->conn_s2c_next;
151 prev = &connp->conn_s2c_prev;
152 if (*next != NULL)
153 next_prev = &(*next)->conn_s2c_prev;
154 if (*prev != NULL)
155 prev_next = &(*prev)->conn_s2c_next;
156 }
157
158 if (hash->ilb_connp == connp) {
159 hash->ilb_connp = *next;
160 if (*next != NULL)
161 *next_prev = NULL;
162 } else {
163 if (*prev != NULL)
164 *prev_next = *next;
165 if (*next != NULL)
166 *next_prev = *prev;
167 }
168 ASSERT(hash->ilb_conn_cnt > 0);
169 hash->ilb_conn_cnt--;
170
171 *next = NULL;
172 *prev = NULL;
173 }
174
175 static void
ilb_conn_remove(ilb_conn_t * connp)176 ilb_conn_remove(ilb_conn_t *connp)
177 {
178 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
179 ilb_conn_remove_common(connp, B_TRUE);
180 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
181 ilb_conn_remove_common(connp, B_FALSE);
182
183 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
184 in_port_t port;
185
186 port = ntohs(connp->conn_rule_cache.info.nat_sport);
187 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
188 (void *)(uintptr_t)port, 1);
189 }
190
191 if (connp->conn_sticky != NULL)
192 ILB_STICKY_REFRELE(connp->conn_sticky);
193 ILB_SERVER_REFRELE(connp->conn_server);
194 kmem_cache_free(ilb_conn_cache, connp);
195 }
196
197 /*
198 * Routine to do periodic garbage collection of conn hash entries. When
199 * a conn hash timer fires, it dispatches a taskq to call this function
200 * to do the gc. Note that each taskq is responisble for a portion of
201 * the table. The portion is stored in timer->start, timer->end.
202 */
203 static void
ilb_conn_cleanup(void * arg)204 ilb_conn_cleanup(void *arg)
205 {
206 ilb_timer_t *timer = (ilb_timer_t *)arg;
207 uint32_t i;
208 ilb_stack_t *ilbs;
209 ilb_conn_hash_t *c2s_hash, *s2c_hash;
210 ilb_conn_t *connp, *nxt_connp;
211 int64_t now;
212 int64_t expiry;
213 boolean_t die_now;
214
215 ilbs = timer->ilbs;
216 c2s_hash = ilbs->ilbs_c2s_conn_hash;
217 ASSERT(c2s_hash != NULL);
218
219 now = ddi_get_lbolt64();
220 for (i = timer->start; i < timer->end; i++) {
221 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
222 if ((connp = c2s_hash[i].ilb_connp) == NULL) {
223 ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
224 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
225 continue;
226 }
227 do {
228 ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
229 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
230 nxt_connp = connp->conn_c2s_next;
231 expiry = now - SEC_TO_TICK(connp->conn_expiry);
232 if (connp->conn_server->iser_die_time != 0 &&
233 connp->conn_server->iser_die_time < now)
234 die_now = B_TRUE;
235 else
236 die_now = B_FALSE;
237 s2c_hash = connp->conn_s2c_hash;
238 mutex_enter(&s2c_hash->ilb_conn_hash_lock);
239
240 if (connp->conn_gc || die_now ||
241 (connp->conn_c2s_atime < expiry &&
242 connp->conn_s2c_atime < expiry)) {
243 /* Need to update the nat list cur_connp */
244 if (connp == ilbs->ilbs_conn_list_connp) {
245 ilbs->ilbs_conn_list_connp =
246 connp->conn_c2s_next;
247 }
248 ilb_conn_remove(connp);
249 goto nxt_connp;
250 }
251
252 if (connp->conn_l4 != IPPROTO_TCP)
253 goto nxt_connp;
254
255 /* Update and check TCP related conn info */
256 if (connp->conn_c2s_tcp_fin_sent &&
257 SEQ_GT(connp->conn_s2c_tcp_ack,
258 connp->conn_c2s_tcp_fss)) {
259 connp->conn_c2s_tcp_fin_acked = B_TRUE;
260 }
261 if (connp->conn_s2c_tcp_fin_sent &&
262 SEQ_GT(connp->conn_c2s_tcp_ack,
263 connp->conn_s2c_tcp_fss)) {
264 connp->conn_s2c_tcp_fin_acked = B_TRUE;
265 }
266 if (connp->conn_c2s_tcp_fin_acked &&
267 connp->conn_s2c_tcp_fin_acked) {
268 ilb_conn_remove(connp);
269 }
270 nxt_connp:
271 mutex_exit(&s2c_hash->ilb_conn_hash_lock);
272 connp = nxt_connp;
273 } while (connp != NULL);
274 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
275 }
276 }
277
278 /* Conn hash timer routine. It dispatches a taskq and restart the timer */
279 static void
ilb_conn_timer(void * arg)280 ilb_conn_timer(void *arg)
281 {
282 ilb_timer_t *timer = (ilb_timer_t *)arg;
283
284 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
285 arg, TQ_SLEEP);
286 mutex_enter(&timer->tid_lock);
287 if (timer->tid == 0) {
288 mutex_exit(&timer->tid_lock);
289 } else {
290 timer->tid = timeout(ilb_conn_timer, arg,
291 SEC_TO_TICK(ilb_conn_cache_timeout));
292 mutex_exit(&timer->tid_lock);
293 }
294 }
295
296 void
ilb_conn_hash_init(ilb_stack_t * ilbs)297 ilb_conn_hash_init(ilb_stack_t *ilbs)
298 {
299 extern pri_t minclsyspri;
300 int i, part;
301 ilb_timer_t *tm;
302 char tq_name[TASKQ_NAMELEN];
303
304 /*
305 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
306 * the next power of 2.
307 */
308 if (!ISP2(ilbs->ilbs_conn_hash_size)) {
309 for (i = 0; i < 31; i++) {
310 if (ilbs->ilbs_conn_hash_size < (1 << i))
311 break;
312 }
313 ilbs->ilbs_conn_hash_size = 1 << i;
314 }
315
316 /*
317 * Can sleep since this should be called when a rule is being added,
318 * hence we are not in interrupt context.
319 */
320 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
321 ilbs->ilbs_conn_hash_size, KM_SLEEP);
322 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
323 ilbs->ilbs_conn_hash_size, KM_SLEEP);
324
325 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
326 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
327 NULL, MUTEX_DEFAULT, NULL);
328 }
329 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
330 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
331 NULL, MUTEX_DEFAULT, NULL);
332 }
333
334 if (ilb_conn_cache == NULL)
335 ilb_conn_cache_init();
336
337 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
338 (void *)ilbs->ilbs_netstack);
339 ASSERT(ilbs->ilbs_conn_taskq == NULL);
340 ilbs->ilbs_conn_taskq = taskq_create(tq_name,
341 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
342 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
343
344 ASSERT(ilbs->ilbs_conn_timer_list == NULL);
345 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
346 ilb_conn_timer_size, KM_SLEEP);
347
348 /*
349 * The hash table is divided in equal partition for those timers
350 * to do garbage collection.
351 */
352 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
353 for (i = 0; i < ilb_conn_timer_size; i++) {
354 tm = ilbs->ilbs_conn_timer_list + i;
355 tm->start = i * part;
356 tm->end = i * part + part;
357 if (tm->end > ilbs->ilbs_conn_hash_size)
358 tm->end = ilbs->ilbs_conn_hash_size;
359 tm->ilbs = ilbs;
360 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
361 /* Spread out the starting execution time of all the timers. */
362 tm->tid = timeout(ilb_conn_timer, tm,
363 SEC_TO_TICK(ilb_conn_cache_timeout + i));
364 }
365 }
366
367 void
ilb_conn_hash_fini(ilb_stack_t * ilbs)368 ilb_conn_hash_fini(ilb_stack_t *ilbs)
369 {
370 uint32_t i;
371 ilb_conn_t *connp;
372 ilb_conn_hash_t *hash;
373
374 if (ilbs->ilbs_c2s_conn_hash == NULL) {
375 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
376 return;
377 }
378
379 /* Stop all the timers first. */
380 for (i = 0; i < ilb_conn_timer_size; i++) {
381 timeout_id_t tid;
382
383 /* Setting tid to 0 tells the timer handler not to restart. */
384 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
385 tid = ilbs->ilbs_conn_timer_list[i].tid;
386 ilbs->ilbs_conn_timer_list[i].tid = 0;
387 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
388 (void) untimeout(tid);
389 }
390 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
391 ilb_conn_timer_size);
392 taskq_destroy(ilbs->ilbs_conn_taskq);
393 ilbs->ilbs_conn_taskq = NULL;
394
395 /* Then remove all the conns. */
396 hash = ilbs->ilbs_s2c_conn_hash;
397 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
398 while ((connp = hash[i].ilb_connp) != NULL) {
399 hash[i].ilb_connp = connp->conn_s2c_next;
400 ILB_SERVER_REFRELE(connp->conn_server);
401 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
402 ilb_nat_src_entry_t *ent;
403 in_port_t port;
404
405 /*
406 * src_ent will be freed in ilb_nat_src_fini().
407 */
408 port = ntohs(
409 connp->conn_rule_cache.info.nat_sport);
410 ent = connp->conn_rule_cache.info.src_ent;
411 vmem_free(ent->nse_port_arena,
412 (void *)(uintptr_t)port, 1);
413 }
414 kmem_cache_free(ilb_conn_cache, connp);
415 }
416 }
417 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
418 ilbs->ilbs_conn_hash_size);
419 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
420 ilbs->ilbs_conn_hash_size);
421 }
422
423 /*
424 * Internet checksum adjustment calculation routines. We pre-calculate
425 * checksum adjustment so that we don't need to compute the checksum on
426 * the whole packet when we change address/port in the packet.
427 */
428
429 static void
hnat_cksum_v4(uint16_t * oaddr,uint16_t * naddr,in_port_t old_port,in_port_t new_port,uint32_t * adj_sum)430 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
431 in_port_t new_port, uint32_t *adj_sum)
432 {
433 uint32_t sum;
434
435 sum = *oaddr + *(oaddr + 1) + old_port;
436 while ((sum >> 16) != 0)
437 sum = (sum & 0xffff) + (sum >> 16);
438 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
439 }
440
441 static void
hnat_cksum_v6(uint16_t * oaddr,uint16_t * naddr,in_port_t old_port,in_port_t new_port,uint32_t * adj_sum)442 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
443 in_port_t new_port, uint32_t *adj_sum)
444 {
445 uint32_t sum = 0;
446
447 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
448 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
449 old_port;
450 while ((sum >> 16) != 0)
451 sum = (sum & 0xffff) + (sum >> 16);
452 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
453 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
454 *(naddr + 6) + *(naddr + 7) + new_port;
455 }
456
457 static void
fnat_cksum_v4(uint16_t * oaddr1,uint16_t * oaddr2,uint16_t * naddr1,uint16_t * naddr2,in_port_t old_port1,in_port_t old_port2,in_port_t new_port1,in_port_t new_port2,uint32_t * adj_sum)458 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
459 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
460 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
461 {
462 uint32_t sum;
463
464 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
465 old_port2;
466 while ((sum >> 16) != 0)
467 sum = (sum & 0xffff) + (sum >> 16);
468 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
469 *naddr2 + *(naddr2 + 1) + new_port2;
470 }
471
472 static void
fnat_cksum_v6(uint16_t * oaddr1,uint16_t * oaddr2,uint16_t * naddr1,uint16_t * naddr2,in_port_t old_port1,in_port_t old_port2,in_port_t new_port1,in_port_t new_port2,uint32_t * adj_sum)473 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
474 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
475 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
476 {
477 uint32_t sum = 0;
478
479 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
480 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
481 old_port1;
482 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
483 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
484 old_port2;
485 while ((sum >> 16) != 0)
486 sum = (sum & 0xffff) + (sum >> 16);
487 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
488 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
489 *(naddr1 + 7) + new_port1;
490 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
491 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
492 *(naddr2 + 7) + new_port2;
493 }
494
495 /*
496 * Add a conn hash entry to the tables. Note that a conn hash entry
497 * (ilb_conn_t) contains info on both directions. And there are two hash
498 * tables, one for client to server and the other for server to client.
499 * So the same entry is added to both tables and can be ccessed by two
500 * thread simultaneously. But each thread will only access data on one
501 * direction, so there is no conflict.
502 */
503 int
ilb_conn_add(ilb_stack_t * ilbs,ilb_rule_t * rule,ilb_server_t * server,in6_addr_t * src,in_port_t sport,in6_addr_t * dst,in_port_t dport,ilb_nat_info_t * info,uint32_t * ip_sum,uint32_t * tp_sum,ilb_sticky_t * s)504 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
505 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
506 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
507 {
508 ilb_conn_t *connp;
509 ilb_conn_hash_t *hash;
510 int i;
511
512 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
513 if (connp == NULL) {
514 if (s != NULL) {
515 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
516 ilb_nat_src_entry_t **entry;
517
518 entry = s->server->iser_nat_src->src_list;
519 vmem_free(entry[s->nat_src_idx]->nse_port_arena,
520 (void *)(uintptr_t)ntohs(info->nat_sport),
521 1);
522 }
523 ILB_STICKY_REFRELE(s);
524 }
525 return (ENOMEM);
526 }
527
528 connp->conn_l4 = rule->ir_proto;
529
530 connp->conn_server = server;
531 ILB_SERVER_REFHOLD(server);
532 connp->conn_sticky = s;
533
534 connp->conn_rule_cache.topo = rule->ir_topo;
535 connp->conn_rule_cache.info = *info;
536
537 connp->conn_gc = B_FALSE;
538
539 connp->conn_expiry = rule->ir_nat_expiry;
540 connp->conn_cr_time = ddi_get_lbolt64();
541
542 /* Client to server info. */
543 connp->conn_c2s_saddr = *src;
544 connp->conn_c2s_sport = sport;
545 connp->conn_c2s_daddr = *dst;
546 connp->conn_c2s_dport = dport;
547
548 connp->conn_c2s_atime = ddi_get_lbolt64();
549 /* The packet ths triggers this creation should be counted */
550 connp->conn_c2s_pkt_cnt = 1;
551 connp->conn_c2s_tcp_fin_sent = B_FALSE;
552 connp->conn_c2s_tcp_fin_acked = B_FALSE;
553
554 /* Server to client info, before NAT */
555 switch (rule->ir_topo) {
556 case ILB_TOPO_IMPL_HALF_NAT:
557 connp->conn_s2c_saddr = info->nat_dst;
558 connp->conn_s2c_sport = info->nat_dport;
559 connp->conn_s2c_daddr = *src;
560 connp->conn_s2c_dport = sport;
561
562 /* Pre-calculate checksum changes for both directions */
563 if (rule->ir_ipver == IPPROTO_IP) {
564 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
565 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
566 &connp->conn_c2s_ip_sum);
567 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
568 (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
569 info->nat_dport, &connp->conn_c2s_tp_sum);
570 *ip_sum = connp->conn_c2s_ip_sum;
571 *tp_sum = connp->conn_c2s_tp_sum;
572
573 hnat_cksum_v4(
574 (uint16_t *)&info->nat_dst.s6_addr32[3],
575 (uint16_t *)&dst->s6_addr32[3], 0, 0,
576 &connp->conn_s2c_ip_sum);
577 hnat_cksum_v4(
578 (uint16_t *)&info->nat_dst.s6_addr32[3],
579 (uint16_t *)&dst->s6_addr32[3],
580 info->nat_dport, dport,
581 &connp->conn_s2c_tp_sum);
582 } else {
583 connp->conn_c2s_ip_sum = 0;
584 hnat_cksum_v6((uint16_t *)dst,
585 (uint16_t *)&info->nat_dst, dport,
586 info->nat_dport, &connp->conn_c2s_tp_sum);
587 *ip_sum = 0;
588 *tp_sum = connp->conn_c2s_tp_sum;
589
590 connp->conn_s2c_ip_sum = 0;
591 hnat_cksum_v6((uint16_t *)&info->nat_dst,
592 (uint16_t *)dst, info->nat_dport, dport,
593 &connp->conn_s2c_tp_sum);
594 }
595 break;
596 case ILB_TOPO_IMPL_NAT:
597 connp->conn_s2c_saddr = info->nat_dst;
598 connp->conn_s2c_sport = info->nat_dport;
599 connp->conn_s2c_daddr = info->nat_src;
600 connp->conn_s2c_dport = info->nat_sport;
601
602 if (rule->ir_ipver == IPPROTO_IP) {
603 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
604 (uint16_t *)&dst->s6_addr32[3],
605 (uint16_t *)&info->nat_src.s6_addr32[3],
606 (uint16_t *)&info->nat_dst.s6_addr32[3],
607 0, 0, 0, 0, &connp->conn_c2s_ip_sum);
608 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
609 (uint16_t *)&dst->s6_addr32[3],
610 (uint16_t *)&info->nat_src.s6_addr32[3],
611 (uint16_t *)&info->nat_dst.s6_addr32[3],
612 sport, dport, info->nat_sport,
613 info->nat_dport, &connp->conn_c2s_tp_sum);
614 *ip_sum = connp->conn_c2s_ip_sum;
615 *tp_sum = connp->conn_c2s_tp_sum;
616
617 fnat_cksum_v4(
618 (uint16_t *)&info->nat_src.s6_addr32[3],
619 (uint16_t *)&info->nat_dst.s6_addr32[3],
620 (uint16_t *)&src->s6_addr32[3],
621 (uint16_t *)&dst->s6_addr32[3],
622 0, 0, 0, 0, &connp->conn_s2c_ip_sum);
623 fnat_cksum_v4(
624 (uint16_t *)&info->nat_src.s6_addr32[3],
625 (uint16_t *)&info->nat_dst.s6_addr32[3],
626 (uint16_t *)&src->s6_addr32[3],
627 (uint16_t *)&dst->s6_addr32[3],
628 info->nat_sport, info->nat_dport,
629 sport, dport, &connp->conn_s2c_tp_sum);
630 } else {
631 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
632 (uint16_t *)&info->nat_src,
633 (uint16_t *)&info->nat_dst,
634 sport, dport, info->nat_sport,
635 info->nat_dport, &connp->conn_c2s_tp_sum);
636 connp->conn_c2s_ip_sum = 0;
637 *ip_sum = 0;
638 *tp_sum = connp->conn_c2s_tp_sum;
639
640 fnat_cksum_v6((uint16_t *)&info->nat_src,
641 (uint16_t *)&info->nat_dst, (uint16_t *)src,
642 (uint16_t *)dst, info->nat_sport,
643 info->nat_dport, sport, dport,
644 &connp->conn_s2c_tp_sum);
645 connp->conn_s2c_ip_sum = 0;
646 }
647 break;
648 }
649
650 connp->conn_s2c_atime = ddi_get_lbolt64();
651 connp->conn_s2c_pkt_cnt = 1;
652 connp->conn_s2c_tcp_fin_sent = B_FALSE;
653 connp->conn_s2c_tcp_fin_acked = B_FALSE;
654
655 /* Add it to the s2c hash table. */
656 hash = ilbs->ilbs_s2c_conn_hash;
657 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
658 ntohs(connp->conn_s2c_sport),
659 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
660 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
661 connp->conn_s2c_hash = &hash[i];
662 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
663
664 mutex_enter(&hash[i].ilb_conn_hash_lock);
665 hash[i].ilb_conn_cnt++;
666 connp->conn_s2c_next = hash[i].ilb_connp;
667 if (hash[i].ilb_connp != NULL)
668 hash[i].ilb_connp->conn_s2c_prev = connp;
669 connp->conn_s2c_prev = NULL;
670 hash[i].ilb_connp = connp;
671 mutex_exit(&hash[i].ilb_conn_hash_lock);
672
673 /* Add it to the c2s hash table. */
674 hash = ilbs->ilbs_c2s_conn_hash;
675 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
676 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
677 ilbs->ilbs_conn_hash_size);
678 connp->conn_c2s_hash = &hash[i];
679 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
680
681 mutex_enter(&hash[i].ilb_conn_hash_lock);
682 hash[i].ilb_conn_cnt++;
683 connp->conn_c2s_next = hash[i].ilb_connp;
684 if (hash[i].ilb_connp != NULL)
685 hash[i].ilb_connp->conn_c2s_prev = connp;
686 connp->conn_c2s_prev = NULL;
687 hash[i].ilb_connp = connp;
688 mutex_exit(&hash[i].ilb_conn_hash_lock);
689
690 return (0);
691 }
692
693 /*
694 * If a connection is using TCP, we keep track of simple TCP state transition
695 * so that we know when to clean up an entry.
696 */
697 static boolean_t
update_conn_tcp(ilb_conn_t * connp,void * iph,tcpha_t * tcpha,int32_t pkt_len,boolean_t c2s)698 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
699 boolean_t c2s)
700 {
701 uint32_t ack, seq;
702 int32_t seg_len;
703
704 ack = 0;
705 if (tcpha->tha_flags & TH_RST)
706 return (B_FALSE);
707
708 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
709 TCP_HDR_LENGTH((tcph_t *)tcpha);
710
711 if (tcpha->tha_flags & TH_ACK)
712 ack = ntohl(tcpha->tha_ack);
713 seq = ntohl(tcpha->tha_seq);
714 if (c2s) {
715 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
716 if (tcpha->tha_flags & TH_FIN) {
717 connp->conn_c2s_tcp_fss = seq + seg_len;
718 connp->conn_c2s_tcp_fin_sent = B_TRUE;
719 }
720 connp->conn_c2s_tcp_ack = ack;
721
722 /* Port reuse by the client, restart the conn. */
723 if (connp->conn_c2s_tcp_fin_sent &&
724 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
725 connp->conn_c2s_tcp_fin_sent = B_FALSE;
726 connp->conn_c2s_tcp_fin_acked = B_FALSE;
727 }
728 } else {
729 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
730 if (tcpha->tha_flags & TH_FIN) {
731 connp->conn_s2c_tcp_fss = seq + seg_len;
732 connp->conn_s2c_tcp_fin_sent = B_TRUE;
733 }
734 connp->conn_s2c_tcp_ack = ack;
735
736 /* Port reuse by the client, restart the conn. */
737 if (connp->conn_s2c_tcp_fin_sent &&
738 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
739 connp->conn_s2c_tcp_fin_sent = B_FALSE;
740 connp->conn_s2c_tcp_fin_acked = B_FALSE;
741 }
742 }
743
744 return (B_TRUE);
745 }
746
747 /*
748 * Helper routint to find conn hash entry given some packet information and
749 * the traffic direction (c2s, client to server?)
750 */
751 static boolean_t
ilb_find_conn(ilb_stack_t * ilbs,void * iph,void * tph,int l4,in6_addr_t * src,in_port_t sport,in6_addr_t * dst,in_port_t dport,ilb_rule_info_t * rule_cache,uint32_t * ip_sum,uint32_t * tp_sum,int32_t pkt_len,boolean_t c2s)752 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
753 in_port_t sport, in6_addr_t *dst, in_port_t dport,
754 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
755 int32_t pkt_len, boolean_t c2s)
756 {
757 ilb_conn_hash_t *hash;
758 uint_t i;
759 ilb_conn_t *connp;
760 boolean_t tcp_alive;
761 boolean_t ret = B_FALSE;
762
763 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
764 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
765 ilbs->ilbs_conn_hash_size);
766 if (c2s) {
767 hash = ilbs->ilbs_c2s_conn_hash;
768 mutex_enter(&hash[i].ilb_conn_hash_lock);
769 for (connp = hash[i].ilb_connp; connp != NULL;
770 connp = connp->conn_c2s_next) {
771 if (connp->conn_l4 == l4 &&
772 connp->conn_c2s_dport == dport &&
773 connp->conn_c2s_sport == sport &&
774 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
775 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
776 connp->conn_c2s_atime = ddi_get_lbolt64();
777 connp->conn_c2s_pkt_cnt++;
778 *rule_cache = connp->conn_rule_cache;
779 *ip_sum = connp->conn_c2s_ip_sum;
780 *tp_sum = connp->conn_c2s_tp_sum;
781 ret = B_TRUE;
782 break;
783 }
784 }
785 } else {
786 hash = ilbs->ilbs_s2c_conn_hash;
787 mutex_enter(&hash[i].ilb_conn_hash_lock);
788 for (connp = hash[i].ilb_connp; connp != NULL;
789 connp = connp->conn_s2c_next) {
790 if (connp->conn_l4 == l4 &&
791 connp->conn_s2c_dport == dport &&
792 connp->conn_s2c_sport == sport &&
793 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
794 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
795 connp->conn_s2c_atime = ddi_get_lbolt64();
796 connp->conn_s2c_pkt_cnt++;
797 *rule_cache = connp->conn_rule_cache;
798 *ip_sum = connp->conn_s2c_ip_sum;
799 *tp_sum = connp->conn_s2c_tp_sum;
800 ret = B_TRUE;
801 break;
802 }
803 }
804 }
805 if (ret) {
806 ILB_S_KSTAT(connp->conn_server, pkt_processed);
807 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
808 pkt_len);
809
810 switch (l4) {
811 case (IPPROTO_TCP):
812 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
813 c2s);
814 if (!tcp_alive) {
815 connp->conn_gc = B_TRUE;
816 }
817 break;
818 default:
819 break;
820 }
821 }
822 mutex_exit(&hash[i].ilb_conn_hash_lock);
823
824 return (ret);
825 }
826
827 /*
828 * To check if a give packet matches an existing conn hash entry. If it
829 * does, return the information about this entry so that the caller can
830 * do the proper NAT.
831 */
832 boolean_t
ilb_check_conn(ilb_stack_t * ilbs,int l3,void * iph,int l4,void * tph,in6_addr_t * src,in6_addr_t * dst,in_port_t sport,in_port_t dport,uint32_t pkt_len,in6_addr_t * lb_dst)833 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
834 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
835 uint32_t pkt_len, in6_addr_t *lb_dst)
836 {
837 ilb_rule_info_t rule_cache;
838 uint32_t adj_ip_sum, adj_tp_sum;
839 boolean_t ret;
840
841 /* Check the incoming hash table. */
842 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
843 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
844 switch (rule_cache.topo) {
845 case ILB_TOPO_IMPL_NAT:
846 *lb_dst = rule_cache.info.nat_dst;
847 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
848 adj_ip_sum, adj_tp_sum, B_TRUE);
849 ret = B_TRUE;
850 break;
851 case ILB_TOPO_IMPL_HALF_NAT:
852 *lb_dst = rule_cache.info.nat_dst;
853 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
854 adj_ip_sum, adj_tp_sum, B_TRUE);
855 ret = B_TRUE;
856 break;
857 default:
858 ret = B_FALSE;
859 break;
860 }
861 return (ret);
862 }
863 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
864 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
865 switch (rule_cache.topo) {
866 case ILB_TOPO_IMPL_NAT:
867 *lb_dst = rule_cache.info.src;
868 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
869 adj_ip_sum, adj_tp_sum, B_FALSE);
870 ret = B_TRUE;
871 break;
872 case ILB_TOPO_IMPL_HALF_NAT:
873 *lb_dst = *dst;
874 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
875 adj_ip_sum, adj_tp_sum, B_FALSE);
876 ret = B_TRUE;
877 break;
878 default:
879 ret = B_FALSE;
880 break;
881 }
882 return (ret);
883 }
884
885 return (B_FALSE);
886 }
887
888 /*
889 * To check if an ICMP packet belongs to a connection in one of the conn
890 * hash entries.
891 */
892 boolean_t
ilb_check_icmp_conn(ilb_stack_t * ilbs,mblk_t * mp,int l3,void * out_iph,void * icmph,in6_addr_t * lb_dst)893 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
894 void *icmph, in6_addr_t *lb_dst)
895 {
896 ilb_conn_hash_t *hash;
897 ipha_t *in_iph4;
898 ip6_t *in_iph6;
899 icmph_t *icmph4;
900 icmp6_t *icmph6;
901 in6_addr_t *in_src_p, *in_dst_p;
902 in_port_t *sport, *dport;
903 int l4;
904 uint_t i;
905 ilb_conn_t *connp;
906 ilb_rule_info_t rule_cache;
907 uint32_t adj_ip_sum;
908 boolean_t full_nat;
909
910 in_iph4 = NULL;
911 in_iph6 = NULL;
912 icmph4 = NULL;
913 icmph6 = NULL;
914
915 if (l3 == IPPROTO_IP) {
916 in6_addr_t in_src, in_dst;
917
918 icmph4 = (icmph_t *)icmph;
919 in_iph4 = (ipha_t *)&icmph4[1];
920
921 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
922 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
923 return (B_FALSE);
924 }
925
926 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
927 in_src_p = &in_src;
928 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
929 in_dst_p = &in_dst;
930
931 l4 = in_iph4->ipha_protocol;
932 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
933 return (B_FALSE);
934
935 sport = (in_port_t *)((char *)in_iph4 +
936 IPH_HDR_LENGTH(in_iph4));
937 dport = sport + 1;
938
939 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
940 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
941 ntohs(*sport), uint16_t, ntohs(*dport));
942 } else {
943 ASSERT(l3 == IPPROTO_IPV6);
944
945 icmph6 = (icmp6_t *)icmph;
946 in_iph6 = (ip6_t *)&icmph6[1];
947 in_src_p = &in_iph6->ip6_src;
948 in_dst_p = &in_iph6->ip6_dst;
949
950 if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
951 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
952 return (B_FALSE);
953 }
954
955 l4 = in_iph6->ip6_nxt;
956 /* We don't go deep inside an IPv6 packet yet. */
957 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
958 return (B_FALSE);
959
960 sport = (in_port_t *)&in_iph6[1];
961 dport = sport + 1;
962
963 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
964 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
965 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
966 }
967
968 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
969 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
970 ilbs->ilbs_conn_hash_size);
971 hash = ilbs->ilbs_c2s_conn_hash;
972
973 mutex_enter(&hash[i].ilb_conn_hash_lock);
974 for (connp = hash[i].ilb_connp; connp != NULL;
975 connp = connp->conn_c2s_next) {
976 if (connp->conn_l4 == l4 &&
977 connp->conn_c2s_dport == *sport &&
978 connp->conn_c2s_sport == *dport &&
979 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
980 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
981 connp->conn_c2s_atime = ddi_get_lbolt64();
982 connp->conn_c2s_pkt_cnt++;
983 rule_cache = connp->conn_rule_cache;
984 adj_ip_sum = connp->conn_c2s_ip_sum;
985 break;
986 }
987 }
988 mutex_exit(&hash[i].ilb_conn_hash_lock);
989
990 if (connp == NULL) {
991 DTRACE_PROBE(ilb__chk__icmp__conn__failed);
992 return (B_FALSE);
993 }
994
995 switch (rule_cache.topo) {
996 case ILB_TOPO_IMPL_NAT:
997 full_nat = B_TRUE;
998 break;
999 case ILB_TOPO_IMPL_HALF_NAT:
1000 full_nat = B_FALSE;
1001 break;
1002 default:
1003 return (B_FALSE);
1004 }
1005
1006 *lb_dst = rule_cache.info.nat_dst;
1007 if (l3 == IPPROTO_IP) {
1008 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
1009 &rule_cache.info, adj_ip_sum, full_nat);
1010 } else {
1011 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1012 &rule_cache.info, full_nat);
1013 }
1014 return (B_TRUE);
1015 }
1016
1017 /*
1018 * This routine sends up the conn hash table to user land. Note that the
1019 * request is an ioctl, hence we cannot really differentiate requests
1020 * from different clients. There is no context shared between different
1021 * ioctls. Here we make the assumption that the user land ilbd will
1022 * only allow one client to show the conn hash table at any time.
1023 * Otherwise, the results will be "very" inconsistent.
1024 *
1025 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1026 * to read from the beginning of the able. After a certain entries
1027 * are reported, the kernel remembers the position of the last returned
1028 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1029 * it will return entries starting from where it was left off. When
1030 * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1031 * the client that there is no more entry.
1032 *
1033 * It is assumed that the caller has checked the size of nat so that it
1034 * can hold num entries.
1035 */
1036 /* ARGSUSED */
1037 int
ilb_list_nat(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_nat_entry_t * nat,uint32_t * num,uint32_t * flags)1038 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1039 uint32_t *num, uint32_t *flags)
1040 {
1041 ilb_conn_hash_t *hash;
1042 ilb_conn_t *cur_connp;
1043 uint32_t i, j;
1044 int ret = 0;
1045
1046 mutex_enter(&ilbs->ilbs_conn_list_lock);
1047 while (ilbs->ilbs_conn_list_busy) {
1048 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1049 &ilbs->ilbs_conn_list_lock) == 0) {
1050 mutex_exit(&ilbs->ilbs_conn_list_lock);
1051 return (EINTR);
1052 }
1053 }
1054 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1055 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1056 mutex_exit(&ilbs->ilbs_conn_list_lock);
1057 *num = 0;
1058 *flags |= ILB_LIST_END;
1059 return (0);
1060 }
1061 ilbs->ilbs_conn_list_busy = B_TRUE;
1062 mutex_exit(&ilbs->ilbs_conn_list_lock);
1063
1064 if (*flags & ILB_LIST_BEGIN) {
1065 i = 0;
1066 mutex_enter(&hash[0].ilb_conn_hash_lock);
1067 cur_connp = hash[0].ilb_connp;
1068 } else if (*flags & ILB_LIST_CONT) {
1069 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1070 *num = 0;
1071 *flags |= ILB_LIST_END;
1072 goto done;
1073 }
1074 i = ilbs->ilbs_conn_list_cur;
1075 mutex_enter(&hash[i].ilb_conn_hash_lock);
1076 cur_connp = ilbs->ilbs_conn_list_connp;
1077 } else {
1078 ret = EINVAL;
1079 goto done;
1080 }
1081
1082 j = 0;
1083 while (j < *num) {
1084 if (cur_connp == NULL) {
1085 mutex_exit(&hash[i].ilb_conn_hash_lock);
1086 if (++i == ilbs->ilbs_conn_hash_size) {
1087 *flags |= ILB_LIST_END;
1088 break;
1089 }
1090 mutex_enter(&hash[i].ilb_conn_hash_lock);
1091 cur_connp = hash[i].ilb_connp;
1092 continue;
1093 }
1094 nat[j].proto = cur_connp->conn_l4;
1095
1096 nat[j].in_global = cur_connp->conn_c2s_daddr;
1097 nat[j].in_global_port = cur_connp->conn_c2s_dport;
1098 nat[j].out_global = cur_connp->conn_c2s_saddr;
1099 nat[j].out_global_port = cur_connp->conn_c2s_sport;
1100
1101 nat[j].in_local = cur_connp->conn_s2c_saddr;
1102 nat[j].in_local_port = cur_connp->conn_s2c_sport;
1103 nat[j].out_local = cur_connp->conn_s2c_daddr;
1104 nat[j].out_local_port = cur_connp->conn_s2c_dport;
1105
1106 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1107 nat[j].last_access_time =
1108 TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1109
1110 /*
1111 * The conn_s2c_pkt_cnt may not be accurate since we are not
1112 * holding the s2c hash lock.
1113 */
1114 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1115 cur_connp->conn_s2c_pkt_cnt;
1116 j++;
1117
1118 cur_connp = cur_connp->conn_c2s_next;
1119 }
1120 ilbs->ilbs_conn_list_connp = cur_connp;
1121 if (j == *num)
1122 mutex_exit(&hash[i].ilb_conn_hash_lock);
1123
1124 ilbs->ilbs_conn_list_cur = i;
1125
1126 *num = j;
1127 done:
1128 mutex_enter(&ilbs->ilbs_conn_list_lock);
1129 ilbs->ilbs_conn_list_busy = B_FALSE;
1130 cv_signal(&ilbs->ilbs_conn_list_cv);
1131 mutex_exit(&ilbs->ilbs_conn_list_lock);
1132
1133 return (ret);
1134 }
1135
1136
1137 /*
1138 * Stickiness (persistence) handling routines.
1139 */
1140
1141
1142 static void
ilb_sticky_cache_init(void)1143 ilb_sticky_cache_init(void)
1144 {
1145 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1146 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1147 ilb_kmem_flags);
1148 }
1149
1150 void
ilb_sticky_cache_fini(void)1151 ilb_sticky_cache_fini(void)
1152 {
1153 if (ilb_sticky_cache != NULL) {
1154 kmem_cache_destroy(ilb_sticky_cache);
1155 ilb_sticky_cache = NULL;
1156 }
1157 }
1158
1159 void
ilb_sticky_refrele(ilb_sticky_t * s)1160 ilb_sticky_refrele(ilb_sticky_t *s)
1161 {
1162 ILB_STICKY_REFRELE(s);
1163 }
1164
1165 static ilb_sticky_t *
ilb_sticky_lookup(ilb_sticky_hash_t * hash,ilb_rule_t * rule,in6_addr_t * src)1166 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1167 {
1168 ilb_sticky_t *s;
1169
1170 ASSERT(mutex_owned(&hash->sticky_lock));
1171
1172 for (s = list_head(&hash->sticky_head); s != NULL;
1173 s = list_next(&hash->sticky_head, s)) {
1174 if (s->rule_instance == rule->ir_ks_instance) {
1175 if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1176 return (s);
1177 }
1178 }
1179 return (NULL);
1180 }
1181
1182 static ilb_sticky_t *
ilb_sticky_add(ilb_sticky_hash_t * hash,ilb_rule_t * rule,ilb_server_t * server,in6_addr_t * src)1183 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1184 in6_addr_t *src)
1185 {
1186 ilb_sticky_t *s;
1187
1188 ASSERT(mutex_owned(&hash->sticky_lock));
1189
1190 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1191 return (NULL);
1192
1193 /*
1194 * The rule instance is for handling the scenario when the same
1195 * client talks to different rules at the same time. Stickiness
1196 * is per rule so we can use the rule instance to differentiate
1197 * the client's request.
1198 */
1199 s->rule_instance = rule->ir_ks_instance;
1200 /*
1201 * Copy the rule name for listing all sticky cache entry. ir_name
1202 * is guaranteed to be NULL terminated.
1203 */
1204 (void) strcpy(s->rule_name, rule->ir_name);
1205 s->server = server;
1206
1207 /*
1208 * Grab a ref cnt on the server so that it won't go away while
1209 * it is still in the sticky table.
1210 */
1211 ILB_SERVER_REFHOLD(server);
1212 s->src = *src;
1213 s->expiry = rule->ir_sticky_expiry;
1214 s->refcnt = 1;
1215 s->hash = hash;
1216
1217 /*
1218 * There is no need to set atime here since the refcnt is not
1219 * zero. A sticky entry is removed only when the refcnt is
1220 * zero. But just set it here for debugging purpose. The
1221 * atime is set when a refrele is done on a sticky entry.
1222 */
1223 s->atime = ddi_get_lbolt64();
1224
1225 list_insert_head(&hash->sticky_head, s);
1226 hash->sticky_cnt++;
1227 return (s);
1228 }
1229
1230 /*
1231 * This routine checks if there is an existing sticky entry which matches
1232 * a given packet. If there is one, return it. If there is not, create
1233 * a sticky entry using the packet's info.
1234 */
1235 ilb_server_t *
ilb_sticky_find_add(ilb_stack_t * ilbs,ilb_rule_t * rule,in6_addr_t * src,ilb_server_t * server,ilb_sticky_t ** res,uint16_t * src_ent_idx)1236 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1237 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1238 {
1239 int i;
1240 ilb_sticky_hash_t *hash;
1241 ilb_sticky_t *s;
1242
1243 ASSERT(server != NULL);
1244
1245 *res = NULL;
1246
1247 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1248 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1249 hash = &ilbs->ilbs_sticky_hash[i];
1250
1251 /* First check if there is already an entry. */
1252 mutex_enter(&hash->sticky_lock);
1253 s = ilb_sticky_lookup(hash, rule, src);
1254
1255 /* No sticky entry, add one. */
1256 if (s == NULL) {
1257 add_new_entry:
1258 s = ilb_sticky_add(hash, rule, server, src);
1259 if (s == NULL) {
1260 mutex_exit(&hash->sticky_lock);
1261 return (NULL);
1262 }
1263 /*
1264 * Find a source for this server. All subseqent requests from
1265 * the same client matching this sticky entry will use this
1266 * source address in doing NAT. The current algorithm is
1267 * simple, rotate the source address. Note that the
1268 * source address array does not change after it's created, so
1269 * it is OK to just increment the cur index.
1270 */
1271 if (server->iser_nat_src != NULL) {
1272 /* It is a hint, does not need to be atomic. */
1273 *src_ent_idx = (server->iser_nat_src->cur++ %
1274 server->iser_nat_src->num_src);
1275 s->nat_src_idx = *src_ent_idx;
1276 }
1277 mutex_exit(&hash->sticky_lock);
1278 *res = s;
1279 return (server);
1280 }
1281
1282 /*
1283 * We don't hold any lock accessing iser_enabled. Refer to the
1284 * comment in ilb_server_add() about iser_lock.
1285 */
1286 if (!s->server->iser_enabled) {
1287 /*
1288 * s->server == server can only happen if there is a race in
1289 * toggling the iser_enabled flag (we don't hold a lock doing
1290 * that) so that the load balance algorithm still returns a
1291 * disabled server. In this case, just drop the packet...
1292 */
1293 if (s->server == server) {
1294 mutex_exit(&hash->sticky_lock);
1295 return (NULL);
1296 }
1297
1298 /*
1299 * The old server is disabled and there is a new server, use
1300 * the new one to create a sticky entry. Since we will
1301 * add the entry at the beginning, subsequent lookup will
1302 * find this new entry instead of the old one.
1303 */
1304 goto add_new_entry;
1305 }
1306
1307 s->refcnt++;
1308 *res = s;
1309 mutex_exit(&hash->sticky_lock);
1310 if (server->iser_nat_src != NULL)
1311 *src_ent_idx = s->nat_src_idx;
1312 return (s->server);
1313 }
1314
1315 static void
ilb_sticky_cleanup(void * arg)1316 ilb_sticky_cleanup(void *arg)
1317 {
1318 ilb_timer_t *timer = (ilb_timer_t *)arg;
1319 uint32_t i;
1320 ilb_stack_t *ilbs;
1321 ilb_sticky_hash_t *hash;
1322 ilb_sticky_t *s, *nxt_s;
1323 int64_t now, expiry;
1324
1325 ilbs = timer->ilbs;
1326 hash = ilbs->ilbs_sticky_hash;
1327 ASSERT(hash != NULL);
1328
1329 now = ddi_get_lbolt64();
1330 for (i = timer->start; i < timer->end; i++) {
1331 mutex_enter(&hash[i].sticky_lock);
1332 for (s = list_head(&hash[i].sticky_head); s != NULL;
1333 s = nxt_s) {
1334 nxt_s = list_next(&hash[i].sticky_head, s);
1335 if (s->refcnt != 0)
1336 continue;
1337 expiry = now - SEC_TO_TICK(s->expiry);
1338 if (s->atime < expiry) {
1339 ILB_SERVER_REFRELE(s->server);
1340 list_remove(&hash[i].sticky_head, s);
1341 kmem_cache_free(ilb_sticky_cache, s);
1342 hash[i].sticky_cnt--;
1343 }
1344 }
1345 mutex_exit(&hash[i].sticky_lock);
1346 }
1347 }
1348
1349 static void
ilb_sticky_timer(void * arg)1350 ilb_sticky_timer(void *arg)
1351 {
1352 ilb_timer_t *timer = (ilb_timer_t *)arg;
1353
1354 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1355 ilb_sticky_cleanup, arg, TQ_SLEEP);
1356 mutex_enter(&timer->tid_lock);
1357 if (timer->tid == 0) {
1358 mutex_exit(&timer->tid_lock);
1359 } else {
1360 timer->tid = timeout(ilb_sticky_timer, arg,
1361 SEC_TO_TICK(ilb_sticky_timeout));
1362 mutex_exit(&timer->tid_lock);
1363 }
1364 }
1365
1366 void
ilb_sticky_hash_init(ilb_stack_t * ilbs)1367 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1368 {
1369 extern pri_t minclsyspri;
1370 int i, part;
1371 char tq_name[TASKQ_NAMELEN];
1372 ilb_timer_t *tm;
1373
1374 if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1375 for (i = 0; i < 31; i++) {
1376 if (ilbs->ilbs_sticky_hash_size < (1 << i))
1377 break;
1378 }
1379 ilbs->ilbs_sticky_hash_size = 1 << i;
1380 }
1381
1382 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1383 ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1384 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1385 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1386 MUTEX_DEFAULT, NULL);
1387 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1388 sizeof (ilb_sticky_t),
1389 offsetof(ilb_sticky_t, list));
1390 }
1391
1392 if (ilb_sticky_cache == NULL)
1393 ilb_sticky_cache_init();
1394
1395 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1396 (void *)ilbs->ilbs_netstack);
1397 ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1398 ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1399 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1400 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1401
1402 ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1403 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1404 ilb_sticky_timer_size, KM_SLEEP);
1405 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1406 for (i = 0; i < ilb_sticky_timer_size; i++) {
1407 tm = ilbs->ilbs_sticky_timer_list + i;
1408 tm->start = i * part;
1409 tm->end = i * part + part;
1410 if (tm->end > ilbs->ilbs_sticky_hash_size)
1411 tm->end = ilbs->ilbs_sticky_hash_size;
1412 tm->ilbs = ilbs;
1413 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1414 /* Spread out the starting execution time of all the timers. */
1415 tm->tid = timeout(ilb_sticky_timer, tm,
1416 SEC_TO_TICK(ilb_sticky_timeout + i));
1417 }
1418 }
1419
1420 void
ilb_sticky_hash_fini(ilb_stack_t * ilbs)1421 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1422 {
1423 int i;
1424 ilb_sticky_t *s;
1425
1426 if (ilbs->ilbs_sticky_hash == NULL)
1427 return;
1428
1429 /* Stop all the timers first. */
1430 for (i = 0; i < ilb_sticky_timer_size; i++) {
1431 timeout_id_t tid;
1432
1433 /* Setting tid to 0 tells the timer handler not to restart. */
1434 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1435 tid = ilbs->ilbs_sticky_timer_list[i].tid;
1436 ilbs->ilbs_sticky_timer_list[i].tid = 0;
1437 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1438 (void) untimeout(tid);
1439 }
1440 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1441 ilb_sticky_timer_size);
1442 taskq_destroy(ilbs->ilbs_sticky_taskq);
1443 ilbs->ilbs_sticky_taskq = NULL;
1444
1445 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1446 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1447 != NULL) {
1448 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1449 ILB_SERVER_REFRELE(s->server);
1450 kmem_free(s, sizeof (ilb_sticky_t));
1451 }
1452 }
1453 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1454 sizeof (ilb_sticky_hash_t));
1455 }
1456
1457 /*
1458 * This routine sends up the sticky hash table to user land. Refer to
1459 * the comments before ilb_list_nat(). Both routines assume similar
1460 * conditions.
1461 *
1462 * It is assumed that the caller has checked the size of st so that it
1463 * can hold num entries.
1464 */
1465 /* ARGSUSED */
1466 int
ilb_list_sticky(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_sticky_entry_t * st,uint32_t * num,uint32_t * flags)1467 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1468 uint32_t *num, uint32_t *flags)
1469 {
1470 ilb_sticky_hash_t *hash;
1471 ilb_sticky_t *curp;
1472 uint32_t i, j;
1473 int ret = 0;
1474
1475 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1476 while (ilbs->ilbs_sticky_list_busy) {
1477 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1478 &ilbs->ilbs_sticky_list_lock) == 0) {
1479 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1480 return (EINTR);
1481 }
1482 }
1483 if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1484 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1485 *num = 0;
1486 *flags |= ILB_LIST_END;
1487 return (0);
1488 }
1489 ilbs->ilbs_sticky_list_busy = B_TRUE;
1490 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1491
1492 if (*flags & ILB_LIST_BEGIN) {
1493 i = 0;
1494 mutex_enter(&hash[0].sticky_lock);
1495 curp = list_head(&hash[0].sticky_head);
1496 } else if (*flags & ILB_LIST_CONT) {
1497 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1498 *num = 0;
1499 *flags |= ILB_LIST_END;
1500 goto done;
1501 }
1502 i = ilbs->ilbs_sticky_list_cur;
1503 mutex_enter(&hash[i].sticky_lock);
1504 curp = ilbs->ilbs_sticky_list_curp;
1505 } else {
1506 ret = EINVAL;
1507 goto done;
1508 }
1509
1510 j = 0;
1511 while (j < *num) {
1512 if (curp == NULL) {
1513 mutex_exit(&hash[i].sticky_lock);
1514 if (++i == ilbs->ilbs_sticky_hash_size) {
1515 *flags |= ILB_LIST_END;
1516 break;
1517 }
1518 mutex_enter(&hash[i].sticky_lock);
1519 curp = list_head(&hash[i].sticky_head);
1520 continue;
1521 }
1522 (void) strcpy(st[j].rule_name, curp->rule_name);
1523 st[j].req_addr = curp->src;
1524 st[j].srv_addr = curp->server->iser_addr_v6;
1525 st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1526 j++;
1527 curp = list_next(&hash[i].sticky_head, curp);
1528 }
1529 ilbs->ilbs_sticky_list_curp = curp;
1530 if (j == *num)
1531 mutex_exit(&hash[i].sticky_lock);
1532
1533 ilbs->ilbs_sticky_list_cur = i;
1534
1535 *num = j;
1536 done:
1537 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1538 ilbs->ilbs_sticky_list_busy = B_FALSE;
1539 cv_signal(&ilbs->ilbs_sticky_list_cv);
1540 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1541
1542 return (ret);
1543 }
1544