1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2014 Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/sysmacros.h> 29 #include <sys/types.h> 30 #include <sys/conf.h> 31 #include <sys/time.h> 32 #include <sys/taskq.h> 33 #include <sys/cmn_err.h> 34 #include <sys/sdt.h> 35 #include <sys/atomic.h> 36 #include <netinet/in.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <inet/tcp.h> 40 #include <inet/udp_impl.h> 41 #include <inet/ilb.h> 42 43 #include "ilb_stack.h" 44 #include "ilb_impl.h" 45 #include "ilb_conn.h" 46 #include "ilb_nat.h" 47 48 /* 49 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection 50 * 51 * start: starting index into the hash table to do gc 52 * end: ending index into the hash table to do gc 53 * ilbs: pointer to the ilb_stack_t of the IP stack 54 * tid_lock: mutex to protect the timer id. 55 * tid: timer id of the timer 56 */ 57 typedef struct ilb_timer_s { 58 uint32_t start; 59 uint32_t end; 60 ilb_stack_t *ilbs; 61 kmutex_t tid_lock; 62 timeout_id_t tid; 63 } ilb_timer_t; 64 65 /* Hash macro for finding the index to the conn hash table */ 66 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \ 67 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \ 68 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \ 69 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \ 70 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \ 71 ((hash_size) - 1)) 72 73 /* Kmem cache for the conn hash entry */ 74 static struct kmem_cache *ilb_conn_cache = NULL; 75 76 /* 77 * There are 60 timers running to do conn cache garbage collection. Each 78 * gc thread is responsible for 1/60 of the conn hash table. 79 */ 80 static int ilb_conn_timer_size = 60; 81 82 /* Each of the above gc timers wake up every 15s to do the gc. */ 83 static int ilb_conn_cache_timeout = 15; 84 85 #define ILB_STICKY_HASH(saddr, rule, hash_size) \ 86 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \ 87 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \ 88 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \ 89 (*(saddr) ^ (rule))) & ((hash_size) - 1)) 90 91 static struct kmem_cache *ilb_sticky_cache = NULL; 92 93 /* 94 * There are 60 timers running to do sticky cache garbage collection. Each 95 * gc thread is responsible for 1/60 of the sticky hash table. 96 */ 97 static int ilb_sticky_timer_size = 60; 98 99 /* Each of the above gc timers wake up every 15s to do the gc. */ 100 static int ilb_sticky_timeout = 15; 101 102 #define ILB_STICKY_REFRELE(s) \ 103 { \ 104 mutex_enter(&(s)->hash->sticky_lock); \ 105 (s)->refcnt--; \ 106 (s)->atime = ddi_get_lbolt64(); \ 107 mutex_exit(&s->hash->sticky_lock); \ 108 } 109 110 111 static void 112 ilb_conn_cache_init(void) 113 { 114 ilb_conn_cache = kmem_cache_create("ilb_conn_cache", 115 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL, 116 ilb_kmem_flags); 117 } 118 119 void 120 ilb_conn_cache_fini(void) 121 { 122 if (ilb_conn_cache != NULL) { 123 kmem_cache_destroy(ilb_conn_cache); 124 ilb_conn_cache = NULL; 125 } 126 } 127 128 static void 129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s) 130 { 131 ilb_conn_hash_t *hash; 132 ilb_conn_t **next, **prev; 133 ilb_conn_t **next_prev, **prev_next; 134 135 next_prev = NULL; 136 prev_next = NULL; 137 138 if (c2s) { 139 hash = connp->conn_c2s_hash; 140 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 141 next = &connp->conn_c2s_next; 142 prev = &connp->conn_c2s_prev; 143 if (*next != NULL) 144 next_prev = &(*next)->conn_c2s_prev; 145 if (*prev != NULL) 146 prev_next = &(*prev)->conn_c2s_next; 147 } else { 148 hash = connp->conn_s2c_hash; 149 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 150 next = &connp->conn_s2c_next; 151 prev = &connp->conn_s2c_prev; 152 if (*next != NULL) 153 next_prev = &(*next)->conn_s2c_prev; 154 if (*prev != NULL) 155 prev_next = &(*prev)->conn_s2c_next; 156 } 157 158 if (hash->ilb_connp == connp) { 159 hash->ilb_connp = *next; 160 if (*next != NULL) 161 *next_prev = NULL; 162 } else { 163 if (*prev != NULL) 164 *prev_next = *next; 165 if (*next != NULL) 166 *next_prev = *prev; 167 } 168 ASSERT(hash->ilb_conn_cnt > 0); 169 hash->ilb_conn_cnt--; 170 171 *next = NULL; 172 *prev = NULL; 173 } 174 175 static void 176 ilb_conn_remove(ilb_conn_t *connp) 177 { 178 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 179 ilb_conn_remove_common(connp, B_TRUE); 180 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 181 ilb_conn_remove_common(connp, B_FALSE); 182 183 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 184 in_port_t port; 185 186 port = ntohs(connp->conn_rule_cache.info.nat_sport); 187 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena, 188 (void *)(uintptr_t)port, 1); 189 } 190 191 if (connp->conn_sticky != NULL) 192 ILB_STICKY_REFRELE(connp->conn_sticky); 193 ILB_SERVER_REFRELE(connp->conn_server); 194 kmem_cache_free(ilb_conn_cache, connp); 195 } 196 197 /* 198 * Routine to do periodic garbage collection of conn hash entries. When 199 * a conn hash timer fires, it dispatches a taskq to call this function 200 * to do the gc. Note that each taskq is responisble for a portion of 201 * the table. The portion is stored in timer->start, timer->end. 202 */ 203 static void 204 ilb_conn_cleanup(void *arg) 205 { 206 ilb_timer_t *timer = (ilb_timer_t *)arg; 207 uint32_t i; 208 ilb_stack_t *ilbs; 209 ilb_conn_hash_t *c2s_hash, *s2c_hash; 210 ilb_conn_t *connp, *nxt_connp; 211 int64_t now; 212 int64_t expiry; 213 boolean_t die_now; 214 215 ilbs = timer->ilbs; 216 c2s_hash = ilbs->ilbs_c2s_conn_hash; 217 ASSERT(c2s_hash != NULL); 218 219 now = ddi_get_lbolt64(); 220 for (i = timer->start; i < timer->end; i++) { 221 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock); 222 if ((connp = c2s_hash[i].ilb_connp) == NULL) { 223 ASSERT(c2s_hash[i].ilb_conn_cnt == 0); 224 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 225 continue; 226 } 227 do { 228 ASSERT(c2s_hash[i].ilb_conn_cnt > 0); 229 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]); 230 nxt_connp = connp->conn_c2s_next; 231 expiry = now - SEC_TO_TICK(connp->conn_expiry); 232 if (connp->conn_server->iser_die_time != 0 && 233 connp->conn_server->iser_die_time < now) 234 die_now = B_TRUE; 235 else 236 die_now = B_FALSE; 237 s2c_hash = connp->conn_s2c_hash; 238 mutex_enter(&s2c_hash->ilb_conn_hash_lock); 239 240 if (connp->conn_gc || die_now || 241 (connp->conn_c2s_atime < expiry && 242 connp->conn_s2c_atime < expiry)) { 243 /* Need to update the nat list cur_connp */ 244 if (connp == ilbs->ilbs_conn_list_connp) { 245 ilbs->ilbs_conn_list_connp = 246 connp->conn_c2s_next; 247 } 248 ilb_conn_remove(connp); 249 goto nxt_connp; 250 } 251 252 if (connp->conn_l4 != IPPROTO_TCP) 253 goto nxt_connp; 254 255 /* Update and check TCP related conn info */ 256 if (connp->conn_c2s_tcp_fin_sent && 257 SEQ_GT(connp->conn_s2c_tcp_ack, 258 connp->conn_c2s_tcp_fss)) { 259 connp->conn_c2s_tcp_fin_acked = B_TRUE; 260 } 261 if (connp->conn_s2c_tcp_fin_sent && 262 SEQ_GT(connp->conn_c2s_tcp_ack, 263 connp->conn_s2c_tcp_fss)) { 264 connp->conn_s2c_tcp_fin_acked = B_TRUE; 265 } 266 if (connp->conn_c2s_tcp_fin_acked && 267 connp->conn_s2c_tcp_fin_acked) { 268 ilb_conn_remove(connp); 269 } 270 nxt_connp: 271 mutex_exit(&s2c_hash->ilb_conn_hash_lock); 272 connp = nxt_connp; 273 } while (connp != NULL); 274 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 275 } 276 } 277 278 /* Conn hash timer routine. It dispatches a taskq and restart the timer */ 279 static void 280 ilb_conn_timer(void *arg) 281 { 282 ilb_timer_t *timer = (ilb_timer_t *)arg; 283 284 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup, 285 arg, TQ_SLEEP); 286 mutex_enter(&timer->tid_lock); 287 if (timer->tid == 0) { 288 mutex_exit(&timer->tid_lock); 289 } else { 290 timer->tid = timeout(ilb_conn_timer, arg, 291 SEC_TO_TICK(ilb_conn_cache_timeout)); 292 mutex_exit(&timer->tid_lock); 293 } 294 } 295 296 void 297 ilb_conn_hash_init(ilb_stack_t *ilbs) 298 { 299 extern pri_t minclsyspri; 300 int i, part; 301 ilb_timer_t *tm; 302 char tq_name[TASKQ_NAMELEN]; 303 304 /* 305 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to 306 * the next power of 2. 307 */ 308 if (!ISP2(ilbs->ilbs_conn_hash_size)) { 309 for (i = 0; i < 31; i++) { 310 if (ilbs->ilbs_conn_hash_size < (1 << i)) 311 break; 312 } 313 ilbs->ilbs_conn_hash_size = 1 << i; 314 } 315 316 /* 317 * Can sleep since this should be called when a rule is being added, 318 * hence we are not in interrupt context. 319 */ 320 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 321 ilbs->ilbs_conn_hash_size, KM_SLEEP); 322 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 323 ilbs->ilbs_conn_hash_size, KM_SLEEP); 324 325 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 326 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock, 327 NULL, MUTEX_DEFAULT, NULL); 328 } 329 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 330 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock, 331 NULL, MUTEX_DEFAULT, NULL); 332 } 333 334 if (ilb_conn_cache == NULL) 335 ilb_conn_cache_init(); 336 337 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p", 338 (void *)ilbs->ilbs_netstack); 339 ASSERT(ilbs->ilbs_conn_taskq == NULL); 340 ilbs->ilbs_conn_taskq = taskq_create(tq_name, 341 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size, 342 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 343 344 ASSERT(ilbs->ilbs_conn_timer_list == NULL); 345 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 346 ilb_conn_timer_size, KM_SLEEP); 347 348 /* 349 * The hash table is divided in equal partition for those timers 350 * to do garbage collection. 351 */ 352 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1; 353 for (i = 0; i < ilb_conn_timer_size; i++) { 354 tm = ilbs->ilbs_conn_timer_list + i; 355 tm->start = i * part; 356 tm->end = i * part + part; 357 if (tm->end > ilbs->ilbs_conn_hash_size) 358 tm->end = ilbs->ilbs_conn_hash_size; 359 tm->ilbs = ilbs; 360 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 361 /* Spread out the starting execution time of all the timers. */ 362 tm->tid = timeout(ilb_conn_timer, tm, 363 SEC_TO_TICK(ilb_conn_cache_timeout + i)); 364 } 365 } 366 367 void 368 ilb_conn_hash_fini(ilb_stack_t *ilbs) 369 { 370 uint32_t i; 371 ilb_conn_t *connp; 372 ilb_conn_hash_t *hash; 373 374 if (ilbs->ilbs_c2s_conn_hash == NULL) { 375 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 376 return; 377 } 378 379 /* Stop all the timers first. */ 380 for (i = 0; i < ilb_conn_timer_size; i++) { 381 timeout_id_t tid; 382 383 /* Setting tid to 0 tells the timer handler not to restart. */ 384 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock); 385 tid = ilbs->ilbs_conn_timer_list[i].tid; 386 ilbs->ilbs_conn_timer_list[i].tid = 0; 387 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock); 388 (void) untimeout(tid); 389 } 390 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) * 391 ilb_conn_timer_size); 392 taskq_destroy(ilbs->ilbs_conn_taskq); 393 ilbs->ilbs_conn_taskq = NULL; 394 395 /* Then remove all the conns. */ 396 hash = ilbs->ilbs_s2c_conn_hash; 397 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 398 while ((connp = hash[i].ilb_connp) != NULL) { 399 hash[i].ilb_connp = connp->conn_s2c_next; 400 ILB_SERVER_REFRELE(connp->conn_server); 401 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 402 ilb_nat_src_entry_t *ent; 403 in_port_t port; 404 405 /* 406 * src_ent will be freed in ilb_nat_src_fini(). 407 */ 408 port = ntohs( 409 connp->conn_rule_cache.info.nat_sport); 410 ent = connp->conn_rule_cache.info.src_ent; 411 vmem_free(ent->nse_port_arena, 412 (void *)(uintptr_t)port, 1); 413 } 414 kmem_cache_free(ilb_conn_cache, connp); 415 } 416 } 417 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) * 418 ilbs->ilbs_conn_hash_size); 419 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) * 420 ilbs->ilbs_conn_hash_size); 421 } 422 423 /* 424 * Internet checksum adjustment calculation routines. We pre-calculate 425 * checksum adjustment so that we don't need to compute the checksum on 426 * the whole packet when we change address/port in the packet. 427 */ 428 429 static void 430 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 431 in_port_t new_port, uint32_t *adj_sum) 432 { 433 uint32_t sum; 434 435 sum = *oaddr + *(oaddr + 1) + old_port; 436 while ((sum >> 16) != 0) 437 sum = (sum & 0xffff) + (sum >> 16); 438 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port; 439 } 440 441 static void 442 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 443 in_port_t new_port, uint32_t *adj_sum) 444 { 445 uint32_t sum = 0; 446 447 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) + 448 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) + 449 old_port; 450 while ((sum >> 16) != 0) 451 sum = (sum & 0xffff) + (sum >> 16); 452 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + 453 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) + 454 *(naddr + 6) + *(naddr + 7) + new_port; 455 } 456 457 static void 458 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 459 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 460 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 461 { 462 uint32_t sum; 463 464 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) + 465 old_port2; 466 while ((sum >> 16) != 0) 467 sum = (sum & 0xffff) + (sum >> 16); 468 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 + 469 *naddr2 + *(naddr2 + 1) + new_port2; 470 } 471 472 static void 473 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 474 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 475 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 476 { 477 uint32_t sum = 0; 478 479 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) + 480 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) + 481 old_port1; 482 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) + 483 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) + 484 old_port2; 485 while ((sum >> 16) != 0) 486 sum = (sum & 0xffff) + (sum >> 16); 487 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) + 488 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) + 489 *(naddr1 + 7) + new_port1; 490 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) + 491 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) + 492 *(naddr2 + 7) + new_port2; 493 } 494 495 /* 496 * Add a conn hash entry to the tables. Note that a conn hash entry 497 * (ilb_conn_t) contains info on both directions. And there are two hash 498 * tables, one for client to server and the other for server to client. 499 * So the same entry is added to both tables and can be ccessed by two 500 * thread simultaneously. But each thread will only access data on one 501 * direction, so there is no conflict. 502 */ 503 int 504 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server, 505 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport, 506 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s) 507 { 508 ilb_conn_t *connp; 509 ilb_conn_hash_t *hash; 510 int i; 511 512 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP); 513 if (connp == NULL) { 514 if (s != NULL) { 515 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 516 ilb_nat_src_entry_t **entry; 517 518 entry = s->server->iser_nat_src->src_list; 519 vmem_free(entry[s->nat_src_idx]->nse_port_arena, 520 (void *)(uintptr_t)ntohs(info->nat_sport), 521 1); 522 } 523 ILB_STICKY_REFRELE(s); 524 } 525 return (ENOMEM); 526 } 527 528 connp->conn_l4 = rule->ir_proto; 529 530 connp->conn_server = server; 531 ILB_SERVER_REFHOLD(server); 532 connp->conn_sticky = s; 533 534 connp->conn_rule_cache.topo = rule->ir_topo; 535 connp->conn_rule_cache.info = *info; 536 537 connp->conn_gc = B_FALSE; 538 539 connp->conn_expiry = rule->ir_nat_expiry; 540 connp->conn_cr_time = ddi_get_lbolt64(); 541 542 /* Client to server info. */ 543 connp->conn_c2s_saddr = *src; 544 connp->conn_c2s_sport = sport; 545 connp->conn_c2s_daddr = *dst; 546 connp->conn_c2s_dport = dport; 547 548 connp->conn_c2s_atime = ddi_get_lbolt64(); 549 /* The packet ths triggers this creation should be counted */ 550 connp->conn_c2s_pkt_cnt = 1; 551 connp->conn_c2s_tcp_fin_sent = B_FALSE; 552 connp->conn_c2s_tcp_fin_acked = B_FALSE; 553 554 /* Server to client info, before NAT */ 555 switch (rule->ir_topo) { 556 case ILB_TOPO_IMPL_HALF_NAT: 557 connp->conn_s2c_saddr = info->nat_dst; 558 connp->conn_s2c_sport = info->nat_dport; 559 connp->conn_s2c_daddr = *src; 560 connp->conn_s2c_dport = sport; 561 562 /* Pre-calculate checksum changes for both directions */ 563 if (rule->ir_ipver == IPPROTO_IP) { 564 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 565 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0, 566 &connp->conn_c2s_ip_sum); 567 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 568 (uint16_t *)&info->nat_dst.s6_addr32[3], dport, 569 info->nat_dport, &connp->conn_c2s_tp_sum); 570 *ip_sum = connp->conn_c2s_ip_sum; 571 *tp_sum = connp->conn_c2s_tp_sum; 572 573 hnat_cksum_v4( 574 (uint16_t *)&info->nat_dst.s6_addr32[3], 575 (uint16_t *)&dst->s6_addr32[3], 0, 0, 576 &connp->conn_s2c_ip_sum); 577 hnat_cksum_v4( 578 (uint16_t *)&info->nat_dst.s6_addr32[3], 579 (uint16_t *)&dst->s6_addr32[3], 580 info->nat_dport, dport, 581 &connp->conn_s2c_tp_sum); 582 } else { 583 connp->conn_c2s_ip_sum = 0; 584 hnat_cksum_v6((uint16_t *)dst, 585 (uint16_t *)&info->nat_dst, dport, 586 info->nat_dport, &connp->conn_c2s_tp_sum); 587 *ip_sum = 0; 588 *tp_sum = connp->conn_c2s_tp_sum; 589 590 connp->conn_s2c_ip_sum = 0; 591 hnat_cksum_v6((uint16_t *)&info->nat_dst, 592 (uint16_t *)dst, info->nat_dport, dport, 593 &connp->conn_s2c_tp_sum); 594 } 595 break; 596 case ILB_TOPO_IMPL_NAT: 597 connp->conn_s2c_saddr = info->nat_dst; 598 connp->conn_s2c_sport = info->nat_dport; 599 connp->conn_s2c_daddr = info->nat_src; 600 connp->conn_s2c_dport = info->nat_sport; 601 602 if (rule->ir_ipver == IPPROTO_IP) { 603 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 604 (uint16_t *)&dst->s6_addr32[3], 605 (uint16_t *)&info->nat_src.s6_addr32[3], 606 (uint16_t *)&info->nat_dst.s6_addr32[3], 607 0, 0, 0, 0, &connp->conn_c2s_ip_sum); 608 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 609 (uint16_t *)&dst->s6_addr32[3], 610 (uint16_t *)&info->nat_src.s6_addr32[3], 611 (uint16_t *)&info->nat_dst.s6_addr32[3], 612 sport, dport, info->nat_sport, 613 info->nat_dport, &connp->conn_c2s_tp_sum); 614 *ip_sum = connp->conn_c2s_ip_sum; 615 *tp_sum = connp->conn_c2s_tp_sum; 616 617 fnat_cksum_v4( 618 (uint16_t *)&info->nat_src.s6_addr32[3], 619 (uint16_t *)&info->nat_dst.s6_addr32[3], 620 (uint16_t *)&src->s6_addr32[3], 621 (uint16_t *)&dst->s6_addr32[3], 622 0, 0, 0, 0, &connp->conn_s2c_ip_sum); 623 fnat_cksum_v4( 624 (uint16_t *)&info->nat_src.s6_addr32[3], 625 (uint16_t *)&info->nat_dst.s6_addr32[3], 626 (uint16_t *)&src->s6_addr32[3], 627 (uint16_t *)&dst->s6_addr32[3], 628 info->nat_sport, info->nat_dport, 629 sport, dport, &connp->conn_s2c_tp_sum); 630 } else { 631 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst, 632 (uint16_t *)&info->nat_src, 633 (uint16_t *)&info->nat_dst, 634 sport, dport, info->nat_sport, 635 info->nat_dport, &connp->conn_c2s_tp_sum); 636 connp->conn_c2s_ip_sum = 0; 637 *ip_sum = 0; 638 *tp_sum = connp->conn_c2s_tp_sum; 639 640 fnat_cksum_v6((uint16_t *)&info->nat_src, 641 (uint16_t *)&info->nat_dst, (uint16_t *)src, 642 (uint16_t *)dst, info->nat_sport, 643 info->nat_dport, sport, dport, 644 &connp->conn_s2c_tp_sum); 645 connp->conn_s2c_ip_sum = 0; 646 } 647 break; 648 } 649 650 connp->conn_s2c_atime = ddi_get_lbolt64(); 651 connp->conn_s2c_pkt_cnt = 1; 652 connp->conn_s2c_tcp_fin_sent = B_FALSE; 653 connp->conn_s2c_tcp_fin_acked = B_FALSE; 654 655 /* Add it to the s2c hash table. */ 656 hash = ilbs->ilbs_s2c_conn_hash; 657 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3], 658 ntohs(connp->conn_s2c_sport), 659 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3], 660 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size); 661 connp->conn_s2c_hash = &hash[i]; 662 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i); 663 664 mutex_enter(&hash[i].ilb_conn_hash_lock); 665 hash[i].ilb_conn_cnt++; 666 connp->conn_s2c_next = hash[i].ilb_connp; 667 if (hash[i].ilb_connp != NULL) 668 hash[i].ilb_connp->conn_s2c_prev = connp; 669 connp->conn_s2c_prev = NULL; 670 hash[i].ilb_connp = connp; 671 mutex_exit(&hash[i].ilb_conn_hash_lock); 672 673 /* Add it to the c2s hash table. */ 674 hash = ilbs->ilbs_c2s_conn_hash; 675 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 676 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 677 ilbs->ilbs_conn_hash_size); 678 connp->conn_c2s_hash = &hash[i]; 679 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i); 680 681 mutex_enter(&hash[i].ilb_conn_hash_lock); 682 hash[i].ilb_conn_cnt++; 683 connp->conn_c2s_next = hash[i].ilb_connp; 684 if (hash[i].ilb_connp != NULL) 685 hash[i].ilb_connp->conn_c2s_prev = connp; 686 connp->conn_c2s_prev = NULL; 687 hash[i].ilb_connp = connp; 688 mutex_exit(&hash[i].ilb_conn_hash_lock); 689 690 return (0); 691 } 692 693 /* 694 * If a connection is using TCP, we keep track of simple TCP state transition 695 * so that we know when to clean up an entry. 696 */ 697 static boolean_t 698 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len, 699 boolean_t c2s) 700 { 701 uint32_t ack, seq; 702 int32_t seg_len; 703 704 ack = 0; 705 if (tcpha->tha_flags & TH_RST) 706 return (B_FALSE); 707 708 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) - 709 TCP_HDR_LENGTH((tcph_t *)tcpha); 710 711 if (tcpha->tha_flags & TH_ACK) 712 ack = ntohl(tcpha->tha_ack); 713 seq = ntohl(tcpha->tha_seq); 714 if (c2s) { 715 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 716 if (tcpha->tha_flags & TH_FIN) { 717 connp->conn_c2s_tcp_fss = seq + seg_len; 718 connp->conn_c2s_tcp_fin_sent = B_TRUE; 719 } 720 connp->conn_c2s_tcp_ack = ack; 721 722 /* Port reuse by the client, restart the conn. */ 723 if (connp->conn_c2s_tcp_fin_sent && 724 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) { 725 connp->conn_c2s_tcp_fin_sent = B_FALSE; 726 connp->conn_c2s_tcp_fin_acked = B_FALSE; 727 } 728 } else { 729 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 730 if (tcpha->tha_flags & TH_FIN) { 731 connp->conn_s2c_tcp_fss = seq + seg_len; 732 connp->conn_s2c_tcp_fin_sent = B_TRUE; 733 } 734 connp->conn_s2c_tcp_ack = ack; 735 736 /* Port reuse by the client, restart the conn. */ 737 if (connp->conn_s2c_tcp_fin_sent && 738 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) { 739 connp->conn_s2c_tcp_fin_sent = B_FALSE; 740 connp->conn_s2c_tcp_fin_acked = B_FALSE; 741 } 742 } 743 744 return (B_TRUE); 745 } 746 747 /* 748 * Helper routint to find conn hash entry given some packet information and 749 * the traffic direction (c2s, client to server?) 750 */ 751 static boolean_t 752 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src, 753 in_port_t sport, in6_addr_t *dst, in_port_t dport, 754 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum, 755 int32_t pkt_len, boolean_t c2s) 756 { 757 ilb_conn_hash_t *hash; 758 uint_t i; 759 ilb_conn_t *connp; 760 boolean_t tcp_alive; 761 boolean_t ret = B_FALSE; 762 763 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 764 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 765 ilbs->ilbs_conn_hash_size); 766 if (c2s) { 767 hash = ilbs->ilbs_c2s_conn_hash; 768 mutex_enter(&hash[i].ilb_conn_hash_lock); 769 for (connp = hash[i].ilb_connp; connp != NULL; 770 connp = connp->conn_c2s_next) { 771 if (connp->conn_l4 == l4 && 772 connp->conn_c2s_dport == dport && 773 connp->conn_c2s_sport == sport && 774 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) && 775 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) { 776 connp->conn_c2s_atime = ddi_get_lbolt64(); 777 connp->conn_c2s_pkt_cnt++; 778 *rule_cache = connp->conn_rule_cache; 779 *ip_sum = connp->conn_c2s_ip_sum; 780 *tp_sum = connp->conn_c2s_tp_sum; 781 ret = B_TRUE; 782 break; 783 } 784 } 785 } else { 786 hash = ilbs->ilbs_s2c_conn_hash; 787 mutex_enter(&hash[i].ilb_conn_hash_lock); 788 for (connp = hash[i].ilb_connp; connp != NULL; 789 connp = connp->conn_s2c_next) { 790 if (connp->conn_l4 == l4 && 791 connp->conn_s2c_dport == dport && 792 connp->conn_s2c_sport == sport && 793 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) && 794 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) { 795 connp->conn_s2c_atime = ddi_get_lbolt64(); 796 connp->conn_s2c_pkt_cnt++; 797 *rule_cache = connp->conn_rule_cache; 798 *ip_sum = connp->conn_s2c_ip_sum; 799 *tp_sum = connp->conn_s2c_tp_sum; 800 ret = B_TRUE; 801 break; 802 } 803 } 804 } 805 if (ret) { 806 ILB_S_KSTAT(connp->conn_server, pkt_processed); 807 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed, 808 pkt_len); 809 810 switch (l4) { 811 case (IPPROTO_TCP): 812 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len, 813 c2s); 814 if (!tcp_alive) { 815 connp->conn_gc = B_TRUE; 816 } 817 break; 818 default: 819 break; 820 } 821 } 822 mutex_exit(&hash[i].ilb_conn_hash_lock); 823 824 return (ret); 825 } 826 827 /* 828 * To check if a give packet matches an existing conn hash entry. If it 829 * does, return the information about this entry so that the caller can 830 * do the proper NAT. 831 */ 832 boolean_t 833 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph, 834 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport, 835 uint32_t pkt_len, in6_addr_t *lb_dst) 836 { 837 ilb_rule_info_t rule_cache; 838 uint32_t adj_ip_sum, adj_tp_sum; 839 boolean_t ret; 840 841 /* Check the incoming hash table. */ 842 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 843 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) { 844 switch (rule_cache.topo) { 845 case ILB_TOPO_IMPL_NAT: 846 *lb_dst = rule_cache.info.nat_dst; 847 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 848 adj_ip_sum, adj_tp_sum, B_TRUE); 849 ret = B_TRUE; 850 break; 851 case ILB_TOPO_IMPL_HALF_NAT: 852 *lb_dst = rule_cache.info.nat_dst; 853 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 854 adj_ip_sum, adj_tp_sum, B_TRUE); 855 ret = B_TRUE; 856 break; 857 default: 858 ret = B_FALSE; 859 break; 860 } 861 return (ret); 862 } 863 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 864 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) { 865 switch (rule_cache.topo) { 866 case ILB_TOPO_IMPL_NAT: 867 *lb_dst = rule_cache.info.src; 868 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 869 adj_ip_sum, adj_tp_sum, B_FALSE); 870 ret = B_TRUE; 871 break; 872 case ILB_TOPO_IMPL_HALF_NAT: 873 *lb_dst = *dst; 874 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 875 adj_ip_sum, adj_tp_sum, B_FALSE); 876 ret = B_TRUE; 877 break; 878 default: 879 ret = B_FALSE; 880 break; 881 } 882 return (ret); 883 } 884 885 return (B_FALSE); 886 } 887 888 /* 889 * To check if an ICMP packet belongs to a connection in one of the conn 890 * hash entries. 891 */ 892 boolean_t 893 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph, 894 void *icmph, in6_addr_t *lb_dst) 895 { 896 ilb_conn_hash_t *hash; 897 ipha_t *in_iph4; 898 ip6_t *in_iph6; 899 icmph_t *icmph4; 900 icmp6_t *icmph6; 901 in6_addr_t *in_src_p, *in_dst_p; 902 in_port_t *sport, *dport; 903 int l4; 904 uint_t i; 905 ilb_conn_t *connp; 906 ilb_rule_info_t rule_cache; 907 uint32_t adj_ip_sum; 908 boolean_t full_nat; 909 910 in_iph4 = NULL; 911 in_iph6 = NULL; 912 icmph4 = NULL; 913 icmph6 = NULL; 914 915 if (l3 == IPPROTO_IP) { 916 in6_addr_t in_src, in_dst; 917 918 icmph4 = (icmph_t *)icmph; 919 in_iph4 = (ipha_t *)&icmph4[1]; 920 921 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) + 922 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 923 return (B_FALSE); 924 } 925 926 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src); 927 in_src_p = &in_src; 928 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst); 929 in_dst_p = &in_dst; 930 931 l4 = in_iph4->ipha_protocol; 932 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 933 return (B_FALSE); 934 935 sport = (in_port_t *)((char *)in_iph4 + 936 IPH_HDR_LENGTH(in_iph4)); 937 dport = sport + 1; 938 939 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t, 940 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t, 941 ntohs(*sport), uint16_t, ntohs(*dport)); 942 } else { 943 ASSERT(l3 == IPPROTO_IPV6); 944 945 icmph6 = (icmp6_t *)icmph; 946 in_iph6 = (ip6_t *)&icmph6[1]; 947 in_src_p = &in_iph6->ip6_src; 948 in_dst_p = &in_iph6->ip6_dst; 949 950 if ((uint8_t *)in_iph6 + sizeof (ip6_t) + 951 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 952 return (B_FALSE); 953 } 954 955 l4 = in_iph6->ip6_nxt; 956 /* We don't go deep inside an IPv6 packet yet. */ 957 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 958 return (B_FALSE); 959 960 sport = (in_port_t *)&in_iph6[1]; 961 dport = sport + 1; 962 963 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *, 964 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst, 965 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport)); 966 } 967 968 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport), 969 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport), 970 ilbs->ilbs_conn_hash_size); 971 hash = ilbs->ilbs_c2s_conn_hash; 972 973 mutex_enter(&hash[i].ilb_conn_hash_lock); 974 for (connp = hash[i].ilb_connp; connp != NULL; 975 connp = connp->conn_c2s_next) { 976 if (connp->conn_l4 == l4 && 977 connp->conn_c2s_dport == *sport && 978 connp->conn_c2s_sport == *dport && 979 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) && 980 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) { 981 connp->conn_c2s_atime = ddi_get_lbolt64(); 982 connp->conn_c2s_pkt_cnt++; 983 rule_cache = connp->conn_rule_cache; 984 adj_ip_sum = connp->conn_c2s_ip_sum; 985 break; 986 } 987 } 988 mutex_exit(&hash[i].ilb_conn_hash_lock); 989 990 if (connp == NULL) { 991 DTRACE_PROBE(ilb__chk__icmp__conn__failed); 992 return (B_FALSE); 993 } 994 995 switch (rule_cache.topo) { 996 case ILB_TOPO_IMPL_NAT: 997 full_nat = B_TRUE; 998 break; 999 case ILB_TOPO_IMPL_HALF_NAT: 1000 full_nat = B_FALSE; 1001 break; 1002 default: 1003 return (B_FALSE); 1004 } 1005 1006 *lb_dst = rule_cache.info.nat_dst; 1007 if (l3 == IPPROTO_IP) { 1008 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport, 1009 &rule_cache.info, adj_ip_sum, full_nat); 1010 } else { 1011 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport, 1012 &rule_cache.info, full_nat); 1013 } 1014 return (B_TRUE); 1015 } 1016 1017 /* 1018 * This routine sends up the conn hash table to user land. Note that the 1019 * request is an ioctl, hence we cannot really differentiate requests 1020 * from different clients. There is no context shared between different 1021 * ioctls. Here we make the assumption that the user land ilbd will 1022 * only allow one client to show the conn hash table at any time. 1023 * Otherwise, the results will be "very" inconsistent. 1024 * 1025 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants 1026 * to read from the beginning of the able. After a certain entries 1027 * are reported, the kernel remembers the position of the last returned 1028 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag, 1029 * it will return entries starting from where it was left off. When 1030 * the end of table is reached, a flag (ILB_LIST_END) is set to tell 1031 * the client that there is no more entry. 1032 * 1033 * It is assumed that the caller has checked the size of nat so that it 1034 * can hold num entries. 1035 */ 1036 /* ARGSUSED */ 1037 int 1038 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat, 1039 uint32_t *num, uint32_t *flags) 1040 { 1041 ilb_conn_hash_t *hash; 1042 ilb_conn_t *cur_connp; 1043 uint32_t i, j; 1044 int ret = 0; 1045 1046 mutex_enter(&ilbs->ilbs_conn_list_lock); 1047 while (ilbs->ilbs_conn_list_busy) { 1048 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv, 1049 &ilbs->ilbs_conn_list_lock) == 0) { 1050 mutex_exit(&ilbs->ilbs_conn_list_lock); 1051 return (EINTR); 1052 } 1053 } 1054 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) { 1055 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 1056 mutex_exit(&ilbs->ilbs_conn_list_lock); 1057 *num = 0; 1058 *flags |= ILB_LIST_END; 1059 return (0); 1060 } 1061 ilbs->ilbs_conn_list_busy = B_TRUE; 1062 mutex_exit(&ilbs->ilbs_conn_list_lock); 1063 1064 if (*flags & ILB_LIST_BEGIN) { 1065 i = 0; 1066 mutex_enter(&hash[0].ilb_conn_hash_lock); 1067 cur_connp = hash[0].ilb_connp; 1068 } else if (*flags & ILB_LIST_CONT) { 1069 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) { 1070 *num = 0; 1071 *flags |= ILB_LIST_END; 1072 goto done; 1073 } 1074 i = ilbs->ilbs_conn_list_cur; 1075 mutex_enter(&hash[i].ilb_conn_hash_lock); 1076 cur_connp = ilbs->ilbs_conn_list_connp; 1077 } else { 1078 ret = EINVAL; 1079 goto done; 1080 } 1081 1082 j = 0; 1083 while (j < *num) { 1084 if (cur_connp == NULL) { 1085 mutex_exit(&hash[i].ilb_conn_hash_lock); 1086 if (++i == ilbs->ilbs_conn_hash_size) { 1087 *flags |= ILB_LIST_END; 1088 break; 1089 } 1090 mutex_enter(&hash[i].ilb_conn_hash_lock); 1091 cur_connp = hash[i].ilb_connp; 1092 continue; 1093 } 1094 nat[j].proto = cur_connp->conn_l4; 1095 1096 nat[j].in_global = cur_connp->conn_c2s_daddr; 1097 nat[j].in_global_port = cur_connp->conn_c2s_dport; 1098 nat[j].out_global = cur_connp->conn_c2s_saddr; 1099 nat[j].out_global_port = cur_connp->conn_c2s_sport; 1100 1101 nat[j].in_local = cur_connp->conn_s2c_saddr; 1102 nat[j].in_local_port = cur_connp->conn_s2c_sport; 1103 nat[j].out_local = cur_connp->conn_s2c_daddr; 1104 nat[j].out_local_port = cur_connp->conn_s2c_dport; 1105 1106 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time); 1107 nat[j].last_access_time = 1108 TICK_TO_MSEC(cur_connp->conn_c2s_atime); 1109 1110 /* 1111 * The conn_s2c_pkt_cnt may not be accurate since we are not 1112 * holding the s2c hash lock. 1113 */ 1114 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt + 1115 cur_connp->conn_s2c_pkt_cnt; 1116 j++; 1117 1118 cur_connp = cur_connp->conn_c2s_next; 1119 } 1120 ilbs->ilbs_conn_list_connp = cur_connp; 1121 if (j == *num) 1122 mutex_exit(&hash[i].ilb_conn_hash_lock); 1123 1124 ilbs->ilbs_conn_list_cur = i; 1125 1126 *num = j; 1127 done: 1128 mutex_enter(&ilbs->ilbs_conn_list_lock); 1129 ilbs->ilbs_conn_list_busy = B_FALSE; 1130 cv_signal(&ilbs->ilbs_conn_list_cv); 1131 mutex_exit(&ilbs->ilbs_conn_list_lock); 1132 1133 return (ret); 1134 } 1135 1136 1137 /* 1138 * Stickiness (persistence) handling routines. 1139 */ 1140 1141 1142 static void 1143 ilb_sticky_cache_init(void) 1144 { 1145 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache", 1146 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL, 1147 ilb_kmem_flags); 1148 } 1149 1150 void 1151 ilb_sticky_cache_fini(void) 1152 { 1153 if (ilb_sticky_cache != NULL) { 1154 kmem_cache_destroy(ilb_sticky_cache); 1155 ilb_sticky_cache = NULL; 1156 } 1157 } 1158 1159 void 1160 ilb_sticky_refrele(ilb_sticky_t *s) 1161 { 1162 ILB_STICKY_REFRELE(s); 1163 } 1164 1165 static ilb_sticky_t * 1166 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src) 1167 { 1168 ilb_sticky_t *s; 1169 1170 ASSERT(mutex_owned(&hash->sticky_lock)); 1171 1172 for (s = list_head(&hash->sticky_head); s != NULL; 1173 s = list_next(&hash->sticky_head, s)) { 1174 if (s->rule_instance == rule->ir_ks_instance) { 1175 if (IN6_ARE_ADDR_EQUAL(src, &s->src)) 1176 return (s); 1177 } 1178 } 1179 return (NULL); 1180 } 1181 1182 static ilb_sticky_t * 1183 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server, 1184 in6_addr_t *src) 1185 { 1186 ilb_sticky_t *s; 1187 1188 ASSERT(mutex_owned(&hash->sticky_lock)); 1189 1190 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL) 1191 return (NULL); 1192 1193 /* 1194 * The rule instance is for handling the scenario when the same 1195 * client talks to different rules at the same time. Stickiness 1196 * is per rule so we can use the rule instance to differentiate 1197 * the client's request. 1198 */ 1199 s->rule_instance = rule->ir_ks_instance; 1200 /* 1201 * Copy the rule name for listing all sticky cache entry. ir_name 1202 * is guaranteed to be NULL terminated. 1203 */ 1204 (void) strcpy(s->rule_name, rule->ir_name); 1205 s->server = server; 1206 1207 /* 1208 * Grab a ref cnt on the server so that it won't go away while 1209 * it is still in the sticky table. 1210 */ 1211 ILB_SERVER_REFHOLD(server); 1212 s->src = *src; 1213 s->expiry = rule->ir_sticky_expiry; 1214 s->refcnt = 1; 1215 s->hash = hash; 1216 1217 /* 1218 * There is no need to set atime here since the refcnt is not 1219 * zero. A sticky entry is removed only when the refcnt is 1220 * zero. But just set it here for debugging purpose. The 1221 * atime is set when a refrele is done on a sticky entry. 1222 */ 1223 s->atime = ddi_get_lbolt64(); 1224 1225 list_insert_head(&hash->sticky_head, s); 1226 hash->sticky_cnt++; 1227 return (s); 1228 } 1229 1230 /* 1231 * This routine checks if there is an existing sticky entry which matches 1232 * a given packet. If there is one, return it. If there is not, create 1233 * a sticky entry using the packet's info. 1234 */ 1235 ilb_server_t * 1236 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src, 1237 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx) 1238 { 1239 int i; 1240 ilb_sticky_hash_t *hash; 1241 ilb_sticky_t *s; 1242 1243 ASSERT(server != NULL); 1244 1245 *res = NULL; 1246 1247 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3], 1248 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size); 1249 hash = &ilbs->ilbs_sticky_hash[i]; 1250 1251 /* First check if there is already an entry. */ 1252 mutex_enter(&hash->sticky_lock); 1253 s = ilb_sticky_lookup(hash, rule, src); 1254 1255 /* No sticky entry, add one. */ 1256 if (s == NULL) { 1257 add_new_entry: 1258 s = ilb_sticky_add(hash, rule, server, src); 1259 if (s == NULL) { 1260 mutex_exit(&hash->sticky_lock); 1261 return (NULL); 1262 } 1263 /* 1264 * Find a source for this server. All subseqent requests from 1265 * the same client matching this sticky entry will use this 1266 * source address in doing NAT. The current algorithm is 1267 * simple, rotate the source address. Note that the 1268 * source address array does not change after it's created, so 1269 * it is OK to just increment the cur index. 1270 */ 1271 if (server->iser_nat_src != NULL) { 1272 /* It is a hint, does not need to be atomic. */ 1273 *src_ent_idx = (server->iser_nat_src->cur++ % 1274 server->iser_nat_src->num_src); 1275 s->nat_src_idx = *src_ent_idx; 1276 } 1277 mutex_exit(&hash->sticky_lock); 1278 *res = s; 1279 return (server); 1280 } 1281 1282 /* 1283 * We don't hold any lock accessing iser_enabled. Refer to the 1284 * comment in ilb_server_add() about iser_lock. 1285 */ 1286 if (!s->server->iser_enabled) { 1287 /* 1288 * s->server == server can only happen if there is a race in 1289 * toggling the iser_enabled flag (we don't hold a lock doing 1290 * that) so that the load balance algorithm still returns a 1291 * disabled server. In this case, just drop the packet... 1292 */ 1293 if (s->server == server) { 1294 mutex_exit(&hash->sticky_lock); 1295 return (NULL); 1296 } 1297 1298 /* 1299 * The old server is disabled and there is a new server, use 1300 * the new one to create a sticky entry. Since we will 1301 * add the entry at the beginning, subsequent lookup will 1302 * find this new entry instead of the old one. 1303 */ 1304 goto add_new_entry; 1305 } 1306 1307 s->refcnt++; 1308 *res = s; 1309 mutex_exit(&hash->sticky_lock); 1310 if (server->iser_nat_src != NULL) 1311 *src_ent_idx = s->nat_src_idx; 1312 return (s->server); 1313 } 1314 1315 static void 1316 ilb_sticky_cleanup(void *arg) 1317 { 1318 ilb_timer_t *timer = (ilb_timer_t *)arg; 1319 uint32_t i; 1320 ilb_stack_t *ilbs; 1321 ilb_sticky_hash_t *hash; 1322 ilb_sticky_t *s, *nxt_s; 1323 int64_t now, expiry; 1324 1325 ilbs = timer->ilbs; 1326 hash = ilbs->ilbs_sticky_hash; 1327 ASSERT(hash != NULL); 1328 1329 now = ddi_get_lbolt64(); 1330 for (i = timer->start; i < timer->end; i++) { 1331 mutex_enter(&hash[i].sticky_lock); 1332 for (s = list_head(&hash[i].sticky_head); s != NULL; 1333 s = nxt_s) { 1334 nxt_s = list_next(&hash[i].sticky_head, s); 1335 if (s->refcnt != 0) 1336 continue; 1337 expiry = now - SEC_TO_TICK(s->expiry); 1338 if (s->atime < expiry) { 1339 ILB_SERVER_REFRELE(s->server); 1340 list_remove(&hash[i].sticky_head, s); 1341 kmem_cache_free(ilb_sticky_cache, s); 1342 hash[i].sticky_cnt--; 1343 } 1344 } 1345 mutex_exit(&hash[i].sticky_lock); 1346 } 1347 } 1348 1349 static void 1350 ilb_sticky_timer(void *arg) 1351 { 1352 ilb_timer_t *timer = (ilb_timer_t *)arg; 1353 1354 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq, 1355 ilb_sticky_cleanup, arg, TQ_SLEEP); 1356 mutex_enter(&timer->tid_lock); 1357 if (timer->tid == 0) { 1358 mutex_exit(&timer->tid_lock); 1359 } else { 1360 timer->tid = timeout(ilb_sticky_timer, arg, 1361 SEC_TO_TICK(ilb_sticky_timeout)); 1362 mutex_exit(&timer->tid_lock); 1363 } 1364 } 1365 1366 void 1367 ilb_sticky_hash_init(ilb_stack_t *ilbs) 1368 { 1369 extern pri_t minclsyspri; 1370 int i, part; 1371 char tq_name[TASKQ_NAMELEN]; 1372 ilb_timer_t *tm; 1373 1374 if (!ISP2(ilbs->ilbs_sticky_hash_size)) { 1375 for (i = 0; i < 31; i++) { 1376 if (ilbs->ilbs_sticky_hash_size < (1 << i)) 1377 break; 1378 } 1379 ilbs->ilbs_sticky_hash_size = 1 << i; 1380 } 1381 1382 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) * 1383 ilbs->ilbs_sticky_hash_size, KM_SLEEP); 1384 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1385 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL, 1386 MUTEX_DEFAULT, NULL); 1387 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head, 1388 sizeof (ilb_sticky_t), 1389 offsetof(ilb_sticky_t, list)); 1390 } 1391 1392 if (ilb_sticky_cache == NULL) 1393 ilb_sticky_cache_init(); 1394 1395 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p", 1396 (void *)ilbs->ilbs_netstack); 1397 ASSERT(ilbs->ilbs_sticky_taskq == NULL); 1398 ilbs->ilbs_sticky_taskq = taskq_create(tq_name, 1399 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size, 1400 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1401 1402 ASSERT(ilbs->ilbs_sticky_timer_list == NULL); 1403 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 1404 ilb_sticky_timer_size, KM_SLEEP); 1405 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1; 1406 for (i = 0; i < ilb_sticky_timer_size; i++) { 1407 tm = ilbs->ilbs_sticky_timer_list + i; 1408 tm->start = i * part; 1409 tm->end = i * part + part; 1410 if (tm->end > ilbs->ilbs_sticky_hash_size) 1411 tm->end = ilbs->ilbs_sticky_hash_size; 1412 tm->ilbs = ilbs; 1413 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 1414 /* Spread out the starting execution time of all the timers. */ 1415 tm->tid = timeout(ilb_sticky_timer, tm, 1416 SEC_TO_TICK(ilb_sticky_timeout + i)); 1417 } 1418 } 1419 1420 void 1421 ilb_sticky_hash_fini(ilb_stack_t *ilbs) 1422 { 1423 int i; 1424 ilb_sticky_t *s; 1425 1426 if (ilbs->ilbs_sticky_hash == NULL) 1427 return; 1428 1429 /* Stop all the timers first. */ 1430 for (i = 0; i < ilb_sticky_timer_size; i++) { 1431 timeout_id_t tid; 1432 1433 /* Setting tid to 0 tells the timer handler not to restart. */ 1434 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1435 tid = ilbs->ilbs_sticky_timer_list[i].tid; 1436 ilbs->ilbs_sticky_timer_list[i].tid = 0; 1437 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1438 (void) untimeout(tid); 1439 } 1440 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) * 1441 ilb_sticky_timer_size); 1442 taskq_destroy(ilbs->ilbs_sticky_taskq); 1443 ilbs->ilbs_sticky_taskq = NULL; 1444 1445 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1446 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head)) 1447 != NULL) { 1448 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s); 1449 ILB_SERVER_REFRELE(s->server); 1450 kmem_free(s, sizeof (ilb_sticky_t)); 1451 } 1452 } 1453 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size * 1454 sizeof (ilb_sticky_hash_t)); 1455 } 1456 1457 /* 1458 * This routine sends up the sticky hash table to user land. Refer to 1459 * the comments before ilb_list_nat(). Both routines assume similar 1460 * conditions. 1461 * 1462 * It is assumed that the caller has checked the size of st so that it 1463 * can hold num entries. 1464 */ 1465 /* ARGSUSED */ 1466 int 1467 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st, 1468 uint32_t *num, uint32_t *flags) 1469 { 1470 ilb_sticky_hash_t *hash; 1471 ilb_sticky_t *curp; 1472 uint32_t i, j; 1473 int ret = 0; 1474 1475 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1476 while (ilbs->ilbs_sticky_list_busy) { 1477 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv, 1478 &ilbs->ilbs_sticky_list_lock) == 0) { 1479 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1480 return (EINTR); 1481 } 1482 } 1483 if ((hash = ilbs->ilbs_sticky_hash) == NULL) { 1484 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1485 *num = 0; 1486 *flags |= ILB_LIST_END; 1487 return (0); 1488 } 1489 ilbs->ilbs_sticky_list_busy = B_TRUE; 1490 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1491 1492 if (*flags & ILB_LIST_BEGIN) { 1493 i = 0; 1494 mutex_enter(&hash[0].sticky_lock); 1495 curp = list_head(&hash[0].sticky_head); 1496 } else if (*flags & ILB_LIST_CONT) { 1497 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) { 1498 *num = 0; 1499 *flags |= ILB_LIST_END; 1500 goto done; 1501 } 1502 i = ilbs->ilbs_sticky_list_cur; 1503 mutex_enter(&hash[i].sticky_lock); 1504 curp = ilbs->ilbs_sticky_list_curp; 1505 } else { 1506 ret = EINVAL; 1507 goto done; 1508 } 1509 1510 j = 0; 1511 while (j < *num) { 1512 if (curp == NULL) { 1513 mutex_exit(&hash[i].sticky_lock); 1514 if (++i == ilbs->ilbs_sticky_hash_size) { 1515 *flags |= ILB_LIST_END; 1516 break; 1517 } 1518 mutex_enter(&hash[i].sticky_lock); 1519 curp = list_head(&hash[i].sticky_head); 1520 continue; 1521 } 1522 (void) strcpy(st[j].rule_name, curp->rule_name); 1523 st[j].req_addr = curp->src; 1524 st[j].srv_addr = curp->server->iser_addr_v6; 1525 st[j].expiry_time = TICK_TO_MSEC(curp->expiry); 1526 j++; 1527 curp = list_next(&hash[i].sticky_head, curp); 1528 } 1529 ilbs->ilbs_sticky_list_curp = curp; 1530 if (j == *num) 1531 mutex_exit(&hash[i].sticky_lock); 1532 1533 ilbs->ilbs_sticky_list_cur = i; 1534 1535 *num = j; 1536 done: 1537 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1538 ilbs->ilbs_sticky_list_busy = B_FALSE; 1539 cv_signal(&ilbs->ilbs_sticky_list_cv); 1540 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1541 1542 return (ret); 1543 } 1544