1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2014 Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/sysmacros.h> 29 #include <sys/types.h> 30 #include <sys/conf.h> 31 #include <sys/time.h> 32 #include <sys/taskq.h> 33 #include <sys/cmn_err.h> 34 #include <sys/sdt.h> 35 #include <sys/atomic.h> 36 #include <netinet/in.h> 37 #include <inet/ip.h> 38 #include <inet/ip6.h> 39 #include <inet/tcp.h> 40 #include <inet/udp_impl.h> 41 #include <inet/ilb.h> 42 43 #include "ilb_stack.h" 44 #include "ilb_impl.h" 45 #include "ilb_conn.h" 46 #include "ilb_nat.h" 47 48 /* 49 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection 50 * 51 * start: starting index into the hash table to do gc 52 * end: ending index into the hash table to do gc 53 * ilbs: pointer to the ilb_stack_t of the IP stack 54 * tid_lock: mutex to protect the timer id. 55 * tid: timer id of the timer 56 */ 57 typedef struct ilb_timer_s { 58 uint32_t start; 59 uint32_t end; 60 ilb_stack_t *ilbs; 61 kmutex_t tid_lock; 62 timeout_id_t tid; 63 } ilb_timer_t; 64 65 /* Hash macro for finding the index to the conn hash table */ 66 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \ 67 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \ 68 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \ 69 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \ 70 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \ 71 ((hash_size) - 1)) 72 73 /* Kmem cache for the conn hash entry */ 74 static struct kmem_cache *ilb_conn_cache = NULL; 75 76 /* 77 * There are 60 timers running to do conn cache garbage collection. Each 78 * gc thread is responsible for 1/60 of the conn hash table. 79 */ 80 static int ilb_conn_timer_size = 60; 81 82 /* Each of the above gc timers wake up every 15s to do the gc. */ 83 static int ilb_conn_cache_timeout = 15; 84 85 #define ILB_STICKY_HASH(saddr, rule, hash_size) \ 86 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \ 87 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \ 88 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \ 89 (*(saddr) ^ (rule))) & ((hash_size) - 1)) 90 91 static struct kmem_cache *ilb_sticky_cache = NULL; 92 93 /* 94 * There are 60 timers running to do sticky cache garbage collection. Each 95 * gc thread is responsible for 1/60 of the sticky hash table. 96 */ 97 static int ilb_sticky_timer_size = 60; 98 99 /* Each of the above gc timers wake up every 15s to do the gc. */ 100 static int ilb_sticky_timeout = 15; 101 102 #define ILB_STICKY_REFRELE(s) \ 103 { \ 104 mutex_enter(&(s)->hash->sticky_lock); \ 105 (s)->refcnt--; \ 106 (s)->atime = ddi_get_lbolt64(); \ 107 mutex_exit(&s->hash->sticky_lock); \ 108 } 109 110 111 static void 112 ilb_conn_cache_init(void) 113 { 114 ilb_conn_cache = kmem_cache_create("ilb_conn_cache", 115 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL, 116 ilb_kmem_flags); 117 } 118 119 void 120 ilb_conn_cache_fini(void) 121 { 122 if (ilb_conn_cache != NULL) { 123 kmem_cache_destroy(ilb_conn_cache); 124 ilb_conn_cache = NULL; 125 } 126 } 127 128 static void 129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s) 130 { 131 ilb_conn_hash_t *hash; 132 ilb_conn_t **next, **prev; 133 ilb_conn_t **next_prev, **prev_next; 134 135 if (c2s) { 136 hash = connp->conn_c2s_hash; 137 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 138 next = &connp->conn_c2s_next; 139 prev = &connp->conn_c2s_prev; 140 if (*next != NULL) 141 next_prev = &(*next)->conn_c2s_prev; 142 if (*prev != NULL) 143 prev_next = &(*prev)->conn_c2s_next; 144 } else { 145 hash = connp->conn_s2c_hash; 146 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 147 next = &connp->conn_s2c_next; 148 prev = &connp->conn_s2c_prev; 149 if (*next != NULL) 150 next_prev = &(*next)->conn_s2c_prev; 151 if (*prev != NULL) 152 prev_next = &(*prev)->conn_s2c_next; 153 } 154 155 if (hash->ilb_connp == connp) { 156 hash->ilb_connp = *next; 157 if (*next != NULL) 158 *next_prev = NULL; 159 } else { 160 if (*prev != NULL) 161 *prev_next = *next; 162 if (*next != NULL) 163 *next_prev = *prev; 164 } 165 ASSERT(hash->ilb_conn_cnt > 0); 166 hash->ilb_conn_cnt--; 167 168 *next = NULL; 169 *prev = NULL; 170 } 171 172 static void 173 ilb_conn_remove(ilb_conn_t *connp) 174 { 175 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 176 ilb_conn_remove_common(connp, B_TRUE); 177 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 178 ilb_conn_remove_common(connp, B_FALSE); 179 180 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 181 in_port_t port; 182 183 port = ntohs(connp->conn_rule_cache.info.nat_sport); 184 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena, 185 (void *)(uintptr_t)port, 1); 186 } 187 188 if (connp->conn_sticky != NULL) 189 ILB_STICKY_REFRELE(connp->conn_sticky); 190 ILB_SERVER_REFRELE(connp->conn_server); 191 kmem_cache_free(ilb_conn_cache, connp); 192 } 193 194 /* 195 * Routine to do periodic garbage collection of conn hash entries. When 196 * a conn hash timer fires, it dispatches a taskq to call this function 197 * to do the gc. Note that each taskq is responisble for a portion of 198 * the table. The portion is stored in timer->start, timer->end. 199 */ 200 static void 201 ilb_conn_cleanup(void *arg) 202 { 203 ilb_timer_t *timer = (ilb_timer_t *)arg; 204 uint32_t i; 205 ilb_stack_t *ilbs; 206 ilb_conn_hash_t *c2s_hash, *s2c_hash; 207 ilb_conn_t *connp, *nxt_connp; 208 int64_t now; 209 int64_t expiry; 210 boolean_t die_now; 211 212 ilbs = timer->ilbs; 213 c2s_hash = ilbs->ilbs_c2s_conn_hash; 214 ASSERT(c2s_hash != NULL); 215 216 now = ddi_get_lbolt64(); 217 for (i = timer->start; i < timer->end; i++) { 218 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock); 219 if ((connp = c2s_hash[i].ilb_connp) == NULL) { 220 ASSERT(c2s_hash[i].ilb_conn_cnt == 0); 221 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 222 continue; 223 } 224 do { 225 ASSERT(c2s_hash[i].ilb_conn_cnt > 0); 226 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]); 227 nxt_connp = connp->conn_c2s_next; 228 expiry = now - SEC_TO_TICK(connp->conn_expiry); 229 if (connp->conn_server->iser_die_time != 0 && 230 connp->conn_server->iser_die_time < now) 231 die_now = B_TRUE; 232 else 233 die_now = B_FALSE; 234 s2c_hash = connp->conn_s2c_hash; 235 mutex_enter(&s2c_hash->ilb_conn_hash_lock); 236 237 if (connp->conn_gc || die_now || 238 (connp->conn_c2s_atime < expiry && 239 connp->conn_s2c_atime < expiry)) { 240 /* Need to update the nat list cur_connp */ 241 if (connp == ilbs->ilbs_conn_list_connp) { 242 ilbs->ilbs_conn_list_connp = 243 connp->conn_c2s_next; 244 } 245 ilb_conn_remove(connp); 246 goto nxt_connp; 247 } 248 249 if (connp->conn_l4 != IPPROTO_TCP) 250 goto nxt_connp; 251 252 /* Update and check TCP related conn info */ 253 if (connp->conn_c2s_tcp_fin_sent && 254 SEQ_GT(connp->conn_s2c_tcp_ack, 255 connp->conn_c2s_tcp_fss)) { 256 connp->conn_c2s_tcp_fin_acked = B_TRUE; 257 } 258 if (connp->conn_s2c_tcp_fin_sent && 259 SEQ_GT(connp->conn_c2s_tcp_ack, 260 connp->conn_s2c_tcp_fss)) { 261 connp->conn_s2c_tcp_fin_acked = B_TRUE; 262 } 263 if (connp->conn_c2s_tcp_fin_acked && 264 connp->conn_s2c_tcp_fin_acked) { 265 ilb_conn_remove(connp); 266 } 267 nxt_connp: 268 mutex_exit(&s2c_hash->ilb_conn_hash_lock); 269 connp = nxt_connp; 270 } while (connp != NULL); 271 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 272 } 273 } 274 275 /* Conn hash timer routine. It dispatches a taskq and restart the timer */ 276 static void 277 ilb_conn_timer(void *arg) 278 { 279 ilb_timer_t *timer = (ilb_timer_t *)arg; 280 281 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup, 282 arg, TQ_SLEEP); 283 mutex_enter(&timer->tid_lock); 284 if (timer->tid == 0) { 285 mutex_exit(&timer->tid_lock); 286 } else { 287 timer->tid = timeout(ilb_conn_timer, arg, 288 SEC_TO_TICK(ilb_conn_cache_timeout)); 289 mutex_exit(&timer->tid_lock); 290 } 291 } 292 293 void 294 ilb_conn_hash_init(ilb_stack_t *ilbs) 295 { 296 extern pri_t minclsyspri; 297 int i, part; 298 ilb_timer_t *tm; 299 char tq_name[TASKQ_NAMELEN]; 300 301 /* 302 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to 303 * the next power of 2. 304 */ 305 if (!ISP2(ilbs->ilbs_conn_hash_size)) { 306 for (i = 0; i < 31; i++) { 307 if (ilbs->ilbs_conn_hash_size < (1 << i)) 308 break; 309 } 310 ilbs->ilbs_conn_hash_size = 1 << i; 311 } 312 313 /* 314 * Can sleep since this should be called when a rule is being added, 315 * hence we are not in interrupt context. 316 */ 317 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 318 ilbs->ilbs_conn_hash_size, KM_SLEEP); 319 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 320 ilbs->ilbs_conn_hash_size, KM_SLEEP); 321 322 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 323 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock, 324 NULL, MUTEX_DEFAULT, NULL); 325 } 326 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 327 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock, 328 NULL, MUTEX_DEFAULT, NULL); 329 } 330 331 if (ilb_conn_cache == NULL) 332 ilb_conn_cache_init(); 333 334 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p", 335 (void *)ilbs->ilbs_netstack); 336 ASSERT(ilbs->ilbs_conn_taskq == NULL); 337 ilbs->ilbs_conn_taskq = taskq_create(tq_name, 338 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size, 339 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 340 341 ASSERT(ilbs->ilbs_conn_timer_list == NULL); 342 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 343 ilb_conn_timer_size, KM_SLEEP); 344 345 /* 346 * The hash table is divided in equal partition for those timers 347 * to do garbage collection. 348 */ 349 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1; 350 for (i = 0; i < ilb_conn_timer_size; i++) { 351 tm = ilbs->ilbs_conn_timer_list + i; 352 tm->start = i * part; 353 tm->end = i * part + part; 354 if (tm->end > ilbs->ilbs_conn_hash_size) 355 tm->end = ilbs->ilbs_conn_hash_size; 356 tm->ilbs = ilbs; 357 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 358 /* Spread out the starting execution time of all the timers. */ 359 tm->tid = timeout(ilb_conn_timer, tm, 360 SEC_TO_TICK(ilb_conn_cache_timeout + i)); 361 } 362 } 363 364 void 365 ilb_conn_hash_fini(ilb_stack_t *ilbs) 366 { 367 uint32_t i; 368 ilb_conn_t *connp; 369 ilb_conn_hash_t *hash; 370 371 if (ilbs->ilbs_c2s_conn_hash == NULL) { 372 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 373 return; 374 } 375 376 /* Stop all the timers first. */ 377 for (i = 0; i < ilb_conn_timer_size; i++) { 378 timeout_id_t tid; 379 380 /* Setting tid to 0 tells the timer handler not to restart. */ 381 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock); 382 tid = ilbs->ilbs_conn_timer_list[i].tid; 383 ilbs->ilbs_conn_timer_list[i].tid = 0; 384 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock); 385 (void) untimeout(tid); 386 } 387 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) * 388 ilb_conn_timer_size); 389 taskq_destroy(ilbs->ilbs_conn_taskq); 390 ilbs->ilbs_conn_taskq = NULL; 391 392 /* Then remove all the conns. */ 393 hash = ilbs->ilbs_s2c_conn_hash; 394 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 395 while ((connp = hash[i].ilb_connp) != NULL) { 396 hash[i].ilb_connp = connp->conn_s2c_next; 397 ILB_SERVER_REFRELE(connp->conn_server); 398 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 399 ilb_nat_src_entry_t *ent; 400 in_port_t port; 401 402 /* 403 * src_ent will be freed in ilb_nat_src_fini(). 404 */ 405 port = ntohs( 406 connp->conn_rule_cache.info.nat_sport); 407 ent = connp->conn_rule_cache.info.src_ent; 408 vmem_free(ent->nse_port_arena, 409 (void *)(uintptr_t)port, 1); 410 } 411 kmem_cache_free(ilb_conn_cache, connp); 412 } 413 } 414 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) * 415 ilbs->ilbs_conn_hash_size); 416 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) * 417 ilbs->ilbs_conn_hash_size); 418 } 419 420 /* 421 * Internet checksum adjustment calculation routines. We pre-calculate 422 * checksum adjustment so that we don't need to compute the checksum on 423 * the whole packet when we change address/port in the packet. 424 */ 425 426 static void 427 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 428 in_port_t new_port, uint32_t *adj_sum) 429 { 430 uint32_t sum; 431 432 sum = *oaddr + *(oaddr + 1) + old_port; 433 while ((sum >> 16) != 0) 434 sum = (sum & 0xffff) + (sum >> 16); 435 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port; 436 } 437 438 static void 439 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 440 in_port_t new_port, uint32_t *adj_sum) 441 { 442 uint32_t sum = 0; 443 444 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) + 445 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) + 446 old_port; 447 while ((sum >> 16) != 0) 448 sum = (sum & 0xffff) + (sum >> 16); 449 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + 450 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) + 451 *(naddr + 6) + *(naddr + 7) + new_port; 452 } 453 454 static void 455 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 456 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 457 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 458 { 459 uint32_t sum; 460 461 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) + 462 old_port2; 463 while ((sum >> 16) != 0) 464 sum = (sum & 0xffff) + (sum >> 16); 465 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 + 466 *naddr2 + *(naddr2 + 1) + new_port2; 467 } 468 469 static void 470 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 471 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 472 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 473 { 474 uint32_t sum = 0; 475 476 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) + 477 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) + 478 old_port1; 479 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) + 480 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) + 481 old_port2; 482 while ((sum >> 16) != 0) 483 sum = (sum & 0xffff) + (sum >> 16); 484 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) + 485 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) + 486 *(naddr1 + 7) + new_port1; 487 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) + 488 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) + 489 *(naddr2 + 7) + new_port2; 490 } 491 492 /* 493 * Add a conn hash entry to the tables. Note that a conn hash entry 494 * (ilb_conn_t) contains info on both directions. And there are two hash 495 * tables, one for client to server and the other for server to client. 496 * So the same entry is added to both tables and can be ccessed by two 497 * thread simultaneously. But each thread will only access data on one 498 * direction, so there is no conflict. 499 */ 500 int 501 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server, 502 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport, 503 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s) 504 { 505 ilb_conn_t *connp; 506 ilb_conn_hash_t *hash; 507 int i; 508 509 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP); 510 if (connp == NULL) { 511 if (s != NULL) { 512 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 513 ilb_nat_src_entry_t **entry; 514 515 entry = s->server->iser_nat_src->src_list; 516 vmem_free(entry[s->nat_src_idx]->nse_port_arena, 517 (void *)(uintptr_t)ntohs(info->nat_sport), 518 1); 519 } 520 ILB_STICKY_REFRELE(s); 521 } 522 return (ENOMEM); 523 } 524 525 connp->conn_l4 = rule->ir_proto; 526 527 connp->conn_server = server; 528 ILB_SERVER_REFHOLD(server); 529 connp->conn_sticky = s; 530 531 connp->conn_rule_cache.topo = rule->ir_topo; 532 connp->conn_rule_cache.info = *info; 533 534 connp->conn_gc = B_FALSE; 535 536 connp->conn_expiry = rule->ir_nat_expiry; 537 connp->conn_cr_time = ddi_get_lbolt64(); 538 539 /* Client to server info. */ 540 connp->conn_c2s_saddr = *src; 541 connp->conn_c2s_sport = sport; 542 connp->conn_c2s_daddr = *dst; 543 connp->conn_c2s_dport = dport; 544 545 connp->conn_c2s_atime = ddi_get_lbolt64(); 546 /* The packet ths triggers this creation should be counted */ 547 connp->conn_c2s_pkt_cnt = 1; 548 connp->conn_c2s_tcp_fin_sent = B_FALSE; 549 connp->conn_c2s_tcp_fin_acked = B_FALSE; 550 551 /* Server to client info, before NAT */ 552 switch (rule->ir_topo) { 553 case ILB_TOPO_IMPL_HALF_NAT: 554 connp->conn_s2c_saddr = info->nat_dst; 555 connp->conn_s2c_sport = info->nat_dport; 556 connp->conn_s2c_daddr = *src; 557 connp->conn_s2c_dport = sport; 558 559 /* Pre-calculate checksum changes for both directions */ 560 if (rule->ir_ipver == IPPROTO_IP) { 561 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 562 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0, 563 &connp->conn_c2s_ip_sum); 564 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 565 (uint16_t *)&info->nat_dst.s6_addr32[3], dport, 566 info->nat_dport, &connp->conn_c2s_tp_sum); 567 *ip_sum = connp->conn_c2s_ip_sum; 568 *tp_sum = connp->conn_c2s_tp_sum; 569 570 hnat_cksum_v4( 571 (uint16_t *)&info->nat_dst.s6_addr32[3], 572 (uint16_t *)&dst->s6_addr32[3], 0, 0, 573 &connp->conn_s2c_ip_sum); 574 hnat_cksum_v4( 575 (uint16_t *)&info->nat_dst.s6_addr32[3], 576 (uint16_t *)&dst->s6_addr32[3], 577 info->nat_dport, dport, 578 &connp->conn_s2c_tp_sum); 579 } else { 580 connp->conn_c2s_ip_sum = 0; 581 hnat_cksum_v6((uint16_t *)dst, 582 (uint16_t *)&info->nat_dst, dport, 583 info->nat_dport, &connp->conn_c2s_tp_sum); 584 *ip_sum = 0; 585 *tp_sum = connp->conn_c2s_tp_sum; 586 587 connp->conn_s2c_ip_sum = 0; 588 hnat_cksum_v6((uint16_t *)&info->nat_dst, 589 (uint16_t *)dst, info->nat_dport, dport, 590 &connp->conn_s2c_tp_sum); 591 } 592 break; 593 case ILB_TOPO_IMPL_NAT: 594 connp->conn_s2c_saddr = info->nat_dst; 595 connp->conn_s2c_sport = info->nat_dport; 596 connp->conn_s2c_daddr = info->nat_src; 597 connp->conn_s2c_dport = info->nat_sport; 598 599 if (rule->ir_ipver == IPPROTO_IP) { 600 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 601 (uint16_t *)&dst->s6_addr32[3], 602 (uint16_t *)&info->nat_src.s6_addr32[3], 603 (uint16_t *)&info->nat_dst.s6_addr32[3], 604 0, 0, 0, 0, &connp->conn_c2s_ip_sum); 605 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 606 (uint16_t *)&dst->s6_addr32[3], 607 (uint16_t *)&info->nat_src.s6_addr32[3], 608 (uint16_t *)&info->nat_dst.s6_addr32[3], 609 sport, dport, info->nat_sport, 610 info->nat_dport, &connp->conn_c2s_tp_sum); 611 *ip_sum = connp->conn_c2s_ip_sum; 612 *tp_sum = connp->conn_c2s_tp_sum; 613 614 fnat_cksum_v4( 615 (uint16_t *)&info->nat_src.s6_addr32[3], 616 (uint16_t *)&info->nat_dst.s6_addr32[3], 617 (uint16_t *)&src->s6_addr32[3], 618 (uint16_t *)&dst->s6_addr32[3], 619 0, 0, 0, 0, &connp->conn_s2c_ip_sum); 620 fnat_cksum_v4( 621 (uint16_t *)&info->nat_src.s6_addr32[3], 622 (uint16_t *)&info->nat_dst.s6_addr32[3], 623 (uint16_t *)&src->s6_addr32[3], 624 (uint16_t *)&dst->s6_addr32[3], 625 info->nat_sport, info->nat_dport, 626 sport, dport, &connp->conn_s2c_tp_sum); 627 } else { 628 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst, 629 (uint16_t *)&info->nat_src, 630 (uint16_t *)&info->nat_dst, 631 sport, dport, info->nat_sport, 632 info->nat_dport, &connp->conn_c2s_tp_sum); 633 connp->conn_c2s_ip_sum = 0; 634 *ip_sum = 0; 635 *tp_sum = connp->conn_c2s_tp_sum; 636 637 fnat_cksum_v6((uint16_t *)&info->nat_src, 638 (uint16_t *)&info->nat_dst, (uint16_t *)src, 639 (uint16_t *)dst, info->nat_sport, 640 info->nat_dport, sport, dport, 641 &connp->conn_s2c_tp_sum); 642 connp->conn_s2c_ip_sum = 0; 643 } 644 break; 645 } 646 647 connp->conn_s2c_atime = ddi_get_lbolt64(); 648 connp->conn_s2c_pkt_cnt = 1; 649 connp->conn_s2c_tcp_fin_sent = B_FALSE; 650 connp->conn_s2c_tcp_fin_acked = B_FALSE; 651 652 /* Add it to the s2c hash table. */ 653 hash = ilbs->ilbs_s2c_conn_hash; 654 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3], 655 ntohs(connp->conn_s2c_sport), 656 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3], 657 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size); 658 connp->conn_s2c_hash = &hash[i]; 659 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i); 660 661 mutex_enter(&hash[i].ilb_conn_hash_lock); 662 hash[i].ilb_conn_cnt++; 663 connp->conn_s2c_next = hash[i].ilb_connp; 664 if (hash[i].ilb_connp != NULL) 665 hash[i].ilb_connp->conn_s2c_prev = connp; 666 connp->conn_s2c_prev = NULL; 667 hash[i].ilb_connp = connp; 668 mutex_exit(&hash[i].ilb_conn_hash_lock); 669 670 /* Add it to the c2s hash table. */ 671 hash = ilbs->ilbs_c2s_conn_hash; 672 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 673 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 674 ilbs->ilbs_conn_hash_size); 675 connp->conn_c2s_hash = &hash[i]; 676 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i); 677 678 mutex_enter(&hash[i].ilb_conn_hash_lock); 679 hash[i].ilb_conn_cnt++; 680 connp->conn_c2s_next = hash[i].ilb_connp; 681 if (hash[i].ilb_connp != NULL) 682 hash[i].ilb_connp->conn_c2s_prev = connp; 683 connp->conn_c2s_prev = NULL; 684 hash[i].ilb_connp = connp; 685 mutex_exit(&hash[i].ilb_conn_hash_lock); 686 687 return (0); 688 } 689 690 /* 691 * If a connection is using TCP, we keep track of simple TCP state transition 692 * so that we know when to clean up an entry. 693 */ 694 static boolean_t 695 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len, 696 boolean_t c2s) 697 { 698 uint32_t ack, seq; 699 int32_t seg_len; 700 701 if (tcpha->tha_flags & TH_RST) 702 return (B_FALSE); 703 704 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) - 705 TCP_HDR_LENGTH((tcph_t *)tcpha); 706 707 if (tcpha->tha_flags & TH_ACK) 708 ack = ntohl(tcpha->tha_ack); 709 seq = ntohl(tcpha->tha_seq); 710 if (c2s) { 711 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 712 if (tcpha->tha_flags & TH_FIN) { 713 connp->conn_c2s_tcp_fss = seq + seg_len; 714 connp->conn_c2s_tcp_fin_sent = B_TRUE; 715 } 716 connp->conn_c2s_tcp_ack = ack; 717 718 /* Port reuse by the client, restart the conn. */ 719 if (connp->conn_c2s_tcp_fin_sent && 720 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) { 721 connp->conn_c2s_tcp_fin_sent = B_FALSE; 722 connp->conn_c2s_tcp_fin_acked = B_FALSE; 723 } 724 } else { 725 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 726 if (tcpha->tha_flags & TH_FIN) { 727 connp->conn_s2c_tcp_fss = seq + seg_len; 728 connp->conn_s2c_tcp_fin_sent = B_TRUE; 729 } 730 connp->conn_s2c_tcp_ack = ack; 731 732 /* Port reuse by the client, restart the conn. */ 733 if (connp->conn_s2c_tcp_fin_sent && 734 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) { 735 connp->conn_s2c_tcp_fin_sent = B_FALSE; 736 connp->conn_s2c_tcp_fin_acked = B_FALSE; 737 } 738 } 739 740 return (B_TRUE); 741 } 742 743 /* 744 * Helper routint to find conn hash entry given some packet information and 745 * the traffic direction (c2s, client to server?) 746 */ 747 static boolean_t 748 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src, 749 in_port_t sport, in6_addr_t *dst, in_port_t dport, 750 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum, 751 int32_t pkt_len, boolean_t c2s) 752 { 753 ilb_conn_hash_t *hash; 754 uint_t i; 755 ilb_conn_t *connp; 756 boolean_t tcp_alive; 757 boolean_t ret = B_FALSE; 758 759 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 760 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 761 ilbs->ilbs_conn_hash_size); 762 if (c2s) { 763 hash = ilbs->ilbs_c2s_conn_hash; 764 mutex_enter(&hash[i].ilb_conn_hash_lock); 765 for (connp = hash[i].ilb_connp; connp != NULL; 766 connp = connp->conn_c2s_next) { 767 if (connp->conn_l4 == l4 && 768 connp->conn_c2s_dport == dport && 769 connp->conn_c2s_sport == sport && 770 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) && 771 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) { 772 connp->conn_c2s_atime = ddi_get_lbolt64(); 773 connp->conn_c2s_pkt_cnt++; 774 *rule_cache = connp->conn_rule_cache; 775 *ip_sum = connp->conn_c2s_ip_sum; 776 *tp_sum = connp->conn_c2s_tp_sum; 777 ret = B_TRUE; 778 break; 779 } 780 } 781 } else { 782 hash = ilbs->ilbs_s2c_conn_hash; 783 mutex_enter(&hash[i].ilb_conn_hash_lock); 784 for (connp = hash[i].ilb_connp; connp != NULL; 785 connp = connp->conn_s2c_next) { 786 if (connp->conn_l4 == l4 && 787 connp->conn_s2c_dport == dport && 788 connp->conn_s2c_sport == sport && 789 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) && 790 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) { 791 connp->conn_s2c_atime = ddi_get_lbolt64(); 792 connp->conn_s2c_pkt_cnt++; 793 *rule_cache = connp->conn_rule_cache; 794 *ip_sum = connp->conn_s2c_ip_sum; 795 *tp_sum = connp->conn_s2c_tp_sum; 796 ret = B_TRUE; 797 break; 798 } 799 } 800 } 801 if (ret) { 802 ILB_S_KSTAT(connp->conn_server, pkt_processed); 803 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed, 804 pkt_len); 805 806 switch (l4) { 807 case (IPPROTO_TCP): 808 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len, 809 c2s); 810 if (!tcp_alive) { 811 connp->conn_gc = B_TRUE; 812 } 813 break; 814 default: 815 break; 816 } 817 } 818 mutex_exit(&hash[i].ilb_conn_hash_lock); 819 820 return (ret); 821 } 822 823 /* 824 * To check if a give packet matches an existing conn hash entry. If it 825 * does, return the information about this entry so that the caller can 826 * do the proper NAT. 827 */ 828 boolean_t 829 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph, 830 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport, 831 uint32_t pkt_len, in6_addr_t *lb_dst) 832 { 833 ilb_rule_info_t rule_cache; 834 uint32_t adj_ip_sum, adj_tp_sum; 835 boolean_t ret; 836 837 /* Check the incoming hash table. */ 838 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 839 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) { 840 switch (rule_cache.topo) { 841 case ILB_TOPO_IMPL_NAT: 842 *lb_dst = rule_cache.info.nat_dst; 843 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 844 adj_ip_sum, adj_tp_sum, B_TRUE); 845 ret = B_TRUE; 846 break; 847 case ILB_TOPO_IMPL_HALF_NAT: 848 *lb_dst = rule_cache.info.nat_dst; 849 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 850 adj_ip_sum, adj_tp_sum, B_TRUE); 851 ret = B_TRUE; 852 break; 853 default: 854 ret = B_FALSE; 855 break; 856 } 857 return (ret); 858 } 859 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 860 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) { 861 switch (rule_cache.topo) { 862 case ILB_TOPO_IMPL_NAT: 863 *lb_dst = rule_cache.info.src; 864 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 865 adj_ip_sum, adj_tp_sum, B_FALSE); 866 ret = B_TRUE; 867 break; 868 case ILB_TOPO_IMPL_HALF_NAT: 869 *lb_dst = *dst; 870 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 871 adj_ip_sum, adj_tp_sum, B_FALSE); 872 ret = B_TRUE; 873 break; 874 default: 875 ret = B_FALSE; 876 break; 877 } 878 return (ret); 879 } 880 881 return (B_FALSE); 882 } 883 884 /* 885 * To check if an ICMP packet belongs to a connection in one of the conn 886 * hash entries. 887 */ 888 boolean_t 889 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph, 890 void *icmph, in6_addr_t *lb_dst) 891 { 892 ilb_conn_hash_t *hash; 893 ipha_t *in_iph4; 894 ip6_t *in_iph6; 895 icmph_t *icmph4; 896 icmp6_t *icmph6; 897 in6_addr_t *in_src_p, *in_dst_p; 898 in_port_t *sport, *dport; 899 int l4; 900 uint_t i; 901 ilb_conn_t *connp; 902 ilb_rule_info_t rule_cache; 903 uint32_t adj_ip_sum; 904 boolean_t full_nat; 905 906 if (l3 == IPPROTO_IP) { 907 in6_addr_t in_src, in_dst; 908 909 icmph4 = (icmph_t *)icmph; 910 in_iph4 = (ipha_t *)&icmph4[1]; 911 912 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) + 913 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 914 return (B_FALSE); 915 } 916 917 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src); 918 in_src_p = &in_src; 919 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst); 920 in_dst_p = &in_dst; 921 922 l4 = in_iph4->ipha_protocol; 923 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 924 return (B_FALSE); 925 926 sport = (in_port_t *)((char *)in_iph4 + 927 IPH_HDR_LENGTH(in_iph4)); 928 dport = sport + 1; 929 930 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t, 931 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t, 932 ntohs(*sport), uint16_t, ntohs(*dport)); 933 } else { 934 ASSERT(l3 == IPPROTO_IPV6); 935 936 icmph6 = (icmp6_t *)icmph; 937 in_iph6 = (ip6_t *)&icmph6[1]; 938 in_src_p = &in_iph6->ip6_src; 939 in_dst_p = &in_iph6->ip6_dst; 940 941 if ((uint8_t *)in_iph6 + sizeof (ip6_t) + 942 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 943 return (B_FALSE); 944 } 945 946 l4 = in_iph6->ip6_nxt; 947 /* We don't go deep inside an IPv6 packet yet. */ 948 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 949 return (B_FALSE); 950 951 sport = (in_port_t *)&in_iph6[1]; 952 dport = sport + 1; 953 954 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *, 955 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst, 956 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport)); 957 } 958 959 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport), 960 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport), 961 ilbs->ilbs_conn_hash_size); 962 hash = ilbs->ilbs_c2s_conn_hash; 963 964 mutex_enter(&hash[i].ilb_conn_hash_lock); 965 for (connp = hash[i].ilb_connp; connp != NULL; 966 connp = connp->conn_c2s_next) { 967 if (connp->conn_l4 == l4 && 968 connp->conn_c2s_dport == *sport && 969 connp->conn_c2s_sport == *dport && 970 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) && 971 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) { 972 connp->conn_c2s_atime = ddi_get_lbolt64(); 973 connp->conn_c2s_pkt_cnt++; 974 rule_cache = connp->conn_rule_cache; 975 adj_ip_sum = connp->conn_c2s_ip_sum; 976 break; 977 } 978 } 979 mutex_exit(&hash[i].ilb_conn_hash_lock); 980 981 if (connp == NULL) { 982 DTRACE_PROBE(ilb__chk__icmp__conn__failed); 983 return (B_FALSE); 984 } 985 986 switch (rule_cache.topo) { 987 case ILB_TOPO_IMPL_NAT: 988 full_nat = B_TRUE; 989 break; 990 case ILB_TOPO_IMPL_HALF_NAT: 991 full_nat = B_FALSE; 992 break; 993 default: 994 return (B_FALSE); 995 } 996 997 *lb_dst = rule_cache.info.nat_dst; 998 if (l3 == IPPROTO_IP) { 999 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport, 1000 &rule_cache.info, adj_ip_sum, full_nat); 1001 } else { 1002 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport, 1003 &rule_cache.info, full_nat); 1004 } 1005 return (B_TRUE); 1006 } 1007 1008 /* 1009 * This routine sends up the conn hash table to user land. Note that the 1010 * request is an ioctl, hence we cannot really differentiate requests 1011 * from different clients. There is no context shared between different 1012 * ioctls. Here we make the assumption that the user land ilbd will 1013 * only allow one client to show the conn hash table at any time. 1014 * Otherwise, the results will be "very" inconsistent. 1015 * 1016 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants 1017 * to read from the beginning of the able. After a certain entries 1018 * are reported, the kernel remembers the position of the last returned 1019 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag, 1020 * it will return entries starting from where it was left off. When 1021 * the end of table is reached, a flag (ILB_LIST_END) is set to tell 1022 * the client that there is no more entry. 1023 * 1024 * It is assumed that the caller has checked the size of nat so that it 1025 * can hold num entries. 1026 */ 1027 /* ARGSUSED */ 1028 int 1029 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat, 1030 uint32_t *num, uint32_t *flags) 1031 { 1032 ilb_conn_hash_t *hash; 1033 ilb_conn_t *cur_connp; 1034 uint32_t i, j; 1035 int ret = 0; 1036 1037 mutex_enter(&ilbs->ilbs_conn_list_lock); 1038 while (ilbs->ilbs_conn_list_busy) { 1039 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv, 1040 &ilbs->ilbs_conn_list_lock) == 0) { 1041 mutex_exit(&ilbs->ilbs_conn_list_lock); 1042 return (EINTR); 1043 } 1044 } 1045 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) { 1046 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 1047 mutex_exit(&ilbs->ilbs_conn_list_lock); 1048 *num = 0; 1049 *flags |= ILB_LIST_END; 1050 return (0); 1051 } 1052 ilbs->ilbs_conn_list_busy = B_TRUE; 1053 mutex_exit(&ilbs->ilbs_conn_list_lock); 1054 1055 if (*flags & ILB_LIST_BEGIN) { 1056 i = 0; 1057 mutex_enter(&hash[0].ilb_conn_hash_lock); 1058 cur_connp = hash[0].ilb_connp; 1059 } else if (*flags & ILB_LIST_CONT) { 1060 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) { 1061 *num = 0; 1062 *flags |= ILB_LIST_END; 1063 goto done; 1064 } 1065 i = ilbs->ilbs_conn_list_cur; 1066 mutex_enter(&hash[i].ilb_conn_hash_lock); 1067 cur_connp = ilbs->ilbs_conn_list_connp; 1068 } else { 1069 ret = EINVAL; 1070 goto done; 1071 } 1072 1073 j = 0; 1074 while (j < *num) { 1075 if (cur_connp == NULL) { 1076 mutex_exit(&hash[i].ilb_conn_hash_lock); 1077 if (++i == ilbs->ilbs_conn_hash_size) { 1078 *flags |= ILB_LIST_END; 1079 break; 1080 } 1081 mutex_enter(&hash[i].ilb_conn_hash_lock); 1082 cur_connp = hash[i].ilb_connp; 1083 continue; 1084 } 1085 nat[j].proto = cur_connp->conn_l4; 1086 1087 nat[j].in_global = cur_connp->conn_c2s_daddr; 1088 nat[j].in_global_port = cur_connp->conn_c2s_dport; 1089 nat[j].out_global = cur_connp->conn_c2s_saddr; 1090 nat[j].out_global_port = cur_connp->conn_c2s_sport; 1091 1092 nat[j].in_local = cur_connp->conn_s2c_saddr; 1093 nat[j].in_local_port = cur_connp->conn_s2c_sport; 1094 nat[j].out_local = cur_connp->conn_s2c_daddr; 1095 nat[j].out_local_port = cur_connp->conn_s2c_dport; 1096 1097 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time); 1098 nat[j].last_access_time = 1099 TICK_TO_MSEC(cur_connp->conn_c2s_atime); 1100 1101 /* 1102 * The conn_s2c_pkt_cnt may not be accurate since we are not 1103 * holding the s2c hash lock. 1104 */ 1105 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt + 1106 cur_connp->conn_s2c_pkt_cnt; 1107 j++; 1108 1109 cur_connp = cur_connp->conn_c2s_next; 1110 } 1111 ilbs->ilbs_conn_list_connp = cur_connp; 1112 if (j == *num) 1113 mutex_exit(&hash[i].ilb_conn_hash_lock); 1114 1115 ilbs->ilbs_conn_list_cur = i; 1116 1117 *num = j; 1118 done: 1119 mutex_enter(&ilbs->ilbs_conn_list_lock); 1120 ilbs->ilbs_conn_list_busy = B_FALSE; 1121 cv_signal(&ilbs->ilbs_conn_list_cv); 1122 mutex_exit(&ilbs->ilbs_conn_list_lock); 1123 1124 return (ret); 1125 } 1126 1127 1128 /* 1129 * Stickiness (persistence) handling routines. 1130 */ 1131 1132 1133 static void 1134 ilb_sticky_cache_init(void) 1135 { 1136 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache", 1137 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL, 1138 ilb_kmem_flags); 1139 } 1140 1141 void 1142 ilb_sticky_cache_fini(void) 1143 { 1144 if (ilb_sticky_cache != NULL) { 1145 kmem_cache_destroy(ilb_sticky_cache); 1146 ilb_sticky_cache = NULL; 1147 } 1148 } 1149 1150 void 1151 ilb_sticky_refrele(ilb_sticky_t *s) 1152 { 1153 ILB_STICKY_REFRELE(s); 1154 } 1155 1156 static ilb_sticky_t * 1157 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src) 1158 { 1159 ilb_sticky_t *s; 1160 1161 ASSERT(mutex_owned(&hash->sticky_lock)); 1162 1163 for (s = list_head(&hash->sticky_head); s != NULL; 1164 s = list_next(&hash->sticky_head, s)) { 1165 if (s->rule_instance == rule->ir_ks_instance) { 1166 if (IN6_ARE_ADDR_EQUAL(src, &s->src)) 1167 return (s); 1168 } 1169 } 1170 return (NULL); 1171 } 1172 1173 static ilb_sticky_t * 1174 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server, 1175 in6_addr_t *src) 1176 { 1177 ilb_sticky_t *s; 1178 1179 ASSERT(mutex_owned(&hash->sticky_lock)); 1180 1181 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL) 1182 return (NULL); 1183 1184 /* 1185 * The rule instance is for handling the scenario when the same 1186 * client talks to different rules at the same time. Stickiness 1187 * is per rule so we can use the rule instance to differentiate 1188 * the client's request. 1189 */ 1190 s->rule_instance = rule->ir_ks_instance; 1191 /* 1192 * Copy the rule name for listing all sticky cache entry. ir_name 1193 * is guaranteed to be NULL terminated. 1194 */ 1195 (void) strcpy(s->rule_name, rule->ir_name); 1196 s->server = server; 1197 1198 /* 1199 * Grab a ref cnt on the server so that it won't go away while 1200 * it is still in the sticky table. 1201 */ 1202 ILB_SERVER_REFHOLD(server); 1203 s->src = *src; 1204 s->expiry = rule->ir_sticky_expiry; 1205 s->refcnt = 1; 1206 s->hash = hash; 1207 1208 /* 1209 * There is no need to set atime here since the refcnt is not 1210 * zero. A sticky entry is removed only when the refcnt is 1211 * zero. But just set it here for debugging purpose. The 1212 * atime is set when a refrele is done on a sticky entry. 1213 */ 1214 s->atime = ddi_get_lbolt64(); 1215 1216 list_insert_head(&hash->sticky_head, s); 1217 hash->sticky_cnt++; 1218 return (s); 1219 } 1220 1221 /* 1222 * This routine checks if there is an existing sticky entry which matches 1223 * a given packet. If there is one, return it. If there is not, create 1224 * a sticky entry using the packet's info. 1225 */ 1226 ilb_server_t * 1227 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src, 1228 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx) 1229 { 1230 int i; 1231 ilb_sticky_hash_t *hash; 1232 ilb_sticky_t *s; 1233 1234 ASSERT(server != NULL); 1235 1236 *res = NULL; 1237 1238 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3], 1239 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size); 1240 hash = &ilbs->ilbs_sticky_hash[i]; 1241 1242 /* First check if there is already an entry. */ 1243 mutex_enter(&hash->sticky_lock); 1244 s = ilb_sticky_lookup(hash, rule, src); 1245 1246 /* No sticky entry, add one. */ 1247 if (s == NULL) { 1248 add_new_entry: 1249 s = ilb_sticky_add(hash, rule, server, src); 1250 if (s == NULL) { 1251 mutex_exit(&hash->sticky_lock); 1252 return (NULL); 1253 } 1254 /* 1255 * Find a source for this server. All subseqent requests from 1256 * the same client matching this sticky entry will use this 1257 * source address in doing NAT. The current algorithm is 1258 * simple, rotate the source address. Note that the 1259 * source address array does not change after it's created, so 1260 * it is OK to just increment the cur index. 1261 */ 1262 if (server->iser_nat_src != NULL) { 1263 /* It is a hint, does not need to be atomic. */ 1264 *src_ent_idx = (server->iser_nat_src->cur++ % 1265 server->iser_nat_src->num_src); 1266 s->nat_src_idx = *src_ent_idx; 1267 } 1268 mutex_exit(&hash->sticky_lock); 1269 *res = s; 1270 return (server); 1271 } 1272 1273 /* 1274 * We don't hold any lock accessing iser_enabled. Refer to the 1275 * comment in ilb_server_add() about iser_lock. 1276 */ 1277 if (!s->server->iser_enabled) { 1278 /* 1279 * s->server == server can only happen if there is a race in 1280 * toggling the iser_enabled flag (we don't hold a lock doing 1281 * that) so that the load balance algorithm still returns a 1282 * disabled server. In this case, just drop the packet... 1283 */ 1284 if (s->server == server) { 1285 mutex_exit(&hash->sticky_lock); 1286 return (NULL); 1287 } 1288 1289 /* 1290 * The old server is disabled and there is a new server, use 1291 * the new one to create a sticky entry. Since we will 1292 * add the entry at the beginning, subsequent lookup will 1293 * find this new entry instead of the old one. 1294 */ 1295 goto add_new_entry; 1296 } 1297 1298 s->refcnt++; 1299 *res = s; 1300 mutex_exit(&hash->sticky_lock); 1301 if (server->iser_nat_src != NULL) 1302 *src_ent_idx = s->nat_src_idx; 1303 return (s->server); 1304 } 1305 1306 static void 1307 ilb_sticky_cleanup(void *arg) 1308 { 1309 ilb_timer_t *timer = (ilb_timer_t *)arg; 1310 uint32_t i; 1311 ilb_stack_t *ilbs; 1312 ilb_sticky_hash_t *hash; 1313 ilb_sticky_t *s, *nxt_s; 1314 int64_t now, expiry; 1315 1316 ilbs = timer->ilbs; 1317 hash = ilbs->ilbs_sticky_hash; 1318 ASSERT(hash != NULL); 1319 1320 now = ddi_get_lbolt64(); 1321 for (i = timer->start; i < timer->end; i++) { 1322 mutex_enter(&hash[i].sticky_lock); 1323 for (s = list_head(&hash[i].sticky_head); s != NULL; 1324 s = nxt_s) { 1325 nxt_s = list_next(&hash[i].sticky_head, s); 1326 if (s->refcnt != 0) 1327 continue; 1328 expiry = now - SEC_TO_TICK(s->expiry); 1329 if (s->atime < expiry) { 1330 ILB_SERVER_REFRELE(s->server); 1331 list_remove(&hash[i].sticky_head, s); 1332 kmem_cache_free(ilb_sticky_cache, s); 1333 hash[i].sticky_cnt--; 1334 } 1335 } 1336 mutex_exit(&hash[i].sticky_lock); 1337 } 1338 } 1339 1340 static void 1341 ilb_sticky_timer(void *arg) 1342 { 1343 ilb_timer_t *timer = (ilb_timer_t *)arg; 1344 1345 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq, 1346 ilb_sticky_cleanup, arg, TQ_SLEEP); 1347 mutex_enter(&timer->tid_lock); 1348 if (timer->tid == 0) { 1349 mutex_exit(&timer->tid_lock); 1350 } else { 1351 timer->tid = timeout(ilb_sticky_timer, arg, 1352 SEC_TO_TICK(ilb_sticky_timeout)); 1353 mutex_exit(&timer->tid_lock); 1354 } 1355 } 1356 1357 void 1358 ilb_sticky_hash_init(ilb_stack_t *ilbs) 1359 { 1360 extern pri_t minclsyspri; 1361 int i, part; 1362 char tq_name[TASKQ_NAMELEN]; 1363 ilb_timer_t *tm; 1364 1365 if (!ISP2(ilbs->ilbs_sticky_hash_size)) { 1366 for (i = 0; i < 31; i++) { 1367 if (ilbs->ilbs_sticky_hash_size < (1 << i)) 1368 break; 1369 } 1370 ilbs->ilbs_sticky_hash_size = 1 << i; 1371 } 1372 1373 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) * 1374 ilbs->ilbs_sticky_hash_size, KM_SLEEP); 1375 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1376 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL, 1377 MUTEX_DEFAULT, NULL); 1378 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head, 1379 sizeof (ilb_sticky_t), 1380 offsetof(ilb_sticky_t, list)); 1381 } 1382 1383 if (ilb_sticky_cache == NULL) 1384 ilb_sticky_cache_init(); 1385 1386 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p", 1387 (void *)ilbs->ilbs_netstack); 1388 ASSERT(ilbs->ilbs_sticky_taskq == NULL); 1389 ilbs->ilbs_sticky_taskq = taskq_create(tq_name, 1390 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size, 1391 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1392 1393 ASSERT(ilbs->ilbs_sticky_timer_list == NULL); 1394 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 1395 ilb_sticky_timer_size, KM_SLEEP); 1396 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1; 1397 for (i = 0; i < ilb_sticky_timer_size; i++) { 1398 tm = ilbs->ilbs_sticky_timer_list + i; 1399 tm->start = i * part; 1400 tm->end = i * part + part; 1401 if (tm->end > ilbs->ilbs_sticky_hash_size) 1402 tm->end = ilbs->ilbs_sticky_hash_size; 1403 tm->ilbs = ilbs; 1404 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 1405 /* Spread out the starting execution time of all the timers. */ 1406 tm->tid = timeout(ilb_sticky_timer, tm, 1407 SEC_TO_TICK(ilb_sticky_timeout + i)); 1408 } 1409 } 1410 1411 void 1412 ilb_sticky_hash_fini(ilb_stack_t *ilbs) 1413 { 1414 int i; 1415 ilb_sticky_t *s; 1416 1417 if (ilbs->ilbs_sticky_hash == NULL) 1418 return; 1419 1420 /* Stop all the timers first. */ 1421 for (i = 0; i < ilb_sticky_timer_size; i++) { 1422 timeout_id_t tid; 1423 1424 /* Setting tid to 0 tells the timer handler not to restart. */ 1425 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1426 tid = ilbs->ilbs_sticky_timer_list[i].tid; 1427 ilbs->ilbs_sticky_timer_list[i].tid = 0; 1428 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1429 (void) untimeout(tid); 1430 } 1431 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) * 1432 ilb_sticky_timer_size); 1433 taskq_destroy(ilbs->ilbs_sticky_taskq); 1434 ilbs->ilbs_sticky_taskq = NULL; 1435 1436 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1437 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head)) 1438 != NULL) { 1439 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s); 1440 ILB_SERVER_REFRELE(s->server); 1441 kmem_free(s, sizeof (ilb_sticky_t)); 1442 } 1443 } 1444 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size * 1445 sizeof (ilb_sticky_hash_t)); 1446 } 1447 1448 /* 1449 * This routine sends up the sticky hash table to user land. Refer to 1450 * the comments before ilb_list_nat(). Both routines assume similar 1451 * conditions. 1452 * 1453 * It is assumed that the caller has checked the size of st so that it 1454 * can hold num entries. 1455 */ 1456 /* ARGSUSED */ 1457 int 1458 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st, 1459 uint32_t *num, uint32_t *flags) 1460 { 1461 ilb_sticky_hash_t *hash; 1462 ilb_sticky_t *curp; 1463 uint32_t i, j; 1464 int ret = 0; 1465 1466 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1467 while (ilbs->ilbs_sticky_list_busy) { 1468 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv, 1469 &ilbs->ilbs_sticky_list_lock) == 0) { 1470 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1471 return (EINTR); 1472 } 1473 } 1474 if ((hash = ilbs->ilbs_sticky_hash) == NULL) { 1475 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1476 *num = 0; 1477 *flags |= ILB_LIST_END; 1478 return (0); 1479 } 1480 ilbs->ilbs_sticky_list_busy = B_TRUE; 1481 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1482 1483 if (*flags & ILB_LIST_BEGIN) { 1484 i = 0; 1485 mutex_enter(&hash[0].sticky_lock); 1486 curp = list_head(&hash[0].sticky_head); 1487 } else if (*flags & ILB_LIST_CONT) { 1488 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) { 1489 *num = 0; 1490 *flags |= ILB_LIST_END; 1491 goto done; 1492 } 1493 i = ilbs->ilbs_sticky_list_cur; 1494 mutex_enter(&hash[i].sticky_lock); 1495 curp = ilbs->ilbs_sticky_list_curp; 1496 } else { 1497 ret = EINVAL; 1498 goto done; 1499 } 1500 1501 j = 0; 1502 while (j < *num) { 1503 if (curp == NULL) { 1504 mutex_exit(&hash[i].sticky_lock); 1505 if (++i == ilbs->ilbs_sticky_hash_size) { 1506 *flags |= ILB_LIST_END; 1507 break; 1508 } 1509 mutex_enter(&hash[i].sticky_lock); 1510 curp = list_head(&hash[i].sticky_head); 1511 continue; 1512 } 1513 (void) strcpy(st[j].rule_name, curp->rule_name); 1514 st[j].req_addr = curp->src; 1515 st[j].srv_addr = curp->server->iser_addr_v6; 1516 st[j].expiry_time = TICK_TO_MSEC(curp->expiry); 1517 j++; 1518 curp = list_next(&hash[i].sticky_head, curp); 1519 } 1520 ilbs->ilbs_sticky_list_curp = curp; 1521 if (j == *num) 1522 mutex_exit(&hash[i].sticky_lock); 1523 1524 ilbs->ilbs_sticky_list_cur = i; 1525 1526 *num = j; 1527 done: 1528 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1529 ilbs->ilbs_sticky_list_busy = B_FALSE; 1530 cv_signal(&ilbs->ilbs_sticky_list_cv); 1531 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1532 1533 return (ret); 1534 } 1535