1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/cmn_err.h> 29 #include <sys/crc32.h> 30 #include <netinet/in.h> 31 #include <inet/ip.h> 32 #include <inet/ip6.h> 33 #include <inet/tcp.h> 34 #include <inet/udp_impl.h> 35 #include <inet/ilb.h> 36 37 #include "ilb_impl.h" 38 #include "ilb_stack.h" 39 #include "ilb_nat.h" 40 41 /* 42 * NAT source entry garbarge collection timeout. The actual timeout value 43 * includes a random jitter bounded by the ILB_NAT_SRC_TIMEOUT_JITTER. 44 */ 45 #define ILB_NAT_SRC_TIMEOUT 30 46 #define ILB_NAT_SRC_TIMEOUT_JITTER 5 47 48 /* key1/2 are assumed to be uint32_t. */ 49 #define ILB_NAT_SRC_HASH(hash, key1, key2, hash_size) \ 50 { \ 51 CRC32((hash), (key1), sizeof (uint32_t), -1U, crc32_table); \ 52 CRC32((hash), (key2), sizeof (uint32_t), (hash), crc32_table); \ 53 (hash) %= (hash_size); \ 54 } 55 56 /* NAT source port space instance number. */ 57 static uint32_t ilb_nat_src_instance = 0; 58 59 static void 60 incr_addr(in6_addr_t *a) 61 { 62 uint32_t i; 63 64 i = ntohl(a->s6_addr32[3]); 65 if (IN6_IS_ADDR_V4MAPPED(a)) { 66 a->s6_addr32[3] = htonl(++i); 67 ASSERT(i != 0); 68 return; 69 } 70 71 if (++i != 0) { 72 a->s6_addr32[3] = htonl(i); 73 return; 74 } 75 a->s6_addr32[3] = 0; 76 i = ntohl(a->s6_addr[2]); 77 if (++i != 0) { 78 a->s6_addr32[2] = htonl(i); 79 return; 80 } 81 a->s6_addr32[2] = 0; 82 i = ntohl(a->s6_addr[1]); 83 if (++i != 0) { 84 a->s6_addr32[1] = htonl(i); 85 return; 86 } 87 a->s6_addr32[1] = 0; 88 i = ntohl(a->s6_addr[0]); 89 a->s6_addr[0] = htonl(++i); 90 ASSERT(i != 0); 91 } 92 93 /* 94 * When ILB does full NAT, it first picks one source address from the rule's 95 * specified NAT source address list (currently done in round robin fashion). 96 * Then it needs to allocate a port. This source port must make the tuple 97 * (source address:source port:destination address:destination port) 98 * unique. The destination part of the tuple is determined by the back 99 * end server, and could not be changed. 100 * 101 * To handle the above source port number allocation, ILB sets up a table 102 * of entries identified by source address:back end server address:server port 103 * tuple. This table is used by all rules for NAT source port allocation. 104 * Each tuple has an associated vmem arena used for managing the NAT source 105 * port space between the source address and back end server address/port. 106 * Each back end server (ilb_server_t) has an array of pointers (iser_nat_src) 107 * to the different entries in this table for NAT source port allocation. 108 * When ILB needs to allocate a NAT source address and port to talk to a back 109 * end server, it picks a source address and uses the array pointer to get 110 * to an entry. Then it calls vmem_alloc() on the associated vmem arena to 111 * find an unused port. 112 * 113 * When a back end server is added, ILB sets up the aforementioned array. 114 * For each source address specified in the rule, ILB checks if there is any 115 * existing entry which matches this source address:back end server address: 116 * port tuple. The server port is either a specific port or 0 (meaning wild 117 * card port). Normally, a back end server uses the same port as in the rule. 118 * If a back end server is used to serve two different rules, there will be 119 * two different ports. Source port allocation for these two rules do not 120 * conflict, hence we can use two vmem arenas (two different entries in the 121 * table). But if a server uses port range in one rule, we will treat it as 122 * a wild card port. Wild card poart matches with any port. If this server 123 * is used to serve more than one rules and those rules use the same set of 124 * NAT source addresses, this means that they must share the same set of vmem 125 * arenas (source port spaces). We do this for simplicity reason. If not, 126 * we need to partition the port range so that we can identify different forms 127 * of source port number collision. 128 */ 129 130 /* 131 * NAT source address initialization routine. 132 */ 133 void 134 ilb_nat_src_init(ilb_stack_t *ilbs) 135 { 136 int i; 137 138 ilbs->ilbs_nat_src_tid = timeout(ilb_nat_src_timer, ilbs, 139 SEC_TO_TICK(ILB_NAT_SRC_TIMEOUT + 140 gethrtime() % ILB_NAT_SRC_TIMEOUT_JITTER)); 141 ilbs->ilbs_nat_src = kmem_zalloc(sizeof (ilb_nat_src_hash_t) * 142 ilbs->ilbs_nat_src_hash_size, KM_SLEEP); 143 for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) { 144 list_create(&ilbs->ilbs_nat_src[i].nsh_head, 145 sizeof (ilb_nat_src_entry_t), 146 offsetof(ilb_nat_src_entry_t, nse_link)); 147 mutex_init(&ilbs->ilbs_nat_src[i].nsh_lock, NULL, 148 MUTEX_DEFAULT, NULL); 149 } 150 } 151 152 /* 153 * NAT source address clean up routine. 154 */ 155 void 156 ilb_nat_src_fini(ilb_stack_t *ilbs) 157 { 158 ilb_nat_src_entry_t *cur; 159 timeout_id_t tid; 160 int i; 161 162 /* 163 * By setting ilbs_nat_src_tid to 0, the timer handler will not 164 * restart the timer. 165 */ 166 mutex_enter(&ilbs->ilbs_nat_src_lock); 167 tid = ilbs->ilbs_nat_src_tid; 168 ilbs->ilbs_nat_src_tid = 0; 169 mutex_exit(&ilbs->ilbs_nat_src_lock); 170 if (tid != 0) 171 (void) untimeout(tid); 172 173 mutex_destroy(&ilbs->ilbs_nat_src_lock); 174 175 for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) { 176 while ((cur = list_remove_head(&ilbs->ilbs_nat_src[i].nsh_head)) 177 != NULL) { 178 vmem_destroy(cur->nse_port_arena); 179 kmem_free(cur, sizeof (ilb_nat_src_entry_t)); 180 } 181 mutex_destroy(&ilbs->ilbs_nat_src[i].nsh_lock); 182 } 183 184 kmem_free(ilbs->ilbs_nat_src, sizeof (ilb_nat_src_hash_t) * 185 ilbs->ilbs_nat_src_hash_size); 186 ilbs->ilbs_nat_src = NULL; 187 } 188 189 /* An arena name is "ilb_ns" + "_xxxxxxxxxx" */ 190 #define ARENA_NAMESZ 18 191 #define NAT_PORT_START 4096 192 #define NAT_PORT_SIZE 65535 - NAT_PORT_START 193 194 /* 195 * Check if the NAT source and back end server pair ilb_nat_src_entry_t 196 * exists. If it does, increment the refcnt and return it. If not, create 197 * one and return it. 198 */ 199 static ilb_nat_src_entry_t * 200 ilb_find_nat_src(ilb_stack_t *ilbs, const in6_addr_t *nat_src, 201 const in6_addr_t *serv_addr, in_port_t port) 202 { 203 ilb_nat_src_entry_t *tmp; 204 uint32_t idx; 205 char arena_name[ARENA_NAMESZ]; 206 list_t *head; 207 208 ILB_NAT_SRC_HASH(idx, &nat_src->s6_addr32[3], &serv_addr->s6_addr32[3], 209 ilbs->ilbs_nat_src_hash_size); 210 mutex_enter(&ilbs->ilbs_nat_src[idx].nsh_lock); 211 head = &ilbs->ilbs_nat_src[idx].nsh_head; 212 for (tmp = list_head(head); tmp != NULL; tmp = list_next(head, tmp)) { 213 if (IN6_ARE_ADDR_EQUAL(&tmp->nse_src_addr, nat_src) && 214 IN6_ARE_ADDR_EQUAL(&tmp->nse_serv_addr, serv_addr) && 215 (port == tmp->nse_port || port == 0 || 216 tmp->nse_port == 0)) { 217 break; 218 } 219 } 220 /* Found one, return it. */ 221 if (tmp != NULL) { 222 tmp->nse_refcnt++; 223 mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock); 224 return (tmp); 225 } 226 227 tmp = kmem_alloc(sizeof (ilb_nat_src_entry_t), KM_NOSLEEP); 228 if (tmp == NULL) { 229 mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock); 230 return (NULL); 231 } 232 tmp->nse_src_addr = *nat_src; 233 tmp->nse_serv_addr = *serv_addr; 234 tmp->nse_port = port; 235 tmp->nse_nsh_lock = &ilbs->ilbs_nat_src[idx].nsh_lock; 236 tmp->nse_refcnt = 1; 237 238 (void) snprintf(arena_name, ARENA_NAMESZ, "ilb_ns_%u", 239 atomic_add_32_nv(&ilb_nat_src_instance, 1)); 240 if ((tmp->nse_port_arena = vmem_create(arena_name, 241 (void *)NAT_PORT_START, NAT_PORT_SIZE, 1, NULL, NULL, NULL, 1, 242 VM_SLEEP | VMC_IDENTIFIER)) == NULL) { 243 kmem_free(tmp, sizeof (*tmp)); 244 return (NULL); 245 } 246 247 list_insert_tail(head, tmp); 248 mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock); 249 250 return (tmp); 251 } 252 253 /* 254 * Create ilb_nat_src_t struct for a ilb_server_t struct. 255 */ 256 int 257 ilb_create_nat_src(ilb_stack_t *ilbs, ilb_nat_src_t **nat_src, 258 const in6_addr_t *srv_addr, in_port_t port, const in6_addr_t *start, 259 int num) 260 { 261 ilb_nat_src_t *src; 262 in6_addr_t cur_addr; 263 int i; 264 265 if ((src = kmem_zalloc(sizeof (ilb_nat_src_t), KM_NOSLEEP)) == NULL) { 266 *nat_src = NULL; 267 return (ENOMEM); 268 } 269 cur_addr = *start; 270 for (i = 0; i < num && i < ILB_MAX_NAT_SRC; i++) { 271 src->src_list[i] = ilb_find_nat_src(ilbs, &cur_addr, srv_addr, 272 port); 273 if (src->src_list[i] == NULL) { 274 ilb_destroy_nat_src(&src); 275 *nat_src = NULL; 276 return (ENOMEM); 277 } 278 incr_addr(&cur_addr); 279 /* 280 * Increment num_src here so that we can call 281 * ilb_destroy_nat_src() when we need to do cleanup. 282 */ 283 src->num_src++; 284 } 285 *nat_src = src; 286 return (0); 287 } 288 289 /* 290 * Timer routine for garbage collecting unneeded NAT source entry. We 291 * don't use a taskq for this since the table should be relatively small 292 * and should be OK for a timer to handle. 293 */ 294 void 295 ilb_nat_src_timer(void *arg) 296 { 297 ilb_stack_t *ilbs = (ilb_stack_t *)arg; 298 ilb_nat_src_entry_t *cur, *tmp; 299 list_t *head; 300 int i; 301 302 for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) { 303 mutex_enter(&ilbs->ilbs_nat_src[i].nsh_lock); 304 head = &ilbs->ilbs_nat_src[i].nsh_head; 305 cur = list_head(head); 306 while (cur != NULL) { 307 /* 308 * When a server is removed, it will release its 309 * reference on an entry. But there may still be 310 * conn using some ports. So check the size also. 311 */ 312 if (cur->nse_refcnt != 0 || 313 vmem_size(cur->nse_port_arena, VMEM_ALLOC) != 0) { 314 cur = list_next(head, cur); 315 continue; 316 } 317 tmp = cur; 318 cur = list_next(head, cur); 319 list_remove(head, tmp); 320 vmem_destroy(tmp->nse_port_arena); 321 kmem_free(tmp, sizeof (ilb_nat_src_entry_t)); 322 } 323 mutex_exit(&ilbs->ilbs_nat_src[i].nsh_lock); 324 } 325 326 mutex_enter(&ilbs->ilbs_nat_src_lock); 327 if (ilbs->ilbs_nat_src_tid == 0) { 328 mutex_exit(&ilbs->ilbs_nat_src_lock); 329 } else { 330 ilbs->ilbs_nat_src_tid = timeout(ilb_nat_src_timer, ilbs, 331 SEC_TO_TICK(ILB_NAT_SRC_TIMEOUT + 332 gethrtime() % ILB_NAT_SRC_TIMEOUT_JITTER)); 333 mutex_exit(&ilbs->ilbs_nat_src_lock); 334 } 335 } 336 337 /* 338 * Destroy a given ilb_nat_src_t struct. It will also release the reference 339 * hold on all its ilb_nat_src_entry_t. 340 */ 341 void 342 ilb_destroy_nat_src(ilb_nat_src_t **nat_src) 343 { 344 int i, size; 345 ilb_nat_src_t *src; 346 ilb_nat_src_entry_t *entry; 347 348 src = *nat_src; 349 if (src == NULL) 350 return; 351 size = src->num_src; 352 /* 353 * Set each entry to be condemned and the garbarge collector will 354 * clean them up. 355 */ 356 for (i = 0; i < size; i++) { 357 entry = src->src_list[i]; 358 mutex_enter(entry->nse_nsh_lock); 359 entry->nse_refcnt--; 360 mutex_exit(entry->nse_nsh_lock); 361 } 362 kmem_free(src, sizeof (ilb_nat_src_t)); 363 *nat_src = NULL; 364 } 365 366 /* 367 * Given a backend server address and its ilb_nat_src_t, allocate a source 368 * address and port for NAT usage. 369 */ 370 ilb_nat_src_entry_t * 371 ilb_alloc_nat_addr(ilb_nat_src_t *src, in6_addr_t *addr, in_port_t *port, 372 uint16_t *nat_src_idx) 373 { 374 int i, try, size; 375 in_port_t p; 376 377 size = src->num_src; 378 /* Increment of cur does not need to be atomic. It is just a hint. */ 379 if (nat_src_idx == NULL) 380 i = (++src->cur) % size; 381 else 382 i = *nat_src_idx; 383 384 for (try = 0; try < size; try++) { 385 p = (in_port_t)(uintptr_t)vmem_alloc( 386 src->src_list[i]->nse_port_arena, 1, VM_NOSLEEP); 387 if (p != 0) 388 break; 389 /* 390 * If an index is given and we cannot allocate a port using 391 * that entry, return NULL. 392 */ 393 if (nat_src_idx != NULL) 394 return (NULL); 395 i = (i + 1) % size; 396 } 397 if (try == size) 398 return (NULL); 399 *addr = src->src_list[i]->nse_src_addr; 400 *port = htons(p); 401 return (src->src_list[i]); 402 } 403 404 /* 405 * Use the pre-calculated checksum to adjust the checksum of a packet after 406 * NAT. 407 */ 408 static void 409 adj_cksum(uint16_t *chksum, uint32_t adj_sum) 410 { 411 adj_sum += (uint16_t)~(*chksum); 412 while ((adj_sum >> 16) != 0) 413 adj_sum = (adj_sum & 0xffff) + (adj_sum >> 16); 414 *chksum = (uint16_t)~adj_sum; 415 } 416 417 /* Do full NAT (replace both source and desination info) on a packet. */ 418 void 419 ilb_full_nat(int l3, void *iph, int l4, void *tph, ilb_nat_info_t *info, 420 uint32_t adj_ip_sum, uint32_t adj_tp_sum, boolean_t c2s) 421 { 422 in_port_t *orig_sport, *orig_dport; 423 uint16_t *tp_cksum; 424 425 switch (l4) { 426 case IPPROTO_TCP: 427 orig_sport = &((tcpha_t *)tph)->tha_lport; 428 orig_dport = &((tcpha_t *)tph)->tha_fport; 429 tp_cksum = &((tcpha_t *)tph)->tha_sum; 430 break; 431 case IPPROTO_UDP: 432 orig_sport = &((udpha_t *)tph)->uha_src_port; 433 orig_dport = &((udpha_t *)tph)->uha_dst_port; 434 tp_cksum = &((udpha_t *)tph)->uha_checksum; 435 break; 436 default: 437 ASSERT(0); 438 return; 439 } 440 441 switch (l3) { 442 case IPPROTO_IP: { 443 ipha_t *ipha; 444 445 ipha = iph; 446 if (c2s) { 447 IN6_V4MAPPED_TO_IPADDR(&info->nat_src, 448 ipha->ipha_src); 449 IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, 450 ipha->ipha_dst); 451 *orig_sport = info->nat_sport; 452 *orig_dport = info->nat_dport; 453 } else { 454 IN6_V4MAPPED_TO_IPADDR(&info->vip, ipha->ipha_src); 455 IN6_V4MAPPED_TO_IPADDR(&info->src, ipha->ipha_dst); 456 *orig_sport = info->dport; 457 *orig_dport = info->sport; 458 } 459 adj_cksum(&ipha->ipha_hdr_checksum, adj_ip_sum); 460 adj_cksum(tp_cksum, adj_tp_sum); 461 break; 462 } 463 case IPPROTO_IPV6: { 464 ip6_t *ip6h; 465 466 ip6h = iph; 467 if (c2s) { 468 ip6h->ip6_src = info->nat_src; 469 ip6h->ip6_dst = info->nat_dst; 470 *orig_sport = info->nat_sport; 471 *orig_dport = info->nat_dport; 472 } else { 473 ip6h->ip6_src = info->vip; 474 ip6h->ip6_dst = info->src; 475 *orig_sport = info->dport; 476 *orig_dport = info->sport; 477 } 478 /* No checksum for IPv6 header */ 479 adj_cksum(tp_cksum, adj_tp_sum); 480 break; 481 } 482 default: 483 ASSERT(0); 484 break; 485 } 486 } 487 488 /* Do half NAT (only replace the destination info) on a packet. */ 489 void 490 ilb_half_nat(int l3, void *iph, int l4, void *tph, ilb_nat_info_t *info, 491 uint32_t adj_ip_sum, uint32_t adj_tp_sum, boolean_t c2s) 492 { 493 in_port_t *orig_port; 494 uint16_t *tp_cksum; 495 496 switch (l4) { 497 case IPPROTO_TCP: 498 if (c2s) 499 orig_port = &((tcpha_t *)tph)->tha_fport; 500 else 501 orig_port = &((tcpha_t *)tph)->tha_lport; 502 tp_cksum = &((tcpha_t *)tph)->tha_sum; 503 break; 504 case IPPROTO_UDP: 505 if (c2s) 506 orig_port = &((udpha_t *)tph)->uha_dst_port; 507 else 508 orig_port = &((udpha_t *)tph)->uha_src_port; 509 tp_cksum = &((udpha_t *)tph)->uha_checksum; 510 break; 511 default: 512 ASSERT(0); 513 return; 514 } 515 516 switch (l3) { 517 case IPPROTO_IP: { 518 ipha_t *ipha; 519 520 ipha = iph; 521 if (c2s) { 522 IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, 523 ipha->ipha_dst); 524 *orig_port = info->nat_dport; 525 } else { 526 IN6_V4MAPPED_TO_IPADDR(&info->vip, ipha->ipha_src); 527 *orig_port = info->dport; 528 } 529 adj_cksum(&ipha->ipha_hdr_checksum, adj_ip_sum); 530 adj_cksum(tp_cksum, adj_tp_sum); 531 break; 532 } 533 case IPPROTO_IPV6: { 534 ip6_t *ip6h; 535 536 ip6h = iph; 537 if (c2s) { 538 ip6h->ip6_dst = info->nat_dst; 539 *orig_port = info->nat_dport; 540 } else { 541 ip6h->ip6_src = info->vip; 542 *orig_port = info->dport; 543 } 544 /* No checksum for IPv6 header */ 545 adj_cksum(tp_cksum, adj_tp_sum); 546 break; 547 } 548 default: 549 ASSERT(0); 550 break; 551 } 552 } 553 554 /* Calculate the IPv6 pseudo checksum, used for ICMPv6 NAT. */ 555 uint32_t 556 ilb_pseudo_sum_v6(ip6_t *ip6h, uint8_t nxt_hdr) 557 { 558 uint32_t sum; 559 uint16_t *cur; 560 561 cur = (uint16_t *)&ip6h->ip6_src; 562 sum = cur[0] + cur[1] + cur[2] + cur[3] + cur[4] + cur[5] + cur[6] + 563 cur[7] + cur[8] + cur[9] + cur[10] + cur[11] + cur[12] + cur[13] + 564 cur[14] + cur[15] + htons(nxt_hdr); 565 return ((sum & 0xffff) + (sum >> 16)); 566 } 567 568 /* Do NAT on an ICMPv4 packet. */ 569 void 570 ilb_nat_icmpv4(mblk_t *mp, ipha_t *out_iph, icmph_t *icmph, ipha_t *in_iph, 571 in_port_t *sport, in_port_t *dport, ilb_nat_info_t *info, uint32_t sum, 572 boolean_t full_nat) 573 { 574 if (full_nat) { 575 IN6_V4MAPPED_TO_IPADDR(&info->nat_src, out_iph->ipha_src); 576 IN6_V4MAPPED_TO_IPADDR(&info->nat_src, in_iph->ipha_dst); 577 *dport = info->nat_sport; 578 } 579 IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, out_iph->ipha_dst); 580 adj_cksum(&out_iph->ipha_hdr_checksum, sum); 581 IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, in_iph->ipha_src); 582 *sport = info->nat_dport; 583 584 icmph->icmph_checksum = 0; 585 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(out_iph), 0); 586 } 587 588 /* Do NAT on an ICMPv6 packet. */ 589 void 590 ilb_nat_icmpv6(mblk_t *mp, ip6_t *out_ip6h, icmp6_t *icmp6h, ip6_t *in_ip6h, 591 in_port_t *sport, in_port_t *dport, ilb_nat_info_t *info, 592 boolean_t full_nat) 593 { 594 int hdr_len; 595 596 if (full_nat) { 597 out_ip6h->ip6_src = info->nat_src; 598 in_ip6h->ip6_dst = info->nat_src; 599 *dport = info->nat_sport; 600 } 601 out_ip6h->ip6_dst = info->nat_dst; 602 in_ip6h->ip6_src = info->nat_dst; 603 *sport = info->nat_dport; 604 605 icmp6h->icmp6_cksum = out_ip6h->ip6_plen; 606 hdr_len = (char *)icmp6h - (char *)out_ip6h; 607 icmp6h->icmp6_cksum = IP_CSUM(mp, hdr_len, 608 ilb_pseudo_sum_v6(out_ip6h, IPPROTO_ICMPV6)); 609 } 610