1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2015-2019 Yandex LLC 5 * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org> 6 * Copyright (c) 2016-2019 Andrey V. Elsukov <ae@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/counter.h> 36 #include <sys/ck.h> 37 #include <sys/epoch.h> 38 #include <sys/errno.h> 39 #include <sys/hash.h> 40 #include <sys/kernel.h> 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/module.h> 45 #include <sys/rmlock.h> 46 #include <sys/socket.h> 47 #include <sys/syslog.h> 48 #include <sys/sysctl.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_pflog.h> 53 #include <net/pfil.h> 54 55 #include <netinet/in.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip_var.h> 58 #include <netinet/ip_fw.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/ip_icmp.h> 62 #include <netinet/tcp.h> 63 #include <netinet/udp.h> 64 #include <netinet6/in6_var.h> 65 #include <netinet6/ip6_var.h> 66 #include <netinet6/ip_fw_nat64.h> 67 68 #include <netpfil/ipfw/ip_fw_private.h> 69 #include <netpfil/pf/pf.h> 70 71 #include "nat64lsn.h" 72 73 MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); 74 75 #define NAT64LSN_EPOCH_ENTER(et) NET_EPOCH_ENTER(et) 76 #define NAT64LSN_EPOCH_EXIT(et) NET_EPOCH_EXIT(et) 77 #define NAT64LSN_EPOCH_ASSERT() NET_EPOCH_ASSERT() 78 #define NAT64LSN_EPOCH_CALL(c, f) epoch_call(net_epoch_preempt, (c), (f)) 79 80 static uma_zone_t nat64lsn_host_zone; 81 static uma_zone_t nat64lsn_pgchunk_zone; 82 static uma_zone_t nat64lsn_pg_zone; 83 static uma_zone_t nat64lsn_aliaslink_zone; 84 static uma_zone_t nat64lsn_state_zone; 85 static uma_zone_t nat64lsn_job_zone; 86 87 static void nat64lsn_periodic(void *data); 88 #define PERIODIC_DELAY 4 89 #define NAT64_LOOKUP(chain, cmd) \ 90 (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) 91 /* 92 * Delayed job queue, used to create new hosts 93 * and new portgroups 94 */ 95 enum nat64lsn_jtype { 96 JTYPE_NEWHOST = 1, 97 JTYPE_NEWPORTGROUP, 98 JTYPE_DESTROY, 99 }; 100 101 struct nat64lsn_job_item { 102 STAILQ_ENTRY(nat64lsn_job_item) entries; 103 enum nat64lsn_jtype jtype; 104 105 union { 106 struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */ 107 struct mbuf *m; 108 struct nat64lsn_host *host; 109 struct nat64lsn_state *state; 110 uint32_t src6_hval; 111 uint32_t state_hval; 112 struct ipfw_flow_id f_id; 113 in_addr_t faddr; 114 uint16_t port; 115 uint8_t proto; 116 uint8_t done; 117 }; 118 struct { /* used by JTYPE_DESTROY */ 119 struct nat64lsn_hosts_slist hosts; 120 struct nat64lsn_pg_slist portgroups; 121 struct nat64lsn_pgchunk *pgchunk; 122 struct epoch_context epoch_ctx; 123 }; 124 }; 125 }; 126 127 static struct mtx jmtx; 128 #define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF) 129 #define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx) 130 #define JQUEUE_LOCK() mtx_lock(&jmtx) 131 #define JQUEUE_UNLOCK() mtx_unlock(&jmtx) 132 133 static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, 134 struct nat64lsn_job_item *ji); 135 static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, 136 struct nat64lsn_job_item *ji); 137 static struct nat64lsn_job_item *nat64lsn_create_job( 138 struct nat64lsn_cfg *cfg, int jtype); 139 static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, 140 struct nat64lsn_job_item *ji); 141 static void nat64lsn_job_destroy(epoch_context_t ctx); 142 static void nat64lsn_destroy_host(struct nat64lsn_host *host); 143 static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg); 144 145 static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, 146 const struct ipfw_flow_id *f_id, struct mbuf **mp); 147 static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, 148 struct ipfw_flow_id *f_id, struct mbuf **mp); 149 static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, 150 struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags); 151 152 #define NAT64_BIT_TCP_FIN 0 /* FIN was seen */ 153 #define NAT64_BIT_TCP_SYN 1 /* First syn in->out */ 154 #define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */ 155 #define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */ 156 #define NAT64_BIT_STALE 7 /* state is going to be expired */ 157 158 #define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN) 159 #define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN) 160 #define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB) 161 #define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) 162 163 #define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4) 164 #define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE) 165 166 static inline uint8_t 167 convert_tcp_flags(uint8_t flags) 168 { 169 uint8_t result; 170 171 result = flags & (TH_FIN|TH_SYN); 172 result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ 173 result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ 174 175 return (result); 176 } 177 178 static void 179 nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, 180 struct nat64lsn_state *state) 181 { 182 183 memset(plog, 0, sizeof(*plog)); 184 plog->length = PFLOG_REAL_HDRLEN; 185 plog->af = family; 186 plog->action = PF_NAT; 187 plog->dir = PF_IN; 188 plog->rulenr = htonl(state->ip_src); 189 plog->subrulenr = htonl((uint32_t)(state->aport << 16) | 190 (state->proto << 8) | (state->ip_dst & 0xff)); 191 plog->ruleset[0] = '\0'; 192 strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); 193 ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); 194 } 195 196 #define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s)) 197 #define HOST_HVAL(c, a) HVAL((a),\ 198 sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed) 199 #define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)]) 200 201 #define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\ 202 sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed) 203 #define ALIAS_BYHASH(c, v) \ 204 ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)]) 205 static struct nat64lsn_aliaslink* 206 nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused, 207 struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused) 208 { 209 210 /* 211 * We can implement some different algorithms how 212 * select an alias address. 213 * XXX: for now we use first available. 214 */ 215 return (CK_SLIST_FIRST(&host->aliases)); 216 } 217 218 #define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed) 219 #define STATE_HASH(h, v) \ 220 ((h)->states_hash[(v) & ((h)->states_hashsize - 1)]) 221 #define STATES_CHUNK(p, v) \ 222 ((p)->chunks_count == 1 ? (p)->states : \ 223 ((p)->states_chunk[CHUNK_BY_FADDR(p, v)])) 224 225 #ifdef __LP64__ 226 #define FREEMASK_FFSLL(pg, faddr) \ 227 ffsll(*FREEMASK_CHUNK((pg), (faddr))) 228 #define FREEMASK_BTR(pg, faddr, bit) \ 229 ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) 230 #define FREEMASK_BTS(pg, faddr, bit) \ 231 ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) 232 #define FREEMASK_ISSET(pg, faddr, bit) \ 233 ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit)) 234 #define FREEMASK_COPY(pg, n, out) \ 235 (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n))) 236 #else 237 static inline int 238 freemask_ffsll(uint32_t *freemask) 239 { 240 int i; 241 242 if ((i = ffsl(freemask[0])) != 0) 243 return (i); 244 if ((i = ffsl(freemask[1])) != 0) 245 return (i + 32); 246 return (0); 247 } 248 #define FREEMASK_FFSLL(pg, faddr) \ 249 freemask_ffsll(FREEMASK_CHUNK((pg), (faddr))) 250 #define FREEMASK_BTR(pg, faddr, bit) \ 251 ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) 252 #define FREEMASK_BTS(pg, faddr, bit) \ 253 ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) 254 #define FREEMASK_ISSET(pg, faddr, bit) \ 255 ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32) 256 #define FREEMASK_COPY(pg, n, out) \ 257 (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \ 258 ((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32) 259 #endif /* !__LP64__ */ 260 261 262 #define NAT64LSN_TRY_PGCNT 32 263 static struct nat64lsn_pg* 264 nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask, 265 struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, 266 uint32_t *pgidx, in_addr_t faddr) 267 { 268 struct nat64lsn_pg *pg, *oldpg; 269 uint32_t idx, oldidx; 270 int cnt; 271 272 cnt = 0; 273 /* First try last used PG */ 274 oldpg = pg = ck_pr_load_ptr(pgptr); 275 idx = oldidx = ck_pr_load_32(pgidx); 276 /* If pgidx is out of range, reset it to the first pgchunk */ 277 if (!ISSET32(*chunkmask, idx / 32)) 278 idx = 0; 279 do { 280 ck_pr_fence_load(); 281 if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) { 282 /* 283 * If last used PG has not free states, 284 * try to update pointer. 285 * NOTE: it can be already updated by jobs handler, 286 * thus we use CAS operation. 287 */ 288 if (cnt > 0) 289 ck_pr_cas_ptr(pgptr, oldpg, pg); 290 return (pg); 291 } 292 /* Stop if idx is out of range */ 293 if (!ISSET32(*chunkmask, idx / 32)) 294 break; 295 296 if (ISSET32(pgmask[idx / 32], idx % 32)) 297 pg = ck_pr_load_ptr( 298 &chunks[idx / 32]->pgptr[idx % 32]); 299 else 300 pg = NULL; 301 302 idx++; 303 } while (++cnt < NAT64LSN_TRY_PGCNT); 304 305 /* If pgidx is out of range, reset it to the first pgchunk */ 306 if (!ISSET32(*chunkmask, idx / 32)) 307 idx = 0; 308 ck_pr_cas_32(pgidx, oldidx, idx); 309 return (NULL); 310 } 311 312 static struct nat64lsn_state* 313 nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, 314 const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr, 315 uint16_t port, uint8_t proto) 316 { 317 struct nat64lsn_aliaslink *link; 318 struct nat64lsn_state *state; 319 struct nat64lsn_pg *pg; 320 int i, offset; 321 322 NAT64LSN_EPOCH_ASSERT(); 323 324 /* Check that we already have state for given arguments */ 325 CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) { 326 if (state->proto == proto && state->ip_dst == faddr && 327 state->sport == port && state->dport == f_id->dst_port) 328 return (state); 329 } 330 331 link = nat64lsn_get_aliaslink(cfg, host, f_id); 332 if (link == NULL) 333 return (NULL); 334 335 switch (proto) { 336 case IPPROTO_TCP: 337 pg = nat64lsn_get_pg( 338 &link->alias->tcp_chunkmask, link->alias->tcp_pgmask, 339 link->alias->tcp, &link->alias->tcp_pg, 340 &link->alias->tcp_pgidx, faddr); 341 break; 342 case IPPROTO_UDP: 343 pg = nat64lsn_get_pg( 344 &link->alias->udp_chunkmask, link->alias->udp_pgmask, 345 link->alias->udp, &link->alias->udp_pg, 346 &link->alias->udp_pgidx, faddr); 347 break; 348 case IPPROTO_ICMP: 349 pg = nat64lsn_get_pg( 350 &link->alias->icmp_chunkmask, link->alias->icmp_pgmask, 351 link->alias->icmp, &link->alias->icmp_pg, 352 &link->alias->icmp_pgidx, faddr); 353 break; 354 default: 355 panic("%s: wrong proto %d", __func__, proto); 356 } 357 if (pg == NULL) 358 return (NULL); 359 360 /* Check that PG has some free states */ 361 state = NULL; 362 i = FREEMASK_BITCOUNT(pg, faddr); 363 while (i-- > 0) { 364 offset = FREEMASK_FFSLL(pg, faddr); 365 if (offset == 0) { 366 /* 367 * We lost the race. 368 * No more free states in this PG. 369 */ 370 break; 371 } 372 373 /* Lets try to atomically grab the state */ 374 if (FREEMASK_BTR(pg, faddr, offset - 1)) { 375 state = &STATES_CHUNK(pg, faddr)->state[offset - 1]; 376 /* Initialize */ 377 state->flags = proto != IPPROTO_TCP ? 0 : 378 convert_tcp_flags(f_id->_flags); 379 state->proto = proto; 380 state->aport = pg->base_port + offset - 1; 381 state->dport = f_id->dst_port; 382 state->sport = port; 383 state->ip6_dst = f_id->dst_ip6; 384 state->ip_dst = faddr; 385 state->ip_src = link->alias->addr; 386 state->hval = hval; 387 state->host = host; 388 SET_AGE(state->timestamp); 389 390 /* Insert new state into host's hash table */ 391 HOST_LOCK(host); 392 CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval), 393 state, entries); 394 host->states_count++; 395 /* 396 * XXX: In case if host is going to be expired, 397 * reset NAT64LSN_DEADHOST flag. 398 */ 399 host->flags &= ~NAT64LSN_DEADHOST; 400 HOST_UNLOCK(host); 401 NAT64STAT_INC(&cfg->base.stats, screated); 402 /* Mark the state as ready for translate4 */ 403 ck_pr_fence_store(); 404 ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4); 405 break; 406 } 407 } 408 return (state); 409 } 410 411 /* 412 * Inspects icmp packets to see if the message contains different 413 * packet header so we need to alter @addr and @port. 414 */ 415 static int 416 inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr, 417 uint16_t *port) 418 { 419 struct icmp *icmp; 420 struct ip *ip; 421 int off; 422 uint8_t inner_proto; 423 424 ip = mtod(*mp, struct ip *); /* Outer IP header */ 425 off = (ip->ip_hl << 2) + ICMP_MINLEN; 426 if ((*mp)->m_len < off) 427 *mp = m_pullup(*mp, off); 428 if (*mp == NULL) 429 return (ENOMEM); 430 431 ip = mtod(*mp, struct ip *); /* Outer IP header */ 432 icmp = L3HDR(ip, struct icmp *); 433 switch (icmp->icmp_type) { 434 case ICMP_ECHO: 435 case ICMP_ECHOREPLY: 436 /* Use icmp ID as distinguisher */ 437 *port = ntohs(icmp->icmp_id); 438 return (0); 439 case ICMP_UNREACH: 440 case ICMP_TIMXCEED: 441 break; 442 default: 443 return (EOPNOTSUPP); 444 } 445 /* 446 * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits 447 * of ULP header. 448 */ 449 if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) 450 return (EINVAL); 451 if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) 452 *mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN); 453 if (*mp == NULL) 454 return (ENOMEM); 455 ip = mtodo(*mp, off); /* Inner IP header */ 456 inner_proto = ip->ip_p; 457 off += ip->ip_hl << 2; /* Skip inner IP header */ 458 *addr = ntohl(ip->ip_src.s_addr); 459 if ((*mp)->m_len < off + ICMP_MINLEN) 460 *mp = m_pullup(*mp, off + ICMP_MINLEN); 461 if (*mp == NULL) 462 return (ENOMEM); 463 switch (inner_proto) { 464 case IPPROTO_TCP: 465 case IPPROTO_UDP: 466 /* Copy source port from the header */ 467 *port = ntohs(*((uint16_t *)mtodo(*mp, off))); 468 *proto = inner_proto; 469 return (0); 470 case IPPROTO_ICMP: 471 /* 472 * We will translate only ICMP errors for our ICMP 473 * echo requests. 474 */ 475 icmp = mtodo(*mp, off); 476 if (icmp->icmp_type != ICMP_ECHO) 477 return (EOPNOTSUPP); 478 *port = ntohs(icmp->icmp_id); 479 return (0); 480 }; 481 return (EOPNOTSUPP); 482 } 483 484 static struct nat64lsn_state* 485 nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, 486 in_addr_t faddr, uint16_t port, uint8_t proto) 487 { 488 struct nat64lsn_state *state; 489 struct nat64lsn_pg *pg; 490 int chunk_idx, pg_idx, state_idx; 491 492 NAT64LSN_EPOCH_ASSERT(); 493 494 if (port < NAT64_MIN_PORT) 495 return (NULL); 496 /* 497 * Alias keeps 32 pgchunks for each protocol. 498 * Each pgchunk has 32 pointers to portgroup. 499 * Each portgroup has 64 states for ports. 500 */ 501 port -= NAT64_MIN_PORT; 502 chunk_idx = port / 2048; 503 504 port -= chunk_idx * 2048; 505 pg_idx = port / 64; 506 state_idx = port % 64; 507 508 /* 509 * First check in proto_chunkmask that we have allocated PG chunk. 510 * Then check in proto_pgmask that we have valid PG pointer. 511 */ 512 pg = NULL; 513 switch (proto) { 514 case IPPROTO_TCP: 515 if (ISSET32(alias->tcp_chunkmask, chunk_idx) && 516 ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) { 517 pg = alias->tcp[chunk_idx]->pgptr[pg_idx]; 518 break; 519 } 520 return (NULL); 521 case IPPROTO_UDP: 522 if (ISSET32(alias->udp_chunkmask, chunk_idx) && 523 ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) { 524 pg = alias->udp[chunk_idx]->pgptr[pg_idx]; 525 break; 526 } 527 return (NULL); 528 case IPPROTO_ICMP: 529 if (ISSET32(alias->icmp_chunkmask, chunk_idx) && 530 ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) { 531 pg = alias->icmp[chunk_idx]->pgptr[pg_idx]; 532 break; 533 } 534 return (NULL); 535 default: 536 panic("%s: wrong proto %d", __func__, proto); 537 } 538 if (pg == NULL) 539 return (NULL); 540 541 if (FREEMASK_ISSET(pg, faddr, state_idx)) 542 return (NULL); 543 544 state = &STATES_CHUNK(pg, faddr)->state[state_idx]; 545 ck_pr_fence_load(); 546 if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY) 547 return (state); 548 return (NULL); 549 } 550 551 static int 552 nat64lsn_translate4(struct nat64lsn_cfg *cfg, 553 const struct ipfw_flow_id *f_id, struct mbuf **mp) 554 { 555 struct pfloghdr loghdr, *logdata; 556 struct in6_addr src6; 557 struct nat64lsn_state *state; 558 struct nat64lsn_alias *alias; 559 uint32_t addr, flags; 560 uint16_t port, ts; 561 int ret; 562 uint8_t proto; 563 564 addr = f_id->dst_ip; 565 port = f_id->dst_port; 566 proto = f_id->proto; 567 if (addr < cfg->prefix4 || addr > cfg->pmask4) { 568 NAT64STAT_INC(&cfg->base.stats, nomatch4); 569 return (cfg->nomatch_verdict); 570 } 571 572 /* Check if protocol is supported */ 573 switch (proto) { 574 case IPPROTO_ICMP: 575 ret = inspect_icmp_mbuf(mp, &proto, &addr, &port); 576 if (ret != 0) { 577 if (ret == ENOMEM) { 578 NAT64STAT_INC(&cfg->base.stats, nomem); 579 return (IP_FW_DENY); 580 } 581 NAT64STAT_INC(&cfg->base.stats, noproto); 582 return (cfg->nomatch_verdict); 583 } 584 if (addr < cfg->prefix4 || addr > cfg->pmask4) { 585 NAT64STAT_INC(&cfg->base.stats, nomatch4); 586 return (cfg->nomatch_verdict); 587 } 588 /* FALLTHROUGH */ 589 case IPPROTO_TCP: 590 case IPPROTO_UDP: 591 break; 592 default: 593 NAT64STAT_INC(&cfg->base.stats, noproto); 594 return (cfg->nomatch_verdict); 595 } 596 597 alias = &ALIAS_BYHASH(cfg, addr); 598 MPASS(addr == alias->addr); 599 600 /* Check that we have state for this port */ 601 state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip, 602 port, proto); 603 if (state == NULL) { 604 NAT64STAT_INC(&cfg->base.stats, nomatch4); 605 return (cfg->nomatch_verdict); 606 } 607 608 /* TODO: Check flags to see if we need to do some static mapping */ 609 610 /* Update some state fields if need */ 611 SET_AGE(ts); 612 if (f_id->proto == IPPROTO_TCP) 613 flags = convert_tcp_flags(f_id->_flags); 614 else 615 flags = 0; 616 if (state->timestamp != ts) 617 state->timestamp = ts; 618 if ((state->flags & flags) != flags) 619 state->flags |= flags; 620 621 port = htons(state->sport); 622 src6 = state->ip6_dst; 623 624 if (cfg->base.flags & NAT64_LOG) { 625 logdata = &loghdr; 626 nat64lsn_log(logdata, *mp, AF_INET, state); 627 } else 628 logdata = NULL; 629 630 /* 631 * We already have src6 with embedded address, but it is possible, 632 * that src_ip is different than state->ip_dst, this is why we 633 * do embedding again. 634 */ 635 nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip)); 636 ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port, 637 &cfg->base, logdata); 638 if (ret == NAT64SKIP) 639 return (cfg->nomatch_verdict); 640 if (ret == NAT64RETURN) 641 *mp = NULL; 642 return (IP_FW_DENY); 643 } 644 645 /* 646 * Check if particular state is stale and should be deleted. 647 * Return 1 if true, 0 otherwise. 648 */ 649 static int 650 nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state) 651 { 652 int age, ttl; 653 654 /* State was marked as stale in previous pass. */ 655 if (ISSET32(state->flags, NAT64_BIT_STALE)) 656 return (1); 657 658 /* State is not yet initialized, it is going to be READY */ 659 if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4)) 660 return (0); 661 662 age = GET_AGE(state->timestamp); 663 switch (state->proto) { 664 case IPPROTO_TCP: 665 if (ISSET32(state->flags, NAT64_BIT_TCP_FIN)) 666 ttl = cfg->st_close_ttl; 667 else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB)) 668 ttl = cfg->st_estab_ttl; 669 else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN)) 670 ttl = cfg->st_syn_ttl; 671 else 672 ttl = cfg->st_syn_ttl; 673 if (age > ttl) 674 return (1); 675 break; 676 case IPPROTO_UDP: 677 if (age > cfg->st_udp_ttl) 678 return (1); 679 break; 680 case IPPROTO_ICMP: 681 if (age > cfg->st_icmp_ttl) 682 return (1); 683 break; 684 } 685 return (0); 686 } 687 688 static int 689 nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg) 690 { 691 struct nat64lsn_state *state; 692 struct nat64lsn_host *host; 693 uint64_t freemask; 694 int c, i, update_age; 695 696 update_age = 0; 697 for (c = 0; c < pg->chunks_count; c++) { 698 FREEMASK_COPY(pg, c, freemask); 699 for (i = 0; i < 64; i++) { 700 if (ISSET64(freemask, i)) 701 continue; 702 state = &STATES_CHUNK(pg, c)->state[i]; 703 if (nat64lsn_check_state(cfg, state) == 0) { 704 update_age = 1; 705 continue; 706 } 707 /* 708 * Expire state: 709 * 1. Mark as STALE and unlink from host's hash. 710 * 2. Set bit in freemask. 711 */ 712 if (ISSET32(state->flags, NAT64_BIT_STALE)) { 713 /* 714 * State was marked as STALE in previous 715 * pass. Now it is safe to release it. 716 */ 717 state->flags = 0; 718 ck_pr_fence_store(); 719 FREEMASK_BTS(pg, c, i); 720 NAT64STAT_INC(&cfg->base.stats, sdeleted); 721 continue; 722 } 723 MPASS(state->flags & NAT64_FLAG_READY); 724 725 host = state->host; 726 HOST_LOCK(host); 727 CK_SLIST_REMOVE(&STATE_HASH(host, state->hval), 728 state, nat64lsn_state, entries); 729 host->states_count--; 730 HOST_UNLOCK(host); 731 732 /* Reset READY flag */ 733 ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4); 734 /* And set STALE flag */ 735 ck_pr_bts_32(&state->flags, NAT64_BIT_STALE); 736 ck_pr_fence_store(); 737 /* 738 * Now translate6 will not use this state, wait 739 * until it become safe for translate4, then mark 740 * state as free. 741 */ 742 } 743 } 744 745 /* 746 * We have some alive states, update timestamp. 747 */ 748 if (update_age) 749 SET_AGE(pg->timestamp); 750 751 if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) 752 return (0); 753 754 return (1); 755 } 756 757 static void 758 nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg, 759 struct nat64lsn_pg_slist *portgroups) 760 { 761 struct nat64lsn_alias *alias; 762 struct nat64lsn_pg *pg, *tpg, *firstpg, **pgptr; 763 uint32_t *pgmask, *pgidx; 764 int i, idx; 765 766 for (i = 0; i < 1 << (32 - cfg->plen4); i++) { 767 alias = &cfg->aliases[i]; 768 CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) { 769 if (nat64lsn_maintain_pg(cfg, pg) == 0) 770 continue; 771 /* Always keep first PG */ 772 if (pg->base_port == NAT64_MIN_PORT) 773 continue; 774 /* 775 * PG is expired, unlink it and schedule for 776 * deferred destroying. 777 */ 778 idx = (pg->base_port - NAT64_MIN_PORT) / 64; 779 switch (pg->proto) { 780 case IPPROTO_TCP: 781 pgmask = alias->tcp_pgmask; 782 pgptr = &alias->tcp_pg; 783 pgidx = &alias->tcp_pgidx; 784 firstpg = alias->tcp[0]->pgptr[0]; 785 break; 786 case IPPROTO_UDP: 787 pgmask = alias->udp_pgmask; 788 pgptr = &alias->udp_pg; 789 pgidx = &alias->udp_pgidx; 790 firstpg = alias->udp[0]->pgptr[0]; 791 break; 792 case IPPROTO_ICMP: 793 pgmask = alias->icmp_pgmask; 794 pgptr = &alias->icmp_pg; 795 pgidx = &alias->icmp_pgidx; 796 firstpg = alias->icmp[0]->pgptr[0]; 797 break; 798 } 799 /* Reset the corresponding bit in pgmask array. */ 800 ck_pr_btr_32(&pgmask[idx / 32], idx % 32); 801 ck_pr_fence_store(); 802 /* If last used PG points to this PG, reset it. */ 803 ck_pr_cas_ptr(pgptr, pg, firstpg); 804 ck_pr_cas_32(pgidx, idx, 0); 805 /* Unlink PG from alias's chain */ 806 ALIAS_LOCK(alias); 807 CK_SLIST_REMOVE(&alias->portgroups, pg, 808 nat64lsn_pg, entries); 809 alias->portgroups_count--; 810 ALIAS_UNLOCK(alias); 811 /* And link to job's chain for deferred destroying */ 812 NAT64STAT_INC(&cfg->base.stats, spgdeleted); 813 CK_SLIST_INSERT_HEAD(portgroups, pg, entries); 814 } 815 } 816 } 817 818 static void 819 nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg, 820 struct nat64lsn_hosts_slist *hosts) 821 { 822 struct nat64lsn_host *host, *tmp; 823 int i; 824 825 for (i = 0; i < cfg->hosts_hashsize; i++) { 826 CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i], 827 entries, tmp) { 828 /* Is host was marked in previous call? */ 829 if (host->flags & NAT64LSN_DEADHOST) { 830 if (host->states_count > 0) { 831 host->flags &= ~NAT64LSN_DEADHOST; 832 continue; 833 } 834 /* 835 * Unlink host from hash table and schedule 836 * it for deferred destroying. 837 */ 838 CFG_LOCK(cfg); 839 CK_SLIST_REMOVE(&cfg->hosts_hash[i], host, 840 nat64lsn_host, entries); 841 cfg->hosts_count--; 842 CFG_UNLOCK(cfg); 843 CK_SLIST_INSERT_HEAD(hosts, host, entries); 844 continue; 845 } 846 if (GET_AGE(host->timestamp) < cfg->host_delete_delay) 847 continue; 848 if (host->states_count > 0) 849 continue; 850 /* Mark host as going to be expired in next pass */ 851 host->flags |= NAT64LSN_DEADHOST; 852 ck_pr_fence_store(); 853 } 854 } 855 } 856 857 static struct nat64lsn_pgchunk* 858 nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg) 859 { 860 #if 0 861 struct nat64lsn_alias *alias; 862 struct nat64lsn_pgchunk *chunk; 863 uint32_t pgmask; 864 int i, c; 865 866 for (i = 0; i < 1 << (32 - cfg->plen4); i++) { 867 alias = &cfg->aliases[i]; 868 if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay) 869 continue; 870 /* Always keep single chunk allocated */ 871 for (c = 1; c < 32; c++) { 872 if ((alias->tcp_chunkmask & (1 << c)) == 0) 873 break; 874 chunk = ck_pr_load_ptr(&alias->tcp[c]); 875 if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) 876 continue; 877 ck_pr_btr_32(&alias->tcp_chunkmask, c); 878 ck_pr_fence_load(); 879 if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) 880 continue; 881 } 882 } 883 #endif 884 return (NULL); 885 } 886 887 #if 0 888 static void 889 nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg) 890 { 891 struct nat64lsn_host *h; 892 struct nat64lsn_states_slist *hash; 893 int i, j, hsize; 894 895 for (i = 0; i < cfg->hosts_hashsize; i++) { 896 CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) { 897 if (h->states_count / 2 < h->states_hashsize || 898 h->states_hashsize >= NAT64LSN_MAX_HSIZE) 899 continue; 900 hsize = h->states_hashsize * 2; 901 hash = malloc(sizeof(*hash)* hsize, M_NOWAIT); 902 if (hash == NULL) 903 continue; 904 for (j = 0; j < hsize; j++) 905 CK_SLIST_INIT(&hash[i]); 906 907 ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH); 908 } 909 } 910 } 911 #endif 912 913 /* 914 * This procedure is used to perform various maintance 915 * on dynamic hash list. Currently it is called every 4 seconds. 916 */ 917 static void 918 nat64lsn_periodic(void *data) 919 { 920 struct nat64lsn_job_item *ji; 921 struct nat64lsn_cfg *cfg; 922 923 cfg = (struct nat64lsn_cfg *) data; 924 CURVNET_SET(cfg->vp); 925 if (cfg->hosts_count > 0) { 926 ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); 927 if (ji != NULL) { 928 ji->jtype = JTYPE_DESTROY; 929 CK_SLIST_INIT(&ji->hosts); 930 CK_SLIST_INIT(&ji->portgroups); 931 nat64lsn_expire_hosts(cfg, &ji->hosts); 932 nat64lsn_expire_portgroups(cfg, &ji->portgroups); 933 ji->pgchunk = nat64lsn_expire_pgchunk(cfg); 934 NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, 935 nat64lsn_job_destroy); 936 } else 937 NAT64STAT_INC(&cfg->base.stats, jnomem); 938 } 939 callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY); 940 CURVNET_RESTORE(); 941 } 942 943 #define ALLOC_ERROR(stage, type) ((stage) ? 10 * (type) + (stage): 0) 944 #define HOST_ERROR(stage) ALLOC_ERROR(stage, 1) 945 #define PG_ERROR(stage) ALLOC_ERROR(stage, 2) 946 static int 947 nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) 948 { 949 char a[INET6_ADDRSTRLEN]; 950 struct nat64lsn_aliaslink *link; 951 struct nat64lsn_host *host; 952 struct nat64lsn_state *state; 953 uint32_t hval, data[2]; 954 int i; 955 956 /* Check that host was not yet added. */ 957 NAT64LSN_EPOCH_ASSERT(); 958 CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) { 959 if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) { 960 /* The host was allocated in previous call. */ 961 ji->host = host; 962 goto get_state; 963 } 964 } 965 966 host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); 967 if (ji->host == NULL) 968 return (HOST_ERROR(1)); 969 970 host->states_hashsize = NAT64LSN_HSIZE; 971 host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) * 972 host->states_hashsize, M_NAT64LSN, M_NOWAIT); 973 if (host->states_hash == NULL) { 974 uma_zfree(nat64lsn_host_zone, host); 975 return (HOST_ERROR(2)); 976 } 977 978 link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT); 979 if (link == NULL) { 980 free(host->states_hash, M_NAT64LSN); 981 uma_zfree(nat64lsn_host_zone, host); 982 return (HOST_ERROR(3)); 983 } 984 985 /* Initialize */ 986 HOST_LOCK_INIT(host); 987 SET_AGE(host->timestamp); 988 host->addr = ji->f_id.src_ip6; 989 host->hval = ji->src6_hval; 990 host->flags = 0; 991 host->states_count = 0; 992 host->states_hashsize = NAT64LSN_HSIZE; 993 CK_SLIST_INIT(&host->aliases); 994 for (i = 0; i < host->states_hashsize; i++) 995 CK_SLIST_INIT(&host->states_hash[i]); 996 997 /* Determine alias from flow hash. */ 998 hval = ALIASLINK_HVAL(cfg, &ji->f_id); 999 link->alias = &ALIAS_BYHASH(cfg, hval); 1000 CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries); 1001 1002 ALIAS_LOCK(link->alias); 1003 CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries); 1004 link->alias->hosts_count++; 1005 ALIAS_UNLOCK(link->alias); 1006 1007 CFG_LOCK(cfg); 1008 CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries); 1009 cfg->hosts_count++; 1010 CFG_UNLOCK(cfg); 1011 1012 get_state: 1013 data[0] = ji->faddr; 1014 data[1] = (ji->f_id.dst_port << 16) | ji->port; 1015 ji->state_hval = hval = STATE_HVAL(cfg, data); 1016 state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval, 1017 ji->faddr, ji->port, ji->proto); 1018 /* 1019 * We failed to obtain new state, used alias needs new PG. 1020 * XXX: or another alias should be used. 1021 */ 1022 if (state == NULL) { 1023 /* Try to allocate new PG */ 1024 if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) 1025 return (HOST_ERROR(4)); 1026 /* We assume that nat64lsn_alloc_pg() got state */ 1027 } else 1028 ji->state = state; 1029 1030 ji->done = 1; 1031 DPRINTF(DP_OBJ, "ALLOC HOST %s %p", 1032 inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host); 1033 return (HOST_ERROR(0)); 1034 } 1035 1036 static int 1037 nat64lsn_find_pg_place(uint32_t *data) 1038 { 1039 int i; 1040 1041 for (i = 0; i < 32; i++) { 1042 if (~data[i] == 0) 1043 continue; 1044 return (i * 32 + ffs(~data[i]) - 1); 1045 } 1046 return (-1); 1047 } 1048 1049 static int 1050 nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg, 1051 struct nat64lsn_alias *alias, uint32_t *chunkmask, 1052 uint32_t *pgmask, struct nat64lsn_pgchunk **chunks, 1053 struct nat64lsn_pg **pgptr, uint8_t proto) 1054 { 1055 struct nat64lsn_pg *pg; 1056 int i, pg_idx, chunk_idx; 1057 1058 /* Find place in pgchunk where PG can be added */ 1059 pg_idx = nat64lsn_find_pg_place(pgmask); 1060 if (pg_idx < 0) /* no more PGs */ 1061 return (PG_ERROR(1)); 1062 /* Check that we have allocated pgchunk for given PG index */ 1063 chunk_idx = pg_idx / 32; 1064 if (!ISSET32(*chunkmask, chunk_idx)) { 1065 chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone, 1066 M_NOWAIT); 1067 if (chunks[chunk_idx] == NULL) 1068 return (PG_ERROR(2)); 1069 ck_pr_bts_32(chunkmask, chunk_idx); 1070 ck_pr_fence_store(); 1071 } 1072 /* Allocate PG and states chunks */ 1073 pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); 1074 if (pg == NULL) 1075 return (PG_ERROR(3)); 1076 pg->chunks_count = cfg->states_chunks; 1077 if (pg->chunks_count > 1) { 1078 pg->freemask_chunk = malloc(pg->chunks_count * 1079 sizeof(uint64_t), M_NAT64LSN, M_NOWAIT); 1080 if (pg->freemask_chunk == NULL) { 1081 uma_zfree(nat64lsn_pg_zone, pg); 1082 return (PG_ERROR(4)); 1083 } 1084 pg->states_chunk = malloc(pg->chunks_count * 1085 sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN, 1086 M_NOWAIT | M_ZERO); 1087 if (pg->states_chunk == NULL) { 1088 free(pg->freemask_chunk, M_NAT64LSN); 1089 uma_zfree(nat64lsn_pg_zone, pg); 1090 return (PG_ERROR(5)); 1091 } 1092 for (i = 0; i < pg->chunks_count; i++) { 1093 pg->states_chunk[i] = uma_zalloc( 1094 nat64lsn_state_zone, M_NOWAIT); 1095 if (pg->states_chunk[i] == NULL) 1096 goto states_failed; 1097 } 1098 memset(pg->freemask_chunk, 0xff, 1099 sizeof(uint64_t) * pg->chunks_count); 1100 } else { 1101 pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT); 1102 if (pg->states == NULL) { 1103 uma_zfree(nat64lsn_pg_zone, pg); 1104 return (PG_ERROR(6)); 1105 } 1106 memset(&pg->freemask64, 0xff, sizeof(uint64_t)); 1107 } 1108 1109 /* Initialize PG and hook it to pgchunk */ 1110 SET_AGE(pg->timestamp); 1111 pg->proto = proto; 1112 pg->base_port = NAT64_MIN_PORT + 64 * pg_idx; 1113 ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg); 1114 ck_pr_fence_store(); 1115 ck_pr_bts_32(&pgmask[pg_idx / 32], pg_idx % 32); 1116 ck_pr_store_ptr(pgptr, pg); 1117 1118 ALIAS_LOCK(alias); 1119 CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries); 1120 SET_AGE(alias->timestamp); 1121 alias->portgroups_count++; 1122 ALIAS_UNLOCK(alias); 1123 NAT64STAT_INC(&cfg->base.stats, spgcreated); 1124 return (PG_ERROR(0)); 1125 1126 states_failed: 1127 for (i = 0; i < pg->chunks_count; i++) 1128 uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); 1129 free(pg->freemask_chunk, M_NAT64LSN); 1130 free(pg->states_chunk, M_NAT64LSN); 1131 uma_zfree(nat64lsn_pg_zone, pg); 1132 return (PG_ERROR(7)); 1133 } 1134 1135 static int 1136 nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) 1137 { 1138 struct nat64lsn_aliaslink *link; 1139 struct nat64lsn_alias *alias; 1140 int ret; 1141 1142 link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id); 1143 if (link == NULL) 1144 return (PG_ERROR(1)); 1145 1146 /* 1147 * TODO: check that we did not already allocated PG in 1148 * previous call. 1149 */ 1150 1151 ret = 0; 1152 alias = link->alias; 1153 /* Find place in pgchunk where PG can be added */ 1154 switch (ji->proto) { 1155 case IPPROTO_TCP: 1156 ret = nat64lsn_alloc_proto_pg(cfg, alias, 1157 &alias->tcp_chunkmask, alias->tcp_pgmask, 1158 alias->tcp, &alias->tcp_pg, ji->proto); 1159 break; 1160 case IPPROTO_UDP: 1161 ret = nat64lsn_alloc_proto_pg(cfg, alias, 1162 &alias->udp_chunkmask, alias->udp_pgmask, 1163 alias->udp, &alias->udp_pg, ji->proto); 1164 break; 1165 case IPPROTO_ICMP: 1166 ret = nat64lsn_alloc_proto_pg(cfg, alias, 1167 &alias->icmp_chunkmask, alias->icmp_pgmask, 1168 alias->icmp, &alias->icmp_pg, ji->proto); 1169 break; 1170 default: 1171 panic("%s: wrong proto %d", __func__, ji->proto); 1172 } 1173 if (ret == PG_ERROR(1)) { 1174 /* 1175 * PG_ERROR(1) means that alias lacks free PGs 1176 * XXX: try next alias. 1177 */ 1178 printf("NAT64LSN: %s: failed to obtain PG\n", 1179 __func__); 1180 return (ret); 1181 } 1182 if (ret == PG_ERROR(0)) { 1183 ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id, 1184 ji->state_hval, ji->faddr, ji->port, ji->proto); 1185 if (ji->state == NULL) 1186 ret = PG_ERROR(8); 1187 else 1188 ji->done = 1; 1189 } 1190 return (ret); 1191 } 1192 1193 static void 1194 nat64lsn_do_request(void *data) 1195 { 1196 struct epoch_tracker et; 1197 struct nat64lsn_job_head jhead; 1198 struct nat64lsn_job_item *ji, *ji2; 1199 struct nat64lsn_cfg *cfg; 1200 int jcount; 1201 uint8_t flags; 1202 1203 cfg = (struct nat64lsn_cfg *)data; 1204 if (cfg->jlen == 0) 1205 return; 1206 1207 CURVNET_SET(cfg->vp); 1208 STAILQ_INIT(&jhead); 1209 1210 /* Grab queue */ 1211 JQUEUE_LOCK(); 1212 STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item); 1213 jcount = cfg->jlen; 1214 cfg->jlen = 0; 1215 JQUEUE_UNLOCK(); 1216 1217 /* TODO: check if we need to resize hash */ 1218 1219 NAT64STAT_INC(&cfg->base.stats, jcalls); 1220 DPRINTF(DP_JQUEUE, "count=%d", jcount); 1221 1222 /* 1223 * TODO: 1224 * What we should do here is to build a hash 1225 * to ensure we don't have lots of duplicate requests. 1226 * Skip this for now. 1227 * 1228 * TODO: Limit per-call number of items 1229 */ 1230 1231 NAT64LSN_EPOCH_ENTER(et); 1232 STAILQ_FOREACH(ji, &jhead, entries) { 1233 switch (ji->jtype) { 1234 case JTYPE_NEWHOST: 1235 if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0)) 1236 NAT64STAT_INC(&cfg->base.stats, jhostfails); 1237 break; 1238 case JTYPE_NEWPORTGROUP: 1239 if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) 1240 NAT64STAT_INC(&cfg->base.stats, jportfails); 1241 break; 1242 default: 1243 continue; 1244 } 1245 if (ji->done != 0) { 1246 flags = ji->proto != IPPROTO_TCP ? 0 : 1247 convert_tcp_flags(ji->f_id._flags); 1248 nat64lsn_translate6_internal(cfg, &ji->m, 1249 ji->state, flags); 1250 NAT64STAT_INC(&cfg->base.stats, jreinjected); 1251 } 1252 } 1253 NAT64LSN_EPOCH_EXIT(et); 1254 1255 ji = STAILQ_FIRST(&jhead); 1256 while (ji != NULL) { 1257 ji2 = STAILQ_NEXT(ji, entries); 1258 /* 1259 * In any case we must free mbuf if 1260 * translator did not consumed it. 1261 */ 1262 m_freem(ji->m); 1263 uma_zfree(nat64lsn_job_zone, ji); 1264 ji = ji2; 1265 } 1266 CURVNET_RESTORE(); 1267 } 1268 1269 static struct nat64lsn_job_item * 1270 nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype) 1271 { 1272 struct nat64lsn_job_item *ji; 1273 1274 /* 1275 * Do not try to lock possibly contested mutex if we're near the 1276 * limit. Drop packet instead. 1277 */ 1278 ji = NULL; 1279 if (cfg->jlen >= cfg->jmaxlen) 1280 NAT64STAT_INC(&cfg->base.stats, jmaxlen); 1281 else { 1282 ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); 1283 if (ji == NULL) 1284 NAT64STAT_INC(&cfg->base.stats, jnomem); 1285 } 1286 if (ji == NULL) { 1287 NAT64STAT_INC(&cfg->base.stats, dropped); 1288 DPRINTF(DP_DROPS, "failed to create job"); 1289 } else { 1290 ji->jtype = jtype; 1291 ji->done = 0; 1292 } 1293 return (ji); 1294 } 1295 1296 static void 1297 nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) 1298 { 1299 1300 JQUEUE_LOCK(); 1301 STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries); 1302 NAT64STAT_INC(&cfg->base.stats, jrequests); 1303 cfg->jlen++; 1304 1305 if (callout_pending(&cfg->jcallout) == 0) 1306 callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); 1307 JQUEUE_UNLOCK(); 1308 } 1309 1310 static void 1311 nat64lsn_job_destroy(epoch_context_t ctx) 1312 { 1313 struct nat64lsn_job_item *ji; 1314 struct nat64lsn_host *host; 1315 struct nat64lsn_pg *pg; 1316 int i; 1317 1318 ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx); 1319 MPASS(ji->jtype == JTYPE_DESTROY); 1320 while (!CK_SLIST_EMPTY(&ji->hosts)) { 1321 host = CK_SLIST_FIRST(&ji->hosts); 1322 CK_SLIST_REMOVE_HEAD(&ji->hosts, entries); 1323 if (host->states_count > 0) { 1324 /* 1325 * XXX: The state has been created 1326 * during host deletion. 1327 */ 1328 printf("NAT64LSN: %s: destroying host with %d " 1329 "states\n", __func__, host->states_count); 1330 } 1331 nat64lsn_destroy_host(host); 1332 } 1333 while (!CK_SLIST_EMPTY(&ji->portgroups)) { 1334 pg = CK_SLIST_FIRST(&ji->portgroups); 1335 CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries); 1336 for (i = 0; i < pg->chunks_count; i++) { 1337 if (FREEMASK_BITCOUNT(pg, i) != 64) { 1338 /* 1339 * XXX: The state has been created during 1340 * PG deletion. 1341 */ 1342 printf("NAT64LSN: %s: destroying PG %p " 1343 "with non-empty chunk %d\n", __func__, 1344 pg, i); 1345 } 1346 } 1347 nat64lsn_destroy_pg(pg); 1348 } 1349 uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk); 1350 uma_zfree(nat64lsn_job_zone, ji); 1351 } 1352 1353 static int 1354 nat64lsn_request_host(struct nat64lsn_cfg *cfg, 1355 const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, 1356 in_addr_t faddr, uint16_t port, uint8_t proto) 1357 { 1358 struct nat64lsn_job_item *ji; 1359 1360 ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST); 1361 if (ji != NULL) { 1362 ji->m = *mp; 1363 ji->f_id = *f_id; 1364 ji->faddr = faddr; 1365 ji->port = port; 1366 ji->proto = proto; 1367 ji->src6_hval = hval; 1368 1369 nat64lsn_enqueue_job(cfg, ji); 1370 NAT64STAT_INC(&cfg->base.stats, jhostsreq); 1371 *mp = NULL; 1372 } 1373 return (IP_FW_DENY); 1374 } 1375 1376 static int 1377 nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, 1378 const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, 1379 in_addr_t faddr, uint16_t port, uint8_t proto) 1380 { 1381 struct nat64lsn_job_item *ji; 1382 1383 ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP); 1384 if (ji != NULL) { 1385 ji->m = *mp; 1386 ji->f_id = *f_id; 1387 ji->faddr = faddr; 1388 ji->port = port; 1389 ji->proto = proto; 1390 ji->state_hval = hval; 1391 ji->host = host; 1392 1393 nat64lsn_enqueue_job(cfg, ji); 1394 NAT64STAT_INC(&cfg->base.stats, jportreq); 1395 *mp = NULL; 1396 } 1397 return (IP_FW_DENY); 1398 } 1399 1400 static int 1401 nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, 1402 struct nat64lsn_state *state, uint8_t flags) 1403 { 1404 struct pfloghdr loghdr, *logdata; 1405 int ret; 1406 uint16_t ts; 1407 1408 /* Update timestamp and flags if needed */ 1409 SET_AGE(ts); 1410 if (state->timestamp != ts) 1411 state->timestamp = ts; 1412 if ((state->flags & flags) != 0) 1413 state->flags |= flags; 1414 1415 if (cfg->base.flags & NAT64_LOG) { 1416 logdata = &loghdr; 1417 nat64lsn_log(logdata, *mp, AF_INET6, state); 1418 } else 1419 logdata = NULL; 1420 1421 ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src), 1422 htons(state->aport), &cfg->base, logdata); 1423 if (ret == NAT64SKIP) 1424 return (cfg->nomatch_verdict); 1425 if (ret == NAT64RETURN) 1426 *mp = NULL; 1427 return (IP_FW_DENY); 1428 } 1429 1430 static int 1431 nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, 1432 struct mbuf **mp) 1433 { 1434 struct nat64lsn_state *state; 1435 struct nat64lsn_host *host; 1436 struct icmp6_hdr *icmp6; 1437 uint32_t addr, hval, data[2]; 1438 int offset, proto; 1439 uint16_t port; 1440 uint8_t flags; 1441 1442 /* Check if protocol is supported */ 1443 port = f_id->src_port; 1444 proto = f_id->proto; 1445 switch (f_id->proto) { 1446 case IPPROTO_ICMPV6: 1447 /* 1448 * For ICMPv6 echo reply/request we use icmp6_id as 1449 * local port. 1450 */ 1451 offset = 0; 1452 proto = nat64_getlasthdr(*mp, &offset); 1453 if (proto < 0) { 1454 NAT64STAT_INC(&cfg->base.stats, dropped); 1455 DPRINTF(DP_DROPS, "mbuf isn't contigious"); 1456 return (IP_FW_DENY); 1457 } 1458 if (proto == IPPROTO_ICMPV6) { 1459 icmp6 = mtodo(*mp, offset); 1460 if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || 1461 icmp6->icmp6_type == ICMP6_ECHO_REPLY) 1462 port = ntohs(icmp6->icmp6_id); 1463 } 1464 proto = IPPROTO_ICMP; 1465 /* FALLTHROUGH */ 1466 case IPPROTO_TCP: 1467 case IPPROTO_UDP: 1468 break; 1469 default: 1470 NAT64STAT_INC(&cfg->base.stats, noproto); 1471 return (cfg->nomatch_verdict); 1472 } 1473 1474 /* Extract IPv4 from destination IPv6 address */ 1475 addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen); 1476 if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) { 1477 char a[INET_ADDRSTRLEN]; 1478 1479 NAT64STAT_INC(&cfg->base.stats, dropped); 1480 DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s", 1481 inet_ntop(AF_INET, &addr, a, sizeof(a))); 1482 return (IP_FW_DENY); /* XXX: add extra stats? */ 1483 } 1484 1485 /* Try to find host */ 1486 hval = HOST_HVAL(cfg, &f_id->src_ip6); 1487 CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) { 1488 if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr)) 1489 break; 1490 } 1491 /* We use IPv4 address in host byte order */ 1492 addr = ntohl(addr); 1493 if (host == NULL) 1494 return (nat64lsn_request_host(cfg, f_id, mp, 1495 hval, addr, port, proto)); 1496 1497 flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); 1498 1499 data[0] = addr; 1500 data[1] = (f_id->dst_port << 16) | port; 1501 hval = STATE_HVAL(cfg, data); 1502 state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr, 1503 port, proto); 1504 if (state == NULL) 1505 return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr, 1506 port, proto)); 1507 return (nat64lsn_translate6_internal(cfg, mp, state, flags)); 1508 } 1509 1510 /* 1511 * Main dataplane entry point. 1512 */ 1513 int 1514 ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, 1515 ipfw_insn *cmd, int *done) 1516 { 1517 struct nat64lsn_cfg *cfg; 1518 ipfw_insn *icmd; 1519 int ret; 1520 1521 IPFW_RLOCK_ASSERT(ch); 1522 1523 *done = 0; /* continue the search in case of failure */ 1524 icmd = cmd + 1; 1525 if (cmd->opcode != O_EXTERNAL_ACTION || 1526 cmd->arg1 != V_nat64lsn_eid || 1527 icmd->opcode != O_EXTERNAL_INSTANCE || 1528 (cfg = NAT64_LOOKUP(ch, icmd)) == NULL) 1529 return (IP_FW_DENY); 1530 1531 *done = 1; /* terminate the search */ 1532 1533 switch (args->f_id.addr_type) { 1534 case 4: 1535 ret = nat64lsn_translate4(cfg, &args->f_id, &args->m); 1536 break; 1537 case 6: 1538 /* 1539 * Check that destination IPv6 address matches our prefix6. 1540 */ 1541 if ((cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 && 1542 memcmp(&args->f_id.dst_ip6, &cfg->base.plat_prefix, 1543 cfg->base.plat_plen / 8) != 0) { 1544 ret = cfg->nomatch_verdict; 1545 break; 1546 } 1547 ret = nat64lsn_translate6(cfg, &args->f_id, &args->m); 1548 break; 1549 default: 1550 ret = cfg->nomatch_verdict; 1551 } 1552 1553 if (ret != IP_FW_PASS && args->m != NULL) { 1554 m_freem(args->m); 1555 args->m = NULL; 1556 } 1557 return (ret); 1558 } 1559 1560 static int 1561 nat64lsn_state_ctor(void *mem, int size, void *arg, int flags) 1562 { 1563 struct nat64lsn_states_chunk *chunk; 1564 int i; 1565 1566 chunk = (struct nat64lsn_states_chunk *)mem; 1567 for (i = 0; i < 64; i++) 1568 chunk->state[i].flags = 0; 1569 return (0); 1570 } 1571 1572 void 1573 nat64lsn_init_internal(void) 1574 { 1575 1576 nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts", 1577 sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL, 1578 UMA_ALIGN_PTR, 0); 1579 nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks", 1580 sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL, 1581 UMA_ALIGN_PTR, 0); 1582 nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups", 1583 sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL, 1584 UMA_ALIGN_PTR, 0); 1585 nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links", 1586 sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL, 1587 UMA_ALIGN_PTR, 0); 1588 nat64lsn_state_zone = uma_zcreate("NAT64LSN states", 1589 sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor, 1590 NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1591 nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs", 1592 sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL, 1593 UMA_ALIGN_PTR, 0); 1594 JQUEUE_LOCK_INIT(); 1595 } 1596 1597 void 1598 nat64lsn_uninit_internal(void) 1599 { 1600 1601 /* XXX: epoch_task drain */ 1602 JQUEUE_LOCK_DESTROY(); 1603 uma_zdestroy(nat64lsn_host_zone); 1604 uma_zdestroy(nat64lsn_pgchunk_zone); 1605 uma_zdestroy(nat64lsn_pg_zone); 1606 uma_zdestroy(nat64lsn_aliaslink_zone); 1607 uma_zdestroy(nat64lsn_state_zone); 1608 uma_zdestroy(nat64lsn_job_zone); 1609 } 1610 1611 void 1612 nat64lsn_start_instance(struct nat64lsn_cfg *cfg) 1613 { 1614 1615 CALLOUT_LOCK(cfg); 1616 callout_reset(&cfg->periodic, hz * PERIODIC_DELAY, 1617 nat64lsn_periodic, cfg); 1618 CALLOUT_UNLOCK(cfg); 1619 } 1620 1621 struct nat64lsn_cfg * 1622 nat64lsn_init_instance(struct ip_fw_chain *ch, in_addr_t prefix, int plen) 1623 { 1624 struct nat64lsn_cfg *cfg; 1625 struct nat64lsn_alias *alias; 1626 int i, naddr; 1627 1628 cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN, 1629 M_WAITOK | M_ZERO); 1630 1631 CFG_LOCK_INIT(cfg); 1632 CALLOUT_LOCK_INIT(cfg); 1633 STAILQ_INIT(&cfg->jhead); 1634 cfg->vp = curvnet; 1635 COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); 1636 1637 cfg->hash_seed = arc4random(); 1638 cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE; 1639 cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) * 1640 cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO); 1641 for (i = 0; i < cfg->hosts_hashsize; i++) 1642 CK_SLIST_INIT(&cfg->hosts_hash[i]); 1643 1644 naddr = 1 << (32 - plen); 1645 cfg->prefix4 = prefix; 1646 cfg->pmask4 = prefix | (naddr - 1); 1647 cfg->plen4 = plen; 1648 cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr, 1649 M_NAT64LSN, M_WAITOK | M_ZERO); 1650 for (i = 0; i < naddr; i++) { 1651 alias = &cfg->aliases[i]; 1652 alias->addr = prefix + i; /* host byte order */ 1653 CK_SLIST_INIT(&alias->hosts); 1654 ALIAS_LOCK_INIT(alias); 1655 } 1656 1657 callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0); 1658 callout_init(&cfg->jcallout, CALLOUT_MPSAFE); 1659 1660 return (cfg); 1661 } 1662 1663 static void 1664 nat64lsn_destroy_pg(struct nat64lsn_pg *pg) 1665 { 1666 int i; 1667 1668 if (pg->chunks_count == 1) { 1669 uma_zfree(nat64lsn_state_zone, pg->states); 1670 } else { 1671 for (i = 0; i < pg->chunks_count; i++) 1672 uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); 1673 free(pg->states_chunk, M_NAT64LSN); 1674 free(pg->freemask_chunk, M_NAT64LSN); 1675 } 1676 uma_zfree(nat64lsn_pg_zone, pg); 1677 } 1678 1679 static void 1680 nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg, 1681 struct nat64lsn_alias *alias) 1682 { 1683 struct nat64lsn_pg *pg; 1684 int i; 1685 1686 while (!CK_SLIST_EMPTY(&alias->portgroups)) { 1687 pg = CK_SLIST_FIRST(&alias->portgroups); 1688 CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries); 1689 nat64lsn_destroy_pg(pg); 1690 } 1691 for (i = 0; i < 32; i++) { 1692 if (ISSET32(alias->tcp_chunkmask, i)) 1693 uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]); 1694 if (ISSET32(alias->udp_chunkmask, i)) 1695 uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]); 1696 if (ISSET32(alias->icmp_chunkmask, i)) 1697 uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]); 1698 } 1699 ALIAS_LOCK_DESTROY(alias); 1700 } 1701 1702 static void 1703 nat64lsn_destroy_host(struct nat64lsn_host *host) 1704 { 1705 struct nat64lsn_aliaslink *link; 1706 1707 while (!CK_SLIST_EMPTY(&host->aliases)) { 1708 link = CK_SLIST_FIRST(&host->aliases); 1709 CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries); 1710 1711 ALIAS_LOCK(link->alias); 1712 CK_SLIST_REMOVE(&link->alias->hosts, link, 1713 nat64lsn_aliaslink, alias_entries); 1714 link->alias->hosts_count--; 1715 ALIAS_UNLOCK(link->alias); 1716 1717 uma_zfree(nat64lsn_aliaslink_zone, link); 1718 } 1719 HOST_LOCK_DESTROY(host); 1720 free(host->states_hash, M_NAT64LSN); 1721 uma_zfree(nat64lsn_host_zone, host); 1722 } 1723 1724 void 1725 nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg) 1726 { 1727 struct nat64lsn_host *host; 1728 int i; 1729 1730 CALLOUT_LOCK(cfg); 1731 callout_drain(&cfg->periodic); 1732 CALLOUT_UNLOCK(cfg); 1733 callout_drain(&cfg->jcallout); 1734 1735 for (i = 0; i < cfg->hosts_hashsize; i++) { 1736 while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) { 1737 host = CK_SLIST_FIRST(&cfg->hosts_hash[i]); 1738 CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries); 1739 nat64lsn_destroy_host(host); 1740 } 1741 } 1742 1743 for (i = 0; i < (1 << (32 - cfg->plen4)); i++) 1744 nat64lsn_destroy_alias(cfg, &cfg->aliases[i]); 1745 1746 CALLOUT_LOCK_DESTROY(cfg); 1747 CFG_LOCK_DESTROY(cfg); 1748 COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); 1749 free(cfg->hosts_hash, M_NAT64LSN); 1750 free(cfg->aliases, M_NAT64LSN); 1751 free(cfg, M_NAT64LSN); 1752 } 1753 1754