xref: /freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c (revision 4a77657cbc011ea657ccb079fff6b58b295eccb0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015-2020 Yandex LLC
5  * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  * Copyright (c) 2016-2020 Andrey V. Elsukov <ae@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/counter.h>
33 #include <sys/ck.h>
34 #include <sys/epoch.h>
35 #include <sys/errno.h>
36 #include <sys/hash.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/module.h>
42 #include <sys/rmlock.h>
43 #include <sys/socket.h>
44 #include <sys/syslog.h>
45 #include <sys/sysctl.h>
46 
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/if_pflog.h>
50 #include <net/pfil.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/ip.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/ip_fw.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <netinet/ip_icmp.h>
59 #include <netinet/tcp.h>
60 #include <netinet/udp.h>
61 #include <netinet6/in6_var.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/ip_fw_nat64.h>
64 
65 #include <netpfil/ipfw/ip_fw_private.h>
66 #include <netpfil/pf/pf.h>
67 
68 #include "nat64lsn.h"
69 
70 MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN");
71 
72 #define	NAT64LSN_EPOCH_ENTER(et)  NET_EPOCH_ENTER(et)
73 #define	NAT64LSN_EPOCH_EXIT(et)   NET_EPOCH_EXIT(et)
74 #define	NAT64LSN_EPOCH_ASSERT()   NET_EPOCH_ASSERT()
75 #define	NAT64LSN_EPOCH_CALL(c, f) NET_EPOCH_CALL((f), (c))
76 
77 static uma_zone_t nat64lsn_host_zone;
78 static uma_zone_t nat64lsn_pgchunk_zone;
79 static uma_zone_t nat64lsn_pg_zone;
80 static uma_zone_t nat64lsn_aliaslink_zone;
81 static uma_zone_t nat64lsn_state_zone;
82 static uma_zone_t nat64lsn_job_zone;
83 
84 static void nat64lsn_periodic(void *data);
85 #define	PERIODIC_DELAY		4
86 #define	NAT64_LOOKUP(chain, cmd)	\
87     (struct nat64lsn_instance *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx)
88 /*
89  * Delayed job queue, used to create new hosts
90  * and new portgroups
91  */
92 enum nat64lsn_jtype {
93 	JTYPE_NEWHOST = 1,
94 	JTYPE_NEWPORTGROUP,
95 	JTYPE_DESTROY,
96 };
97 
98 struct nat64lsn_job_item {
99 	STAILQ_ENTRY(nat64lsn_job_item)	entries;
100 	enum nat64lsn_jtype	jtype;
101 
102 	union {
103 		struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */
104 			struct mbuf		*m;
105 			struct nat64lsn_host	*host;
106 			struct nat64lsn_state	*state;
107 			uint32_t		src6_hval;
108 			uint32_t		state_hval;
109 			struct ipfw_flow_id	f_id;
110 			in_addr_t		faddr;
111 			uint16_t		port;
112 			uint8_t			proto;
113 			uint8_t			done;
114 		};
115 		struct { /* used by JTYPE_DESTROY */
116 			struct nat64lsn_hosts_slist	hosts;
117 			struct nat64lsn_pg_slist	portgroups;
118 			struct nat64lsn_pgchunk		*pgchunk;
119 			struct epoch_context		epoch_ctx;
120 		};
121 	};
122 };
123 
124 static struct mtx jmtx;
125 #define	JQUEUE_LOCK_INIT()	mtx_init(&jmtx, "qlock", NULL, MTX_DEF)
126 #define	JQUEUE_LOCK_DESTROY()	mtx_destroy(&jmtx)
127 #define	JQUEUE_LOCK()		mtx_lock(&jmtx)
128 #define	JQUEUE_UNLOCK()		mtx_unlock(&jmtx)
129 
130 static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg,
131     struct nat64lsn_job_item *ji);
132 static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg,
133     struct nat64lsn_job_item *ji);
134 static struct nat64lsn_job_item *nat64lsn_create_job(
135     struct nat64lsn_cfg *cfg, int jtype);
136 static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,
137     struct nat64lsn_job_item *ji);
138 static void nat64lsn_job_destroy(epoch_context_t ctx);
139 static void nat64lsn_destroy_host(struct nat64lsn_host *host);
140 static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg);
141 
142 static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,
143     const struct ipfw_flow_id *f_id, struct mbuf **mp);
144 static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,
145     struct ipfw_flow_id *f_id, struct mbuf **mp);
146 static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg,
147     struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags);
148 
149 #define	NAT64_BIT_TCP_FIN	0	/* FIN was seen */
150 #define	NAT64_BIT_TCP_SYN	1	/* First syn in->out */
151 #define	NAT64_BIT_TCP_ESTAB	2	/* Packet with Ack */
152 #define	NAT64_BIT_READY_IPV4	6	/* state is ready for translate4 */
153 #define	NAT64_BIT_STALE		7	/* state is going to be expired */
154 
155 #define	NAT64_FLAG_FIN		(1 << NAT64_BIT_TCP_FIN)
156 #define	NAT64_FLAG_SYN		(1 << NAT64_BIT_TCP_SYN)
157 #define	NAT64_FLAG_ESTAB	(1 << NAT64_BIT_TCP_ESTAB)
158 #define	NAT64_FLAGS_TCP	(NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN)
159 
160 #define	NAT64_FLAG_READY	(1 << NAT64_BIT_READY_IPV4)
161 #define	NAT64_FLAG_STALE	(1 << NAT64_BIT_STALE)
162 
163 static inline uint8_t
convert_tcp_flags(uint8_t flags)164 convert_tcp_flags(uint8_t flags)
165 {
166 	uint8_t result;
167 
168 	result = flags & (TH_FIN|TH_SYN);
169 	result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
170 	result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */
171 
172 	return (result);
173 }
174 
175 static void
nat64lsn_log(struct pfloghdr * plog,struct mbuf * m,sa_family_t family,struct nat64lsn_state * state)176 nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
177     struct nat64lsn_state *state)
178 {
179 
180 	memset(plog, 0, sizeof(*plog));
181 	plog->length = PFLOG_REAL_HDRLEN;
182 	plog->af = family;
183 	plog->action = PF_NAT;
184 	plog->dir = PF_IN;
185 	plog->rulenr = htonl(state->ip_src);
186 	plog->subrulenr = htonl((uint32_t)(state->aport << 16) |
187 	    (state->proto << 8) | (state->ip_dst & 0xff));
188 	plog->ruleset[0] = '\0';
189 	strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
190 	ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
191 }
192 
193 #define	HVAL(p, n, s)	jenkins_hash32((const uint32_t *)(p), (n), (s))
194 #define	HOST_HVAL(c, a)	HVAL((a),\
195     sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed)
196 #define	HOSTS(c, v)	((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)])
197 
198 #define	ALIASLINK_HVAL(c, f)	HVAL(&(f)->dst_ip6,\
199     sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed)
200 #define	ALIAS_BYHASH(c, v)	\
201     ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)])
202 static struct nat64lsn_aliaslink*
nat64lsn_get_aliaslink(struct nat64lsn_cfg * cfg __unused,struct nat64lsn_host * host,const struct ipfw_flow_id * f_id __unused)203 nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused,
204     struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused)
205 {
206 
207 	/*
208 	 * We can implement some different algorithms how
209 	 * select an alias address.
210 	 * XXX: for now we use first available.
211 	 */
212 	return (CK_SLIST_FIRST(&host->aliases));
213 }
214 
215 static struct nat64lsn_alias*
nat64lsn_get_alias(struct nat64lsn_cfg * cfg,const struct ipfw_flow_id * f_id __unused)216 nat64lsn_get_alias(struct nat64lsn_cfg *cfg,
217     const struct ipfw_flow_id *f_id __unused)
218 {
219 	static uint32_t idx = 0;
220 
221 	/*
222 	 * We can choose alias by number of allocated PGs,
223 	 * not used yet by other hosts, or some static configured
224 	 * by user.
225 	 * XXX: for now we choose it using round robin.
226 	 */
227 	return (&ALIAS_BYHASH(cfg, idx++));
228 }
229 
230 #define	STATE_HVAL(c, d)	HVAL((d), 2, (c)->hash_seed)
231 #define	STATE_HASH(h, v)	\
232     ((h)->states_hash[(v) & ((h)->states_hashsize - 1)])
233 #define	STATES_CHUNK(p, v)	\
234     ((p)->chunks_count == 1 ? (p)->states : \
235 	((p)->states_chunk[CHUNK_BY_FADDR(p, v)]))
236 
237 #ifdef __LP64__
238 #define	FREEMASK_FFSLL(pg, faddr)		\
239     ffsll(*FREEMASK_CHUNK((pg), (faddr)))
240 #define	FREEMASK_BTR(pg, faddr, bit)	\
241     ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit))
242 #define	FREEMASK_BTS(pg, faddr, bit)	\
243     ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit))
244 #define	FREEMASK_ISSET(pg, faddr, bit)	\
245     ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit))
246 #define	FREEMASK_COPY(pg, n, out)	\
247     (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n)))
248 #else
249 static inline int
freemask_ffsll(uint32_t * freemask)250 freemask_ffsll(uint32_t *freemask)
251 {
252 	int i;
253 
254 	if ((i = ffsl(freemask[0])) != 0)
255 		return (i);
256 	if ((i = ffsl(freemask[1])) != 0)
257 		return (i + 32);
258 	return (0);
259 }
260 #define	FREEMASK_FFSLL(pg, faddr)		\
261     freemask_ffsll(FREEMASK_CHUNK((pg), (faddr)))
262 #define	FREEMASK_BTR(pg, faddr, bit)	\
263     ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)
264 #define	FREEMASK_BTS(pg, faddr, bit)	\
265     ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)
266 #define	FREEMASK_ISSET(pg, faddr, bit)	\
267     ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32)
268 #define	FREEMASK_COPY(pg, n, out)	\
269     (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \
270 	((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32)
271 #endif /* !__LP64__ */
272 
273 
274 #define	NAT64LSN_TRY_PGCNT	36
275 static struct nat64lsn_pg*
nat64lsn_get_pg(uint32_t * chunkmask,uint32_t * pgmask,struct nat64lsn_pgchunk ** chunks,uint32_t * pgidx,in_addr_t faddr)276 nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask,
277     struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, in_addr_t faddr)
278 {
279 	struct nat64lsn_pg *pg;
280 	uint32_t idx, oldidx;
281 	int cnt;
282 
283 	/* First try last used PG. */
284 	idx = oldidx = ck_pr_load_32(pgidx);
285 	MPASS(idx < 1024);
286 	cnt = 0;
287 	do {
288 		ck_pr_fence_load();
289 		if (idx > 1023 || !ISSET32(*chunkmask, idx / 32)) {
290 			/* If it is first try, reset idx to first PG */
291 			idx = 0;
292 			/* Stop if idx is out of range */
293 			if (cnt > 0)
294 				break;
295 		}
296 		if (ISSET32(pgmask[idx / 32], idx % 32)) {
297 			pg = ck_pr_load_ptr(
298 			    &chunks[idx / 32]->pgptr[idx % 32]);
299 			ck_pr_fence_load();
300 			/*
301 			 * Make sure that pg did not become DEAD.
302 			 */
303 			if ((pg->flags & NAT64LSN_DEADPG) == 0 &&
304 			    FREEMASK_BITCOUNT(pg, faddr) > 0) {
305 				if (cnt > 0)
306 					ck_pr_cas_32(pgidx, oldidx, idx);
307 				return (pg);
308 			}
309 		}
310 		idx++;
311 	} while (++cnt < NAT64LSN_TRY_PGCNT);
312 	if (oldidx != idx)
313 		ck_pr_cas_32(pgidx, oldidx, idx);
314 	return (NULL);
315 }
316 
317 static struct nat64lsn_state*
nat64lsn_get_state6to4(struct nat64lsn_cfg * cfg,struct nat64lsn_host * host,const struct ipfw_flow_id * f_id,uint32_t hval,in_addr_t faddr,uint16_t port,uint8_t proto)318 nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host,
319     const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr,
320     uint16_t port, uint8_t proto)
321 {
322 	struct nat64lsn_aliaslink *link;
323 	struct nat64lsn_state *state;
324 	struct nat64lsn_pg *pg;
325 	int i, offset;
326 
327 	NAT64LSN_EPOCH_ASSERT();
328 
329 	/* Check that we already have state for given arguments */
330 	CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) {
331 		if (state->proto == proto && state->ip_dst == faddr &&
332 		    state->sport == port && state->dport == f_id->dst_port)
333 			return (state);
334 	}
335 
336 	link = nat64lsn_get_aliaslink(cfg, host, f_id);
337 	if (link == NULL)
338 		return (NULL);
339 
340 	switch (proto) {
341 	case IPPROTO_TCP:
342 		pg = nat64lsn_get_pg(&link->alias->tcp_chunkmask,
343 		    link->alias->tcp_pgmask, link->alias->tcp,
344 		    &link->alias->tcp_pgidx, faddr);
345 		break;
346 	case IPPROTO_UDP:
347 		pg = nat64lsn_get_pg(&link->alias->udp_chunkmask,
348 		    link->alias->udp_pgmask, link->alias->udp,
349 		    &link->alias->udp_pgidx, faddr);
350 		break;
351 	case IPPROTO_ICMP:
352 		pg = nat64lsn_get_pg(&link->alias->icmp_chunkmask,
353 		    link->alias->icmp_pgmask, link->alias->icmp,
354 		    &link->alias->icmp_pgidx, faddr);
355 		break;
356 	default:
357 		panic("%s: wrong proto %d", __func__, proto);
358 	}
359 	if (pg == NULL || (pg->flags & NAT64LSN_DEADPG) != 0)
360 		return (NULL);
361 
362 	/* Check that PG has some free states */
363 	state = NULL;
364 	i = FREEMASK_BITCOUNT(pg, faddr);
365 	while (i-- > 0) {
366 		offset = FREEMASK_FFSLL(pg, faddr);
367 		if (offset == 0) {
368 			/*
369 			 * We lost the race.
370 			 * No more free states in this PG.
371 			 */
372 			break;
373 		}
374 
375 		/* Lets try to atomically grab the state */
376 		if (FREEMASK_BTR(pg, faddr, offset - 1)) {
377 			state = &STATES_CHUNK(pg, faddr)->state[offset - 1];
378 			/* Initialize */
379 			state->flags = proto != IPPROTO_TCP ? 0 :
380 			    convert_tcp_flags(f_id->_flags);
381 			state->proto = proto;
382 			state->aport = pg->base_port + offset - 1;
383 			state->dport = f_id->dst_port;
384 			state->sport = port;
385 			state->ip6_dst = f_id->dst_ip6;
386 			state->ip_dst = faddr;
387 			state->ip_src = link->alias->addr;
388 			state->hval = hval;
389 			state->host = host;
390 			SET_AGE(state->timestamp);
391 
392 			/* Insert new state into host's hash table */
393 			HOST_LOCK(host);
394 			SET_AGE(host->timestamp);
395 			CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval),
396 			    state, entries);
397 			host->states_count++;
398 			HOST_UNLOCK(host);
399 			NAT64STAT_INC(&cfg->base.stats, screated);
400 			/* Mark the state as ready for translate4 */
401 			ck_pr_fence_store();
402 			ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4);
403 			break;
404 		}
405 	}
406 	return (state);
407 }
408 
409 /*
410  * Inspects icmp packets to see if the message contains different
411  * packet header so we need to alter @addr and @port.
412  */
413 static int
inspect_icmp_mbuf(struct mbuf ** mp,uint8_t * proto,uint32_t * addr,uint16_t * port)414 inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr,
415     uint16_t *port)
416 {
417 	struct icmp *icmp;
418 	struct ip *ip;
419 	int off;
420 	uint8_t inner_proto;
421 
422 	ip = mtod(*mp, struct ip *); /* Outer IP header */
423 	off = (ip->ip_hl << 2) + ICMP_MINLEN;
424 	if ((*mp)->m_len < off)
425 		*mp = m_pullup(*mp, off);
426 	if (*mp == NULL)
427 		return (ENOMEM);
428 
429 	ip = mtod(*mp, struct ip *); /* Outer IP header */
430 	icmp = L3HDR(ip, struct icmp *);
431 	switch (icmp->icmp_type) {
432 	case ICMP_ECHO:
433 	case ICMP_ECHOREPLY:
434 		/* Use icmp ID as distinguisher */
435 		*port = ntohs(icmp->icmp_id);
436 		return (0);
437 	case ICMP_UNREACH:
438 	case ICMP_TIMXCEED:
439 		break;
440 	default:
441 		return (EOPNOTSUPP);
442 	}
443 	/*
444 	 * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits
445 	 * of ULP header.
446 	 */
447 	if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
448 		return (EINVAL);
449 	if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
450 		*mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN);
451 	if (*mp == NULL)
452 		return (ENOMEM);
453 	ip = mtodo(*mp, off); /* Inner IP header */
454 	inner_proto = ip->ip_p;
455 	off += ip->ip_hl << 2; /* Skip inner IP header */
456 	*addr = ntohl(ip->ip_src.s_addr);
457 	if ((*mp)->m_len < off + ICMP_MINLEN)
458 		*mp = m_pullup(*mp, off + ICMP_MINLEN);
459 	if (*mp == NULL)
460 		return (ENOMEM);
461 	switch (inner_proto) {
462 	case IPPROTO_TCP:
463 	case IPPROTO_UDP:
464 		/* Copy source port from the header */
465 		*port = ntohs(*((uint16_t *)mtodo(*mp, off)));
466 		*proto = inner_proto;
467 		return (0);
468 	case IPPROTO_ICMP:
469 		/*
470 		 * We will translate only ICMP errors for our ICMP
471 		 * echo requests.
472 		 */
473 		icmp = mtodo(*mp, off);
474 		if (icmp->icmp_type != ICMP_ECHO)
475 			return (EOPNOTSUPP);
476 		*port = ntohs(icmp->icmp_id);
477 		return (0);
478 	};
479 	return (EOPNOTSUPP);
480 }
481 
482 static struct nat64lsn_state*
nat64lsn_get_state4to6(struct nat64lsn_cfg * cfg,struct nat64lsn_alias * alias,in_addr_t faddr,uint16_t port,uint8_t proto)483 nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias,
484     in_addr_t faddr, uint16_t port, uint8_t proto)
485 {
486 	struct nat64lsn_state *state;
487 	struct nat64lsn_pg *pg;
488 	int chunk_idx, pg_idx, state_idx;
489 
490 	NAT64LSN_EPOCH_ASSERT();
491 
492 	if (port < NAT64_MIN_PORT)
493 		return (NULL);
494 	/*
495 	 * Alias keeps 32 pgchunks for each protocol.
496 	 * Each pgchunk has 32 pointers to portgroup.
497 	 * Each portgroup has 64 states for ports.
498 	 */
499 	port -= NAT64_MIN_PORT;
500 	chunk_idx = port / 2048;
501 
502 	port -= chunk_idx * 2048;
503 	pg_idx = port / 64;
504 	state_idx = port % 64;
505 
506 	/*
507 	 * First check in proto_chunkmask that we have allocated PG chunk.
508 	 * Then check in proto_pgmask that we have valid PG pointer.
509 	 */
510 	pg = NULL;
511 	switch (proto) {
512 	case IPPROTO_TCP:
513 		if (ISSET32(alias->tcp_chunkmask, chunk_idx) &&
514 		    ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) {
515 			pg = alias->tcp[chunk_idx]->pgptr[pg_idx];
516 			break;
517 		}
518 		return (NULL);
519 	case IPPROTO_UDP:
520 		if (ISSET32(alias->udp_chunkmask, chunk_idx) &&
521 		    ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) {
522 			pg = alias->udp[chunk_idx]->pgptr[pg_idx];
523 			break;
524 		}
525 		return (NULL);
526 	case IPPROTO_ICMP:
527 		if (ISSET32(alias->icmp_chunkmask, chunk_idx) &&
528 		    ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) {
529 			pg = alias->icmp[chunk_idx]->pgptr[pg_idx];
530 			break;
531 		}
532 		return (NULL);
533 	default:
534 		panic("%s: wrong proto %d", __func__, proto);
535 	}
536 	if (pg == NULL)
537 		return (NULL);
538 
539 	if (FREEMASK_ISSET(pg, faddr, state_idx))
540 		return (NULL);
541 
542 	state = &STATES_CHUNK(pg, faddr)->state[state_idx];
543 	ck_pr_fence_load();
544 	if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY)
545 		return (state);
546 	return (NULL);
547 }
548 
549 /*
550  * Reassemble IPv4 fragments, make PULLUP if needed, get some ULP fields
551  * that might be unknown until reassembling is completed.
552  */
553 static struct mbuf*
nat64lsn_reassemble4(struct nat64lsn_cfg * cfg,struct mbuf * m,uint16_t * port)554 nat64lsn_reassemble4(struct nat64lsn_cfg *cfg, struct mbuf *m,
555     uint16_t *port)
556 {
557 	struct ip *ip;
558 	int len;
559 
560 	m = ip_reass(m);
561 	if (m == NULL)
562 		return (NULL);
563 	/* IP header must be contigious after ip_reass() */
564 	ip = mtod(m, struct ip *);
565 	len = ip->ip_hl << 2;
566 	switch (ip->ip_p) {
567 	case IPPROTO_ICMP:
568 		len += ICMP_MINLEN;
569 		break;
570 	case IPPROTO_TCP:
571 		len += sizeof(struct tcphdr);
572 		break;
573 	case IPPROTO_UDP:
574 		len += sizeof(struct udphdr);
575 		break;
576 	default:
577 		m_freem(m);
578 		NAT64STAT_INC(&cfg->base.stats, noproto);
579 		return (NULL);
580 	}
581 	if (m->m_len < len) {
582 		m = m_pullup(m, len);
583 		if (m == NULL) {
584 			NAT64STAT_INC(&cfg->base.stats, nomem);
585 			return (NULL);
586 		}
587 		ip = mtod(m, struct ip *);
588 	}
589 	switch (ip->ip_p) {
590 	case IPPROTO_TCP:
591 		*port = ntohs(L3HDR(ip, struct tcphdr *)->th_dport);
592 		break;
593 	case IPPROTO_UDP:
594 		*port = ntohs(L3HDR(ip, struct udphdr *)->uh_dport);
595 		break;
596 	}
597 	return (m);
598 }
599 
600 static int
nat64lsn_translate4(struct nat64lsn_cfg * cfg,const struct ipfw_flow_id * f_id,struct mbuf ** mp)601 nat64lsn_translate4(struct nat64lsn_cfg *cfg,
602     const struct ipfw_flow_id *f_id, struct mbuf **mp)
603 {
604 	struct pfloghdr loghdr, *logdata;
605 	struct in6_addr src6;
606 	struct nat64lsn_state *state;
607 	struct nat64lsn_alias *alias;
608 	uint32_t addr, flags;
609 	uint16_t port, ts;
610 	int ret;
611 	uint8_t proto;
612 
613 	addr = f_id->dst_ip;
614 	port = f_id->dst_port;
615 	proto = f_id->proto;
616 	if (addr < cfg->prefix4 || addr > cfg->pmask4) {
617 		NAT64STAT_INC(&cfg->base.stats, nomatch4);
618 		return (cfg->nomatch_verdict);
619 	}
620 
621 	/* Reassemble fragments if needed */
622 	ret = ntohs(mtod(*mp, struct ip *)->ip_off);
623 	if ((ret & (IP_MF | IP_OFFMASK)) != 0) {
624 		*mp = nat64lsn_reassemble4(cfg, *mp, &port);
625 		if (*mp == NULL)
626 			return (IP_FW_DENY);
627 	}
628 
629 	/* Check if protocol is supported */
630 	switch (proto) {
631 	case IPPROTO_ICMP:
632 		ret = inspect_icmp_mbuf(mp, &proto, &addr, &port);
633 		if (ret != 0) {
634 			if (ret == ENOMEM) {
635 				NAT64STAT_INC(&cfg->base.stats, nomem);
636 				return (IP_FW_DENY);
637 			}
638 			NAT64STAT_INC(&cfg->base.stats, noproto);
639 			return (cfg->nomatch_verdict);
640 		}
641 		if (addr < cfg->prefix4 || addr > cfg->pmask4) {
642 			NAT64STAT_INC(&cfg->base.stats, nomatch4);
643 			return (cfg->nomatch_verdict);
644 		}
645 		/* FALLTHROUGH */
646 	case IPPROTO_TCP:
647 	case IPPROTO_UDP:
648 		break;
649 	default:
650 		NAT64STAT_INC(&cfg->base.stats, noproto);
651 		return (cfg->nomatch_verdict);
652 	}
653 
654 	alias = &ALIAS_BYHASH(cfg, addr);
655 	MPASS(addr == alias->addr);
656 
657 	/* Check that we have state for this port */
658 	state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip,
659 	    port, proto);
660 	if (state == NULL) {
661 		NAT64STAT_INC(&cfg->base.stats, nomatch4);
662 		return (cfg->nomatch_verdict);
663 	}
664 
665 	/* TODO: Check flags to see if we need to do some static mapping */
666 
667 	/* Update some state fields if need */
668 	SET_AGE(ts);
669 	if (f_id->proto == IPPROTO_TCP)
670 		flags = convert_tcp_flags(f_id->_flags);
671 	else
672 		flags = 0;
673 	if (state->timestamp != ts)
674 		state->timestamp = ts;
675 	if ((state->flags & flags) != flags)
676 		state->flags |= flags;
677 
678 	port = htons(state->sport);
679 	src6 = state->ip6_dst;
680 
681 	if (cfg->base.flags & NAT64_LOG) {
682 		logdata = &loghdr;
683 		nat64lsn_log(logdata, *mp, AF_INET, state);
684 	} else
685 		logdata = NULL;
686 
687 	/*
688 	 * We already have src6 with embedded address, but it is possible,
689 	 * that src_ip is different than state->ip_dst, this is why we
690 	 * do embedding again.
691 	 */
692 	nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip));
693 	ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port,
694 	    &cfg->base, logdata);
695 	if (ret == NAT64SKIP)
696 		return (cfg->nomatch_verdict);
697 	if (ret == NAT64RETURN)
698 		*mp = NULL;
699 	return (IP_FW_DENY);
700 }
701 
702 /*
703  * Check if particular state is stale and should be deleted.
704  * Return 1 if true, 0 otherwise.
705  */
706 static int
nat64lsn_check_state(struct nat64lsn_cfg * cfg,struct nat64lsn_state * state)707 nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state)
708 {
709 	int age, ttl;
710 
711 	/* State was marked as stale in previous pass. */
712 	if (ISSET32(state->flags, NAT64_BIT_STALE))
713 		return (1);
714 
715 	/* State is not yet initialized, it is going to be READY */
716 	if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4))
717 		return (0);
718 
719 	age = GET_AGE(state->timestamp);
720 	switch (state->proto) {
721 	case IPPROTO_TCP:
722 		if (ISSET32(state->flags, NAT64_BIT_TCP_FIN))
723 			ttl = cfg->st_close_ttl;
724 		else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB))
725 			ttl = cfg->st_estab_ttl;
726 		else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN))
727 			ttl = cfg->st_syn_ttl;
728 		else
729 			ttl = cfg->st_syn_ttl;
730 		if (age > ttl)
731 			return (1);
732 		break;
733 	case IPPROTO_UDP:
734 		if (age > cfg->st_udp_ttl)
735 			return (1);
736 		break;
737 	case IPPROTO_ICMP:
738 		if (age > cfg->st_icmp_ttl)
739 			return (1);
740 		break;
741 	}
742 	return (0);
743 }
744 
745 #define	PGCOUNT_ADD(alias, proto, value)			\
746     switch (proto) {						\
747     case IPPROTO_TCP: (alias)->tcp_pgcount += (value); break;	\
748     case IPPROTO_UDP: (alias)->udp_pgcount += (value); break;	\
749     case IPPROTO_ICMP: (alias)->icmp_pgcount += (value); break;	\
750     }
751 #define	PGCOUNT_INC(alias, proto)	PGCOUNT_ADD(alias, proto, 1)
752 #define	PGCOUNT_DEC(alias, proto)	PGCOUNT_ADD(alias, proto, -1)
753 
754 static inline void
nat64lsn_state_cleanup(struct nat64lsn_state * state)755 nat64lsn_state_cleanup(struct nat64lsn_state *state)
756 {
757 
758 	/*
759 	 * Reset READY flag and wait until it become
760 	 * safe for translate4.
761 	 */
762 	ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4);
763 	/*
764 	 * And set STALE flag for deferred deletion in the
765 	 * next pass of nat64lsn_maintain_pg().
766 	 */
767 	ck_pr_bts_32(&state->flags, NAT64_BIT_STALE);
768 	ck_pr_fence_store();
769 }
770 
771 static int
nat64lsn_maintain_pg(struct nat64lsn_cfg * cfg,struct nat64lsn_pg * pg)772 nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg)
773 {
774 	struct nat64lsn_state *state;
775 	struct nat64lsn_host *host;
776 	uint64_t freemask;
777 	int c, i, update_age;
778 
779 	update_age = 0;
780 	for (c = 0; c < pg->chunks_count; c++) {
781 		FREEMASK_COPY(pg, c, freemask);
782 		for (i = 0; i < 64; i++) {
783 			if (ISSET64(freemask, i))
784 				continue;
785 			state = &STATES_CHUNK(pg, c)->state[i];
786 			if (nat64lsn_check_state(cfg, state) == 0) {
787 				update_age = 1;
788 				continue;
789 			}
790 			/*
791 			 * Expire state:
792 			 * 1. Mark as STALE and unlink from host's hash.
793 			 * 2. Set bit in freemask.
794 			 */
795 			if (ISSET32(state->flags, NAT64_BIT_STALE)) {
796 				/*
797 				 * State was marked as STALE in previous
798 				 * pass. Now it is safe to release it.
799 				 */
800 				state->flags = 0;
801 				ck_pr_fence_store();
802 				FREEMASK_BTS(pg, c, i);
803 				NAT64STAT_INC(&cfg->base.stats, sdeleted);
804 				continue;
805 			}
806 			MPASS(state->flags & NAT64_FLAG_READY);
807 
808 			host = state->host;
809 			HOST_LOCK(host);
810 			CK_SLIST_REMOVE(&STATE_HASH(host, state->hval),
811 			    state, nat64lsn_state, entries);
812 			/*
813 			 * Now translate6 will not use this state.
814 			 */
815 			host->states_count--;
816 			HOST_UNLOCK(host);
817 			nat64lsn_state_cleanup(state);
818 		}
819 	}
820 
821 	/*
822 	 * We have some alive states, update timestamp.
823 	 */
824 	if (update_age)
825 		SET_AGE(pg->timestamp);
826 
827 	if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay)
828 		return (0);
829 
830 	return (1);
831 }
832 
833 static void
nat64lsn_expire_portgroups(struct nat64lsn_cfg * cfg,struct nat64lsn_pg_slist * portgroups)834 nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg,
835     struct nat64lsn_pg_slist *portgroups)
836 {
837 	struct nat64lsn_alias *alias;
838 	struct nat64lsn_pg *pg, *tpg;
839 	uint32_t *pgmask, *pgidx;
840 	int i, idx;
841 
842 	for (i = 0; i < 1 << (32 - cfg->plen4); i++) {
843 		alias = &cfg->aliases[i];
844 		CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) {
845 			if (nat64lsn_maintain_pg(cfg, pg) == 0)
846 				continue;
847 			/* Always keep first PG */
848 			if (pg->base_port == NAT64_MIN_PORT)
849 				continue;
850 			/*
851 			 * PG expires in two passes:
852 			 * 1. Reset bit in pgmask, mark it as DEAD.
853 			 * 2. Unlink it and schedule for deferred destroying.
854 			 */
855 			idx = (pg->base_port - NAT64_MIN_PORT) / 64;
856 			switch (pg->proto) {
857 			case IPPROTO_TCP:
858 				pgmask = alias->tcp_pgmask;
859 				pgidx = &alias->tcp_pgidx;
860 				break;
861 			case IPPROTO_UDP:
862 				pgmask = alias->udp_pgmask;
863 				pgidx = &alias->udp_pgidx;
864 				break;
865 			case IPPROTO_ICMP:
866 				pgmask = alias->icmp_pgmask;
867 				pgidx = &alias->icmp_pgidx;
868 				break;
869 			}
870 			if (pg->flags & NAT64LSN_DEADPG) {
871 				/* Unlink PG from alias's chain */
872 				ALIAS_LOCK(alias);
873 				CK_SLIST_REMOVE(&alias->portgroups, pg,
874 				    nat64lsn_pg, entries);
875 				PGCOUNT_DEC(alias, pg->proto);
876 				ALIAS_UNLOCK(alias);
877 				/*
878 				 * Link it to job's chain for deferred
879 				 * destroying.
880 				 */
881 				NAT64STAT_INC(&cfg->base.stats, spgdeleted);
882 				CK_SLIST_INSERT_HEAD(portgroups, pg, entries);
883 				continue;
884 			}
885 
886 			/* Reset the corresponding bit in pgmask array. */
887 			ck_pr_btr_32(&pgmask[idx / 32], idx % 32);
888 			pg->flags |= NAT64LSN_DEADPG;
889 			ck_pr_fence_store();
890 			/* If last used PG points to this PG, reset it. */
891 			ck_pr_cas_32(pgidx, idx, 0);
892 		}
893 	}
894 }
895 
896 static void
nat64lsn_expire_hosts(struct nat64lsn_cfg * cfg,struct nat64lsn_hosts_slist * hosts)897 nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg,
898     struct nat64lsn_hosts_slist *hosts)
899 {
900 	struct nat64lsn_host *host, *tmp;
901 	int i;
902 
903 	for (i = 0; i < cfg->hosts_hashsize; i++) {
904 		CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i],
905 		    entries, tmp) {
906 			/* Is host was marked in previous call? */
907 			if (host->flags & NAT64LSN_DEADHOST) {
908 				if (host->states_count > 0 ||
909 				    GET_AGE(host->timestamp) <
910 				    cfg->host_delete_delay) {
911 					host->flags &= ~NAT64LSN_DEADHOST;
912 					continue;
913 				}
914 				/*
915 				 * Unlink host from hash table and schedule
916 				 * it for deferred destroying.
917 				 */
918 				CFG_LOCK(cfg);
919 				CK_SLIST_REMOVE(&cfg->hosts_hash[i], host,
920 				    nat64lsn_host, entries);
921 				cfg->hosts_count--;
922 				CFG_UNLOCK(cfg);
923 				CK_SLIST_INSERT_HEAD(hosts, host, entries);
924 				continue;
925 			}
926 			if (host->states_count > 0 ||
927 			    GET_AGE(host->timestamp) < cfg->host_delete_delay)
928 				continue;
929 			/* Mark host as going to be expired in next pass */
930 			host->flags |= NAT64LSN_DEADHOST;
931 			ck_pr_fence_store();
932 		}
933 	}
934 }
935 
936 static struct nat64lsn_pgchunk*
nat64lsn_expire_pgchunk(struct nat64lsn_cfg * cfg)937 nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg)
938 {
939 #if 0
940 	struct nat64lsn_alias *alias;
941 	struct nat64lsn_pgchunk *chunk;
942 	uint32_t pgmask;
943 	int i, c;
944 
945 	for (i = 0; i < 1 << (32 - cfg->plen4); i++) {
946 		alias = &cfg->aliases[i];
947 		if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay)
948 			continue;
949 		/* Always keep single chunk allocated */
950 		for (c = 1; c < 32; c++) {
951 			if ((alias->tcp_chunkmask & (1 << c)) == 0)
952 				break;
953 			chunk = ck_pr_load_ptr(&alias->tcp[c]);
954 			if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0)
955 				continue;
956 			ck_pr_btr_32(&alias->tcp_chunkmask, c);
957 			ck_pr_fence_load();
958 			if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0)
959 				continue;
960 		}
961 	}
962 #endif
963 	return (NULL);
964 }
965 
966 #if 0
967 static void
968 nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg)
969 {
970 	struct nat64lsn_host *h;
971 	struct nat64lsn_states_slist *hash;
972 	int i, j, hsize;
973 
974 	for (i = 0; i < cfg->hosts_hashsize; i++) {
975 		CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) {
976 			 if (h->states_count / 2 < h->states_hashsize ||
977 			     h->states_hashsize >= NAT64LSN_MAX_HSIZE)
978 				 continue;
979 			 hsize = h->states_hashsize * 2;
980 			 hash = malloc(sizeof(*hash)* hsize, M_NOWAIT);
981 			 if (hash == NULL)
982 				 continue;
983 			 for (j = 0; j < hsize; j++)
984 				CK_SLIST_INIT(&hash[i]);
985 
986 			 ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH);
987 		}
988 	}
989 }
990 #endif
991 
992 /*
993  * This procedure is used to perform various maintance
994  * on dynamic hash list. Currently it is called every 4 seconds.
995  */
996 static void
nat64lsn_periodic(void * data)997 nat64lsn_periodic(void *data)
998 {
999 	struct nat64lsn_job_item *ji;
1000 	struct nat64lsn_cfg *cfg;
1001 
1002 	cfg = (struct nat64lsn_cfg *) data;
1003 	CURVNET_SET(cfg->vp);
1004 	if (cfg->hosts_count > 0) {
1005 		ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT);
1006 		if (ji != NULL) {
1007 			ji->jtype = JTYPE_DESTROY;
1008 			CK_SLIST_INIT(&ji->hosts);
1009 			CK_SLIST_INIT(&ji->portgroups);
1010 			nat64lsn_expire_hosts(cfg, &ji->hosts);
1011 			nat64lsn_expire_portgroups(cfg, &ji->portgroups);
1012 			ji->pgchunk = nat64lsn_expire_pgchunk(cfg);
1013 			NAT64LSN_EPOCH_CALL(&ji->epoch_ctx,
1014 			    nat64lsn_job_destroy);
1015 		} else
1016 			NAT64STAT_INC(&cfg->base.stats, jnomem);
1017 	}
1018 	callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY);
1019 	CURVNET_RESTORE();
1020 }
1021 
1022 #define	ALLOC_ERROR(stage, type)	((stage) ? 10 * (type) + (stage): 0)
1023 #define	HOST_ERROR(stage)		ALLOC_ERROR(stage, 1)
1024 #define	PG_ERROR(stage)			ALLOC_ERROR(stage, 2)
1025 static int
nat64lsn_alloc_host(struct nat64lsn_cfg * cfg,struct nat64lsn_job_item * ji)1026 nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1027 {
1028 	char a[INET6_ADDRSTRLEN];
1029 	struct nat64lsn_aliaslink *link;
1030 	struct nat64lsn_host *host;
1031 	struct nat64lsn_state *state;
1032 	uint32_t hval, data[2];
1033 	int i;
1034 
1035 	/* Check that host was not yet added. */
1036 	NAT64LSN_EPOCH_ASSERT();
1037 	CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) {
1038 		if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) {
1039 			/* The host was allocated in previous call. */
1040 			ji->host = host;
1041 			goto get_state;
1042 		}
1043 	}
1044 
1045 	host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT);
1046 	if (ji->host == NULL)
1047 		return (HOST_ERROR(1));
1048 
1049 	host->states_hashsize = NAT64LSN_HSIZE;
1050 	host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) *
1051 	    host->states_hashsize, M_NAT64LSN, M_NOWAIT);
1052 	if (host->states_hash == NULL) {
1053 		uma_zfree(nat64lsn_host_zone, host);
1054 		return (HOST_ERROR(2));
1055 	}
1056 
1057 	link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT);
1058 	if (link == NULL) {
1059 		free(host->states_hash, M_NAT64LSN);
1060 		uma_zfree(nat64lsn_host_zone, host);
1061 		return (HOST_ERROR(3));
1062 	}
1063 
1064 	/* Initialize */
1065 	HOST_LOCK_INIT(host);
1066 	SET_AGE(host->timestamp);
1067 	host->addr = ji->f_id.src_ip6;
1068 	host->hval = ji->src6_hval;
1069 	host->flags = 0;
1070 	host->states_count = 0;
1071 	CK_SLIST_INIT(&host->aliases);
1072 	for (i = 0; i < host->states_hashsize; i++)
1073 		CK_SLIST_INIT(&host->states_hash[i]);
1074 
1075 	link->alias = nat64lsn_get_alias(cfg, &ji->f_id);
1076 	CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries);
1077 
1078 	ALIAS_LOCK(link->alias);
1079 	CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries);
1080 	link->alias->hosts_count++;
1081 	ALIAS_UNLOCK(link->alias);
1082 
1083 	CFG_LOCK(cfg);
1084 	CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries);
1085 	cfg->hosts_count++;
1086 	CFG_UNLOCK(cfg);
1087 
1088 get_state:
1089 	data[0] = ji->faddr;
1090 	data[1] = (ji->f_id.dst_port << 16) | ji->port;
1091 	ji->state_hval = hval = STATE_HVAL(cfg, data);
1092 	state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval,
1093 	    ji->faddr, ji->port, ji->proto);
1094 	/*
1095 	 * We failed to obtain new state, used alias needs new PG.
1096 	 * XXX: or another alias should be used.
1097 	 */
1098 	if (state == NULL) {
1099 		/* Try to allocate new PG */
1100 		if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0))
1101 			return (HOST_ERROR(4));
1102 		/* We assume that nat64lsn_alloc_pg() got state */
1103 	} else
1104 		ji->state = state;
1105 
1106 	ji->done = 1;
1107 	DPRINTF(DP_OBJ, "ALLOC HOST %s %p",
1108 	    inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host);
1109 	return (HOST_ERROR(0));
1110 }
1111 
1112 static int
nat64lsn_find_pg_place(uint32_t * data)1113 nat64lsn_find_pg_place(uint32_t *data)
1114 {
1115 	int i;
1116 
1117 	for (i = 0; i < 32; i++) {
1118 		if (~data[i] == 0)
1119 			continue;
1120 		return (i * 32 + ffs(~data[i]) - 1);
1121 	}
1122 	return (-1);
1123 }
1124 
1125 static int
nat64lsn_alloc_proto_pg(struct nat64lsn_cfg * cfg,struct nat64lsn_alias * alias,uint32_t * chunkmask,uint32_t * pgmask,struct nat64lsn_pgchunk ** chunks,uint32_t * pgidx,uint8_t proto)1126 nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg,
1127     struct nat64lsn_alias *alias, uint32_t *chunkmask, uint32_t *pgmask,
1128     struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, uint8_t proto)
1129 {
1130 	struct nat64lsn_pg *pg;
1131 	int i, pg_idx, chunk_idx;
1132 
1133 	/* Find place in pgchunk where PG can be added */
1134 	pg_idx = nat64lsn_find_pg_place(pgmask);
1135 	if (pg_idx < 0)	/* no more PGs */
1136 		return (PG_ERROR(1));
1137 	/* Check that we have allocated pgchunk for given PG index */
1138 	chunk_idx = pg_idx / 32;
1139 	if (!ISSET32(*chunkmask, chunk_idx)) {
1140 		chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone,
1141 		    M_NOWAIT);
1142 		if (chunks[chunk_idx] == NULL)
1143 			return (PG_ERROR(2));
1144 		ck_pr_bts_32(chunkmask, chunk_idx);
1145 		ck_pr_fence_store();
1146 	}
1147 	/* Allocate PG and states chunks */
1148 	pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT);
1149 	if (pg == NULL)
1150 		return (PG_ERROR(3));
1151 	pg->chunks_count = cfg->states_chunks;
1152 	if (pg->chunks_count > 1) {
1153 		pg->freemask_chunk = malloc(pg->chunks_count *
1154 		    sizeof(uint64_t), M_NAT64LSN, M_NOWAIT);
1155 		if (pg->freemask_chunk == NULL) {
1156 			uma_zfree(nat64lsn_pg_zone, pg);
1157 			return (PG_ERROR(4));
1158 		}
1159 		pg->states_chunk = malloc(pg->chunks_count *
1160 		    sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN,
1161 		    M_NOWAIT | M_ZERO);
1162 		if (pg->states_chunk == NULL) {
1163 			free(pg->freemask_chunk, M_NAT64LSN);
1164 			uma_zfree(nat64lsn_pg_zone, pg);
1165 			return (PG_ERROR(5));
1166 		}
1167 		for (i = 0; i < pg->chunks_count; i++) {
1168 			pg->states_chunk[i] = uma_zalloc(
1169 			    nat64lsn_state_zone, M_NOWAIT);
1170 			if (pg->states_chunk[i] == NULL)
1171 				goto states_failed;
1172 		}
1173 		memset(pg->freemask_chunk, 0xff,
1174 		    sizeof(uint64_t) * pg->chunks_count);
1175 	} else {
1176 		pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT);
1177 		if (pg->states == NULL) {
1178 			uma_zfree(nat64lsn_pg_zone, pg);
1179 			return (PG_ERROR(6));
1180 		}
1181 		memset(&pg->freemask64, 0xff, sizeof(uint64_t));
1182 	}
1183 
1184 	/* Initialize PG and hook it to pgchunk */
1185 	SET_AGE(pg->timestamp);
1186 	pg->flags = 0;
1187 	pg->proto = proto;
1188 	pg->base_port = NAT64_MIN_PORT + 64 * pg_idx;
1189 	ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg);
1190 	ck_pr_fence_store();
1191 
1192 	/* Set bit in pgmask and set index of last used PG */
1193 	ck_pr_bts_32(&pgmask[chunk_idx], pg_idx % 32);
1194 	ck_pr_store_32(pgidx, pg_idx);
1195 
1196 	ALIAS_LOCK(alias);
1197 	CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries);
1198 	SET_AGE(alias->timestamp);
1199 	PGCOUNT_INC(alias, proto);
1200 	ALIAS_UNLOCK(alias);
1201 	NAT64STAT_INC(&cfg->base.stats, spgcreated);
1202 	return (PG_ERROR(0));
1203 
1204 states_failed:
1205 	for (i = 0; i < pg->chunks_count; i++)
1206 		uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]);
1207 	free(pg->freemask_chunk, M_NAT64LSN);
1208 	free(pg->states_chunk, M_NAT64LSN);
1209 	uma_zfree(nat64lsn_pg_zone, pg);
1210 	return (PG_ERROR(7));
1211 }
1212 
1213 static int
nat64lsn_alloc_pg(struct nat64lsn_cfg * cfg,struct nat64lsn_job_item * ji)1214 nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1215 {
1216 	struct nat64lsn_aliaslink *link;
1217 	struct nat64lsn_alias *alias;
1218 	int ret;
1219 
1220 	link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id);
1221 	if (link == NULL)
1222 		return (PG_ERROR(1));
1223 
1224 	/*
1225 	 * TODO: check that we did not already allocated PG in
1226 	 *	 previous call.
1227 	 */
1228 
1229 	ret = 0;
1230 	alias = link->alias;
1231 	/* Find place in pgchunk where PG can be added */
1232 	switch (ji->proto) {
1233 	case IPPROTO_TCP:
1234 		ret = nat64lsn_alloc_proto_pg(cfg, alias,
1235 		    &alias->tcp_chunkmask, alias->tcp_pgmask,
1236 		    alias->tcp, &alias->tcp_pgidx, ji->proto);
1237 		break;
1238 	case IPPROTO_UDP:
1239 		ret = nat64lsn_alloc_proto_pg(cfg, alias,
1240 		    &alias->udp_chunkmask, alias->udp_pgmask,
1241 		    alias->udp, &alias->udp_pgidx, ji->proto);
1242 		break;
1243 	case IPPROTO_ICMP:
1244 		ret = nat64lsn_alloc_proto_pg(cfg, alias,
1245 		    &alias->icmp_chunkmask, alias->icmp_pgmask,
1246 		    alias->icmp, &alias->icmp_pgidx, ji->proto);
1247 		break;
1248 	default:
1249 		panic("%s: wrong proto %d", __func__, ji->proto);
1250 	}
1251 	if (ret == PG_ERROR(1)) {
1252 		/*
1253 		 * PG_ERROR(1) means that alias lacks free PGs
1254 		 * XXX: try next alias.
1255 		 */
1256 		printf("NAT64LSN: %s: failed to obtain PG\n",
1257 		    __func__);
1258 		return (ret);
1259 	}
1260 	if (ret == PG_ERROR(0)) {
1261 		ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id,
1262 		    ji->state_hval, ji->faddr, ji->port, ji->proto);
1263 		if (ji->state == NULL)
1264 			ret = PG_ERROR(8);
1265 		else
1266 			ji->done = 1;
1267 	}
1268 	return (ret);
1269 }
1270 
1271 static void
nat64lsn_do_request(void * data)1272 nat64lsn_do_request(void *data)
1273 {
1274 	struct epoch_tracker et;
1275 	struct nat64lsn_job_head jhead;
1276 	struct nat64lsn_job_item *ji, *ji2;
1277 	struct nat64lsn_cfg *cfg;
1278 	int jcount;
1279 	uint8_t flags;
1280 
1281 	cfg = (struct nat64lsn_cfg *)data;
1282 	if (cfg->jlen == 0)
1283 		return;
1284 
1285 	CURVNET_SET(cfg->vp);
1286 	STAILQ_INIT(&jhead);
1287 
1288 	/* Grab queue */
1289 	JQUEUE_LOCK();
1290 	STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item);
1291 	jcount = cfg->jlen;
1292 	cfg->jlen = 0;
1293 	JQUEUE_UNLOCK();
1294 
1295 	/* TODO: check if we need to resize hash */
1296 
1297 	NAT64STAT_INC(&cfg->base.stats, jcalls);
1298 	DPRINTF(DP_JQUEUE, "count=%d", jcount);
1299 
1300 	/*
1301 	 * TODO:
1302 	 * What we should do here is to build a hash
1303 	 * to ensure we don't have lots of duplicate requests.
1304 	 * Skip this for now.
1305 	 *
1306 	 * TODO: Limit per-call number of items
1307 	 */
1308 
1309 	NAT64LSN_EPOCH_ENTER(et);
1310 	STAILQ_FOREACH(ji, &jhead, entries) {
1311 		switch (ji->jtype) {
1312 		case JTYPE_NEWHOST:
1313 			if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0))
1314 				NAT64STAT_INC(&cfg->base.stats, jhostfails);
1315 			break;
1316 		case JTYPE_NEWPORTGROUP:
1317 			if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0))
1318 				NAT64STAT_INC(&cfg->base.stats, jportfails);
1319 			break;
1320 		default:
1321 			continue;
1322 		}
1323 		if (ji->done != 0) {
1324 			flags = ji->proto != IPPROTO_TCP ? 0 :
1325 			    convert_tcp_flags(ji->f_id._flags);
1326 			nat64lsn_translate6_internal(cfg, &ji->m,
1327 			    ji->state, flags);
1328 			NAT64STAT_INC(&cfg->base.stats, jreinjected);
1329 		}
1330 	}
1331 	NAT64LSN_EPOCH_EXIT(et);
1332 
1333 	ji = STAILQ_FIRST(&jhead);
1334 	while (ji != NULL) {
1335 		ji2 = STAILQ_NEXT(ji, entries);
1336 		/*
1337 		 * In any case we must free mbuf if
1338 		 * translator did not consumed it.
1339 		 */
1340 		m_freem(ji->m);
1341 		uma_zfree(nat64lsn_job_zone, ji);
1342 		ji = ji2;
1343 	}
1344 	CURVNET_RESTORE();
1345 }
1346 
1347 static struct nat64lsn_job_item *
nat64lsn_create_job(struct nat64lsn_cfg * cfg,int jtype)1348 nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype)
1349 {
1350 	struct nat64lsn_job_item *ji;
1351 
1352 	/*
1353 	 * Do not try to lock possibly contested mutex if we're near the
1354 	 * limit. Drop packet instead.
1355 	 */
1356 	ji = NULL;
1357 	if (cfg->jlen >= cfg->jmaxlen)
1358 		NAT64STAT_INC(&cfg->base.stats, jmaxlen);
1359 	else {
1360 		ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT);
1361 		if (ji == NULL)
1362 			NAT64STAT_INC(&cfg->base.stats, jnomem);
1363 	}
1364 	if (ji == NULL) {
1365 		NAT64STAT_INC(&cfg->base.stats, dropped);
1366 		DPRINTF(DP_DROPS, "failed to create job");
1367 	} else {
1368 		ji->jtype = jtype;
1369 		ji->done = 0;
1370 	}
1371 	return (ji);
1372 }
1373 
1374 static void
nat64lsn_enqueue_job(struct nat64lsn_cfg * cfg,struct nat64lsn_job_item * ji)1375 nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1376 {
1377 
1378 	JQUEUE_LOCK();
1379 	STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries);
1380 	NAT64STAT_INC(&cfg->base.stats, jrequests);
1381 	cfg->jlen++;
1382 
1383 	if (callout_pending(&cfg->jcallout) == 0)
1384 		callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
1385 	JQUEUE_UNLOCK();
1386 }
1387 
1388 /*
1389  * This function is used to clean up the result of less likely possible
1390  * race condition, when host object was deleted, but some translation
1391  * state was created before it is destroyed.
1392  *
1393  * Since the state expiration removes state from host's hash table,
1394  * we need to be sure, that there will not any states, that are linked
1395  * with this host entry.
1396  */
1397 static void
nat64lsn_host_cleanup(struct nat64lsn_host * host)1398 nat64lsn_host_cleanup(struct nat64lsn_host *host)
1399 {
1400 	struct nat64lsn_state *state, *ts;
1401 	int i;
1402 
1403 	printf("NAT64LSN: %s: race condition has been detected for host %p\n",
1404 	    __func__, host);
1405 	for (i = 0; i < host->states_hashsize; i++) {
1406 		CK_SLIST_FOREACH_SAFE(state, &host->states_hash[i],
1407 		    entries, ts) {
1408 			/*
1409 			 * We can remove the state without lock,
1410 			 * because this host entry is unlinked and will
1411 			 * be destroyed.
1412 			 */
1413 			CK_SLIST_REMOVE(&host->states_hash[i], state,
1414 			    nat64lsn_state, entries);
1415 			host->states_count--;
1416 			nat64lsn_state_cleanup(state);
1417 		}
1418 	}
1419 	MPASS(host->states_count == 0);
1420 }
1421 
1422 /*
1423  * This function is used to clean up the result of less likely possible
1424  * race condition, when portgroup was deleted, but some translation state
1425  * was created before it is destroyed.
1426  *
1427  * Since states entries are accessible via host's hash table, we need
1428  * to be sure, that there will not any states from this PG, that are
1429  * linked with any host entries.
1430  */
1431 static void
nat64lsn_pg_cleanup(struct nat64lsn_pg * pg)1432 nat64lsn_pg_cleanup(struct nat64lsn_pg *pg)
1433 {
1434 	struct nat64lsn_state *state;
1435 	uint64_t usedmask;
1436 	int c, i;
1437 
1438 	printf("NAT64LSN: %s: race condition has been detected for pg %p\n",
1439 	    __func__, pg);
1440 	for (c = 0; c < pg->chunks_count; c++) {
1441 		/*
1442 		 * Use inverted freemask to find what state was created.
1443 		 */
1444 		usedmask = ~(*FREEMASK_CHUNK(pg, c));
1445 		if (usedmask == 0)
1446 			continue;
1447 		for (i = 0; i < 64; i++) {
1448 			if (!ISSET64(usedmask, i))
1449 				continue;
1450 			state = &STATES_CHUNK(pg, c)->state[i];
1451 			/*
1452 			 * If we have STALE bit, this means that state
1453 			 * is already unlinked from host's hash table.
1454 			 * Thus we can just reset the bit in mask and
1455 			 * schedule destroying in the next epoch call.
1456 			 */
1457 			if (ISSET32(state->flags, NAT64_BIT_STALE)) {
1458 				FREEMASK_BTS(pg, c, i);
1459 				continue;
1460 			}
1461 			/*
1462 			 * There is  small window, when we have bit
1463 			 * grabbed from freemask, but state is not yet
1464 			 * linked into host's hash table.
1465 			 * Check for READY flag, it is set just after
1466 			 * linking. If it is not set, defer cleanup
1467 			 * for next call.
1468 			 */
1469 			if (ISSET32(state->flags, NAT64_BIT_READY_IPV4)) {
1470 				struct nat64lsn_host *host;
1471 
1472 				host = state->host;
1473 				HOST_LOCK(host);
1474 				CK_SLIST_REMOVE(&STATE_HASH(host,
1475 				    state->hval), state, nat64lsn_state,
1476 				    entries);
1477 				host->states_count--;
1478 				HOST_UNLOCK(host);
1479 				nat64lsn_state_cleanup(state);
1480 			}
1481 		}
1482 	}
1483 }
1484 
1485 static void
nat64lsn_job_destroy(epoch_context_t ctx)1486 nat64lsn_job_destroy(epoch_context_t ctx)
1487 {
1488 	struct nat64lsn_hosts_slist hosts;
1489 	struct nat64lsn_pg_slist portgroups;
1490 	struct nat64lsn_job_item *ji;
1491 	struct nat64lsn_host *host;
1492 	struct nat64lsn_pg *pg;
1493 	int i;
1494 
1495 	CK_SLIST_INIT(&hosts);
1496 	CK_SLIST_INIT(&portgroups);
1497 	ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx);
1498 	MPASS(ji->jtype == JTYPE_DESTROY);
1499 	while (!CK_SLIST_EMPTY(&ji->hosts)) {
1500 		host = CK_SLIST_FIRST(&ji->hosts);
1501 		CK_SLIST_REMOVE_HEAD(&ji->hosts, entries);
1502 		if (host->states_count > 0) {
1503 			/*
1504 			 * The state has been created during host deletion.
1505 			 */
1506 			printf("NAT64LSN: %s: destroying host with %d "
1507 			    "states\n", __func__, host->states_count);
1508 			/*
1509 			 * We need to cleanup these states to avoid
1510 			 * possible access to already deleted host in
1511 			 * the state expiration code.
1512 			 */
1513 			nat64lsn_host_cleanup(host);
1514 			CK_SLIST_INSERT_HEAD(&hosts, host, entries);
1515 			/*
1516 			 * Keep host entry for next deferred destroying.
1517 			 * In the next epoch its states will be not
1518 			 * accessible.
1519 			 */
1520 			continue;
1521 		}
1522 		nat64lsn_destroy_host(host);
1523 	}
1524 	while (!CK_SLIST_EMPTY(&ji->portgroups)) {
1525 		pg = CK_SLIST_FIRST(&ji->portgroups);
1526 		CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries);
1527 		for (i = 0; i < pg->chunks_count; i++) {
1528 			if (FREEMASK_BITCOUNT(pg, i) != 64) {
1529 				/*
1530 				 * A state has been created during
1531 				 * PG deletion.
1532 				 */
1533 				printf("NAT64LSN: %s: destroying PG %p "
1534 				    "with non-empty chunk %d\n", __func__,
1535 				    pg, i);
1536 				nat64lsn_pg_cleanup(pg);
1537 				CK_SLIST_INSERT_HEAD(&portgroups,
1538 				    pg, entries);
1539 				i = -1;
1540 				break;
1541 			}
1542 		}
1543 		if (i != -1)
1544 			nat64lsn_destroy_pg(pg);
1545 	}
1546 	if (CK_SLIST_EMPTY(&hosts) &&
1547 	    CK_SLIST_EMPTY(&portgroups)) {
1548 		uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk);
1549 		uma_zfree(nat64lsn_job_zone, ji);
1550 		return;
1551 	}
1552 
1553 	/* Schedule job item again */
1554 	CK_SLIST_MOVE(&ji->hosts, &hosts, entries);
1555 	CK_SLIST_MOVE(&ji->portgroups, &portgroups, entries);
1556 	NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, nat64lsn_job_destroy);
1557 }
1558 
1559 static int
nat64lsn_request_host(struct nat64lsn_cfg * cfg,const struct ipfw_flow_id * f_id,struct mbuf ** mp,uint32_t hval,in_addr_t faddr,uint16_t port,uint8_t proto)1560 nat64lsn_request_host(struct nat64lsn_cfg *cfg,
1561     const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval,
1562     in_addr_t faddr, uint16_t port, uint8_t proto)
1563 {
1564 	struct nat64lsn_job_item *ji;
1565 
1566 	ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST);
1567 	if (ji != NULL) {
1568 		ji->m = *mp;
1569 		ji->f_id = *f_id;
1570 		ji->faddr = faddr;
1571 		ji->port = port;
1572 		ji->proto = proto;
1573 		ji->src6_hval = hval;
1574 
1575 		nat64lsn_enqueue_job(cfg, ji);
1576 		NAT64STAT_INC(&cfg->base.stats, jhostsreq);
1577 		*mp = NULL;
1578 	}
1579 	return (IP_FW_DENY);
1580 }
1581 
1582 static int
nat64lsn_request_pg(struct nat64lsn_cfg * cfg,struct nat64lsn_host * host,const struct ipfw_flow_id * f_id,struct mbuf ** mp,uint32_t hval,in_addr_t faddr,uint16_t port,uint8_t proto)1583 nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host,
1584     const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval,
1585     in_addr_t faddr, uint16_t port, uint8_t proto)
1586 {
1587 	struct nat64lsn_job_item *ji;
1588 
1589 	ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP);
1590 	if (ji != NULL) {
1591 		ji->m = *mp;
1592 		ji->f_id = *f_id;
1593 		ji->faddr = faddr;
1594 		ji->port = port;
1595 		ji->proto = proto;
1596 		ji->state_hval = hval;
1597 		ji->host = host;
1598 
1599 		nat64lsn_enqueue_job(cfg, ji);
1600 		NAT64STAT_INC(&cfg->base.stats, jportreq);
1601 		*mp = NULL;
1602 	}
1603 	return (IP_FW_DENY);
1604 }
1605 
1606 static int
nat64lsn_translate6_internal(struct nat64lsn_cfg * cfg,struct mbuf ** mp,struct nat64lsn_state * state,uint8_t flags)1607 nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp,
1608     struct nat64lsn_state *state, uint8_t flags)
1609 {
1610 	struct pfloghdr loghdr, *logdata;
1611 	int ret;
1612 	uint16_t ts;
1613 
1614 	/* Update timestamp and flags if needed */
1615 	SET_AGE(ts);
1616 	if (state->timestamp != ts)
1617 		state->timestamp = ts;
1618 	if ((state->flags & flags) != 0)
1619 		state->flags |= flags;
1620 
1621 	if (cfg->base.flags & NAT64_LOG) {
1622 		logdata = &loghdr;
1623 		nat64lsn_log(logdata, *mp, AF_INET6, state);
1624 	} else
1625 		logdata = NULL;
1626 
1627 	ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src),
1628 	    htons(state->aport), &cfg->base, logdata);
1629 	if (ret == NAT64SKIP)
1630 		return (cfg->nomatch_verdict);
1631 	if (ret == NAT64RETURN)
1632 		*mp = NULL;
1633 	return (IP_FW_DENY);
1634 }
1635 
1636 static int
nat64lsn_translate6(struct nat64lsn_cfg * cfg,struct ipfw_flow_id * f_id,struct mbuf ** mp)1637 nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id,
1638     struct mbuf **mp)
1639 {
1640 	struct nat64lsn_state *state;
1641 	struct nat64lsn_host *host;
1642 	struct icmp6_hdr *icmp6;
1643 	uint32_t addr, hval, data[2];
1644 	int offset, proto;
1645 	uint16_t port;
1646 	uint8_t flags;
1647 
1648 	/* Check if protocol is supported */
1649 	port = f_id->src_port;
1650 	proto = f_id->proto;
1651 	switch (f_id->proto) {
1652 	case IPPROTO_ICMPV6:
1653 		/*
1654 		 * For ICMPv6 echo reply/request we use icmp6_id as
1655 		 * local port.
1656 		 */
1657 		offset = 0;
1658 		proto = nat64_getlasthdr(*mp, &offset);
1659 		if (proto < 0) {
1660 			NAT64STAT_INC(&cfg->base.stats, dropped);
1661 			DPRINTF(DP_DROPS, "mbuf isn't contigious");
1662 			return (IP_FW_DENY);
1663 		}
1664 		if (proto == IPPROTO_ICMPV6) {
1665 			icmp6 = mtodo(*mp, offset);
1666 			if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST ||
1667 			    icmp6->icmp6_type == ICMP6_ECHO_REPLY)
1668 				port = ntohs(icmp6->icmp6_id);
1669 		}
1670 		proto = IPPROTO_ICMP;
1671 		/* FALLTHROUGH */
1672 	case IPPROTO_TCP:
1673 	case IPPROTO_UDP:
1674 		break;
1675 	default:
1676 		NAT64STAT_INC(&cfg->base.stats, noproto);
1677 		return (cfg->nomatch_verdict);
1678 	}
1679 
1680 	/* Extract IPv4 from destination IPv6 address */
1681 	addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen);
1682 	if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) {
1683 		char a[INET_ADDRSTRLEN];
1684 
1685 		NAT64STAT_INC(&cfg->base.stats, dropped);
1686 		DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s",
1687 		    inet_ntop(AF_INET, &addr, a, sizeof(a)));
1688 		return (IP_FW_DENY); /* XXX: add extra stats? */
1689 	}
1690 
1691 	/* Try to find host */
1692 	hval = HOST_HVAL(cfg, &f_id->src_ip6);
1693 	CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) {
1694 		if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr))
1695 			break;
1696 	}
1697 	/* We use IPv4 address in host byte order */
1698 	addr = ntohl(addr);
1699 	if (host == NULL)
1700 		return (nat64lsn_request_host(cfg, f_id, mp,
1701 		    hval, addr, port, proto));
1702 
1703 	flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags);
1704 
1705 	data[0] = addr;
1706 	data[1] = (f_id->dst_port << 16) | port;
1707 	hval = STATE_HVAL(cfg, data);
1708 	state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr,
1709 	    port, proto);
1710 	if (state == NULL)
1711 		return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr,
1712 		    port, proto));
1713 	return (nat64lsn_translate6_internal(cfg, mp, state, flags));
1714 }
1715 
1716 /*
1717  * Main dataplane entry point.
1718  */
1719 int
ipfw_nat64lsn(struct ip_fw_chain * ch,struct ip_fw_args * args,ipfw_insn * cmd,int * done)1720 ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,
1721     ipfw_insn *cmd, int *done)
1722 {
1723 	struct nat64lsn_instance *i;
1724 	ipfw_insn *icmd;
1725 	int ret;
1726 
1727 	IPFW_RLOCK_ASSERT(ch);
1728 
1729 	*done = 0;	/* continue the search in case of failure */
1730 	icmd = cmd + F_LEN(cmd);
1731 	if (cmd->opcode != O_EXTERNAL_ACTION ||
1732 	    insntod(cmd, kidx)->kidx != V_nat64lsn_eid ||
1733 	    icmd->opcode != O_EXTERNAL_INSTANCE ||
1734 	    (i = NAT64_LOOKUP(ch, icmd)) == NULL)
1735 		return (IP_FW_DENY);
1736 
1737 	*done = 1;	/* terminate the search */
1738 
1739 	switch (args->f_id.addr_type) {
1740 	case 4:
1741 		ret = nat64lsn_translate4(i->cfg, &args->f_id, &args->m);
1742 		break;
1743 	case 6:
1744 		/*
1745 		 * Check that destination IPv6 address matches our prefix6.
1746 		 */
1747 		if ((i->cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 &&
1748 		    memcmp(&args->f_id.dst_ip6, &i->cfg->base.plat_prefix,
1749 		    i->cfg->base.plat_plen / 8) != 0) {
1750 			ret = i->cfg->nomatch_verdict;
1751 			break;
1752 		}
1753 		ret = nat64lsn_translate6(i->cfg, &args->f_id, &args->m);
1754 		break;
1755 	default:
1756 		ret = i->cfg->nomatch_verdict;
1757 	}
1758 
1759 	if (ret != IP_FW_PASS && args->m != NULL) {
1760 		m_freem(args->m);
1761 		args->m = NULL;
1762 	}
1763 	return (ret);
1764 }
1765 
1766 static int
nat64lsn_state_ctor(void * mem,int size,void * arg,int flags)1767 nat64lsn_state_ctor(void *mem, int size, void *arg, int flags)
1768 {
1769 	struct nat64lsn_states_chunk *chunk;
1770 	int i;
1771 
1772 	chunk = (struct nat64lsn_states_chunk *)mem;
1773 	for (i = 0; i < 64; i++)
1774 		chunk->state[i].flags = 0;
1775 	return (0);
1776 }
1777 
1778 void
nat64lsn_init_internal(void)1779 nat64lsn_init_internal(void)
1780 {
1781 
1782 	nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts",
1783 	    sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL,
1784 	    UMA_ALIGN_PTR, 0);
1785 	nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks",
1786 	    sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL,
1787 	    UMA_ALIGN_PTR, 0);
1788 	nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups",
1789 	    sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL,
1790 	    UMA_ALIGN_PTR, 0);
1791 	nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links",
1792 	    sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL,
1793 	    UMA_ALIGN_PTR, 0);
1794 	nat64lsn_state_zone = uma_zcreate("NAT64LSN states",
1795 	    sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor,
1796 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1797 	nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs",
1798 	    sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL,
1799 	    UMA_ALIGN_PTR, 0);
1800 	JQUEUE_LOCK_INIT();
1801 }
1802 
1803 void
nat64lsn_uninit_internal(void)1804 nat64lsn_uninit_internal(void)
1805 {
1806 
1807 	/* XXX: epoch_task drain */
1808 	JQUEUE_LOCK_DESTROY();
1809 	uma_zdestroy(nat64lsn_host_zone);
1810 	uma_zdestroy(nat64lsn_pgchunk_zone);
1811 	uma_zdestroy(nat64lsn_pg_zone);
1812 	uma_zdestroy(nat64lsn_aliaslink_zone);
1813 	uma_zdestroy(nat64lsn_state_zone);
1814 	uma_zdestroy(nat64lsn_job_zone);
1815 }
1816 
1817 void
nat64lsn_start_instance(struct nat64lsn_cfg * cfg)1818 nat64lsn_start_instance(struct nat64lsn_cfg *cfg)
1819 {
1820 
1821 	CALLOUT_LOCK(cfg);
1822 	callout_reset(&cfg->periodic, hz * PERIODIC_DELAY,
1823 	    nat64lsn_periodic, cfg);
1824 	CALLOUT_UNLOCK(cfg);
1825 }
1826 
1827 struct nat64lsn_cfg *
nat64lsn_init_config(struct ip_fw_chain * ch,in_addr_t prefix,int plen)1828 nat64lsn_init_config(struct ip_fw_chain *ch, in_addr_t prefix, int plen)
1829 {
1830 	struct nat64lsn_cfg *cfg;
1831 	struct nat64lsn_alias *alias;
1832 	int i, naddr;
1833 
1834 	cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN,
1835 	    M_WAITOK | M_ZERO);
1836 
1837 	CFG_LOCK_INIT(cfg);
1838 	CALLOUT_LOCK_INIT(cfg);
1839 	STAILQ_INIT(&cfg->jhead);
1840 	cfg->vp = curvnet;
1841 	COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK);
1842 
1843 	cfg->hash_seed = arc4random();
1844 	cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE;
1845 	cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) *
1846 	    cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO);
1847 	for (i = 0; i < cfg->hosts_hashsize; i++)
1848 		CK_SLIST_INIT(&cfg->hosts_hash[i]);
1849 
1850 	naddr = 1 << (32 - plen);
1851 	cfg->prefix4 = prefix;
1852 	cfg->pmask4 = prefix | (naddr - 1);
1853 	cfg->plen4 = plen;
1854 	cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr,
1855 	    M_NAT64LSN, M_WAITOK | M_ZERO);
1856 	for (i = 0; i < naddr; i++) {
1857 		alias = &cfg->aliases[i];
1858 		alias->addr = prefix + i; /* host byte order */
1859 		CK_SLIST_INIT(&alias->hosts);
1860 		ALIAS_LOCK_INIT(alias);
1861 	}
1862 
1863 	callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0);
1864 	callout_init(&cfg->jcallout, CALLOUT_MPSAFE);
1865 
1866 	return (cfg);
1867 }
1868 
1869 static void
nat64lsn_destroy_pg(struct nat64lsn_pg * pg)1870 nat64lsn_destroy_pg(struct nat64lsn_pg *pg)
1871 {
1872 	int i;
1873 
1874 	if (pg->chunks_count == 1) {
1875 		uma_zfree(nat64lsn_state_zone, pg->states);
1876 	} else {
1877 		for (i = 0; i < pg->chunks_count; i++)
1878 			uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]);
1879 		free(pg->states_chunk, M_NAT64LSN);
1880 		free(pg->freemask_chunk, M_NAT64LSN);
1881 	}
1882 	uma_zfree(nat64lsn_pg_zone, pg);
1883 }
1884 
1885 static void
nat64lsn_destroy_alias(struct nat64lsn_cfg * cfg,struct nat64lsn_alias * alias)1886 nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg,
1887     struct nat64lsn_alias *alias)
1888 {
1889 	struct nat64lsn_pg *pg;
1890 	int i;
1891 
1892 	while (!CK_SLIST_EMPTY(&alias->portgroups)) {
1893 		pg = CK_SLIST_FIRST(&alias->portgroups);
1894 		CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries);
1895 		nat64lsn_destroy_pg(pg);
1896 	}
1897 	for (i = 0; i < 32; i++) {
1898 		if (ISSET32(alias->tcp_chunkmask, i))
1899 			uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]);
1900 		if (ISSET32(alias->udp_chunkmask, i))
1901 			uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]);
1902 		if (ISSET32(alias->icmp_chunkmask, i))
1903 			uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]);
1904 	}
1905 	ALIAS_LOCK_DESTROY(alias);
1906 }
1907 
1908 static void
nat64lsn_destroy_host(struct nat64lsn_host * host)1909 nat64lsn_destroy_host(struct nat64lsn_host *host)
1910 {
1911 	struct nat64lsn_aliaslink *link;
1912 
1913 	while (!CK_SLIST_EMPTY(&host->aliases)) {
1914 		link = CK_SLIST_FIRST(&host->aliases);
1915 		CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries);
1916 
1917 		ALIAS_LOCK(link->alias);
1918 		CK_SLIST_REMOVE(&link->alias->hosts, link,
1919 		    nat64lsn_aliaslink, alias_entries);
1920 		link->alias->hosts_count--;
1921 		ALIAS_UNLOCK(link->alias);
1922 
1923 		uma_zfree(nat64lsn_aliaslink_zone, link);
1924 	}
1925 	HOST_LOCK_DESTROY(host);
1926 	free(host->states_hash, M_NAT64LSN);
1927 	uma_zfree(nat64lsn_host_zone, host);
1928 }
1929 
1930 void
nat64lsn_destroy_config(struct nat64lsn_cfg * cfg)1931 nat64lsn_destroy_config(struct nat64lsn_cfg *cfg)
1932 {
1933 	struct nat64lsn_host *host;
1934 	int i;
1935 
1936 	CALLOUT_LOCK(cfg);
1937 	callout_drain(&cfg->periodic);
1938 	CALLOUT_UNLOCK(cfg);
1939 	callout_drain(&cfg->jcallout);
1940 
1941 	for (i = 0; i < cfg->hosts_hashsize; i++) {
1942 		while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) {
1943 			host = CK_SLIST_FIRST(&cfg->hosts_hash[i]);
1944 			CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries);
1945 			nat64lsn_destroy_host(host);
1946 		}
1947 	}
1948 
1949 	for (i = 0; i < (1 << (32 - cfg->plen4)); i++)
1950 		nat64lsn_destroy_alias(cfg, &cfg->aliases[i]);
1951 
1952 	CALLOUT_LOCK_DESTROY(cfg);
1953 	CFG_LOCK_DESTROY(cfg);
1954 	COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS);
1955 	free(cfg->hosts_hash, M_NAT64LSN);
1956 	free(cfg->aliases, M_NAT64LSN);
1957 	free(cfg, M_NAT64LSN);
1958 }
1959 
1960