xref: /freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c (revision 7815283df299be63807225a9fe9b6e54406eae28)
1 /*-
2  * Copyright (c) 2015-2016 Yandex LLC
3  * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
4  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/counter.h>
35 #include <sys/errno.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/module.h>
41 #include <sys/rmlock.h>
42 #include <sys/rwlock.h>
43 #include <sys/socket.h>
44 #include <sys/queue.h>
45 #include <sys/syslog.h>
46 #include <sys/sysctl.h>
47 
48 #include <net/if.h>
49 #include <net/if_var.h>
50 #include <net/if_pflog.h>
51 #include <net/pfil.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip_var.h>
56 #include <netinet/ip_fw.h>
57 #include <netinet/ip6.h>
58 #include <netinet/icmp6.h>
59 #include <netinet/ip_icmp.h>
60 #include <netinet/tcp.h>
61 #include <netinet/udp.h>
62 #include <netinet6/in6_var.h>
63 #include <netinet6/ip6_var.h>
64 #include <netinet6/ip_fw_nat64.h>
65 
66 #include <netpfil/ipfw/ip_fw_private.h>
67 #include <netpfil/pf/pf.h>
68 
69 #include "nat64lsn.h"
70 
71 MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN");
72 
73 static void nat64lsn_periodic(void *data);
74 #define	PERIODIC_DELAY	4
75 static uint8_t nat64lsn_proto_map[256];
76 uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
77 
78 #define	NAT64_FLAG_FIN		0x01	/* FIN was seen */
79 #define	NAT64_FLAG_SYN		0x02	/* First syn in->out */
80 #define	NAT64_FLAG_ESTAB	0x04	/* Packet with Ack */
81 #define	NAT64_FLAGS_TCP	(NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN)
82 
83 #define	NAT64_FLAG_RDR		0x80	/* Port redirect */
84 #define	NAT64_LOOKUP(chain, cmd)	\
85 	(struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
86 /*
87  * Delayed job queue, used to create new hosts
88  * and new portgroups
89  */
90 enum nat64lsn_jtype {
91 	JTYPE_NEWHOST = 1,
92 	JTYPE_NEWPORTGROUP,
93 	JTYPE_DELPORTGROUP,
94 };
95 
96 struct nat64lsn_job_item {
97 	TAILQ_ENTRY(nat64lsn_job_item)	next;
98 	enum nat64lsn_jtype	jtype;
99 	struct nat64lsn_host	*nh;
100 	struct nat64lsn_portgroup	*pg;
101 	void			*spare_idx;
102 	struct in6_addr		haddr;
103 	uint8_t			nat_proto;
104 	uint8_t			done;
105 	int			needs_idx;
106 	int			delcount;
107 	unsigned int		fhash;	/* Flow hash */
108 	uint32_t		aaddr;	/* Last used address (net) */
109 	struct mbuf		*m;
110 	struct ipfw_flow_id	f_id;
111 	uint64_t		delmask[NAT64LSN_PGPTRNMASK];
112 };
113 
114 static struct mtx jmtx;
115 #define	JQUEUE_LOCK_INIT()	mtx_init(&jmtx, "qlock", NULL, MTX_DEF)
116 #define	JQUEUE_LOCK_DESTROY()	mtx_destroy(&jmtx)
117 #define	JQUEUE_LOCK()		mtx_lock(&jmtx)
118 #define	JQUEUE_UNLOCK()		mtx_unlock(&jmtx)
119 
120 static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,
121     struct nat64lsn_job_item *ji);
122 static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
123     struct nat64lsn_job_head *jhead, int jlen);
124 
125 static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg,
126     const struct ipfw_flow_id *f_id, int jtype);
127 static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
128     const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
129     int needs_idx);
130 static int nat64lsn_request_host(struct nat64lsn_cfg *cfg,
131     const struct ipfw_flow_id *f_id, struct mbuf **pm);
132 static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,
133     const struct ipfw_flow_id *f_id, struct mbuf **pm);
134 static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,
135     struct ipfw_flow_id *f_id, struct mbuf **pm);
136 
137 static int alloc_portgroup(struct nat64lsn_job_item *ji);
138 static void destroy_portgroup(struct nat64lsn_portgroup *pg);
139 static void destroy_host6(struct nat64lsn_host *nh);
140 static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
141 
142 static int attach_portgroup(struct nat64lsn_cfg *cfg,
143     struct nat64lsn_job_item *ji);
144 static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
145 
146 
147 /* XXX tmp */
148 static uma_zone_t nat64lsn_host_zone;
149 static uma_zone_t nat64lsn_pg_zone;
150 static uma_zone_t nat64lsn_pgidx_zone;
151 
152 static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg,
153     struct nat64lsn_host *nh);
154 
155 #define	I6_hash(x)		(djb_hash((const unsigned char *)(x), 16))
156 #define	I6_first(_ph, h)	(_ph)[h]
157 #define	I6_next(x)		(x)->next
158 #define	I6_val(x)		(&(x)->addr)
159 #define	I6_cmp(a, b)		IN6_ARE_ADDR_EQUAL(a, b)
160 #define	I6_lock(a, b)
161 #define	I6_unlock(a, b)
162 
163 #define	I6HASH_FIND(_cfg, _res, _a) \
164 	CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a)
165 #define	I6HASH_INSERT(_cfg, _i)	\
166 	CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i)
167 #define	I6HASH_REMOVE(_cfg, _res, _tmp, _a)	\
168 	CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a)
169 
170 #define	I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg)	\
171 	CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg)
172 
173 #define	HASH_IN4(x)	djb_hash((const unsigned char *)(x), 8)
174 
175 static unsigned
176 djb_hash(const unsigned char *h, const int len)
177 {
178 	unsigned int result = 0;
179 	int i;
180 
181 	for (i = 0; i < len; i++)
182 		result = 33 * result ^ h[i];
183 
184 	return (result);
185 }
186 
187 /*
188 static size_t
189 bitmask_size(size_t num, int *level)
190 {
191 	size_t x;
192 	int c;
193 
194 	for (c = 0, x = num; num > 1; num /= 64, c++)
195 		;
196 
197 	return (x);
198 }
199 
200 static void
201 bitmask_prepare(uint64_t *pmask, size_t bufsize, int level)
202 {
203 	size_t x, z;
204 
205 	memset(pmask, 0xFF, bufsize);
206 	for (x = 0, z = 1; level > 1; x += z, z *= 64, level--)
207 		;
208 	pmask[x] ~= 0x01;
209 }
210 */
211 
212 static void
213 nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
214     uint32_t n, uint32_t sn)
215 {
216 
217 	memset(plog, 0, sizeof(*plog));
218 	plog->length = PFLOG_REAL_HDRLEN;
219 	plog->af = family;
220 	plog->action = PF_NAT;
221 	plog->dir = PF_IN;
222 	plog->rulenr = htonl(n);
223 	plog->subrulenr = htonl(sn);
224 	plog->ruleset[0] = '\0';
225 	strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
226 	ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
227 }
228 /*
229  * Inspects icmp packets to see if the message contains different
230  * packet header so we need to alter @addr and @port.
231  */
232 static int
233 inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr,
234     uint16_t *port)
235 {
236 	struct ip *ip;
237 	struct tcphdr *tcp;
238 	struct udphdr *udp;
239 	struct icmphdr *icmp;
240 	int off;
241 	uint8_t proto;
242 
243 	ip = mtod(*m, struct ip *); /* Outer IP header */
244 	off = (ip->ip_hl << 2) + ICMP_MINLEN;
245 	if ((*m)->m_len < off)
246 		*m = m_pullup(*m, off);
247 	if (*m == NULL)
248 		return (ENOMEM);
249 
250 	ip = mtod(*m, struct ip *); /* Outer IP header */
251 	icmp = L3HDR(ip, struct icmphdr *);
252 	switch (icmp->icmp_type) {
253 	case ICMP_ECHO:
254 	case ICMP_ECHOREPLY:
255 		/* Use icmp ID as distinguisher */
256 		*port = ntohs(*((uint16_t *)(icmp + 1)));
257 		return (0);
258 	case ICMP_UNREACH:
259 	case ICMP_TIMXCEED:
260 		break;
261 	default:
262 		return (EOPNOTSUPP);
263 	}
264 	/*
265 	 * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits
266 	 * of ULP header.
267 	 */
268 	if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
269 		return (EINVAL);
270 	if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
271 		*m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN);
272 	if (*m == NULL)
273 		return (ENOMEM);
274 	ip = mtodo(*m, off); /* Inner IP header */
275 	proto = ip->ip_p;
276 	off += ip->ip_hl << 2; /* Skip inner IP header */
277 	*addr = ntohl(ip->ip_src.s_addr);
278 	if ((*m)->m_len < off + ICMP_MINLEN)
279 		*m = m_pullup(*m, off + ICMP_MINLEN);
280 	if (*m == NULL)
281 		return (ENOMEM);
282 	switch (proto) {
283 	case IPPROTO_TCP:
284 		tcp = mtodo(*m, off);
285 		*nat_proto = NAT_PROTO_TCP;
286 		*port = ntohs(tcp->th_sport);
287 		return (0);
288 	case IPPROTO_UDP:
289 		udp = mtodo(*m, off);
290 		*nat_proto = NAT_PROTO_UDP;
291 		*port = ntohs(udp->uh_sport);
292 		return (0);
293 	case IPPROTO_ICMP:
294 		/*
295 		 * We will translate only ICMP errors for our ICMP
296 		 * echo requests.
297 		 */
298 		icmp = mtodo(*m, off);
299 		if (icmp->icmp_type != ICMP_ECHO)
300 			return (EOPNOTSUPP);
301 		*port = ntohs(*((uint16_t *)(icmp + 1)));
302 		return (0);
303 	};
304 	return (EOPNOTSUPP);
305 }
306 
307 static inline uint8_t
308 convert_tcp_flags(uint8_t flags)
309 {
310 	uint8_t result;
311 
312 	result = flags & (TH_FIN|TH_SYN);
313 	result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
314 	result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */
315 
316 	return (result);
317 }
318 
319 static NAT64NOINLINE int
320 nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
321     struct mbuf **pm)
322 {
323 	struct pfloghdr loghdr, *logdata;
324 	struct in6_addr src6;
325 	struct nat64lsn_portgroup *pg;
326 	struct nat64lsn_host *nh;
327 	struct nat64lsn_state *st;
328 	struct ip *ip;
329 	uint32_t addr;
330 	uint16_t state_flags, state_ts;
331 	uint16_t port, lport;
332 	uint8_t nat_proto;
333 	int ret;
334 
335 	addr = f_id->dst_ip;
336 	port = f_id->dst_port;
337 	if (addr < cfg->prefix4 || addr > cfg->pmask4) {
338 		NAT64STAT_INC(&cfg->base.stats, nomatch4);
339 		return (cfg->nomatch_verdict);
340 	}
341 
342 	/* Check if protocol is supported and get its short id */
343 	nat_proto = nat64lsn_proto_map[f_id->proto];
344 	if (nat_proto == 0) {
345 		NAT64STAT_INC(&cfg->base.stats, noproto);
346 		return (cfg->nomatch_verdict);
347 	}
348 
349 	/* We might need to handle icmp differently */
350 	if (nat_proto == NAT_PROTO_ICMP) {
351 		ret = inspect_icmp_mbuf(pm, &nat_proto, &addr, &port);
352 		if (ret != 0) {
353 			if (ret == ENOMEM) {
354 				NAT64STAT_INC(&cfg->base.stats, nomem);
355 				return (IP_FW_DENY);
356 			}
357 			NAT64STAT_INC(&cfg->base.stats, noproto);
358 			return (cfg->nomatch_verdict);
359 		}
360 		/* XXX: Check addr for validity */
361 		if (addr < cfg->prefix4 || addr > cfg->pmask4) {
362 			NAT64STAT_INC(&cfg->base.stats, nomatch4);
363 			return (cfg->nomatch_verdict);
364 		}
365 	}
366 
367 	/* Calc portgroup offset w.r.t protocol */
368 	pg = GET_PORTGROUP(cfg, addr, nat_proto, port);
369 
370 	/* Check if this port is occupied by any portgroup */
371 	if (pg == NULL) {
372 		NAT64STAT_INC(&cfg->base.stats, nomatch4);
373 #if 0
374 		DPRINTF(DP_STATE, "NOMATCH %u %d %d (%d)", addr, nat_proto, port,
375 		    _GET_PORTGROUP_IDX(cfg, addr, nat_proto, port));
376 #endif
377 		return (cfg->nomatch_verdict);
378 	}
379 
380 	/* TODO: Check flags to see if we need to do some static mapping */
381 	nh = pg->host;
382 
383 	/* Prepare some fields we might need to update */
384 	SET_AGE(state_ts);
385 	ip = mtod(*pm, struct ip *);
386 	if (ip->ip_p == IPPROTO_TCP)
387 		state_flags = convert_tcp_flags(
388 		    L3HDR(ip, struct tcphdr *)->th_flags);
389 	else
390 		state_flags = 0;
391 
392 	/* Lock host and get port mapping */
393 	NAT64_LOCK(nh);
394 
395 	st = &pg->states[port & (NAT64_CHUNK_SIZE - 1)];
396 	if (st->timestamp != state_ts)
397 		st->timestamp = state_ts;
398 	if ((st->flags & state_flags) != state_flags)
399 		st->flags |= state_flags;
400 	lport = htons(st->u.s.lport);
401 
402 	NAT64_UNLOCK(nh);
403 
404 	if (cfg->base.flags & NAT64_LOG) {
405 		logdata = &loghdr;
406 		nat64lsn_log(logdata, *pm, AF_INET, pg->idx, st->cur.off);
407 	} else
408 		logdata = NULL;
409 
410 	nat64_embed_ip4(&cfg->base, htonl(f_id->src_ip), &src6);
411 	ret = nat64_do_handle_ip4(*pm, &src6, &nh->addr, lport,
412 	    &cfg->base, logdata);
413 
414 	if (ret == NAT64SKIP)
415 		return (cfg->nomatch_verdict);
416 	if (ret == NAT64MFREE)
417 		m_freem(*pm);
418 	*pm = NULL;
419 
420 	return (IP_FW_DENY);
421 }
422 
423 void
424 nat64lsn_dump_state(const struct nat64lsn_cfg *cfg,
425    const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st,
426    const char *px, int off)
427 {
428 	char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], d[INET_ADDRSTRLEN];
429 
430 	if ((V_nat64_debug & DP_STATE) == 0)
431 		return;
432 	inet_ntop(AF_INET6, &pg->host->addr, s, sizeof(s));
433 	inet_ntop(AF_INET, &pg->aaddr, a, sizeof(a));
434 	inet_ntop(AF_INET, &st->u.s.faddr, d, sizeof(d));
435 
436 	DPRINTF(DP_STATE, "%s: PG %d ST [%p|%d]: %s:%d/%d <%s:%d> "
437 	    "%s:%d AGE %d", px, pg->idx, st, off,
438 	    s, st->u.s.lport, pg->nat_proto, a, pg->aport + off,
439 	    d, st->u.s.fport, GET_AGE(st->timestamp));
440 }
441 
442 /*
443  * Check if particular TCP state is stale and should be deleted.
444  * Return 1 if true, 0 otherwise.
445  */
446 static int
447 nat64lsn_periodic_check_tcp(const struct nat64lsn_cfg *cfg,
448     const struct nat64lsn_state *st, int age)
449 {
450 	int ttl;
451 
452 	if (st->flags & NAT64_FLAG_FIN)
453 		ttl = cfg->st_close_ttl;
454 	else if (st->flags & NAT64_FLAG_ESTAB)
455 		ttl = cfg->st_estab_ttl;
456 	else if (st->flags & NAT64_FLAG_SYN)
457 		ttl = cfg->st_syn_ttl;
458 	else
459 		ttl = cfg->st_syn_ttl;
460 
461 	if (age > ttl)
462 		return (1);
463 	return (0);
464 }
465 
466 /*
467  * Check if nat state @st is stale and should be deleted.
468  * Return 1 if true, 0 otherwise.
469  */
470 static NAT64NOINLINE int
471 nat64lsn_periodic_chkstate(const struct nat64lsn_cfg *cfg,
472     const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st)
473 {
474 	int age, delete;
475 
476 	age = GET_AGE(st->timestamp);
477 	delete = 0;
478 
479 	/* Skip immutable records */
480 	if (st->flags & NAT64_FLAG_RDR)
481 		return (0);
482 
483 	switch (pg->nat_proto) {
484 		case NAT_PROTO_TCP:
485 			delete = nat64lsn_periodic_check_tcp(cfg, st, age);
486 			break;
487 		case NAT_PROTO_UDP:
488 			if (age > cfg->st_udp_ttl)
489 				delete = 1;
490 			break;
491 		case NAT_PROTO_ICMP:
492 			if (age > cfg->st_icmp_ttl)
493 				delete = 1;
494 			break;
495 	}
496 
497 	return (delete);
498 }
499 
500 
501 /*
502  * The following structures and functions
503  * are used to perform SLIST_FOREACH_SAFE()
504  * analog for states identified by struct st_ptr.
505  */
506 
507 struct st_idx {
508 	struct nat64lsn_portgroup *pg;
509 	struct nat64lsn_state *st;
510 	struct st_ptr sidx_next;
511 };
512 
513 static struct st_idx *
514 st_first(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
515     struct st_ptr *sidx, struct st_idx *si)
516 {
517 	struct nat64lsn_portgroup *pg;
518 	struct nat64lsn_state *st;
519 
520 	if (sidx->idx == 0) {
521 		memset(si, 0, sizeof(*si));
522 		return (si);
523 	}
524 
525 	pg = PORTGROUP_BYSIDX(cfg, nh, sidx->idx);
526 	st = &pg->states[sidx->off];
527 
528 	si->pg = pg;
529 	si->st = st;
530 	si->sidx_next = st->next;
531 
532 	return (si);
533 }
534 
535 static struct st_idx *
536 st_next(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
537     struct st_idx *si)
538 {
539 	struct st_ptr sidx;
540 	struct nat64lsn_portgroup *pg;
541 	struct nat64lsn_state *st;
542 
543 	sidx = si->sidx_next;
544 	if (sidx.idx == 0) {
545 		memset(si, 0, sizeof(*si));
546 		si->st = NULL;
547 		si->pg = NULL;
548 		return (si);
549 	}
550 
551 	pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
552 	st = &pg->states[sidx.off];
553 
554 	si->pg = pg;
555 	si->st = st;
556 	si->sidx_next = st->next;
557 
558 	return (si);
559 }
560 
561 static struct st_idx *
562 st_save_cond(struct st_idx *si_dst, struct st_idx *si)
563 {
564 	if (si->st != NULL)
565 		*si_dst = *si;
566 
567 	return (si_dst);
568 }
569 
570 unsigned int
571 nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh)
572 {
573 	struct st_idx si, si_prev;
574 	int i;
575 	unsigned int delcount;
576 
577 	delcount = 0;
578 	for (i = 0; i < nh->hsize; i++) {
579 		memset(&si_prev, 0, sizeof(si_prev));
580 		for (st_first(cfg, nh, &nh->phash[i], &si);
581 		    si.st != NULL;
582 		    st_save_cond(&si_prev, &si), st_next(cfg, nh, &si)) {
583 			if (nat64lsn_periodic_chkstate(cfg, si.pg, si.st) == 0)
584 				continue;
585 			nat64lsn_dump_state(cfg, si.pg, si.st, "DELETE STATE",
586 			    si.st->cur.off);
587 			/* Unlink from hash */
588 			if (si_prev.st != NULL)
589 				si_prev.st->next = si.st->next;
590 			else
591 				nh->phash[i] = si.st->next;
592 			/* Delete state and free its data */
593 			PG_MARK_FREE_IDX(si.pg, si.st->cur.off);
594 			memset(si.st, 0, sizeof(struct nat64lsn_state));
595 			si.st = NULL;
596 			delcount++;
597 
598 			/* Update portgroup timestamp */
599 			SET_AGE(si.pg->timestamp);
600 		}
601 	}
602 	NAT64STAT_ADD(&cfg->base.stats, sdeleted, delcount);
603 	return (delcount);
604 }
605 
606 /*
607  * Checks if portgroup is not used and can be deleted,
608  * Returns 1 if stale, 0 otherwise
609  */
610 static int
611 stale_pg(const struct nat64lsn_cfg *cfg, const struct nat64lsn_portgroup *pg)
612 {
613 
614 	if (!PG_IS_EMPTY(pg))
615 		return (0);
616 	if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay)
617 		return (0);
618 	return (1);
619 }
620 
621 /*
622  * Checks if host record is not used and can be deleted,
623  * Returns 1 if stale, 0 otherwise
624  */
625 static int
626 stale_nh(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh)
627 {
628 
629 	if (nh->pg_used != 0)
630 		return (0);
631 	if (GET_AGE(nh->timestamp) < cfg->nh_delete_delay)
632 		return (0);
633 	return (1);
634 }
635 
636 struct nat64lsn_periodic_data {
637 	struct nat64lsn_cfg *cfg;
638 	struct nat64lsn_job_head jhead;
639 	int jlen;
640 };
641 
642 static NAT64NOINLINE int
643 nat64lsn_periodic_chkhost(struct nat64lsn_host *nh,
644     struct nat64lsn_periodic_data *d)
645 {
646 	struct nat64lsn_portgroup *pg;
647 	struct nat64lsn_job_item *ji;
648 	uint64_t delmask[NAT64LSN_PGPTRNMASK];
649 	int delcount, i;
650 
651 	delcount = 0;
652 	memset(delmask, 0, sizeof(delmask));
653 
654 	if (V_nat64_debug & DP_JQUEUE) {
655 		char a[INET6_ADDRSTRLEN];
656 
657 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
658 		DPRINTF(DP_JQUEUE, "Checking %s host %s on cpu %d",
659 		    stale_nh(d->cfg, nh) ? "stale" : "non-stale", a, curcpu);
660 	}
661 	if (!stale_nh(d->cfg, nh)) {
662 		/* Non-stale host. Inspect internals */
663 		NAT64_LOCK(nh);
664 
665 		/* Stage 1: Check&expire states */
666 		if (nat64lsn_periodic_chkstates(d->cfg, nh) != 0)
667 			SET_AGE(nh->timestamp);
668 
669 		/* Stage 2: Check if we need to expire */
670 		for (i = 0; i < nh->pg_used; i++) {
671 			pg = PORTGROUP_BYSIDX(d->cfg, nh, i + 1);
672 			if (pg == NULL)
673 				continue;
674 
675 			/* Check if we can delete portgroup */
676 			if (stale_pg(d->cfg, pg) == 0)
677 				continue;
678 
679 			DPRINTF(DP_JQUEUE, "Check PG %d", i);
680 			delmask[i / 64] |= ((uint64_t)1 << (i % 64));
681 			delcount++;
682 		}
683 
684 		NAT64_UNLOCK(nh);
685 		if (delcount == 0)
686 			return (0);
687 	}
688 
689 	DPRINTF(DP_JQUEUE, "Queueing %d portgroups for deleting", delcount);
690 	/* We have something to delete - add it to queue */
691 	ji = nat64lsn_create_job(d->cfg, NULL, JTYPE_DELPORTGROUP);
692 	if (ji == NULL)
693 		return (0);
694 
695 	ji->haddr = nh->addr;
696 	ji->delcount = delcount;
697 	memcpy(ji->delmask, delmask, sizeof(ji->delmask));
698 
699 	TAILQ_INSERT_TAIL(&d->jhead, ji, next);
700 	d->jlen++;
701 	return (0);
702 }
703 
704 /*
705  * This procedure is used to perform various maintance
706  * on dynamic hash list. Currently it is called every second.
707  */
708 static void
709 nat64lsn_periodic(void *data)
710 {
711 	struct ip_fw_chain *ch;
712 	IPFW_RLOCK_TRACKER;
713 	struct nat64lsn_cfg *cfg;
714 	struct nat64lsn_periodic_data d;
715 	struct nat64lsn_host *nh, *tmp;
716 
717 	cfg = (struct nat64lsn_cfg *) data;
718 	ch = cfg->ch;
719 	CURVNET_SET(cfg->vp);
720 
721 	memset(&d, 0, sizeof(d));
722 	d.cfg = cfg;
723 	TAILQ_INIT(&d.jhead);
724 
725 	IPFW_RLOCK(ch);
726 
727 	/* Stage 1: foreach host, check all its portgroups */
728 	I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_periodic_chkhost, &d);
729 
730 	/* Enqueue everything we have requested */
731 	nat64lsn_enqueue_jobs(cfg, &d.jhead, d.jlen);
732 
733 	callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY);
734 
735 	IPFW_RUNLOCK(ch);
736 
737 	CURVNET_RESTORE();
738 }
739 
740 static NAT64NOINLINE void
741 reinject_mbuf(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
742 {
743 
744 	if (ji->m == NULL)
745 		return;
746 
747 	/* Request has failed or packet type is wrong */
748 	if (ji->f_id.addr_type != 6 || ji->done == 0) {
749 		m_freem(ji->m);
750 		ji->m = NULL;
751 		NAT64STAT_INC(&cfg->base.stats, dropped);
752 		DPRINTF(DP_DROPS, "mbuf dropped: type %d, done %d",
753 		    ji->jtype, ji->done);
754 		return;
755 	}
756 
757 	/*
758 	 * XXX: Limit recursion level
759 	 */
760 
761 	NAT64STAT_INC(&cfg->base.stats, jreinjected);
762 	DPRINTF(DP_JQUEUE, "Reinject mbuf");
763 	nat64lsn_translate6(cfg, &ji->f_id, &ji->m);
764 }
765 
766 static void
767 destroy_portgroup(struct nat64lsn_portgroup *pg)
768 {
769 
770 	DPRINTF(DP_OBJ, "DESTROY PORTGROUP %d %p", pg->idx, pg);
771 	uma_zfree(nat64lsn_pg_zone, pg);
772 }
773 
774 static NAT64NOINLINE int
775 alloc_portgroup(struct nat64lsn_job_item *ji)
776 {
777 	struct nat64lsn_portgroup *pg;
778 
779 	pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT);
780 	if (pg == NULL)
781 		return (1);
782 
783 	if (ji->needs_idx != 0) {
784 		ji->spare_idx = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
785 		/* Failed alloc isn't always fatal, so don't check */
786 	}
787 	memset(&pg->freemask, 0xFF, sizeof(pg->freemask));
788 	pg->nat_proto = ji->nat_proto;
789 	ji->pg = pg;
790 	return (0);
791 
792 }
793 
794 static void
795 destroy_host6(struct nat64lsn_host *nh)
796 {
797 	char a[INET6_ADDRSTRLEN];
798 	int i;
799 
800 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
801 	DPRINTF(DP_OBJ, "DESTROY HOST %s %p (pg used %d)", a, nh,
802 	    nh->pg_used);
803 	NAT64_LOCK_DESTROY(nh);
804 	for (i = 0; i < nh->pg_allocated / NAT64LSN_PGIDX_CHUNK; i++)
805 		uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, i));
806 	uma_zfree(nat64lsn_host_zone, nh);
807 }
808 
809 static NAT64NOINLINE int
810 alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
811 {
812 	struct nat64lsn_host *nh;
813 	char a[INET6_ADDRSTRLEN];
814 
815 	nh = uma_zalloc(nat64lsn_host_zone, M_NOWAIT);
816 	if (nh == NULL)
817 		return (1);
818 	PORTGROUP_CHUNK(nh, 0) = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
819 	if (PORTGROUP_CHUNK(nh, 0) == NULL) {
820 		uma_zfree(nat64lsn_host_zone, nh);
821 		return (2);
822 	}
823 	if (alloc_portgroup(ji) != 0) {
824 		NAT64STAT_INC(&cfg->base.stats, jportfails);
825 		uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, 0));
826 		uma_zfree(nat64lsn_host_zone, nh);
827 		return (3);
828 	}
829 
830 	NAT64_LOCK_INIT(nh);
831 	nh->addr = ji->haddr;
832 	nh->hsize = NAT64LSN_HSIZE; /* XXX: hardcoded size */
833 	nh->pg_allocated = NAT64LSN_PGIDX_CHUNK;
834 	nh->pg_used = 0;
835 	ji->nh = nh;
836 
837 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
838 	DPRINTF(DP_OBJ, "ALLOC HOST %s %p", a, ji->nh);
839 	return (0);
840 }
841 
842 /*
843  * Finds free @pg index inside @nh
844  */
845 static NAT64NOINLINE int
846 find_nh_pg_idx(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, int *idx)
847 {
848 	int i;
849 
850 	for (i = 0; i < nh->pg_allocated; i++) {
851 		if (PORTGROUP_BYSIDX(cfg, nh, i + 1) == NULL) {
852 			*idx = i;
853 			return (0);
854 		}
855 	}
856 	return (1);
857 }
858 
859 static NAT64NOINLINE int
860 attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
861 {
862 	char a[INET6_ADDRSTRLEN];
863 	struct nat64lsn_host *nh;
864 
865 	I6HASH_FIND(cfg, nh, &ji->haddr);
866 	if (nh == NULL) {
867 		/* Add new host to list */
868 		nh = ji->nh;
869 		I6HASH_INSERT(cfg, nh);
870 		cfg->ihcount++;
871 		ji->nh = NULL;
872 
873 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
874 		DPRINTF(DP_OBJ, "ATTACH HOST %s %p", a, nh);
875 		/*
876 		 * Try to add portgroup.
877 		 * Note it will automatically set
878 		 * 'done' on ji if successful.
879 		 */
880 		if (attach_portgroup(cfg, ji) != 0) {
881 			DPRINTF(DP_DROPS, "%s %p failed to attach PG",
882 			    a, nh);
883 			NAT64STAT_INC(&cfg->base.stats, jportfails);
884 			return (1);
885 		}
886 		return (0);
887 	}
888 
889 	/*
890 	 * nh isn't NULL. This probably means we had several simultaneous
891 	 * host requests. The previous one request has already attached
892 	 * this host. Requeue attached mbuf and mark job as done, but
893 	 * leave nh and pg pointers not changed, so nat64lsn_do_request()
894 	 * will release all allocated resources.
895 	 */
896 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
897 	DPRINTF(DP_OBJ, "%s %p is already attached as %p",
898 	    a, ji->nh, nh);
899 	ji->done = 1;
900 	return (0);
901 }
902 
903 static NAT64NOINLINE int
904 find_pg_place_addr(const struct nat64lsn_cfg *cfg, int addr_off,
905     int nat_proto, uint16_t *aport, int *ppg_idx)
906 {
907 	int j, pg_idx;
908 
909 	pg_idx = addr_off * _ADDR_PG_COUNT +
910 	    (nat_proto - 1) * _ADDR_PG_PROTO_COUNT;
911 
912 	for (j = NAT64_MIN_CHUNK; j < _ADDR_PG_PROTO_COUNT; j++) {
913 		if (cfg->pg[pg_idx + j] != NULL)
914 			continue;
915 
916 		*aport = j * NAT64_CHUNK_SIZE;
917 		*ppg_idx = pg_idx + j;
918 		return (1);
919 	}
920 
921 	return (0);
922 }
923 
924 /*
925  * XXX: This function needs to be rewritten to
926  * use free bitmask for faster pg finding,
927  * additionally, it should take into consideration
928  * a) randomization and
929  * b) previous addresses allocated to given nat instance
930  *
931  */
932 static NAT64NOINLINE int
933 find_portgroup_place(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji,
934     uint32_t *aaddr, uint16_t *aport, int *ppg_idx)
935 {
936 	int i, nat_proto;
937 
938 	/*
939 	 * XXX: Use bitmask index to be able to find/check if IP address
940 	 * has some spare pg's
941 	 */
942 	nat_proto = ji->nat_proto;
943 
944 	/* First, try to use same address */
945 	if (ji->aaddr != 0) {
946 		i = ntohl(ji->aaddr) - cfg->prefix4;
947 		if (find_pg_place_addr(cfg, i, nat_proto, aport,
948 		    ppg_idx) != 0){
949 			/* Found! */
950 			*aaddr = htonl(cfg->prefix4 + i);
951 			return (0);
952 		}
953 	}
954 
955 	/* Next, try to use random address based on flow hash */
956 	i = ji->fhash % (1 << (32 - cfg->plen4));
957 	if (find_pg_place_addr(cfg, i, nat_proto, aport, ppg_idx) != 0) {
958 		/* Found! */
959 		*aaddr = htonl(cfg->prefix4 + i);
960 		return (0);
961 	}
962 
963 
964 	/* Last one: simply find ANY available */
965 	for (i = 0; i < (1 << (32 - cfg->plen4)); i++) {
966 		if (find_pg_place_addr(cfg, i, nat_proto, aport,
967 		    ppg_idx) != 0){
968 			/* Found! */
969 			*aaddr = htonl(cfg->prefix4 + i);
970 			return (0);
971 		}
972 	}
973 
974 	return (1);
975 }
976 
977 static NAT64NOINLINE int
978 attach_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
979 {
980 	char a[INET6_ADDRSTRLEN];
981 	struct nat64lsn_portgroup *pg;
982 	struct nat64lsn_host *nh;
983 	uint32_t aaddr;
984 	uint16_t aport;
985 	int nh_pg_idx, pg_idx;
986 
987 	pg = ji->pg;
988 
989 	/*
990 	 * Find source host and bind: we can't rely on
991 	 * pg->host
992 	 */
993 	I6HASH_FIND(cfg, nh, &ji->haddr);
994 	if (nh == NULL)
995 		return (1);
996 
997 	/* Find spare port chunk */
998 	if (find_portgroup_place(cfg, ji, &aaddr, &aport, &pg_idx) != 0) {
999 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1000 		DPRINTF(DP_OBJ | DP_DROPS, "empty PG not found for %s", a);
1001 		return (2);
1002 	}
1003 
1004 	/* Expand PG indexes if needed */
1005 	if (nh->pg_allocated < cfg->max_chunks && ji->spare_idx != NULL) {
1006 		PORTGROUP_CHUNK(nh, nh->pg_allocated / NAT64LSN_PGIDX_CHUNK) =
1007 		    ji->spare_idx;
1008 		nh->pg_allocated += NAT64LSN_PGIDX_CHUNK;
1009 		ji->spare_idx = NULL;
1010 	}
1011 
1012 	/* Find empty index to store PG in the @nh */
1013 	if (find_nh_pg_idx(cfg, nh, &nh_pg_idx) != 0) {
1014 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1015 		DPRINTF(DP_OBJ | DP_DROPS, "free PG index not found for %s",
1016 		    a);
1017 		return (3);
1018 	}
1019 
1020 	cfg->pg[pg_idx] = pg;
1021 	cfg->protochunks[pg->nat_proto]++;
1022 	NAT64STAT_INC(&cfg->base.stats, spgcreated);
1023 
1024 	pg->aaddr = aaddr;
1025 	pg->aport = aport;
1026 	pg->host = nh;
1027 	pg->idx = pg_idx;
1028 	SET_AGE(pg->timestamp);
1029 
1030 	PORTGROUP_BYSIDX(cfg, nh, nh_pg_idx + 1) = pg;
1031 	if (nh->pg_used == nh_pg_idx)
1032 		nh->pg_used++;
1033 	SET_AGE(nh->timestamp);
1034 
1035 	ji->pg = NULL;
1036 	ji->done = 1;
1037 
1038 	return (0);
1039 }
1040 
1041 static NAT64NOINLINE void
1042 consider_del_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1043 {
1044 	struct nat64lsn_host *nh, *nh_tmp;
1045 	struct nat64lsn_portgroup *pg, *pg_list[256];
1046 	int i, pg_lidx, idx;
1047 
1048 	/* Find source host */
1049 	I6HASH_FIND(cfg, nh, &ji->haddr);
1050 	if (nh == NULL || nh->pg_used == 0)
1051 		return;
1052 
1053 	memset(pg_list, 0, sizeof(pg_list));
1054 	pg_lidx = 0;
1055 
1056 	NAT64_LOCK(nh);
1057 
1058 	for (i = nh->pg_used - 1; i >= 0; i--) {
1059 		if ((ji->delmask[i / 64] & ((uint64_t)1 << (i % 64))) == 0)
1060 			continue;
1061 		pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
1062 
1063 		/* Check that PG isn't busy. */
1064 		if (stale_pg(cfg, pg) == 0)
1065 			continue;
1066 
1067 		/* DO delete */
1068 		pg_list[pg_lidx++] = pg;
1069 		PORTGROUP_BYSIDX(cfg, nh, i + 1) = NULL;
1070 
1071 		idx = _GET_PORTGROUP_IDX(cfg, ntohl(pg->aaddr), pg->nat_proto,
1072 		    pg->aport);
1073 		KASSERT(cfg->pg[idx] == pg, ("Non matched pg"));
1074 		cfg->pg[idx] = NULL;
1075 		cfg->protochunks[pg->nat_proto]--;
1076 		NAT64STAT_INC(&cfg->base.stats, spgdeleted);
1077 
1078 		/* Decrease pg_used */
1079 		while (nh->pg_used > 0 &&
1080 		    PORTGROUP_BYSIDX(cfg, nh, nh->pg_used) == NULL)
1081 			nh->pg_used--;
1082 
1083 		/* Check if on-stack buffer has ended */
1084 		if (pg_lidx == nitems(pg_list))
1085 			break;
1086 	}
1087 
1088 	NAT64_UNLOCK(nh);
1089 
1090 	if (stale_nh(cfg, nh)) {
1091 		I6HASH_REMOVE(cfg, nh, nh_tmp, &ji->haddr);
1092 		KASSERT(nh != NULL, ("Unable to find address"));
1093 		cfg->ihcount--;
1094 		ji->nh = nh;
1095 		I6HASH_FIND(cfg, nh, &ji->haddr);
1096 		KASSERT(nh == NULL, ("Failed to delete address"));
1097 	}
1098 
1099 	/* TODO: Delay freeing portgroups */
1100 	while (pg_lidx > 0) {
1101 		pg_lidx--;
1102 		NAT64STAT_INC(&cfg->base.stats, spgdeleted);
1103 		destroy_portgroup(pg_list[pg_lidx]);
1104 	}
1105 }
1106 
1107 /*
1108  * Main request handler.
1109  * Responsible for handling jqueue, e.g.
1110  * creating new hosts, addind/deleting portgroups.
1111  */
1112 static NAT64NOINLINE void
1113 nat64lsn_do_request(void *data)
1114 {
1115 	IPFW_RLOCK_TRACKER;
1116 	struct nat64lsn_job_head jhead;
1117 	struct nat64lsn_job_item *ji;
1118 	int jcount, nhsize;
1119 	struct nat64lsn_cfg *cfg = (struct nat64lsn_cfg *) data;
1120 	struct ip_fw_chain *ch;
1121 	int delcount;
1122 
1123 	CURVNET_SET(cfg->vp);
1124 
1125 	TAILQ_INIT(&jhead);
1126 
1127 	/* XXX: We're running unlocked here */
1128 
1129 	ch = cfg->ch;
1130 	delcount = 0;
1131 	IPFW_RLOCK(ch);
1132 
1133 	/* Grab queue */
1134 	JQUEUE_LOCK();
1135 	TAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item, next);
1136 	jcount = cfg->jlen;
1137 	cfg->jlen = 0;
1138 	JQUEUE_UNLOCK();
1139 
1140 	/* check if we need to resize hash */
1141 	nhsize = 0;
1142 	if (cfg->ihcount > cfg->ihsize && cfg->ihsize < 65536) {
1143 		nhsize = cfg->ihsize;
1144 		for ( ; cfg->ihcount > nhsize && nhsize < 65536; nhsize *= 2)
1145 			;
1146 	} else if (cfg->ihcount < cfg->ihsize * 4) {
1147 		nhsize = cfg->ihsize;
1148 		for ( ; cfg->ihcount < nhsize * 4 && nhsize > 32; nhsize /= 2)
1149 			;
1150 	}
1151 
1152 	IPFW_RUNLOCK(ch);
1153 
1154 	if (TAILQ_EMPTY(&jhead)) {
1155 		CURVNET_RESTORE();
1156 		return;
1157 	}
1158 
1159 	NAT64STAT_INC(&cfg->base.stats, jcalls);
1160 	DPRINTF(DP_JQUEUE, "count=%d", jcount);
1161 
1162 	/*
1163 	 * TODO:
1164 	 * What we should do here is to build a hash
1165 	 * to ensure we don't have lots of duplicate requests.
1166 	 * Skip this for now.
1167 	 *
1168 	 * TODO: Limit per-call number of items
1169 	 */
1170 
1171 	/* Pre-allocate everything for entire chain */
1172 	TAILQ_FOREACH(ji, &jhead,  next) {
1173 		switch (ji->jtype) {
1174 			case JTYPE_NEWHOST:
1175 				if (alloc_host6(cfg, ji) != 0)
1176 					NAT64STAT_INC(&cfg->base.stats,
1177 					    jhostfails);
1178 				break;
1179 			case JTYPE_NEWPORTGROUP:
1180 				if (alloc_portgroup(ji) != 0)
1181 					NAT64STAT_INC(&cfg->base.stats,
1182 					    jportfails);
1183 				break;
1184 			case JTYPE_DELPORTGROUP:
1185 				delcount += ji->delcount;
1186 				break;
1187 			default:
1188 				break;
1189 		}
1190 	}
1191 
1192 	/*
1193 	 * TODO: Alloc hew hash
1194 	 */
1195 	nhsize = 0;
1196 	if (nhsize > 0) {
1197 		/* XXX: */
1198 	}
1199 
1200 	/* Apply all changes in batch */
1201 	IPFW_UH_WLOCK(ch);
1202 	IPFW_WLOCK(ch);
1203 
1204 	TAILQ_FOREACH(ji, &jhead,  next) {
1205 		switch (ji->jtype) {
1206 			case JTYPE_NEWHOST:
1207 				if (ji->nh != NULL)
1208 					attach_host6(cfg, ji);
1209 				break;
1210 			case JTYPE_NEWPORTGROUP:
1211 				if (ji->pg != NULL &&
1212 				    attach_portgroup(cfg, ji) != 0)
1213 					NAT64STAT_INC(&cfg->base.stats,
1214 					    jportfails);
1215 				break;
1216 			case JTYPE_DELPORTGROUP:
1217 				consider_del_portgroup(cfg, ji);
1218 				break;
1219 		}
1220 	}
1221 
1222 	if (nhsize > 0) {
1223 		/* XXX: Move everything to new hash */
1224 	}
1225 
1226 	IPFW_WUNLOCK(ch);
1227 	IPFW_UH_WUNLOCK(ch);
1228 
1229 	/* Flush unused entries */
1230 	while (!TAILQ_EMPTY(&jhead)) {
1231 		ji = TAILQ_FIRST(&jhead);
1232 		TAILQ_REMOVE(&jhead, ji, next);
1233 		if (ji->nh != NULL)
1234 			destroy_host6(ji->nh);
1235 		if (ji->pg != NULL)
1236 			destroy_portgroup(ji->pg);
1237 		if (ji->m != NULL)
1238 			reinject_mbuf(cfg, ji);
1239 		if (ji->spare_idx != NULL)
1240 			uma_zfree(nat64lsn_pgidx_zone, ji->spare_idx);
1241 		free(ji, M_IPFW);
1242 	}
1243 	CURVNET_RESTORE();
1244 }
1245 
1246 static NAT64NOINLINE struct nat64lsn_job_item *
1247 nat64lsn_create_job(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
1248     int jtype)
1249 {
1250 	struct nat64lsn_job_item *ji;
1251 	struct in6_addr haddr;
1252 	uint8_t nat_proto;
1253 
1254 	/*
1255 	 * Do not try to lock possibly contested mutex if we're near the limit.
1256 	 * Drop packet instead.
1257 	 */
1258 	if (cfg->jlen >= cfg->jmaxlen) {
1259 		NAT64STAT_INC(&cfg->base.stats, jmaxlen);
1260 		return (NULL);
1261 	}
1262 
1263 	memset(&haddr, 0, sizeof(haddr));
1264 	nat_proto = 0;
1265 	if (f_id != NULL) {
1266 		haddr = f_id->src_ip6;
1267 		nat_proto = nat64lsn_proto_map[f_id->proto];
1268 
1269 		DPRINTF(DP_JQUEUE, "REQUEST pg nat_proto %d on proto %d",
1270 		    nat_proto, f_id->proto);
1271 
1272 		if (nat_proto == 0)
1273 			return (NULL);
1274 	}
1275 
1276 	ji = malloc(sizeof(struct nat64lsn_job_item), M_IPFW,
1277 	    M_NOWAIT | M_ZERO);
1278 
1279 	if (ji == NULL) {
1280 		NAT64STAT_INC(&cfg->base.stats, jnomem);
1281 		return (NULL);
1282 	}
1283 
1284 	ji->jtype = jtype;
1285 
1286 	if (f_id != NULL) {
1287 		ji->f_id = *f_id;
1288 		ji->haddr = haddr;
1289 		ji->nat_proto = nat_proto;
1290 	}
1291 
1292 	return (ji);
1293 }
1294 
1295 static NAT64NOINLINE void
1296 nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1297 {
1298 
1299 	if (ji == NULL)
1300 		return;
1301 
1302 	JQUEUE_LOCK();
1303 	TAILQ_INSERT_TAIL(&cfg->jhead, ji, next);
1304 	cfg->jlen++;
1305 	NAT64STAT_INC(&cfg->base.stats, jrequests);
1306 
1307 	if (callout_pending(&cfg->jcallout) == 0)
1308 		callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
1309 	JQUEUE_UNLOCK();
1310 }
1311 
1312 static NAT64NOINLINE void
1313 nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
1314     struct nat64lsn_job_head *jhead, int jlen)
1315 {
1316 
1317 	if (TAILQ_EMPTY(jhead))
1318 		return;
1319 
1320 	/* Attach current queue to execution one */
1321 	JQUEUE_LOCK();
1322 	TAILQ_CONCAT(&cfg->jhead, jhead, next);
1323 	cfg->jlen += jlen;
1324 	NAT64STAT_ADD(&cfg->base.stats, jrequests, jlen);
1325 
1326 	if (callout_pending(&cfg->jcallout) == 0)
1327 		callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
1328 	JQUEUE_UNLOCK();
1329 }
1330 
1331 static unsigned int
1332 flow6_hash(const struct ipfw_flow_id *f_id)
1333 {
1334 	unsigned char hbuf[36];
1335 
1336 	memcpy(hbuf, &f_id->dst_ip6, 16);
1337 	memcpy(&hbuf[16], &f_id->src_ip6, 16);
1338 	memcpy(&hbuf[32], &f_id->dst_port, 2);
1339 	memcpy(&hbuf[32], &f_id->src_port, 2);
1340 
1341 	return (djb_hash(hbuf, sizeof(hbuf)));
1342 }
1343 
1344 static NAT64NOINLINE int
1345 nat64lsn_request_host(struct nat64lsn_cfg *cfg,
1346     const struct ipfw_flow_id *f_id, struct mbuf **pm)
1347 {
1348 	struct nat64lsn_job_item *ji;
1349 	struct mbuf *m;
1350 
1351 	m = *pm;
1352 	*pm = NULL;
1353 
1354 	ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWHOST);
1355 	if (ji == NULL) {
1356 		m_freem(m);
1357 		NAT64STAT_INC(&cfg->base.stats, dropped);
1358 		DPRINTF(DP_DROPS, "failed to create job");
1359 	} else {
1360 		ji->m = m;
1361 		/* Provide pseudo-random value based on flow */
1362 		ji->fhash = flow6_hash(f_id);
1363 		nat64lsn_enqueue_job(cfg, ji);
1364 		NAT64STAT_INC(&cfg->base.stats, jhostsreq);
1365 	}
1366 
1367 	return (IP_FW_DENY);
1368 }
1369 
1370 static NAT64NOINLINE int
1371 nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
1372     const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
1373     int needs_idx)
1374 {
1375 	struct nat64lsn_job_item *ji;
1376 	struct mbuf *m;
1377 
1378 	m = *pm;
1379 	*pm = NULL;
1380 
1381 	ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWPORTGROUP);
1382 	if (ji == NULL) {
1383 		m_freem(m);
1384 		NAT64STAT_INC(&cfg->base.stats, dropped);
1385 		DPRINTF(DP_DROPS, "failed to create job");
1386 	} else {
1387 		ji->m = m;
1388 		/* Provide pseudo-random value based on flow */
1389 		ji->fhash = flow6_hash(f_id);
1390 		ji->aaddr = aaddr;
1391 		ji->needs_idx = needs_idx;
1392 		nat64lsn_enqueue_job(cfg, ji);
1393 		NAT64STAT_INC(&cfg->base.stats, jportreq);
1394 	}
1395 
1396 	return (IP_FW_DENY);
1397 }
1398 
1399 static NAT64NOINLINE struct nat64lsn_state *
1400 nat64lsn_create_state(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh,
1401     int nat_proto, struct nat64lsn_state *kst, uint32_t *aaddr)
1402 {
1403 	struct nat64lsn_portgroup *pg;
1404 	struct nat64lsn_state *st;
1405 	int i, hval, off;
1406 
1407 	/* XXX: create additional bitmask for selecting proper portgroup */
1408 	for (i = 0; i < nh->pg_used; i++) {
1409 		pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
1410 		if (pg == NULL)
1411 			continue;
1412 		if (*aaddr == 0)
1413 			*aaddr = pg->aaddr;
1414 		if (pg->nat_proto != nat_proto)
1415 			continue;
1416 
1417 		off = PG_GET_FREE_IDX(pg);
1418 		if (off != 0) {
1419 			/* We have found spare state. Use it */
1420 			off--;
1421 			PG_MARK_BUSY_IDX(pg, off);
1422 			st = &pg->states[off];
1423 
1424 			/*
1425 			 * Fill in new info. Assume state was zeroed.
1426 			 * Timestamp and flags will be filled by caller.
1427 			 */
1428 			st->u.s = kst->u.s;
1429 			st->cur.idx = i + 1;
1430 			st->cur.off = off;
1431 
1432 			/* Insert into host hash table */
1433 			hval = HASH_IN4(&st->u.hkey) & (nh->hsize - 1);
1434 			st->next = nh->phash[hval];
1435 			nh->phash[hval] = st->cur;
1436 
1437 			nat64lsn_dump_state(cfg, pg, st, "ALLOC STATE", off);
1438 
1439 			NAT64STAT_INC(&cfg->base.stats, screated);
1440 
1441 			return (st);
1442 		}
1443 		/* Saev last used alias affress */
1444 		*aaddr = pg->aaddr;
1445 	}
1446 
1447 	return (NULL);
1448 }
1449 
1450 static NAT64NOINLINE int
1451 nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id,
1452     struct mbuf **pm)
1453 {
1454 	struct pfloghdr loghdr, *logdata;
1455 	char a[INET6_ADDRSTRLEN];
1456 	struct nat64lsn_host *nh;
1457 	struct st_ptr sidx;
1458 	struct nat64lsn_state *st, kst;
1459 	struct nat64lsn_portgroup *pg;
1460 	struct icmp6_hdr *icmp6;
1461 	uint32_t aaddr;
1462 	int action, hval, nat_proto, proto;
1463 	uint16_t aport, state_ts, state_flags;
1464 
1465 	/* Check if af/protocol is supported and get it short id */
1466 	nat_proto = nat64lsn_proto_map[f_id->proto];
1467 	if (nat_proto == 0) {
1468 		/*
1469 		 * Since we can be called from jobs handler, we need
1470 		 * to free mbuf by self, do not leave this task to
1471 		 * ipfw_check_packet().
1472 		 */
1473 		NAT64STAT_INC(&cfg->base.stats, noproto);
1474 		goto drop;
1475 	}
1476 
1477 	/* Try to find host first */
1478 	I6HASH_FIND(cfg, nh, &f_id->src_ip6);
1479 
1480 	if (nh == NULL)
1481 		return (nat64lsn_request_host(cfg, f_id, pm));
1482 
1483 	/* Fill-in on-stack state structure */
1484 	kst.u.s.faddr = nat64_extract_ip4(&cfg->base, &f_id->dst_ip6);
1485 	if (kst.u.s.faddr == 0) {
1486 		NAT64STAT_INC(&cfg->base.stats, dropped);
1487 		goto drop;
1488 	}
1489 	kst.u.s.fport = f_id->dst_port;
1490 	kst.u.s.lport = f_id->src_port;
1491 
1492 	/* Prepare some fields we might need to update */
1493 	hval = 0;
1494 	proto = nat64_getlasthdr(*pm, &hval);
1495 	if (proto < 0) {
1496 		NAT64STAT_INC(&cfg->base.stats, dropped);
1497 		DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
1498 		goto drop;
1499 	}
1500 
1501 	SET_AGE(state_ts);
1502 	if (proto == IPPROTO_TCP)
1503 		state_flags = convert_tcp_flags(
1504 		    TCP(mtodo(*pm, hval))->th_flags);
1505 	else
1506 		state_flags = 0;
1507 	if (proto == IPPROTO_ICMPV6) {
1508 		/* Alter local port data */
1509 		icmp6 = mtodo(*pm, hval);
1510 		if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST ||
1511 		    icmp6->icmp6_type == ICMP6_ECHO_REPLY)
1512 			kst.u.s.lport = ntohs(icmp6->icmp6_id);
1513 	}
1514 
1515 	hval = HASH_IN4(&kst.u.hkey) & (nh->hsize - 1);
1516 	pg = NULL;
1517 	st = NULL;
1518 
1519 	/* OK, let's find state in host hash */
1520 	NAT64_LOCK(nh);
1521 	sidx = nh->phash[hval];
1522 	int k = 0;
1523 	while (sidx.idx != 0) {
1524 		pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
1525 		st = &pg->states[sidx.off];
1526 		//DPRINTF("SISX: %d/%d next: %d/%d", sidx.idx, sidx.off,
1527 		//st->next.idx, st->next.off);
1528 		if (st->u.hkey == kst.u.hkey && pg->nat_proto == nat_proto)
1529 			break;
1530 		if (k++ > 1000) {
1531 			DPRINTF(DP_ALL, "XXX: too long %d/%d %d/%d\n",
1532 			    sidx.idx, sidx.off, st->next.idx, st->next.off);
1533 			DPRINTF(DP_GENERIC, "TR host %s %p on cpu %d",
1534 			    inet_ntop(AF_INET6, &nh->addr, a, sizeof(a)),
1535 			    nh, curcpu);
1536 			k = 0;
1537 		}
1538 		sidx = st->next;
1539 	}
1540 
1541 	if (sidx.idx == 0) {
1542 		aaddr = 0;
1543 		st = nat64lsn_create_state(cfg, nh, nat_proto, &kst, &aaddr);
1544 		if (st == NULL) {
1545 			/* No free states. Request more if we can */
1546 			if (nh->pg_used >= cfg->max_chunks) {
1547 				/* Limit reached */
1548 				DPRINTF(DP_DROPS, "PG limit reached "
1549 				    " for host %s (used %u, allocated %u, "
1550 				    "limit %u)", inet_ntop(AF_INET6,
1551 				    &nh->addr, a, sizeof(a)),
1552 				    nh->pg_used * NAT64_CHUNK_SIZE,
1553 				    nh->pg_allocated * NAT64_CHUNK_SIZE,
1554 				    cfg->max_chunks * NAT64_CHUNK_SIZE);
1555 				NAT64_UNLOCK(nh);
1556 				NAT64STAT_INC(&cfg->base.stats, dropped);
1557 				goto drop;
1558 			}
1559 			if ((nh->pg_allocated <=
1560 			    nh->pg_used + NAT64LSN_REMAININGPG) &&
1561 			    nh->pg_allocated < cfg->max_chunks)
1562 				action = 1; /* Request new indexes */
1563 			else
1564 				action = 0;
1565 			NAT64_UNLOCK(nh);
1566 			//DPRINTF("No state, unlock for %p", nh);
1567 			return (nat64lsn_request_portgroup(cfg, f_id,
1568 			    pm, aaddr, action));
1569 		}
1570 
1571 		/* We've got new state. */
1572 		sidx = st->cur;
1573 		pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
1574 	}
1575 
1576 	/* Okay, state found */
1577 
1578 	/* Update necessary fileds */
1579 	if (st->timestamp != state_ts)
1580 		st->timestamp = state_ts;
1581 	if ((st->flags & state_flags) != 0)
1582 		st->flags |= state_flags;
1583 
1584 	/* Copy needed state data */
1585 	aaddr = pg->aaddr;
1586 	aport = htons(pg->aport + sidx.off);
1587 
1588 	NAT64_UNLOCK(nh);
1589 
1590 	if (cfg->base.flags & NAT64_LOG) {
1591 		logdata = &loghdr;
1592 		nat64lsn_log(logdata, *pm, AF_INET6, pg->idx, st->cur.off);
1593 	} else
1594 		logdata = NULL;
1595 
1596 	action = nat64_do_handle_ip6(*pm, aaddr, aport, &cfg->base, logdata);
1597 	if (action == NAT64SKIP)
1598 		return (cfg->nomatch_verdict);
1599 	if (action == NAT64MFREE) {
1600 drop:
1601 		m_freem(*pm);
1602 	}
1603 	*pm = NULL;	/* mark mbuf as consumed */
1604 	return (IP_FW_DENY);
1605 }
1606 
1607 /*
1608  * Main dataplane entry point.
1609  */
1610 int
1611 ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,
1612     ipfw_insn *cmd, int *done)
1613 {
1614 	ipfw_insn *icmd;
1615 	struct nat64lsn_cfg *cfg;
1616 	int ret;
1617 
1618 	IPFW_RLOCK_ASSERT(ch);
1619 
1620 	*done = 1; /* terminate the search */
1621 	icmd = cmd + 1;
1622 	if (cmd->opcode != O_EXTERNAL_ACTION ||
1623 	    cmd->arg1 != V_nat64lsn_eid ||
1624 	    icmd->opcode != O_EXTERNAL_INSTANCE ||
1625 	    (cfg = NAT64_LOOKUP(ch, icmd)) == NULL)
1626 		return (0);
1627 
1628 	switch (args->f_id.addr_type) {
1629 	case 4:
1630 		ret = nat64lsn_translate4(cfg, &args->f_id, &args->m);
1631 		break;
1632 	case 6:
1633 		ret = nat64lsn_translate6(cfg, &args->f_id, &args->m);
1634 		break;
1635 	default:
1636 		return (cfg->nomatch_verdict);
1637 	}
1638 	return (ret);
1639 }
1640 
1641 static int
1642 nat64lsn_ctor_host(void *mem, int size, void *arg, int flags)
1643 {
1644 	struct nat64lsn_host *nh;
1645 
1646 	nh = (struct nat64lsn_host *)mem;
1647 	memset(nh->pg_ptr, 0, sizeof(nh->pg_ptr));
1648 	memset(nh->phash, 0, sizeof(nh->phash));
1649 	return (0);
1650 }
1651 
1652 static int
1653 nat64lsn_ctor_pgidx(void *mem, int size, void *arg, int flags)
1654 {
1655 
1656 	memset(mem, 0, size);
1657 	return (0);
1658 }
1659 
1660 void
1661 nat64lsn_init_internal(void)
1662 {
1663 
1664 	memset(nat64lsn_proto_map, 0, sizeof(nat64lsn_proto_map));
1665 	/* Set up supported protocol map */
1666 	nat64lsn_proto_map[IPPROTO_TCP] = NAT_PROTO_TCP;
1667 	nat64lsn_proto_map[IPPROTO_UDP] = NAT_PROTO_UDP;
1668 	nat64lsn_proto_map[IPPROTO_ICMP] = NAT_PROTO_ICMP;
1669 	nat64lsn_proto_map[IPPROTO_ICMPV6] = NAT_PROTO_ICMP;
1670 	/* Fill in reverse proto map */
1671 	memset(nat64lsn_rproto_map, 0, sizeof(nat64lsn_rproto_map));
1672 	nat64lsn_rproto_map[NAT_PROTO_TCP] = IPPROTO_TCP;
1673 	nat64lsn_rproto_map[NAT_PROTO_UDP] = IPPROTO_UDP;
1674 	nat64lsn_rproto_map[NAT_PROTO_ICMP] = IPPROTO_ICMPV6;
1675 
1676 	JQUEUE_LOCK_INIT();
1677 	nat64lsn_host_zone = uma_zcreate("NAT64 hosts zone",
1678 	    sizeof(struct nat64lsn_host), nat64lsn_ctor_host, NULL,
1679 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1680 	nat64lsn_pg_zone = uma_zcreate("NAT64 portgroups zone",
1681 	    sizeof(struct nat64lsn_portgroup), NULL, NULL, NULL, NULL,
1682 	    UMA_ALIGN_PTR, 0);
1683 	nat64lsn_pgidx_zone = uma_zcreate("NAT64 portgroup indexes zone",
1684 	    sizeof(struct nat64lsn_portgroup *) * NAT64LSN_PGIDX_CHUNK,
1685 	    nat64lsn_ctor_pgidx, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1686 }
1687 
1688 void
1689 nat64lsn_uninit_internal(void)
1690 {
1691 
1692 	JQUEUE_LOCK_DESTROY();
1693 	uma_zdestroy(nat64lsn_host_zone);
1694 	uma_zdestroy(nat64lsn_pg_zone);
1695 	uma_zdestroy(nat64lsn_pgidx_zone);
1696 }
1697 
1698 void
1699 nat64lsn_start_instance(struct nat64lsn_cfg *cfg)
1700 {
1701 
1702 	callout_reset(&cfg->periodic, hz * PERIODIC_DELAY,
1703 	    nat64lsn_periodic, cfg);
1704 }
1705 
1706 struct nat64lsn_cfg *
1707 nat64lsn_init_instance(struct ip_fw_chain *ch, size_t numaddr)
1708 {
1709 	struct nat64lsn_cfg *cfg;
1710 
1711 	cfg = malloc(sizeof(struct nat64lsn_cfg), M_IPFW, M_WAITOK | M_ZERO);
1712 	TAILQ_INIT(&cfg->jhead);
1713 	cfg->vp = curvnet;
1714 	cfg->ch = ch;
1715 	COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK);
1716 
1717 	cfg->ihsize = NAT64LSN_HSIZE;
1718 	cfg->ih = malloc(sizeof(void *) * cfg->ihsize, M_IPFW,
1719 	    M_WAITOK | M_ZERO);
1720 
1721 	cfg->pg = malloc(sizeof(void *) * numaddr * _ADDR_PG_COUNT, M_IPFW,
1722 	    M_WAITOK | M_ZERO);
1723 
1724         callout_init(&cfg->periodic, CALLOUT_MPSAFE);
1725         callout_init(&cfg->jcallout, CALLOUT_MPSAFE);
1726 
1727 	return (cfg);
1728 }
1729 
1730 /*
1731  * Destroy all hosts callback.
1732  * Called on module unload when all activity already finished, so
1733  * can work without any locks.
1734  */
1735 static NAT64NOINLINE int
1736 nat64lsn_destroy_host(struct nat64lsn_host *nh, struct nat64lsn_cfg *cfg)
1737 {
1738 	struct nat64lsn_portgroup *pg;
1739 	int i;
1740 
1741 	for (i = nh->pg_used; i > 0; i--) {
1742 		pg = PORTGROUP_BYSIDX(cfg, nh, i);
1743 		if (pg == NULL)
1744 			continue;
1745 		cfg->pg[pg->idx] = NULL;
1746 		destroy_portgroup(pg);
1747 		nh->pg_used--;
1748 	}
1749 	destroy_host6(nh);
1750 	cfg->ihcount--;
1751 	return (0);
1752 }
1753 
1754 void
1755 nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg)
1756 {
1757 	struct nat64lsn_host *nh, *tmp;
1758 
1759 	callout_drain(&cfg->jcallout);
1760 	callout_drain(&cfg->periodic);
1761 	I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_destroy_host, cfg);
1762 	DPRINTF(DP_OBJ, "instance %s: hosts %d", cfg->name, cfg->ihcount);
1763 
1764 	COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS);
1765 	free(cfg->ih, M_IPFW);
1766 	free(cfg->pg, M_IPFW);
1767 	free(cfg, M_IPFW);
1768 }
1769 
1770