xref: /freebsd/sys/netpfil/ipfw/nat64/nat64lsn.c (revision 4657548d18877f64bd02be888406aa5b02bf9b06)
1 /*-
2  * Copyright (c) 2015-2016 Yandex LLC
3  * Copyright (c) 2015 Alexander V. Chernikov <melifaro@FreeBSD.org>
4  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/counter.h>
35 #include <sys/errno.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/module.h>
41 #include <sys/rmlock.h>
42 #include <sys/rwlock.h>
43 #include <sys/socket.h>
44 #include <sys/queue.h>
45 #include <sys/syslog.h>
46 #include <sys/sysctl.h>
47 
48 #include <net/if.h>
49 #include <net/if_var.h>
50 #include <net/if_pflog.h>
51 #include <net/pfil.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip_var.h>
56 #include <netinet/ip_fw.h>
57 #include <netinet/ip6.h>
58 #include <netinet/icmp6.h>
59 #include <netinet/ip_icmp.h>
60 #include <netinet/tcp.h>
61 #include <netinet/udp.h>
62 #include <netinet6/in6_var.h>
63 #include <netinet6/ip6_var.h>
64 #include <netinet6/ip_fw_nat64.h>
65 
66 #include <netpfil/ipfw/ip_fw_private.h>
67 #include <netpfil/ipfw/nat64/ip_fw_nat64.h>
68 #include <netpfil/ipfw/nat64/nat64lsn.h>
69 #include <netpfil/ipfw/nat64/nat64_translate.h>
70 #include <netpfil/pf/pf.h>
71 
72 MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN");
73 
74 static void nat64lsn_periodic(void *data);
75 #define	PERIODIC_DELAY	4
76 static uint8_t nat64lsn_proto_map[256];
77 uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
78 
79 #define	NAT64_FLAG_FIN		0x01	/* FIN was seen */
80 #define	NAT64_FLAG_SYN		0x02	/* First syn in->out */
81 #define	NAT64_FLAG_ESTAB	0x04	/* Packet with Ack */
82 #define	NAT64_FLAGS_TCP	(NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN)
83 
84 #define	NAT64_FLAG_RDR		0x80	/* Port redirect */
85 #define	NAT64_LOOKUP(chain, cmd)	\
86 	(struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
87 /*
88  * Delayed job queue, used to create new hosts
89  * and new portgroups
90  */
91 enum nat64lsn_jtype {
92 	JTYPE_NEWHOST = 1,
93 	JTYPE_NEWPORTGROUP,
94 	JTYPE_DELPORTGROUP,
95 };
96 
97 struct nat64lsn_job_item {
98 	TAILQ_ENTRY(nat64lsn_job_item)	next;
99 	enum nat64lsn_jtype	jtype;
100 	struct nat64lsn_host	*nh;
101 	struct nat64lsn_portgroup	*pg;
102 	void			*spare_idx;
103 	struct in6_addr		haddr;
104 	uint8_t			nat_proto;
105 	uint8_t			done;
106 	int			needs_idx;
107 	int			delcount;
108 	unsigned int		fhash;	/* Flow hash */
109 	uint32_t		aaddr;	/* Last used address (net) */
110 	struct mbuf		*m;
111 	struct ipfw_flow_id	f_id;
112 	uint64_t		delmask[NAT64LSN_PGPTRNMASK];
113 };
114 
115 static struct mtx jmtx;
116 #define	JQUEUE_LOCK_INIT()	mtx_init(&jmtx, "qlock", NULL, MTX_DEF)
117 #define	JQUEUE_LOCK_DESTROY()	mtx_destroy(&jmtx)
118 #define	JQUEUE_LOCK()		mtx_lock(&jmtx)
119 #define	JQUEUE_UNLOCK()		mtx_unlock(&jmtx)
120 
121 static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,
122     struct nat64lsn_job_item *ji);
123 static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
124     struct nat64lsn_job_head *jhead, int jlen);
125 
126 static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg,
127     const struct ipfw_flow_id *f_id, int jtype);
128 static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
129     const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
130     int needs_idx);
131 static int nat64lsn_request_host(struct nat64lsn_cfg *cfg,
132     const struct ipfw_flow_id *f_id, struct mbuf **pm);
133 static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,
134     const struct ipfw_flow_id *f_id, struct mbuf **pm);
135 static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,
136     struct ipfw_flow_id *f_id, struct mbuf **pm);
137 
138 static int alloc_portgroup(struct nat64lsn_job_item *ji);
139 static void destroy_portgroup(struct nat64lsn_portgroup *pg);
140 static void destroy_host6(struct nat64lsn_host *nh);
141 static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
142 
143 static int attach_portgroup(struct nat64lsn_cfg *cfg,
144     struct nat64lsn_job_item *ji);
145 static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
146 
147 
148 /* XXX tmp */
149 static uma_zone_t nat64lsn_host_zone;
150 static uma_zone_t nat64lsn_pg_zone;
151 static uma_zone_t nat64lsn_pgidx_zone;
152 
153 static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg,
154     struct nat64lsn_host *nh);
155 
156 #define	I6_hash(x)		(djb_hash((const unsigned char *)(x), 16))
157 #define	I6_first(_ph, h)	(_ph)[h]
158 #define	I6_next(x)		(x)->next
159 #define	I6_val(x)		(&(x)->addr)
160 #define	I6_cmp(a, b)		IN6_ARE_ADDR_EQUAL(a, b)
161 #define	I6_lock(a, b)
162 #define	I6_unlock(a, b)
163 
164 #define	I6HASH_FIND(_cfg, _res, _a) \
165 	CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a)
166 #define	I6HASH_INSERT(_cfg, _i)	\
167 	CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i)
168 #define	I6HASH_REMOVE(_cfg, _res, _tmp, _a)	\
169 	CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a)
170 
171 #define	I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg)	\
172 	CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg)
173 
174 #define	HASH_IN4(x)	djb_hash((const unsigned char *)(x), 8)
175 
176 static unsigned
177 djb_hash(const unsigned char *h, const int len)
178 {
179 	unsigned int result = 0;
180 	int i;
181 
182 	for (i = 0; i < len; i++)
183 		result = 33 * result ^ h[i];
184 
185 	return (result);
186 }
187 
188 /*
189 static size_t
190 bitmask_size(size_t num, int *level)
191 {
192 	size_t x;
193 	int c;
194 
195 	for (c = 0, x = num; num > 1; num /= 64, c++)
196 		;
197 
198 	return (x);
199 }
200 
201 static void
202 bitmask_prepare(uint64_t *pmask, size_t bufsize, int level)
203 {
204 	size_t x, z;
205 
206 	memset(pmask, 0xFF, bufsize);
207 	for (x = 0, z = 1; level > 1; x += z, z *= 64, level--)
208 		;
209 	pmask[x] ~= 0x01;
210 }
211 */
212 
213 static void
214 nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
215     uint32_t n, uint32_t sn)
216 {
217 
218 	memset(plog, 0, sizeof(*plog));
219 	plog->length = PFLOG_REAL_HDRLEN;
220 	plog->af = family;
221 	plog->action = PF_NAT;
222 	plog->dir = PF_IN;
223 	plog->rulenr = htonl(n);
224 	plog->subrulenr = htonl(sn);
225 	plog->ruleset[0] = '\0';
226 	strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
227 	ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
228 }
229 /*
230  * Inspects icmp packets to see if the message contains different
231  * packet header so we need to alter @addr and @port.
232  */
233 static int
234 inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr,
235     uint16_t *port)
236 {
237 	struct ip *ip;
238 	struct tcphdr *tcp;
239 	struct udphdr *udp;
240 	struct icmphdr *icmp;
241 	int off;
242 	uint8_t proto;
243 
244 	ip = mtod(*m, struct ip *); /* Outer IP header */
245 	off = (ip->ip_hl << 2) + ICMP_MINLEN;
246 	if ((*m)->m_len < off)
247 		*m = m_pullup(*m, off);
248 	if (*m == NULL)
249 		return (ENOMEM);
250 
251 	ip = mtod(*m, struct ip *); /* Outer IP header */
252 	icmp = L3HDR(ip, struct icmphdr *);
253 	switch (icmp->icmp_type) {
254 	case ICMP_ECHO:
255 	case ICMP_ECHOREPLY:
256 		/* Use icmp ID as distinguisher */
257 		*port = ntohs(*((uint16_t *)(icmp + 1)));
258 		return (0);
259 	case ICMP_UNREACH:
260 	case ICMP_TIMXCEED:
261 		break;
262 	default:
263 		return (EOPNOTSUPP);
264 	}
265 	/*
266 	 * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits
267 	 * of ULP header.
268 	 */
269 	if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
270 		return (EINVAL);
271 	if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
272 		*m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN);
273 	if (*m == NULL)
274 		return (ENOMEM);
275 	ip = mtodo(*m, off); /* Inner IP header */
276 	proto = ip->ip_p;
277 	off += ip->ip_hl << 2; /* Skip inner IP header */
278 	*addr = ntohl(ip->ip_src.s_addr);
279 	if ((*m)->m_len < off + ICMP_MINLEN)
280 		*m = m_pullup(*m, off + ICMP_MINLEN);
281 	if (*m == NULL)
282 		return (ENOMEM);
283 	switch (proto) {
284 	case IPPROTO_TCP:
285 		tcp = mtodo(*m, off);
286 		*nat_proto = NAT_PROTO_TCP;
287 		*port = ntohs(tcp->th_sport);
288 		return (0);
289 	case IPPROTO_UDP:
290 		udp = mtodo(*m, off);
291 		*nat_proto = NAT_PROTO_UDP;
292 		*port = ntohs(udp->uh_sport);
293 		return (0);
294 	case IPPROTO_ICMP:
295 		/*
296 		 * We will translate only ICMP errors for our ICMP
297 		 * echo requests.
298 		 */
299 		icmp = mtodo(*m, off);
300 		if (icmp->icmp_type != ICMP_ECHO)
301 			return (EOPNOTSUPP);
302 		*port = ntohs(*((uint16_t *)(icmp + 1)));
303 		return (0);
304 	};
305 	return (EOPNOTSUPP);
306 }
307 
308 static inline uint8_t
309 convert_tcp_flags(uint8_t flags)
310 {
311 	uint8_t result;
312 
313 	result = flags & (TH_FIN|TH_SYN);
314 	result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
315 	result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */
316 
317 	return (result);
318 }
319 
320 static NAT64NOINLINE int
321 nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
322     struct mbuf **pm)
323 {
324 	struct pfloghdr loghdr, *logdata;
325 	struct in6_addr src6;
326 	struct nat64lsn_portgroup *pg;
327 	struct nat64lsn_host *nh;
328 	struct nat64lsn_state *st;
329 	struct ip *ip;
330 	uint32_t addr;
331 	uint16_t state_flags, state_ts;
332 	uint16_t port, lport;
333 	uint8_t nat_proto;
334 	int ret;
335 
336 	addr = f_id->dst_ip;
337 	port = f_id->dst_port;
338 	if (addr < cfg->prefix4 || addr > cfg->pmask4) {
339 		NAT64STAT_INC(&cfg->stats, nomatch4);
340 		return (cfg->nomatch_verdict);
341 	}
342 
343 	/* Check if protocol is supported and get its short id */
344 	nat_proto = nat64lsn_proto_map[f_id->proto];
345 	if (nat_proto == 0) {
346 		NAT64STAT_INC(&cfg->stats, noproto);
347 		return (cfg->nomatch_verdict);
348 	}
349 
350 	/* We might need to handle icmp differently */
351 	if (nat_proto == NAT_PROTO_ICMP) {
352 		ret = inspect_icmp_mbuf(pm, &nat_proto, &addr, &port);
353 		if (ret != 0) {
354 			if (ret == ENOMEM)
355 				NAT64STAT_INC(&cfg->stats, nomem);
356 			else
357 				NAT64STAT_INC(&cfg->stats, noproto);
358 			return (cfg->nomatch_verdict);
359 		}
360 		/* XXX: Check addr for validity */
361 		if (addr < cfg->prefix4 || addr > cfg->pmask4) {
362 			NAT64STAT_INC(&cfg->stats, nomatch4);
363 			return (cfg->nomatch_verdict);
364 		}
365 	}
366 
367 	/* Calc portgroup offset w.r.t protocol */
368 	pg = GET_PORTGROUP(cfg, addr, nat_proto, port);
369 
370 	/* Check if this port is occupied by any portgroup */
371 	if (pg == NULL) {
372 		NAT64STAT_INC(&cfg->stats, nomatch4);
373 #if 0
374 		DPRINTF(DP_STATE, "NOMATCH %u %d %d (%d)", addr, nat_proto, port,
375 		    _GET_PORTGROUP_IDX(cfg, addr, nat_proto, port));
376 #endif
377 		return (cfg->nomatch_verdict);
378 	}
379 
380 	/* TODO: Check flags to see if we need to do some static mapping */
381 	nh = pg->host;
382 
383 	/* Prepare some fields we might need to update */
384 	SET_AGE(state_ts);
385 	ip = mtod(*pm, struct ip *);
386 	if (ip->ip_p == IPPROTO_TCP)
387 		state_flags = convert_tcp_flags(
388 		    L3HDR(ip, struct tcphdr *)->th_flags);
389 	else
390 		state_flags = 0;
391 
392 	/* Lock host and get port mapping */
393 	NAT64_LOCK(nh);
394 
395 	st = &pg->states[port & (NAT64_CHUNK_SIZE - 1)];
396 	if (st->timestamp != state_ts)
397 		st->timestamp = state_ts;
398 	if ((st->flags & state_flags) != state_flags)
399 		st->flags |= state_flags;
400 	lport = htons(st->u.s.lport);
401 
402 	NAT64_UNLOCK(nh);
403 
404 	if (cfg->flags & NAT64_LOG) {
405 		logdata = &loghdr;
406 		nat64lsn_log(logdata, *pm, AF_INET, pg->idx, st->cur.off);
407 	} else
408 		logdata = NULL;
409 
410 	src6.s6_addr32[0] = cfg->prefix6.s6_addr32[0];
411 	src6.s6_addr32[1] = cfg->prefix6.s6_addr32[1];
412 	src6.s6_addr32[2] = cfg->prefix6.s6_addr32[2];
413 	src6.s6_addr32[3] = htonl(f_id->src_ip);
414 
415 	ret = nat64_do_handle_ip4(*pm, &src6, &nh->addr, lport,
416 	    &cfg->stats, logdata);
417 
418 	if (ret == NAT64SKIP)
419 		return (IP_FW_PASS);
420 	if (ret == NAT64MFREE)
421 		m_freem(*pm);
422 	*pm = NULL;
423 
424 	return (IP_FW_DENY);
425 }
426 
427 void
428 nat64lsn_dump_state(const struct nat64lsn_cfg *cfg,
429    const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st,
430    const char *px, int off)
431 {
432 	char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], d[INET_ADDRSTRLEN];
433 
434 	if ((nat64_debug & DP_STATE) == 0)
435 		return;
436 	inet_ntop(AF_INET6, &pg->host->addr, s, sizeof(s));
437 	inet_ntop(AF_INET, &pg->aaddr, a, sizeof(a));
438 	inet_ntop(AF_INET, &st->u.s.faddr, d, sizeof(d));
439 
440 	DPRINTF(DP_STATE, "%s: PG %d ST [%p|%d]: %s:%d/%d <%s:%d> "
441 	    "%s:%d AGE %d", px, pg->idx, st, off,
442 	    s, st->u.s.lport, pg->nat_proto, a, pg->aport + off,
443 	    d, st->u.s.fport, GET_AGE(st->timestamp));
444 }
445 
446 /*
447  * Check if particular TCP state is stale and should be deleted.
448  * Return 1 if true, 0 otherwise.
449  */
450 static int
451 nat64lsn_periodic_check_tcp(const struct nat64lsn_cfg *cfg,
452     const struct nat64lsn_state *st, int age)
453 {
454 	int ttl;
455 
456 	if (st->flags & NAT64_FLAG_FIN)
457 		ttl = cfg->st_close_ttl;
458 	else if (st->flags & NAT64_FLAG_ESTAB)
459 		ttl = cfg->st_estab_ttl;
460 	else if (st->flags & NAT64_FLAG_SYN)
461 		ttl = cfg->st_syn_ttl;
462 	else
463 		ttl = cfg->st_syn_ttl;
464 
465 	if (age > ttl)
466 		return (1);
467 	return (0);
468 }
469 
470 /*
471  * Check if nat state @st is stale and should be deleted.
472  * Return 1 if true, 0 otherwise.
473  */
474 static NAT64NOINLINE int
475 nat64lsn_periodic_chkstate(const struct nat64lsn_cfg *cfg,
476     const struct nat64lsn_portgroup *pg, const struct nat64lsn_state *st)
477 {
478 	int age, delete;
479 
480 	age = GET_AGE(st->timestamp);
481 	delete = 0;
482 
483 	/* Skip immutable records */
484 	if (st->flags & NAT64_FLAG_RDR)
485 		return (0);
486 
487 	switch (pg->nat_proto) {
488 		case NAT_PROTO_TCP:
489 			delete = nat64lsn_periodic_check_tcp(cfg, st, age);
490 			break;
491 		case NAT_PROTO_UDP:
492 			if (age > cfg->st_udp_ttl)
493 				delete = 1;
494 			break;
495 		case NAT_PROTO_ICMP:
496 			if (age > cfg->st_icmp_ttl)
497 				delete = 1;
498 			break;
499 	}
500 
501 	return (delete);
502 }
503 
504 
505 /*
506  * The following structures and functions
507  * are used to perform SLIST_FOREACH_SAFE()
508  * analog for states identified by struct st_ptr.
509  */
510 
511 struct st_idx {
512 	struct nat64lsn_portgroup *pg;
513 	struct nat64lsn_state *st;
514 	struct st_ptr sidx_next;
515 };
516 
517 static struct st_idx *
518 st_first(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
519     struct st_ptr *sidx, struct st_idx *si)
520 {
521 	struct nat64lsn_portgroup *pg;
522 	struct nat64lsn_state *st;
523 
524 	if (sidx->idx == 0) {
525 		memset(si, 0, sizeof(*si));
526 		return (si);
527 	}
528 
529 	pg = PORTGROUP_BYSIDX(cfg, nh, sidx->idx);
530 	st = &pg->states[sidx->off];
531 
532 	si->pg = pg;
533 	si->st = st;
534 	si->sidx_next = st->next;
535 
536 	return (si);
537 }
538 
539 static struct st_idx *
540 st_next(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh,
541     struct st_idx *si)
542 {
543 	struct st_ptr sidx;
544 	struct nat64lsn_portgroup *pg;
545 	struct nat64lsn_state *st;
546 
547 	sidx = si->sidx_next;
548 	if (sidx.idx == 0) {
549 		memset(si, 0, sizeof(*si));
550 		si->st = NULL;
551 		si->pg = NULL;
552 		return (si);
553 	}
554 
555 	pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
556 	st = &pg->states[sidx.off];
557 
558 	si->pg = pg;
559 	si->st = st;
560 	si->sidx_next = st->next;
561 
562 	return (si);
563 }
564 
565 static struct st_idx *
566 st_save_cond(struct st_idx *si_dst, struct st_idx *si)
567 {
568 	if (si->st != NULL)
569 		*si_dst = *si;
570 
571 	return (si_dst);
572 }
573 
574 unsigned int
575 nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh)
576 {
577 	struct st_idx si, si_prev;
578 	int i;
579 	unsigned int delcount;
580 
581 	delcount = 0;
582 	for (i = 0; i < nh->hsize; i++) {
583 		memset(&si_prev, 0, sizeof(si_prev));
584 		for (st_first(cfg, nh, &nh->phash[i], &si);
585 		    si.st != NULL;
586 		    st_save_cond(&si_prev, &si), st_next(cfg, nh, &si)) {
587 			if (nat64lsn_periodic_chkstate(cfg, si.pg, si.st) == 0)
588 				continue;
589 			nat64lsn_dump_state(cfg, si.pg, si.st, "DELETE STATE",
590 			    si.st->cur.off);
591 			/* Unlink from hash */
592 			if (si_prev.st != NULL)
593 				si_prev.st->next = si.st->next;
594 			else
595 				nh->phash[i] = si.st->next;
596 			/* Delete state and free its data */
597 			PG_MARK_FREE_IDX(si.pg, si.st->cur.off);
598 			memset(si.st, 0, sizeof(struct nat64lsn_state));
599 			si.st = NULL;
600 			delcount++;
601 
602 			/* Update portgroup timestamp */
603 			SET_AGE(si.pg->timestamp);
604 		}
605 	}
606 	NAT64STAT_ADD(&cfg->stats, sdeleted, delcount);
607 	return (delcount);
608 }
609 
610 /*
611  * Checks if portgroup is not used and can be deleted,
612  * Returns 1 if stale, 0 otherwise
613  */
614 static int
615 stale_pg(const struct nat64lsn_cfg *cfg, const struct nat64lsn_portgroup *pg)
616 {
617 
618 	if (!PG_IS_EMPTY(pg))
619 		return (0);
620 	if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay)
621 		return (0);
622 	return (1);
623 }
624 
625 /*
626  * Checks if host record is not used and can be deleted,
627  * Returns 1 if stale, 0 otherwise
628  */
629 static int
630 stale_nh(const struct nat64lsn_cfg *cfg, const struct nat64lsn_host *nh)
631 {
632 
633 	if (nh->pg_used != 0)
634 		return (0);
635 	if (GET_AGE(nh->timestamp) < cfg->nh_delete_delay)
636 		return (0);
637 	return (1);
638 }
639 
640 struct nat64lsn_periodic_data {
641 	struct nat64lsn_cfg *cfg;
642 	struct nat64lsn_job_head jhead;
643 	int jlen;
644 };
645 
646 static NAT64NOINLINE int
647 nat64lsn_periodic_chkhost(struct nat64lsn_host *nh,
648     struct nat64lsn_periodic_data *d)
649 {
650 	char a[INET6_ADDRSTRLEN];
651 	struct nat64lsn_portgroup *pg;
652 	struct nat64lsn_job_item *ji;
653 	uint64_t delmask[NAT64LSN_PGPTRNMASK];
654 	int delcount, i;
655 
656 	delcount = 0;
657 	memset(delmask, 0, sizeof(delmask));
658 
659 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
660 	DPRINTF(DP_JQUEUE, "Checking %s host %s on cpu %d",
661 	    stale_nh(d->cfg, nh) ? "stale" : "non-stale", a, curcpu);
662 	if (!stale_nh(d->cfg, nh)) {
663 		/* Non-stale host. Inspect internals */
664 		NAT64_LOCK(nh);
665 
666 		/* Stage 1: Check&expire states */
667 		if (nat64lsn_periodic_chkstates(d->cfg, nh) != 0)
668 			SET_AGE(nh->timestamp);
669 
670 		/* Stage 2: Check if we need to expire */
671 		for (i = 0; i < nh->pg_used; i++) {
672 			pg = PORTGROUP_BYSIDX(d->cfg, nh, i + 1);
673 			if (pg == NULL)
674 				continue;
675 
676 			/* Check if we can delete portgroup */
677 			if (stale_pg(d->cfg, pg) == 0)
678 				continue;
679 
680 			DPRINTF(DP_JQUEUE, "Check PG %d", i);
681 			delmask[i / 64] |= ((uint64_t)1 << (i % 64));
682 			delcount++;
683 		}
684 
685 		NAT64_UNLOCK(nh);
686 		if (delcount == 0)
687 			return (0);
688 	}
689 
690 	DPRINTF(DP_JQUEUE, "Queueing %d portgroups for deleting", delcount);
691 	/* We have something to delete - add it to queue */
692 	ji = nat64lsn_create_job(d->cfg, NULL, JTYPE_DELPORTGROUP);
693 	if (ji == NULL)
694 		return (0);
695 
696 	ji->haddr = nh->addr;
697 	ji->delcount = delcount;
698 	memcpy(ji->delmask, delmask, sizeof(ji->delmask));
699 
700 	TAILQ_INSERT_TAIL(&d->jhead, ji, next);
701 	d->jlen++;
702 	return (0);
703 }
704 
705 /*
706  * This procedure is used to perform various maintance
707  * on dynamic hash list. Currently it is called every second.
708  */
709 static void
710 nat64lsn_periodic(void *data)
711 {
712 	struct ip_fw_chain *ch;
713 	IPFW_RLOCK_TRACKER;
714 	struct nat64lsn_cfg *cfg;
715 	struct nat64lsn_periodic_data d;
716 	struct nat64lsn_host *nh, *tmp;
717 
718 	cfg = (struct nat64lsn_cfg *) data;
719 	ch = cfg->ch;
720 	CURVNET_SET(cfg->vp);
721 
722 	memset(&d, 0, sizeof(d));
723 	d.cfg = cfg;
724 	TAILQ_INIT(&d.jhead);
725 
726 	IPFW_RLOCK(ch);
727 
728 	/* Stage 1: foreach host, check all its portgroups */
729 	I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_periodic_chkhost, &d);
730 
731 	/* Enqueue everything we have requested */
732 	nat64lsn_enqueue_jobs(cfg, &d.jhead, d.jlen);
733 
734 	callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY);
735 
736 	IPFW_RUNLOCK(ch);
737 
738 	CURVNET_RESTORE();
739 }
740 
741 static NAT64NOINLINE void
742 reinject_mbuf(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
743 {
744 
745 	if (ji->m == NULL)
746 		return;
747 
748 	/* Request has failed or packet type is wrong */
749 	if (ji->f_id.addr_type != 6 || ji->done == 0) {
750 		m_freem(ji->m);
751 		ji->m = NULL;
752 		NAT64STAT_INC(&cfg->stats, dropped);
753 		DPRINTF(DP_DROPS, "mbuf dropped: type %d, done %d",
754 		    ji->jtype, ji->done);
755 		return;
756 	}
757 
758 	/*
759 	 * XXX: Limit recursion level
760 	 */
761 
762 	NAT64STAT_INC(&cfg->stats, jreinjected);
763 	DPRINTF(DP_JQUEUE, "Reinject mbuf");
764 	nat64lsn_translate6(cfg, &ji->f_id, &ji->m);
765 }
766 
767 static void
768 destroy_portgroup(struct nat64lsn_portgroup *pg)
769 {
770 
771 	DPRINTF(DP_OBJ, "DESTROY PORTGROUP %d %p", pg->idx, pg);
772 	uma_zfree(nat64lsn_pg_zone, pg);
773 }
774 
775 static NAT64NOINLINE int
776 alloc_portgroup(struct nat64lsn_job_item *ji)
777 {
778 	struct nat64lsn_portgroup *pg;
779 
780 	pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT);
781 	if (pg == NULL)
782 		return (1);
783 
784 	if (ji->needs_idx != 0) {
785 		ji->spare_idx = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
786 		/* Failed alloc isn't always fatal, so don't check */
787 	}
788 	memset(&pg->freemask, 0xFF, sizeof(pg->freemask));
789 	pg->nat_proto = ji->nat_proto;
790 	ji->pg = pg;
791 	return (0);
792 
793 }
794 
795 static void
796 destroy_host6(struct nat64lsn_host *nh)
797 {
798 	char a[INET6_ADDRSTRLEN];
799 	int i;
800 
801 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
802 	DPRINTF(DP_OBJ, "DESTROY HOST %s %p (pg used %d)", a, nh,
803 	    nh->pg_used);
804 	NAT64_LOCK_DESTROY(nh);
805 	for (i = 0; i < nh->pg_allocated / NAT64LSN_PGIDX_CHUNK; i++)
806 		uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, i));
807 	uma_zfree(nat64lsn_host_zone, nh);
808 }
809 
810 static NAT64NOINLINE int
811 alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
812 {
813 	struct nat64lsn_host *nh;
814 	char a[INET6_ADDRSTRLEN];
815 
816 	nh = uma_zalloc(nat64lsn_host_zone, M_NOWAIT);
817 	if (nh == NULL)
818 		return (1);
819 	PORTGROUP_CHUNK(nh, 0) = uma_zalloc(nat64lsn_pgidx_zone, M_NOWAIT);
820 	if (PORTGROUP_CHUNK(nh, 0) == NULL) {
821 		uma_zfree(nat64lsn_host_zone, nh);
822 		return (2);
823 	}
824 	if (alloc_portgroup(ji) != 0) {
825 		NAT64STAT_INC(&cfg->stats, jportfails);
826 		uma_zfree(nat64lsn_pgidx_zone, PORTGROUP_CHUNK(nh, 0));
827 		uma_zfree(nat64lsn_host_zone, nh);
828 		return (3);
829 	}
830 
831 	NAT64_LOCK_INIT(nh);
832 	nh->addr = ji->haddr;
833 	nh->hsize = NAT64LSN_HSIZE; /* XXX: hardcoded size */
834 	nh->pg_allocated = NAT64LSN_PGIDX_CHUNK;
835 	nh->pg_used = 0;
836 	ji->nh = nh;
837 
838 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
839 	DPRINTF(DP_OBJ, "ALLOC HOST %s %p", a, ji->nh);
840 	return (0);
841 }
842 
843 /*
844  * Finds free @pg index inside @nh
845  */
846 static NAT64NOINLINE int
847 find_nh_pg_idx(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh, int *idx)
848 {
849 	int i;
850 
851 	for (i = 0; i < nh->pg_allocated; i++) {
852 		if (PORTGROUP_BYSIDX(cfg, nh, i + 1) == NULL) {
853 			*idx = i;
854 			return (0);
855 		}
856 	}
857 	return (1);
858 }
859 
860 static NAT64NOINLINE int
861 attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
862 {
863 	char a[INET6_ADDRSTRLEN];
864 	struct nat64lsn_host *nh;
865 
866 	I6HASH_FIND(cfg, nh, &ji->haddr);
867 	if (nh == NULL) {
868 		/* Add new host to list */
869 		nh = ji->nh;
870 		I6HASH_INSERT(cfg, nh);
871 		cfg->ihcount++;
872 		ji->nh = NULL;
873 
874 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
875 		DPRINTF(DP_OBJ, "ATTACH HOST %s %p", a, nh);
876 		/*
877 		 * Try to add portgroup.
878 		 * Note it will automatically set
879 		 * 'done' on ji if successful.
880 		 */
881 		if (attach_portgroup(cfg, ji) != 0) {
882 			DPRINTF(DP_DROPS, "%s %p failed to attach PG",
883 			    a, nh);
884 			NAT64STAT_INC(&cfg->stats, jportfails);
885 			return (1);
886 		}
887 		return (0);
888 	}
889 
890 	/*
891 	 * nh isn't NULL. This probably means we had several simultaneous
892 	 * host requests. The previous one request has already attached
893 	 * this host. Requeue attached mbuf and mark job as done, but
894 	 * leave nh and pg pointers not changed, so nat64lsn_do_request()
895 	 * will release all allocated resources.
896 	 */
897 	inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
898 	DPRINTF(DP_OBJ, "%s %p is already attached as %p",
899 	    a, ji->nh, nh);
900 	ji->done = 1;
901 	return (0);
902 }
903 
904 static NAT64NOINLINE int
905 find_pg_place_addr(const struct nat64lsn_cfg *cfg, int addr_off,
906     int nat_proto, uint16_t *aport, int *ppg_idx)
907 {
908 	int j, pg_idx;
909 
910 	pg_idx = addr_off * _ADDR_PG_COUNT +
911 	    (nat_proto - 1) * _ADDR_PG_PROTO_COUNT;
912 
913 	for (j = NAT64_MIN_CHUNK; j < _ADDR_PG_PROTO_COUNT; j++) {
914 		if (cfg->pg[pg_idx + j] != NULL)
915 			continue;
916 
917 		*aport = j * NAT64_CHUNK_SIZE;
918 		*ppg_idx = pg_idx + j;
919 		return (1);
920 	}
921 
922 	return (0);
923 }
924 
925 /*
926  * XXX: This function needs to be rewritten to
927  * use free bitmask for faster pg finding,
928  * additionally, it should take into consideration
929  * a) randomization and
930  * b) previous addresses allocated to given nat instance
931  *
932  */
933 static NAT64NOINLINE int
934 find_portgroup_place(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji,
935     uint32_t *aaddr, uint16_t *aport, int *ppg_idx)
936 {
937 	int i, nat_proto;
938 
939 	/*
940 	 * XXX: Use bitmask index to be able to find/check if IP address
941 	 * has some spare pg's
942 	 */
943 	nat_proto = ji->nat_proto;
944 
945 	/* First, try to use same address */
946 	if (ji->aaddr != 0) {
947 		i = ntohl(ji->aaddr) - cfg->prefix4;
948 		if (find_pg_place_addr(cfg, i, nat_proto, aport,
949 		    ppg_idx) != 0){
950 			/* Found! */
951 			*aaddr = htonl(cfg->prefix4 + i);
952 			return (0);
953 		}
954 	}
955 
956 	/* Next, try to use random address based on flow hash */
957 	i = ji->fhash % (1 << (32 - cfg->plen4));
958 	if (find_pg_place_addr(cfg, i, nat_proto, aport, ppg_idx) != 0) {
959 		/* Found! */
960 		*aaddr = htonl(cfg->prefix4 + i);
961 		return (0);
962 	}
963 
964 
965 	/* Last one: simply find ANY available */
966 	for (i = 0; i < (1 << (32 - cfg->plen4)); i++) {
967 		if (find_pg_place_addr(cfg, i, nat_proto, aport,
968 		    ppg_idx) != 0){
969 			/* Found! */
970 			*aaddr = htonl(cfg->prefix4 + i);
971 			return (0);
972 		}
973 	}
974 
975 	return (1);
976 }
977 
978 static NAT64NOINLINE int
979 attach_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
980 {
981 	char a[INET6_ADDRSTRLEN];
982 	struct nat64lsn_portgroup *pg;
983 	struct nat64lsn_host *nh;
984 	uint32_t aaddr;
985 	uint16_t aport;
986 	int nh_pg_idx, pg_idx;
987 
988 	pg = ji->pg;
989 
990 	/*
991 	 * Find source host and bind: we can't rely on
992 	 * pg->host
993 	 */
994 	I6HASH_FIND(cfg, nh, &ji->haddr);
995 	if (nh == NULL)
996 		return (1);
997 
998 	/* Find spare port chunk */
999 	if (find_portgroup_place(cfg, ji, &aaddr, &aport, &pg_idx) != 0) {
1000 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1001 		DPRINTF(DP_OBJ | DP_DROPS, "empty PG not found for %s", a);
1002 		return (2);
1003 	}
1004 
1005 	/* Expand PG indexes if needed */
1006 	if (nh->pg_allocated < cfg->max_chunks && ji->spare_idx != NULL) {
1007 		PORTGROUP_CHUNK(nh, nh->pg_allocated / NAT64LSN_PGIDX_CHUNK) =
1008 		    ji->spare_idx;
1009 		nh->pg_allocated += NAT64LSN_PGIDX_CHUNK;
1010 		ji->spare_idx = NULL;
1011 	}
1012 
1013 	/* Find empty index to store PG in the @nh */
1014 	if (find_nh_pg_idx(cfg, nh, &nh_pg_idx) != 0) {
1015 		inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1016 		DPRINTF(DP_OBJ | DP_DROPS, "free PG index not found for %s",
1017 		    a);
1018 		return (3);
1019 	}
1020 
1021 	cfg->pg[pg_idx] = pg;
1022 	cfg->protochunks[pg->nat_proto]++;
1023 	NAT64STAT_INC(&cfg->stats, spgcreated);
1024 
1025 	pg->aaddr = aaddr;
1026 	pg->aport = aport;
1027 	pg->host = nh;
1028 	pg->idx = pg_idx;
1029 	SET_AGE(pg->timestamp);
1030 
1031 	PORTGROUP_BYSIDX(cfg, nh, nh_pg_idx + 1) = pg;
1032 	if (nh->pg_used == nh_pg_idx)
1033 		nh->pg_used++;
1034 	SET_AGE(nh->timestamp);
1035 
1036 	ji->pg = NULL;
1037 	ji->done = 1;
1038 
1039 	return (0);
1040 }
1041 
1042 static NAT64NOINLINE void
1043 consider_del_portgroup(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1044 {
1045 	struct nat64lsn_host *nh, *nh_tmp;
1046 	struct nat64lsn_portgroup *pg, *pg_list[256];
1047 	int i, pg_lidx, idx;
1048 
1049 	/* Find source host */
1050 	I6HASH_FIND(cfg, nh, &ji->haddr);
1051 	if (nh == NULL || nh->pg_used == 0)
1052 		return;
1053 
1054 	memset(pg_list, 0, sizeof(pg_list));
1055 	pg_lidx = 0;
1056 
1057 	NAT64_LOCK(nh);
1058 
1059 	for (i = nh->pg_used - 1; i >= 0; i--) {
1060 		if ((ji->delmask[i / 64] & ((uint64_t)1 << (i % 64))) == 0)
1061 			continue;
1062 		pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
1063 
1064 		/* Check that PG isn't busy. */
1065 		if (stale_pg(cfg, pg) == 0)
1066 			continue;
1067 
1068 		/* DO delete */
1069 		pg_list[pg_lidx++] = pg;
1070 		PORTGROUP_BYSIDX(cfg, nh, i + 1) = NULL;
1071 
1072 		idx = _GET_PORTGROUP_IDX(cfg, ntohl(pg->aaddr), pg->nat_proto,
1073 		    pg->aport);
1074 		KASSERT(cfg->pg[idx] == pg, ("Non matched pg"));
1075 		cfg->pg[idx] = NULL;
1076 		cfg->protochunks[pg->nat_proto]--;
1077 		NAT64STAT_INC(&cfg->stats, spgdeleted);
1078 
1079 		/* Decrease pg_used */
1080 		while (nh->pg_used > 0 &&
1081 		    PORTGROUP_BYSIDX(cfg, nh, nh->pg_used) == NULL)
1082 			nh->pg_used--;
1083 
1084 		/* Check if on-stack buffer has ended */
1085 		if (pg_lidx == nitems(pg_list))
1086 			break;
1087 	}
1088 
1089 	NAT64_UNLOCK(nh);
1090 
1091 	if (stale_nh(cfg, nh)) {
1092 		I6HASH_REMOVE(cfg, nh, nh_tmp, &ji->haddr);
1093 		KASSERT(nh != NULL, ("Unable to find address"));
1094 		cfg->ihcount--;
1095 		ji->nh = nh;
1096 		I6HASH_FIND(cfg, nh, &ji->haddr);
1097 		KASSERT(nh == NULL, ("Failed to delete address"));
1098 	}
1099 
1100 	/* TODO: Delay freeing portgroups */
1101 	while (pg_lidx > 0) {
1102 		pg_lidx--;
1103 		NAT64STAT_INC(&cfg->stats, spgdeleted);
1104 		destroy_portgroup(pg_list[pg_lidx]);
1105 	}
1106 }
1107 
1108 /*
1109  * Main request handler.
1110  * Responsible for handling jqueue, e.g.
1111  * creating new hosts, addind/deleting portgroups.
1112  */
1113 static NAT64NOINLINE void
1114 nat64lsn_do_request(void *data)
1115 {
1116 	IPFW_RLOCK_TRACKER;
1117 	struct nat64lsn_job_head jhead;
1118 	struct nat64lsn_job_item *ji;
1119 	int jcount, nhsize;
1120 	struct nat64lsn_cfg *cfg = (struct nat64lsn_cfg *) data;
1121 	struct ip_fw_chain *ch;
1122 	int delcount;
1123 
1124 	CURVNET_SET(cfg->vp);
1125 
1126 	TAILQ_INIT(&jhead);
1127 
1128 	/* XXX: We're running unlocked here */
1129 
1130 	ch = cfg->ch;
1131 	delcount = 0;
1132 	IPFW_RLOCK(ch);
1133 
1134 	/* Grab queue */
1135 	JQUEUE_LOCK();
1136 	TAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item, next);
1137 	jcount = cfg->jlen;
1138 	cfg->jlen = 0;
1139 	JQUEUE_UNLOCK();
1140 
1141 	/* check if we need to resize hash */
1142 	nhsize = 0;
1143 	if (cfg->ihcount > cfg->ihsize && cfg->ihsize < 65536) {
1144 		nhsize = cfg->ihsize;
1145 		for ( ; cfg->ihcount > nhsize && nhsize < 65536; nhsize *= 2)
1146 			;
1147 	} else if (cfg->ihcount < cfg->ihsize * 4) {
1148 		nhsize = cfg->ihsize;
1149 		for ( ; cfg->ihcount < nhsize * 4 && nhsize > 32; nhsize /= 2)
1150 			;
1151 	}
1152 
1153 	IPFW_RUNLOCK(ch);
1154 
1155 	if (TAILQ_EMPTY(&jhead)) {
1156 		CURVNET_RESTORE();
1157 		return;
1158 	}
1159 
1160 	NAT64STAT_INC(&cfg->stats, jcalls);
1161 	DPRINTF(DP_JQUEUE, "count=%d", jcount);
1162 
1163 	/*
1164 	 * TODO:
1165 	 * What we should do here is to build a hash
1166 	 * to ensure we don't have lots of duplicate requests.
1167 	 * Skip this for now.
1168 	 *
1169 	 * TODO: Limit per-call number of items
1170 	 */
1171 
1172 	/* Pre-allocate everything for entire chain */
1173 	TAILQ_FOREACH(ji, &jhead,  next) {
1174 		switch (ji->jtype) {
1175 			case JTYPE_NEWHOST:
1176 				if (alloc_host6(cfg, ji) != 0)
1177 					NAT64STAT_INC(&cfg->stats, jhostfails);
1178 				break;
1179 			case JTYPE_NEWPORTGROUP:
1180 				if (alloc_portgroup(ji) != 0)
1181 					NAT64STAT_INC(&cfg->stats, jportfails);
1182 				break;
1183 			case JTYPE_DELPORTGROUP:
1184 				delcount += ji->delcount;
1185 				break;
1186 			default:
1187 				break;
1188 		}
1189 	}
1190 
1191 	/*
1192 	 * TODO: Alloc hew hash
1193 	 */
1194 	nhsize = 0;
1195 	if (nhsize > 0) {
1196 		/* XXX: */
1197 	}
1198 
1199 	/* Apply all changes in batch */
1200 	IPFW_UH_WLOCK(ch);
1201 	IPFW_WLOCK(ch);
1202 
1203 	TAILQ_FOREACH(ji, &jhead,  next) {
1204 		switch (ji->jtype) {
1205 			case JTYPE_NEWHOST:
1206 				if (ji->nh != NULL)
1207 					attach_host6(cfg, ji);
1208 				break;
1209 			case JTYPE_NEWPORTGROUP:
1210 				if (ji->pg != NULL &&
1211 				    attach_portgroup(cfg, ji) != 0)
1212 					NAT64STAT_INC(&cfg->stats, jportfails);
1213 				break;
1214 			case JTYPE_DELPORTGROUP:
1215 				consider_del_portgroup(cfg, ji);
1216 				break;
1217 		}
1218 	}
1219 
1220 	if (nhsize > 0) {
1221 		/* XXX: Move everything to new hash */
1222 	}
1223 
1224 	IPFW_WUNLOCK(ch);
1225 	IPFW_UH_WUNLOCK(ch);
1226 
1227 	/* Flush unused entries */
1228 	while (!TAILQ_EMPTY(&jhead)) {
1229 		ji = TAILQ_FIRST(&jhead);
1230 		TAILQ_REMOVE(&jhead, ji, next);
1231 		if (ji->nh != NULL)
1232 			destroy_host6(ji->nh);
1233 		if (ji->pg != NULL)
1234 			destroy_portgroup(ji->pg);
1235 		if (ji->m != NULL)
1236 			reinject_mbuf(cfg, ji);
1237 		if (ji->spare_idx != NULL)
1238 			uma_zfree(nat64lsn_pgidx_zone, ji->spare_idx);
1239 		free(ji, M_IPFW);
1240 	}
1241 	CURVNET_RESTORE();
1242 }
1243 
1244 static NAT64NOINLINE struct nat64lsn_job_item *
1245 nat64lsn_create_job(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id,
1246     int jtype)
1247 {
1248 	struct nat64lsn_job_item *ji;
1249 	struct in6_addr haddr;
1250 	uint8_t nat_proto;
1251 
1252 	/*
1253 	 * Do not try to lock possibly contested mutex if we're near the limit.
1254 	 * Drop packet instead.
1255 	 */
1256 	if (cfg->jlen >= cfg->jmaxlen) {
1257 		NAT64STAT_INC(&cfg->stats, jmaxlen);
1258 		return (NULL);
1259 	}
1260 
1261 	memset(&haddr, 0, sizeof(haddr));
1262 	nat_proto = 0;
1263 	if (f_id != NULL) {
1264 		haddr = f_id->src_ip6;
1265 		nat_proto = nat64lsn_proto_map[f_id->proto];
1266 
1267 		DPRINTF(DP_JQUEUE, "REQUEST pg nat_proto %d on proto %d",
1268 		    nat_proto, f_id->proto);
1269 
1270 		if (nat_proto == 0)
1271 			return (NULL);
1272 	}
1273 
1274 	ji = malloc(sizeof(struct nat64lsn_job_item), M_IPFW,
1275 	    M_NOWAIT | M_ZERO);
1276 
1277 	if (ji == NULL) {
1278 		NAT64STAT_INC(&cfg->stats, jnomem);
1279 		return (NULL);
1280 	}
1281 
1282 	ji->jtype = jtype;
1283 
1284 	if (f_id != NULL) {
1285 		ji->f_id = *f_id;
1286 		ji->haddr = haddr;
1287 		ji->nat_proto = nat_proto;
1288 	}
1289 
1290 	return (ji);
1291 }
1292 
1293 static NAT64NOINLINE void
1294 nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)
1295 {
1296 
1297 	if (ji == NULL)
1298 		return;
1299 
1300 	JQUEUE_LOCK();
1301 	TAILQ_INSERT_TAIL(&cfg->jhead, ji, next);
1302 	cfg->jlen++;
1303 	NAT64STAT_INC(&cfg->stats, jrequests);
1304 
1305 	if (callout_pending(&cfg->jcallout) == 0)
1306 		callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
1307 	JQUEUE_UNLOCK();
1308 }
1309 
1310 static NAT64NOINLINE void
1311 nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
1312     struct nat64lsn_job_head *jhead, int jlen)
1313 {
1314 
1315 	if (TAILQ_EMPTY(jhead))
1316 		return;
1317 
1318 	/* Attach current queue to execution one */
1319 	JQUEUE_LOCK();
1320 	TAILQ_CONCAT(&cfg->jhead, jhead, next);
1321 	cfg->jlen += jlen;
1322 	NAT64STAT_ADD(&cfg->stats, jrequests, jlen);
1323 
1324 	if (callout_pending(&cfg->jcallout) == 0)
1325 		callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);
1326 	JQUEUE_UNLOCK();
1327 }
1328 
1329 static unsigned int
1330 flow6_hash(const struct ipfw_flow_id *f_id)
1331 {
1332 	unsigned char hbuf[36];
1333 
1334 	memcpy(hbuf, &f_id->dst_ip6, 16);
1335 	memcpy(&hbuf[16], &f_id->src_ip6, 16);
1336 	memcpy(&hbuf[32], &f_id->dst_port, 2);
1337 	memcpy(&hbuf[32], &f_id->src_port, 2);
1338 
1339 	return (djb_hash(hbuf, sizeof(hbuf)));
1340 }
1341 
1342 static NAT64NOINLINE int
1343 nat64lsn_request_host(struct nat64lsn_cfg *cfg,
1344     const struct ipfw_flow_id *f_id, struct mbuf **pm)
1345 {
1346 	struct nat64lsn_job_item *ji;
1347 	struct mbuf *m;
1348 
1349 	m = *pm;
1350 	*pm = NULL;
1351 
1352 	ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWHOST);
1353 	if (ji == NULL) {
1354 		m_freem(m);
1355 		NAT64STAT_INC(&cfg->stats, dropped);
1356 		DPRINTF(DP_DROPS, "failed to create job");
1357 	} else {
1358 		ji->m = m;
1359 		/* Provide pseudo-random value based on flow */
1360 		ji->fhash = flow6_hash(f_id);
1361 		nat64lsn_enqueue_job(cfg, ji);
1362 		NAT64STAT_INC(&cfg->stats, jhostsreq);
1363 	}
1364 
1365 	return (IP_FW_PASS);
1366 }
1367 
1368 static NAT64NOINLINE int
1369 nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
1370     const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
1371     int needs_idx)
1372 {
1373 	struct nat64lsn_job_item *ji;
1374 	struct mbuf *m;
1375 
1376 	m = *pm;
1377 	*pm = NULL;
1378 
1379 	ji = nat64lsn_create_job(cfg, f_id, JTYPE_NEWPORTGROUP);
1380 	if (ji == NULL) {
1381 		m_freem(m);
1382 		NAT64STAT_INC(&cfg->stats, dropped);
1383 		DPRINTF(DP_DROPS, "failed to create job");
1384 	} else {
1385 		ji->m = m;
1386 		/* Provide pseudo-random value based on flow */
1387 		ji->fhash = flow6_hash(f_id);
1388 		ji->aaddr = aaddr;
1389 		ji->needs_idx = needs_idx;
1390 		nat64lsn_enqueue_job(cfg, ji);
1391 		NAT64STAT_INC(&cfg->stats, jportreq);
1392 	}
1393 
1394 	return (IP_FW_PASS);
1395 }
1396 
1397 static NAT64NOINLINE struct nat64lsn_state *
1398 nat64lsn_create_state(struct nat64lsn_cfg *cfg, struct nat64lsn_host *nh,
1399     int nat_proto, struct nat64lsn_state *kst, uint32_t *aaddr)
1400 {
1401 	struct nat64lsn_portgroup *pg;
1402 	struct nat64lsn_state *st;
1403 	int i, hval, off;
1404 
1405 	/* XXX: create additional bitmask for selecting proper portgroup */
1406 	for (i = 0; i < nh->pg_used; i++) {
1407 		pg = PORTGROUP_BYSIDX(cfg, nh, i + 1);
1408 		if (pg == NULL)
1409 			continue;
1410 		if (*aaddr == 0)
1411 			*aaddr = pg->aaddr;
1412 		if (pg->nat_proto != nat_proto)
1413 			continue;
1414 
1415 		off = PG_GET_FREE_IDX(pg);
1416 		if (off != 0) {
1417 			/* We have found spare state. Use it */
1418 			off--;
1419 			PG_MARK_BUSY_IDX(pg, off);
1420 			st = &pg->states[off];
1421 
1422 			/*
1423 			 * Fill in new info. Assume state was zeroed.
1424 			 * Timestamp and flags will be filled by caller.
1425 			 */
1426 			st->u.s = kst->u.s;
1427 			st->cur.idx = i + 1;
1428 			st->cur.off = off;
1429 
1430 			/* Insert into host hash table */
1431 			hval = HASH_IN4(&st->u.hkey) & (nh->hsize - 1);
1432 			st->next = nh->phash[hval];
1433 			nh->phash[hval] = st->cur;
1434 
1435 			nat64lsn_dump_state(cfg, pg, st, "ALLOC STATE", off);
1436 
1437 			NAT64STAT_INC(&cfg->stats, screated);
1438 
1439 			return (st);
1440 		}
1441 		/* Saev last used alias affress */
1442 		*aaddr = pg->aaddr;
1443 	}
1444 
1445 	return (NULL);
1446 }
1447 
1448 static NAT64NOINLINE int
1449 nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id,
1450     struct mbuf **pm)
1451 {
1452 	struct pfloghdr loghdr, *logdata;
1453 	char a[INET6_ADDRSTRLEN];
1454 	struct nat64lsn_host *nh;
1455 	struct st_ptr sidx;
1456 	struct nat64lsn_state *st, kst;
1457 	struct nat64lsn_portgroup *pg;
1458 	struct icmp6_hdr *icmp6;
1459 	uint32_t aaddr;
1460 	int action, hval, nat_proto, proto;
1461 	uint16_t aport, state_ts, state_flags;
1462 
1463 	/* Check if af/protocol is supported and get it short id */
1464 	nat_proto = nat64lsn_proto_map[f_id->proto];
1465 	if (nat_proto == 0) {
1466 		/*
1467 		 * Since we can be called from jobs handler, we need
1468 		 * to free mbuf by self, do not leave this task to
1469 		 * ipfw_check_packet().
1470 		 */
1471 		NAT64STAT_INC(&cfg->stats, noproto);
1472 		m_freem(*pm);
1473 		*pm = NULL;
1474 		return (IP_FW_DENY);
1475 	}
1476 
1477 	/* Try to find host first */
1478 	I6HASH_FIND(cfg, nh, &f_id->src_ip6);
1479 
1480 	if (nh == NULL)
1481 		return (nat64lsn_request_host(cfg, f_id, pm));
1482 
1483 	/* Fill-in on-stack state structure */
1484 	kst.u.s.faddr = f_id->dst_ip6.s6_addr32[3];
1485 	kst.u.s.fport = f_id->dst_port;
1486 	kst.u.s.lport = f_id->src_port;
1487 
1488 	/* Prepare some fields we might need to update */
1489 	hval = 0;
1490 	proto = nat64_getlasthdr(*pm, &hval);
1491 	if (proto < 0) {
1492 		NAT64STAT_INC(&cfg->stats, dropped);
1493 		DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
1494 		m_freem(*pm);
1495 		*pm = NULL;
1496 		return (IP_FW_DENY);
1497 	}
1498 
1499 	SET_AGE(state_ts);
1500 	if (proto == IPPROTO_TCP)
1501 		state_flags = convert_tcp_flags(
1502 		    TCP(mtodo(*pm, hval))->th_flags);
1503 	else
1504 		state_flags = 0;
1505 	if (proto == IPPROTO_ICMPV6) {
1506 		/* Alter local port data */
1507 		icmp6 = mtodo(*pm, hval);
1508 		if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST ||
1509 		    icmp6->icmp6_type == ICMP6_ECHO_REPLY)
1510 			kst.u.s.lport = ntohs(icmp6->icmp6_id);
1511 	}
1512 
1513 	hval = HASH_IN4(&kst.u.hkey) & (nh->hsize - 1);
1514 	pg = NULL;
1515 	st = NULL;
1516 
1517 	/* OK, let's find state in host hash */
1518 	NAT64_LOCK(nh);
1519 	sidx = nh->phash[hval];
1520 	int k = 0;
1521 	while (sidx.idx != 0) {
1522 		pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
1523 		st = &pg->states[sidx.off];
1524 		//DPRINTF("SISX: %d/%d next: %d/%d", sidx.idx, sidx.off,
1525 		//st->next.idx, st->next.off);
1526 		if (st->u.hkey == kst.u.hkey && pg->nat_proto == nat_proto)
1527 			break;
1528 		if (k++ > 1000) {
1529 			DPRINTF(DP_ALL, "XXX: too long %d/%d %d/%d\n",
1530 			    sidx.idx, sidx.off, st->next.idx, st->next.off);
1531 			inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1532 			DPRINTF(DP_GENERIC, "TR host %s %p on cpu %d",
1533 			    a, nh, curcpu);
1534 			k = 0;
1535 		}
1536 		sidx = st->next;
1537 	}
1538 
1539 	if (sidx.idx == 0) {
1540 		aaddr = 0;
1541 		st = nat64lsn_create_state(cfg, nh, nat_proto, &kst, &aaddr);
1542 		if (st == NULL) {
1543 			/* No free states. Request more if we can */
1544 			if (nh->pg_used >= cfg->max_chunks) {
1545 				/* Limit reached */
1546 				NAT64STAT_INC(&cfg->stats, dropped);
1547 				inet_ntop(AF_INET6, &nh->addr, a, sizeof(a));
1548 				DPRINTF(DP_DROPS, "PG limit reached "
1549 				    " for host %s (used %u, allocated %u, "
1550 				    "limit %u)", a,
1551 				    nh->pg_used * NAT64_CHUNK_SIZE,
1552 				    nh->pg_allocated * NAT64_CHUNK_SIZE,
1553 				    cfg->max_chunks * NAT64_CHUNK_SIZE);
1554 				m_freem(*pm);
1555 				*pm = NULL;
1556 				NAT64_UNLOCK(nh);
1557 				return (IP_FW_DENY);
1558 			}
1559 			if ((nh->pg_allocated <=
1560 			    nh->pg_used + NAT64LSN_REMAININGPG) &&
1561 			    nh->pg_allocated < cfg->max_chunks)
1562 				action = 1; /* Request new indexes */
1563 			else
1564 				action = 0;
1565 			NAT64_UNLOCK(nh);
1566 			//DPRINTF("No state, unlock for %p", nh);
1567 			return (nat64lsn_request_portgroup(cfg, f_id,
1568 			    pm, aaddr, action));
1569 		}
1570 
1571 		/* We've got new state. */
1572 		sidx = st->cur;
1573 		pg = PORTGROUP_BYSIDX(cfg, nh, sidx.idx);
1574 	}
1575 
1576 	/* Okay, state found */
1577 
1578 	/* Update necessary fileds */
1579 	if (st->timestamp != state_ts)
1580 		st->timestamp = state_ts;
1581 	if ((st->flags & state_flags) != 0)
1582 		st->flags |= state_flags;
1583 
1584 	/* Copy needed state data */
1585 	aaddr = pg->aaddr;
1586 	aport = htons(pg->aport + sidx.off);
1587 
1588 	NAT64_UNLOCK(nh);
1589 
1590 	if (cfg->flags & NAT64_LOG) {
1591 		logdata = &loghdr;
1592 		nat64lsn_log(logdata, *pm, AF_INET6, pg->idx, st->cur.off);
1593 	} else
1594 		logdata = NULL;
1595 
1596 	action = nat64_do_handle_ip6(*pm, aaddr, aport, &cfg->stats, logdata);
1597 	if (action == NAT64SKIP)
1598 		return (IP_FW_PASS);
1599 	if (action == NAT64MFREE)
1600 		m_freem(*pm);
1601 	*pm = NULL;	/* mark mbuf as consumed */
1602 	return (IP_FW_DENY);
1603 }
1604 
1605 /*
1606  * Main dataplane entry point.
1607  */
1608 int
1609 ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,
1610     ipfw_insn *cmd, int *done)
1611 {
1612 	ipfw_insn *icmd;
1613 	struct nat64lsn_cfg *cfg;
1614 	int ret;
1615 
1616 	IPFW_RLOCK_ASSERT(ch);
1617 
1618 	*done = 1; /* terminate the search */
1619 	icmd = cmd + 1;
1620 	if (cmd->opcode != O_EXTERNAL_ACTION ||
1621 	    cmd->arg1 != V_nat64lsn_eid ||
1622 	    icmd->opcode != O_EXTERNAL_INSTANCE ||
1623 	    (cfg = NAT64_LOOKUP(ch, icmd)) == NULL)
1624 		return (0);
1625 
1626 	switch (args->f_id.addr_type) {
1627 	case 4:
1628 		ret = nat64lsn_translate4(cfg, &args->f_id, &args->m);
1629 		break;
1630 	case 6:
1631 		ret = nat64lsn_translate6(cfg, &args->f_id, &args->m);
1632 		break;
1633 	default:
1634 		return (0);
1635 	}
1636 	return (ret);
1637 }
1638 
1639 static int
1640 nat64lsn_ctor_host(void *mem, int size, void *arg, int flags)
1641 {
1642 	struct nat64lsn_host *nh;
1643 
1644 	nh = (struct nat64lsn_host *)mem;
1645 	memset(nh->pg_ptr, 0, sizeof(nh->pg_ptr));
1646 	memset(nh->phash, 0, sizeof(nh->phash));
1647 	return (0);
1648 }
1649 
1650 static int
1651 nat64lsn_ctor_pgidx(void *mem, int size, void *arg, int flags)
1652 {
1653 
1654 	memset(mem, 0, size);
1655 	return (0);
1656 }
1657 
1658 void
1659 nat64lsn_init_internal(void)
1660 {
1661 
1662 	memset(nat64lsn_proto_map, 0, sizeof(nat64lsn_proto_map));
1663 	/* Set up supported protocol map */
1664 	nat64lsn_proto_map[IPPROTO_TCP] = NAT_PROTO_TCP;
1665 	nat64lsn_proto_map[IPPROTO_UDP] = NAT_PROTO_UDP;
1666 	nat64lsn_proto_map[IPPROTO_ICMP] = NAT_PROTO_ICMP;
1667 	nat64lsn_proto_map[IPPROTO_ICMPV6] = NAT_PROTO_ICMP;
1668 	/* Fill in reverse proto map */
1669 	memset(nat64lsn_rproto_map, 0, sizeof(nat64lsn_rproto_map));
1670 	nat64lsn_rproto_map[NAT_PROTO_TCP] = IPPROTO_TCP;
1671 	nat64lsn_rproto_map[NAT_PROTO_UDP] = IPPROTO_UDP;
1672 	nat64lsn_rproto_map[NAT_PROTO_ICMP] = IPPROTO_ICMPV6;
1673 
1674 	JQUEUE_LOCK_INIT();
1675 	nat64lsn_host_zone = uma_zcreate("NAT64 hosts zone",
1676 	    sizeof(struct nat64lsn_host), nat64lsn_ctor_host, NULL,
1677 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1678 	nat64lsn_pg_zone = uma_zcreate("NAT64 portgroups zone",
1679 	    sizeof(struct nat64lsn_portgroup), NULL, NULL, NULL, NULL,
1680 	    UMA_ALIGN_PTR, 0);
1681 	nat64lsn_pgidx_zone = uma_zcreate("NAT64 portgroup indexes zone",
1682 	    sizeof(struct nat64lsn_portgroup *) * NAT64LSN_PGIDX_CHUNK,
1683 	    nat64lsn_ctor_pgidx, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1684 }
1685 
1686 void
1687 nat64lsn_uninit_internal(void)
1688 {
1689 
1690 	JQUEUE_LOCK_DESTROY();
1691 	uma_zdestroy(nat64lsn_host_zone);
1692 	uma_zdestroy(nat64lsn_pg_zone);
1693 	uma_zdestroy(nat64lsn_pgidx_zone);
1694 }
1695 
1696 void
1697 nat64lsn_start_instance(struct nat64lsn_cfg *cfg)
1698 {
1699 
1700 	callout_reset(&cfg->periodic, hz * PERIODIC_DELAY,
1701 	    nat64lsn_periodic, cfg);
1702 }
1703 
1704 struct nat64lsn_cfg *
1705 nat64lsn_init_instance(struct ip_fw_chain *ch, size_t numaddr)
1706 {
1707 	struct nat64lsn_cfg *cfg;
1708 
1709 	cfg = malloc(sizeof(struct nat64lsn_cfg), M_IPFW, M_WAITOK | M_ZERO);
1710 	TAILQ_INIT(&cfg->jhead);
1711 	cfg->vp = curvnet;
1712 	cfg->ch = ch;
1713 	COUNTER_ARRAY_ALLOC(cfg->stats.stats, NAT64STATS, M_WAITOK);
1714 
1715 	cfg->ihsize = NAT64LSN_HSIZE;
1716 	cfg->ih = malloc(sizeof(void *) * cfg->ihsize, M_IPFW,
1717 	    M_WAITOK | M_ZERO);
1718 
1719 	cfg->pg = malloc(sizeof(void *) * numaddr * _ADDR_PG_COUNT, M_IPFW,
1720 	    M_WAITOK | M_ZERO);
1721 
1722         callout_init(&cfg->periodic, CALLOUT_MPSAFE);
1723         callout_init(&cfg->jcallout, CALLOUT_MPSAFE);
1724 
1725 	return (cfg);
1726 }
1727 
1728 /*
1729  * Destroy all hosts callback.
1730  * Called on module unload when all activity already finished, so
1731  * can work without any locks.
1732  */
1733 static NAT64NOINLINE int
1734 nat64lsn_destroy_host(struct nat64lsn_host *nh, struct nat64lsn_cfg *cfg)
1735 {
1736 	struct nat64lsn_portgroup *pg;
1737 	int i;
1738 
1739 	for (i = nh->pg_used; i > 0; i--) {
1740 		pg = PORTGROUP_BYSIDX(cfg, nh, i);
1741 		if (pg == NULL)
1742 			continue;
1743 		cfg->pg[pg->idx] = NULL;
1744 		destroy_portgroup(pg);
1745 		nh->pg_used--;
1746 	}
1747 	destroy_host6(nh);
1748 	cfg->ihcount--;
1749 	return (0);
1750 }
1751 
1752 void
1753 nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg)
1754 {
1755 	struct nat64lsn_host *nh, *tmp;
1756 
1757 	JQUEUE_LOCK();
1758 	callout_drain(&cfg->jcallout);
1759 	JQUEUE_UNLOCK();
1760 
1761 	callout_drain(&cfg->periodic);
1762 	I6HASH_FOREACH_SAFE(cfg, nh, tmp, nat64lsn_destroy_host, cfg);
1763 	DPRINTF(DP_OBJ, "instance %s: hosts %d", cfg->name, cfg->ihcount);
1764 
1765 	COUNTER_ARRAY_FREE(cfg->stats.stats, NAT64STATS);
1766 	free(cfg->ih, M_IPFW);
1767 	free(cfg->pg, M_IPFW);
1768 	free(cfg, M_IPFW);
1769 }
1770 
1771