xref: /freebsd/sys/netpfil/pf/pf_lb.c (revision f1ddb6fb8c4d051a205dae3a848776c9d56f86ff)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2001 Daniel Hartmeier
5  * Copyright (c) 2002 - 2008 Henning Brauer
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  *    - Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    - Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following
16  *      disclaimer in the documentation and/or other materials provided
17  *      with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * Effort sponsored in part by the Defense Advanced Research Projects
33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35  *
36  *	$OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $
37  */
38 
39 #include <sys/cdefs.h>
40 #include "opt_pf.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 
44 #include <sys/param.h>
45 #include <sys/lock.h>
46 #include <sys/mbuf.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/vnet.h>
53 #include <net/pfvar.h>
54 #include <net/if_pflog.h>
55 
56 #ifdef INET
57 #include <netinet/in_var.h>
58 #endif
59 
60 #ifdef INET6
61 #include <netinet6/in6_var.h>
62 #endif
63 
64 
65 /*
66  * Limit the amount of work we do to find a free source port for redirects that
67  * introduce a state conflict.
68  */
69 #define	V_pf_rdr_srcport_rewrite_tries	VNET(pf_rdr_srcport_rewrite_tries)
70 VNET_DEFINE_STATIC(int, pf_rdr_srcport_rewrite_tries) = 16;
71 
72 #define DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
73 
74 static void		 pf_hash(struct pf_addr *, struct pf_addr *,
75 			    struct pf_poolhashkey *, sa_family_t);
76 static struct pf_krule	*pf_match_translation(struct pf_pdesc *,
77 			    int, struct pf_kanchor_stackframe *);
78 static int pf_get_sport(struct pf_pdesc *, struct pf_krule *,
79     struct pf_addr *, uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **,
80     struct pf_srchash **, struct pf_kpool *, struct pf_udp_mapping **);
81 static bool		 pf_islinklocal(const sa_family_t, const struct pf_addr *);
82 
83 #define mix(a,b,c) \
84 	do {					\
85 		a -= b; a -= c; a ^= (c >> 13);	\
86 		b -= c; b -= a; b ^= (a << 8);	\
87 		c -= a; c -= b; c ^= (b >> 13);	\
88 		a -= b; a -= c; a ^= (c >> 12);	\
89 		b -= c; b -= a; b ^= (a << 16);	\
90 		c -= a; c -= b; c ^= (b >> 5);	\
91 		a -= b; a -= c; a ^= (c >> 3);	\
92 		b -= c; b -= a; b ^= (a << 10);	\
93 		c -= a; c -= b; c ^= (b >> 15);	\
94 	} while (0)
95 
96 /*
97  * hash function based on bridge_hash in if_bridge.c
98  */
99 static void
100 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
101     struct pf_poolhashkey *key, sa_family_t af)
102 {
103 	u_int32_t	a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
104 
105 	switch (af) {
106 #ifdef INET
107 	case AF_INET:
108 		a += inaddr->addr32[0];
109 		b += key->key32[1];
110 		mix(a, b, c);
111 		hash->addr32[0] = c + key->key32[2];
112 		break;
113 #endif /* INET */
114 #ifdef INET6
115 	case AF_INET6:
116 		a += inaddr->addr32[0];
117 		b += inaddr->addr32[2];
118 		mix(a, b, c);
119 		hash->addr32[0] = c;
120 		a += inaddr->addr32[1];
121 		b += inaddr->addr32[3];
122 		c += key->key32[1];
123 		mix(a, b, c);
124 		hash->addr32[1] = c;
125 		a += inaddr->addr32[2];
126 		b += inaddr->addr32[1];
127 		c += key->key32[2];
128 		mix(a, b, c);
129 		hash->addr32[2] = c;
130 		a += inaddr->addr32[3];
131 		b += inaddr->addr32[0];
132 		c += key->key32[3];
133 		mix(a, b, c);
134 		hash->addr32[3] = c;
135 		break;
136 #endif /* INET6 */
137 	}
138 }
139 
140 static struct pf_krule *
141 pf_match_translation(struct pf_pdesc *pd,
142     int rs_num, struct pf_kanchor_stackframe *anchor_stack)
143 {
144 	struct pf_krule		*r, *rm = NULL;
145 	struct pf_kruleset	*ruleset = NULL;
146 	int			 tag = -1;
147 	int			 rtableid = -1;
148 	int			 asd = 0;
149 
150 	r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
151 	while (r != NULL) {
152 		struct pf_rule_addr	*src = NULL, *dst = NULL;
153 		struct pf_addr_wrap	*xdst = NULL;
154 
155 		if (r->action == PF_BINAT && pd->dir == PF_IN) {
156 			src = &r->dst;
157 			if (r->rdr.cur != NULL)
158 				xdst = &r->rdr.cur->addr;
159 		} else {
160 			src = &r->src;
161 			dst = &r->dst;
162 		}
163 
164 		pf_counter_u64_add(&r->evaluations, 1);
165 		if (pfi_kkif_match(r->kif, pd->kif) == r->ifnot)
166 			r = r->skip[PF_SKIP_IFP];
167 		else if (r->direction && r->direction != pd->dir)
168 			r = r->skip[PF_SKIP_DIR];
169 		else if (r->af && r->af != pd->af)
170 			r = r->skip[PF_SKIP_AF];
171 		else if (r->proto && r->proto != pd->proto)
172 			r = r->skip[PF_SKIP_PROTO];
173 		else if (PF_MISMATCHAW(&src->addr, &pd->nsaddr, pd->af,
174 		    src->neg, pd->kif, M_GETFIB(pd->m)))
175 			r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
176 			    PF_SKIP_DST_ADDR];
177 		else if (src->port_op && !pf_match_port(src->port_op,
178 		    src->port[0], src->port[1], pd->nsport))
179 			r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
180 			    PF_SKIP_DST_PORT];
181 		else if (dst != NULL &&
182 		    PF_MISMATCHAW(&dst->addr, &pd->ndaddr, pd->af, dst->neg, NULL,
183 		    M_GETFIB(pd->m)))
184 			r = r->skip[PF_SKIP_DST_ADDR];
185 		else if (xdst != NULL && PF_MISMATCHAW(xdst, &pd->ndaddr, pd->af,
186 		    0, NULL, M_GETFIB(pd->m)))
187 			r = TAILQ_NEXT(r, entries);
188 		else if (dst != NULL && dst->port_op &&
189 		    !pf_match_port(dst->port_op, dst->port[0],
190 		    dst->port[1], pd->ndport))
191 			r = r->skip[PF_SKIP_DST_PORT];
192 		else if (r->match_tag && !pf_match_tag(pd->m, r, &tag,
193 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
194 			r = TAILQ_NEXT(r, entries);
195 		else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
196 		    IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd,
197 		    &pd->hdr.tcp), r->os_fingerprint)))
198 			r = TAILQ_NEXT(r, entries);
199 		else {
200 			if (r->tag)
201 				tag = r->tag;
202 			if (r->rtableid >= 0)
203 				rtableid = r->rtableid;
204 			if (r->anchor == NULL) {
205 				rm = r;
206 				if (rm->action == PF_NONAT ||
207 				    rm->action == PF_NORDR ||
208 				    rm->action == PF_NOBINAT) {
209 					rm = NULL;
210 				}
211 				break;
212 			} else
213 				pf_step_into_anchor(anchor_stack, &asd,
214 				    &ruleset, rs_num, &r, NULL, NULL);
215 		}
216 		if (r == NULL)
217 			pf_step_out_of_anchor(anchor_stack, &asd, &ruleset,
218 			    rs_num, &r, NULL, NULL);
219 	}
220 
221 	if (tag > 0 && pf_tag_packet(pd, tag))
222 		return (NULL);
223 	if (rtableid >= 0)
224 		M_SETFIB(pd->m, rtableid);
225 
226 	return (rm);
227 }
228 
229 static int
230 pf_get_sport(struct pf_pdesc *pd, struct pf_krule *r,
231     struct pf_addr *naddr, uint16_t *nport, uint16_t low,
232     uint16_t high, struct pf_ksrc_node **sn,
233     struct pf_srchash **sh, struct pf_kpool *rpool,
234     struct pf_udp_mapping **udp_mapping)
235 {
236 	struct pf_state_key_cmp	key;
237 	struct pf_addr		init_addr;
238 
239 	bzero(&init_addr, sizeof(init_addr));
240 
241 	if (! TAILQ_EMPTY(&r->nat.list) &&
242 	    pf_map_addr_sn(pd->naf, r, &pd->nsaddr, naddr, NULL, &init_addr,
243 	    sn, sh, &r->nat))
244 		return (1);
245 
246 	if (udp_mapping) {
247 		MPASS(*udp_mapping == NULL);
248 	}
249 
250 	/*
251 	 * If we are UDP and have an existing mapping we can get source port
252 	 * from the mapping. In this case we have to look up the src_node as
253 	 * pf_map_addr would.
254 	 */
255 	if (pd->proto == IPPROTO_UDP && (r->rdr.opts & PF_POOL_ENDPI)) {
256 		struct pf_udp_endpoint_cmp udp_source;
257 
258 		bzero(&udp_source, sizeof(udp_source));
259 		udp_source.af = pd->af;
260 		PF_ACPY(&udp_source.addr, &pd->nsaddr, pd->af);
261 		udp_source.port = pd->nsport;
262 		if (udp_mapping) {
263 			*udp_mapping = pf_udp_mapping_find(&udp_source);
264 			if (*udp_mapping) {
265 				PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, pd->af);
266 				*nport = (*udp_mapping)->endpoints[1].port;
267 				/* Try to find a src_node as per pf_map_addr(). */
268 				if (*sn == NULL && r->rdr.opts & PF_POOL_STICKYADDR &&
269 				    (r->rdr.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
270 					*sn = pf_find_src_node(&pd->nsaddr, r, pd->af, sh, false);
271 				if (*sn != NULL)
272 					PF_SRC_NODE_UNLOCK(*sn);
273 				return (0);
274 			} else {
275 				*udp_mapping = pf_udp_mapping_create(pd->af, &pd->nsaddr,
276 				    pd->nsport, &init_addr, 0);
277 				if (*udp_mapping == NULL)
278 					return (1);
279 			}
280 		}
281 	}
282 
283 	if (pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL, &init_addr,
284 	    sn, sh, rpool))
285 		goto failed;
286 
287 	if (pd->proto == IPPROTO_ICMP) {
288 		if (*nport == htons(ICMP_ECHO)) {
289 			low = 1;
290 			high = 65535;
291 		} else
292 			return (0);	/* Don't try to modify non-echo ICMP */
293 	}
294 #ifdef INET6
295 	if (pd->proto == IPPROTO_ICMPV6) {
296 		if (*nport == htons(ICMP6_ECHO_REQUEST)) {
297 			low = 1;
298 			high = 65535;
299 		} else
300 			return (0);	/* Don't try to modify non-echo ICMP */
301 	}
302 #endif /* INET6 */
303 
304 	bzero(&key, sizeof(key));
305 	key.af = pd->naf;
306 	key.proto = pd->proto;
307 	key.port[0] = pd->ndport;
308 	PF_ACPY(&key.addr[0], &pd->ndaddr, key.af);
309 
310 	do {
311 		PF_ACPY(&key.addr[1], naddr, key.af);
312 		if (udp_mapping && *udp_mapping)
313 			PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, pd->af);
314 
315 		/*
316 		 * port search; start random, step;
317 		 * similar 2 portloop in in_pcbbind
318 		 */
319 		if (pd->proto == IPPROTO_SCTP) {
320 			key.port[1] = pd->nsport;
321 			if (!pf_find_state_all_exists(&key, PF_IN)) {
322 				*nport = pd->nsport;
323 				return (0);
324 			} else {
325 				return (1); /* Fail mapping. */
326 			}
327 		} else if (!(pd->proto == IPPROTO_TCP || pd->proto == IPPROTO_UDP ||
328 		    pd->proto == IPPROTO_ICMP) || (low == 0 && high == 0)) {
329 			/*
330 			 * XXX bug: icmp states don't use the id on both sides.
331 			 * (traceroute -I through nat)
332 			 */
333 			key.port[1] = pd->nsport;
334 			if (!pf_find_state_all_exists(&key, PF_IN)) {
335 				*nport = pd->nsport;
336 				return (0);
337 			}
338 		} else if (low == high) {
339 			key.port[1] = htons(low);
340 			if (!pf_find_state_all_exists(&key, PF_IN)) {
341 				if (udp_mapping && *udp_mapping != NULL) {
342 					(*udp_mapping)->endpoints[1].port = htons(low);
343 					if (pf_udp_mapping_insert(*udp_mapping) == 0) {
344 						*nport = htons(low);
345 						return (0);
346 					}
347 				} else {
348 					*nport = htons(low);
349 					return (0);
350 				}
351 			}
352 		} else {
353 			uint32_t tmp;
354 			uint16_t cut;
355 
356 			if (low > high) {
357 				tmp = low;
358 				low = high;
359 				high = tmp;
360 			}
361 			/* low < high */
362 			cut = arc4random() % (1 + high - low) + low;
363 			/* low <= cut <= high */
364 			for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) {
365 				if (udp_mapping && *udp_mapping != NULL) {
366 					(*udp_mapping)->endpoints[1].port = htons(tmp);
367 					if (pf_udp_mapping_insert(*udp_mapping) == 0) {
368 						*nport = htons(tmp);
369 						return (0);
370 					}
371 				} else {
372 					key.port[1] = htons(tmp);
373 					if (!pf_find_state_all_exists(&key, PF_IN)) {
374 						*nport = htons(tmp);
375 						return (0);
376 					}
377 				}
378 			}
379 			tmp = cut;
380 			for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) {
381 				if (pd->proto == IPPROTO_UDP &&
382 				    (r->rdr.opts & PF_POOL_ENDPI &&
383 				    udp_mapping != NULL)) {
384 					(*udp_mapping)->endpoints[1].port = htons(tmp);
385 					if (pf_udp_mapping_insert(*udp_mapping) == 0) {
386 						*nport = htons(tmp);
387 						return (0);
388 					}
389 				} else {
390 					key.port[1] = htons(tmp);
391 					if (!pf_find_state_all_exists(&key, PF_IN)) {
392 						*nport = htons(tmp);
393 						return (0);
394 					}
395 				}
396 			}
397 		}
398 
399 		switch (r->rdr.opts & PF_POOL_TYPEMASK) {
400 		case PF_POOL_RANDOM:
401 		case PF_POOL_ROUNDROBIN:
402 			/*
403 			 * pick a different source address since we're out
404 			 * of free port choices for the current one.
405 			 */
406 			(*sn) = NULL;
407 			if (pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL,
408 			    &init_addr, sn, sh, &r->rdr))
409 				return (1);
410 			break;
411 		case PF_POOL_NONE:
412 		case PF_POOL_SRCHASH:
413 		case PF_POOL_BITMASK:
414 		default:
415 			return (1);
416 		}
417 	} while (! PF_AEQ(&init_addr, naddr, pd->naf) );
418 
419 failed:
420 	if (udp_mapping) {
421 		uma_zfree(V_pf_udp_mapping_z, *udp_mapping);
422 		*udp_mapping = NULL;
423 	}
424 
425 	return (1);					/* none available */
426 }
427 
428 static bool
429 pf_islinklocal(const sa_family_t af, const struct pf_addr *addr)
430 {
431 	if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr->v6))
432 		return (true);
433 	return (false);
434 }
435 
436 static int
437 pf_get_mape_sport(struct pf_pdesc *pd, struct pf_krule *r,
438     struct pf_addr *naddr, uint16_t *nport,
439     struct pf_ksrc_node **sn, struct pf_srchash **sh,
440     struct pf_udp_mapping **udp_mapping)
441 {
442 	uint16_t psmask, low, highmask;
443 	uint16_t i, ahigh, cut;
444 	int ashift, psidshift;
445 
446 	ashift = 16 - r->rdr.mape.offset;
447 	psidshift = ashift - r->rdr.mape.psidlen;
448 	psmask = r->rdr.mape.psid & ((1U << r->rdr.mape.psidlen) - 1);
449 	psmask = psmask << psidshift;
450 	highmask = (1U << psidshift) - 1;
451 
452 	ahigh = (1U << r->rdr.mape.offset) - 1;
453 	cut = arc4random() & ahigh;
454 	if (cut == 0)
455 		cut = 1;
456 
457 	for (i = cut; i <= ahigh; i++) {
458 		low = (i << ashift) | psmask;
459 		if (!pf_get_sport(pd, r,
460 		    naddr, nport, low, low | highmask, sn, sh, &r->rdr,
461 		    udp_mapping))
462 			return (0);
463 	}
464 	for (i = cut - 1; i > 0; i--) {
465 		low = (i << ashift) | psmask;
466 		if (!pf_get_sport(pd, r,
467 		    naddr, nport, low, low | highmask, sn, sh, &r->rdr,
468 		    udp_mapping))
469 			return (0);
470 	}
471 	return (1);
472 }
473 
474 u_short
475 pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
476     struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr,
477     struct pf_kpool *rpool)
478 {
479 	u_short			 reason = PFRES_MATCH;
480 	struct pf_addr		*raddr = NULL, *rmask = NULL;
481 
482 	mtx_lock(&rpool->mtx);
483 	/* Find the route using chosen algorithm. Store the found route
484 	   in src_node if it was given or found. */
485 	if (rpool->cur->addr.type == PF_ADDR_NOROUTE) {
486 		reason = PFRES_MAPFAILED;
487 		goto done_pool_mtx;
488 	}
489 	if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
490 		switch (af) {
491 #ifdef INET
492 		case AF_INET:
493 			if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
494 			    (rpool->opts & PF_POOL_TYPEMASK) !=
495 			    PF_POOL_ROUNDROBIN) {
496 				reason = PFRES_MAPFAILED;
497 				goto done_pool_mtx;
498 			}
499 			raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
500 			rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
501 			break;
502 #endif /* INET */
503 #ifdef INET6
504 		case AF_INET6:
505 			if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
506 			    (rpool->opts & PF_POOL_TYPEMASK) !=
507 			    PF_POOL_ROUNDROBIN) {
508 				reason = PFRES_MAPFAILED;
509 				goto done_pool_mtx;
510 			}
511 			raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
512 			rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
513 			break;
514 #endif /* INET6 */
515 		}
516 	} else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
517 		if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) {
518 			reason = PFRES_MAPFAILED;
519 			goto done_pool_mtx; /* unsupported */
520 		}
521 	} else {
522 		raddr = &rpool->cur->addr.v.a.addr;
523 		rmask = &rpool->cur->addr.v.a.mask;
524 	}
525 
526 	switch (rpool->opts & PF_POOL_TYPEMASK) {
527 	case PF_POOL_NONE:
528 		PF_ACPY(naddr, raddr, af);
529 		break;
530 	case PF_POOL_BITMASK:
531 		PF_POOLMASK(naddr, raddr, rmask, saddr, af);
532 		break;
533 	case PF_POOL_RANDOM:
534 		if (init_addr != NULL && PF_AZERO(init_addr, af)) {
535 			switch (af) {
536 #ifdef INET
537 			case AF_INET:
538 				rpool->counter.addr32[0] = htonl(arc4random());
539 				break;
540 #endif /* INET */
541 #ifdef INET6
542 			case AF_INET6:
543 				if (rmask->addr32[3] != 0xffffffff)
544 					rpool->counter.addr32[3] =
545 					    htonl(arc4random());
546 				else
547 					break;
548 				if (rmask->addr32[2] != 0xffffffff)
549 					rpool->counter.addr32[2] =
550 					    htonl(arc4random());
551 				else
552 					break;
553 				if (rmask->addr32[1] != 0xffffffff)
554 					rpool->counter.addr32[1] =
555 					    htonl(arc4random());
556 				else
557 					break;
558 				if (rmask->addr32[0] != 0xffffffff)
559 					rpool->counter.addr32[0] =
560 					    htonl(arc4random());
561 				break;
562 #endif /* INET6 */
563 			}
564 			PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
565 			PF_ACPY(init_addr, naddr, af);
566 
567 		} else {
568 			PF_AINC(&rpool->counter, af);
569 			PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
570 		}
571 		break;
572 	case PF_POOL_SRCHASH:
573 	    {
574 		unsigned char hash[16];
575 
576 		pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
577 		PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
578 		break;
579 	    }
580 	case PF_POOL_ROUNDROBIN:
581 	    {
582 		struct pf_kpooladdr *acur = rpool->cur;
583 
584 		if (rpool->cur->addr.type == PF_ADDR_TABLE) {
585 			if (!pfr_pool_get(rpool->cur->addr.p.tbl,
586 			    &rpool->tblidx, &rpool->counter, af, NULL))
587 				goto get_addr;
588 		} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
589 			if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
590 			    &rpool->tblidx, &rpool->counter, af, pf_islinklocal))
591 				goto get_addr;
592 		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
593 			goto get_addr;
594 
595 	try_next:
596 		if (TAILQ_NEXT(rpool->cur, entries) == NULL)
597 			rpool->cur = TAILQ_FIRST(&rpool->list);
598 		else
599 			rpool->cur = TAILQ_NEXT(rpool->cur, entries);
600 		if (rpool->cur->addr.type == PF_ADDR_TABLE) {
601 			if (pfr_pool_get(rpool->cur->addr.p.tbl,
602 			    &rpool->tblidx, &rpool->counter, af, NULL)) {
603 				/* table contains no address of type 'af' */
604 				if (rpool->cur != acur)
605 					goto try_next;
606 				reason = PFRES_MAPFAILED;
607 				goto done_pool_mtx;
608 			}
609 		} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
610 			rpool->tblidx = -1;
611 			if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
612 			    &rpool->tblidx, &rpool->counter, af, pf_islinklocal)) {
613 				/* table contains no address of type 'af' */
614 				if (rpool->cur != acur)
615 					goto try_next;
616 				reason = PFRES_MAPFAILED;
617 				goto done_pool_mtx;
618 			}
619 		} else {
620 			raddr = &rpool->cur->addr.v.a.addr;
621 			rmask = &rpool->cur->addr.v.a.mask;
622 			PF_ACPY(&rpool->counter, raddr, af);
623 		}
624 
625 	get_addr:
626 		PF_ACPY(naddr, &rpool->counter, af);
627 		if (init_addr != NULL && PF_AZERO(init_addr, af))
628 			PF_ACPY(init_addr, naddr, af);
629 		PF_AINC(&rpool->counter, af);
630 		break;
631 	    }
632 	}
633 
634 	if (nkif)
635 		*nkif = rpool->cur->kif;
636 
637 done_pool_mtx:
638 	mtx_unlock(&rpool->mtx);
639 
640 	if (reason) {
641 		counter_u64_add(V_pf_status.counters[reason], 1);
642 	}
643 
644 	return (reason);
645 }
646 
647 u_short
648 pf_map_addr_sn(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
649     struct pf_addr *naddr, struct pfi_kkif **nkif, struct pf_addr *init_addr,
650     struct pf_ksrc_node **sn, struct pf_srchash **sh, struct pf_kpool *rpool)
651 {
652 	u_short			 reason = 0;
653 
654 	KASSERT(*sn == NULL, ("*sn not NULL"));
655 
656 	/*
657 	 * If this is a sticky-address rule, try to find an existing src_node.
658 	 * Request the sh to be unlocked if sn was not found, as we never
659 	 * insert a new sn when parsing the ruleset.
660 	 */
661 	if (r->rdr.opts & PF_POOL_STICKYADDR &&
662 	    (r->rdr.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
663 		*sn = pf_find_src_node(saddr, r, af, sh, false);
664 
665 	if (*sn != NULL) {
666 		PF_SRC_NODE_LOCK_ASSERT(*sn);
667 
668 		/* If the supplied address is the same as the current one we've
669 		 * been asked before, so tell the caller that there's no other
670 		 * address to be had. */
671 		if (PF_AEQ(naddr, &(*sn)->raddr, af)) {
672 			reason = PFRES_MAPFAILED;
673 			goto done;
674 		}
675 
676 		PF_ACPY(naddr, &(*sn)->raddr, af);
677 		if (nkif)
678 			*nkif = (*sn)->rkif;
679 		if (V_pf_status.debug >= PF_DEBUG_NOISY) {
680 			printf("pf_map_addr: src tracking maps ");
681 			pf_print_host(saddr, 0, af);
682 			printf(" to ");
683 			pf_print_host(naddr, 0, af);
684 			if (nkif)
685 				printf("@%s", (*nkif)->pfik_name);
686 			printf("\n");
687 		}
688 		goto done;
689 	}
690 
691 	/*
692 	 * Source node has not been found. Find a new address and store it
693 	 * in variables given by the caller.
694 	 */
695 	if (pf_map_addr(af, r, saddr, naddr, nkif, init_addr, rpool) != 0) {
696 		/* pf_map_addr() sets reason counters on its own */
697 		goto done;
698 	}
699 
700 	if (V_pf_status.debug >= PF_DEBUG_NOISY &&
701 	    (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
702 		printf("pf_map_addr: selected address ");
703 		pf_print_host(naddr, 0, af);
704 		if (nkif)
705 			printf("@%s", (*nkif)->pfik_name);
706 		printf("\n");
707 	}
708 
709 done:
710 	if ((*sn) != NULL)
711 		PF_SRC_NODE_UNLOCK(*sn);
712 
713 	if (reason) {
714 		counter_u64_add(V_pf_status.counters[reason], 1);
715 	}
716 
717 	return (reason);
718 }
719 
720 u_short
721 pf_get_translation(struct pf_pdesc *pd, int off,
722     struct pf_state_key **skp, struct pf_state_key **nkp,
723     struct pf_kanchor_stackframe *anchor_stack, struct pf_krule **rp,
724     struct pf_udp_mapping **udp_mapping)
725 {
726 	struct pf_krule	*r = NULL;
727 	struct pf_addr	*naddr;
728 	struct pf_ksrc_node	*sn = NULL;
729 	struct pf_srchash	*sh = NULL;
730 	uint16_t	*nportp;
731 	uint16_t	 low, high;
732 	u_short		 reason;
733 
734 	PF_RULES_RASSERT();
735 	KASSERT(*skp == NULL, ("*skp not NULL"));
736 	KASSERT(*nkp == NULL, ("*nkp not NULL"));
737 
738 	*rp = NULL;
739 
740 	if (pd->dir == PF_OUT) {
741 		r = pf_match_translation(pd, PF_RULESET_BINAT, anchor_stack);
742 		if (r == NULL)
743 			r = pf_match_translation(pd, PF_RULESET_NAT, anchor_stack);
744 	} else {
745 		r = pf_match_translation(pd, PF_RULESET_RDR, anchor_stack);
746 		if (r == NULL)
747 			r = pf_match_translation(pd, PF_RULESET_BINAT, anchor_stack);
748 	}
749 
750 	if (r == NULL)
751 		return (PFRES_MAX);
752 
753 	switch (r->action) {
754 	case PF_NONAT:
755 	case PF_NOBINAT:
756 	case PF_NORDR:
757 		return (PFRES_MAX);
758 	}
759 
760 	if (pf_state_key_setup(pd, pd->nsport, pd->ndport, skp, nkp))
761 		return (PFRES_MEMORY);
762 
763 	naddr = &(*nkp)->addr[1];
764 	nportp = &(*nkp)->port[1];
765 
766 	switch (r->action) {
767 	case PF_NAT:
768 		if (pd->proto == IPPROTO_ICMP) {
769 			low = 1;
770 			high = 65535;
771 		} else {
772 			low  = r->rdr.proxy_port[0];
773 			high = r->rdr.proxy_port[1];
774 		}
775 		if (r->rdr.mape.offset > 0) {
776 			if (pf_get_mape_sport(pd, r, naddr, nportp, &sn,
777 			    &sh, udp_mapping)) {
778 				DPFPRINTF(PF_DEBUG_MISC,
779 				    ("pf: MAP-E port allocation (%u/%u/%u)"
780 				    " failed\n",
781 				    r->rdr.mape.offset,
782 				    r->rdr.mape.psidlen,
783 				    r->rdr.mape.psid));
784 				reason = PFRES_MAPFAILED;
785 				goto notrans;
786 			}
787 		} else if (pf_get_sport(pd, r, naddr, nportp, low, high, &sn,
788 		    &sh, &r->rdr, udp_mapping)) {
789 			DPFPRINTF(PF_DEBUG_MISC,
790 			    ("pf: NAT proxy port allocation (%u-%u) failed\n",
791 			    r->rdr.proxy_port[0], r->rdr.proxy_port[1]));
792 			reason = PFRES_MAPFAILED;
793 			goto notrans;
794 		}
795 		break;
796 	case PF_BINAT:
797 		switch (pd->dir) {
798 		case PF_OUT:
799 			if (r->rdr.cur->addr.type == PF_ADDR_DYNIFTL){
800 				switch (pd->af) {
801 #ifdef INET
802 				case AF_INET:
803 					if (r->rdr.cur->addr.p.dyn->
804 					    pfid_acnt4 < 1) {
805 						reason = PFRES_MAPFAILED;
806 						goto notrans;
807 					}
808 					PF_POOLMASK(naddr,
809 					    &r->rdr.cur->addr.p.dyn->
810 					    pfid_addr4,
811 					    &r->rdr.cur->addr.p.dyn->
812 					    pfid_mask4, &pd->nsaddr, AF_INET);
813 					break;
814 #endif /* INET */
815 #ifdef INET6
816 				case AF_INET6:
817 					if (r->rdr.cur->addr.p.dyn->
818 					    pfid_acnt6 < 1) {
819 						reason = PFRES_MAPFAILED;
820 						goto notrans;
821 					}
822 					PF_POOLMASK(naddr,
823 					    &r->rdr.cur->addr.p.dyn->
824 					    pfid_addr6,
825 					    &r->rdr.cur->addr.p.dyn->
826 					    pfid_mask6, &pd->nsaddr, AF_INET6);
827 					break;
828 #endif /* INET6 */
829 				}
830 			} else
831 				PF_POOLMASK(naddr,
832 				    &r->rdr.cur->addr.v.a.addr,
833 				    &r->rdr.cur->addr.v.a.mask, &pd->nsaddr,
834 				    pd->af);
835 			break;
836 		case PF_IN:
837 			if (r->src.addr.type == PF_ADDR_DYNIFTL) {
838 				switch (pd->af) {
839 #ifdef INET
840 				case AF_INET:
841 					if (r->src.addr.p.dyn->pfid_acnt4 < 1) {
842 						reason = PFRES_MAPFAILED;
843 						goto notrans;
844 					}
845 					PF_POOLMASK(naddr,
846 					    &r->src.addr.p.dyn->pfid_addr4,
847 					    &r->src.addr.p.dyn->pfid_mask4,
848 					    &pd->ndaddr, AF_INET);
849 					break;
850 #endif /* INET */
851 #ifdef INET6
852 				case AF_INET6:
853 					if (r->src.addr.p.dyn->pfid_acnt6 < 1) {
854 						reason = PFRES_MAPFAILED;
855 						goto notrans;
856 					}
857 					PF_POOLMASK(naddr,
858 					    &r->src.addr.p.dyn->pfid_addr6,
859 					    &r->src.addr.p.dyn->pfid_mask6,
860 					    &pd->ndaddr, AF_INET6);
861 					break;
862 #endif /* INET6 */
863 				}
864 			} else
865 				PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
866 				    &r->src.addr.v.a.mask, &pd->ndaddr, pd->af);
867 			break;
868 		}
869 		break;
870 	case PF_RDR: {
871 		struct pf_state_key_cmp key;
872 		int tries;
873 		uint16_t cut, low, high, nport;
874 
875 		reason = pf_map_addr_sn(pd->af, r, &pd->nsaddr, naddr, NULL,
876 		    NULL, &sn, &sh, &r->rdr);
877 		if (reason != 0)
878 			goto notrans;
879 		if ((r->rdr.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
880 			PF_POOLMASK(naddr, naddr, &r->rdr.cur->addr.v.a.mask,
881 			    &pd->ndaddr, pd->af);
882 
883 		/* Do not change SCTP ports. */
884 		if (pd->proto == IPPROTO_SCTP)
885 			break;
886 
887 		if (r->rdr.proxy_port[1]) {
888 			uint32_t	tmp_nport;
889 
890 			tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) %
891 			    (r->rdr.proxy_port[1] - r->rdr.proxy_port[0] +
892 			    1)) + r->rdr.proxy_port[0];
893 
894 			/* Wrap around if necessary. */
895 			if (tmp_nport > 65535)
896 				tmp_nport -= 65535;
897 			nport = htons((uint16_t)tmp_nport);
898 		} else if (r->rdr.proxy_port[0])
899 			nport = htons(r->rdr.proxy_port[0]);
900 		else
901 			nport = pd->ndport;
902 
903 		/*
904 		 * Update the destination port.
905 		 */
906 		*nportp = nport;
907 
908 		/*
909 		 * Do we have a source port conflict in the stack state?  Try to
910 		 * modulate the source port if so.  Note that this is racy since
911 		 * the state lookup may not find any matches here but will once
912 		 * pf_create_state() actually instantiates the state.
913 		 */
914 		bzero(&key, sizeof(key));
915 		key.af = pd->af;
916 		key.proto = pd->proto;
917 		key.port[0] = pd->nsport;
918 		PF_ACPY(&key.addr[0], &pd->nsaddr, key.af);
919 		key.port[1] = nport;
920 		PF_ACPY(&key.addr[1], naddr, key.af);
921 
922 		if (!pf_find_state_all_exists(&key, PF_OUT))
923 			break;
924 
925 		tries = 0;
926 
927 		low = 50001;	/* XXX-MJ PF_NAT_PROXY_PORT_LOW/HIGH */
928 		high = 65535;
929 		cut = arc4random() % (1 + high - low) + low;
930 		for (uint32_t tmp = cut;
931 		    tmp <= high && tmp <= UINT16_MAX &&
932 		    tries < V_pf_rdr_srcport_rewrite_tries;
933 		    tmp++, tries++) {
934 			key.port[0] = htons(tmp);
935 			if (!pf_find_state_all_exists(&key, PF_OUT)) {
936 				/* Update the source port. */
937 				(*nkp)->port[0] = htons(tmp);
938 				goto out;
939 			}
940 		}
941 		for (uint32_t tmp = cut - 1;
942 		    tmp >= low && tries < V_pf_rdr_srcport_rewrite_tries;
943 		    tmp--, tries++) {
944 			key.port[0] = htons(tmp);
945 			if (!pf_find_state_all_exists(&key, PF_OUT)) {
946 				/* Update the source port. */
947 				(*nkp)->port[0] = htons(tmp);
948 				goto out;
949 			}
950 		}
951 
952 		/*
953 		 * We failed to find a match.  Push on ahead anyway, let
954 		 * pf_state_insert() be the arbiter of whether the state
955 		 * conflict is tolerable.  In particular, with TCP connections
956 		 * the state may be reused if the TCP state is terminal.
957 		 */
958 		DPFPRINTF(PF_DEBUG_MISC,
959 		    ("pf: RDR source port allocation failed\n"));
960 		break;
961 
962 out:
963 		DPFPRINTF(PF_DEBUG_MISC,
964 		    ("pf: RDR source port allocation %u->%u\n",
965 		    ntohs(pd->nsport), ntohs((*nkp)->port[0])));
966 		break;
967 	}
968 	default:
969 		panic("%s: unknown action %u", __func__, r->action);
970 	}
971 
972 	/* Return success only if translation really happened. */
973 	if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) {
974 		*rp = r;
975 		return (PFRES_MATCH);
976 	}
977 
978 	reason = PFRES_MAX;
979 notrans:
980 	uma_zfree(V_pf_state_key_z, *nkp);
981 	uma_zfree(V_pf_state_key_z, *skp);
982 	*skp = *nkp = NULL;
983 
984 	return (reason);
985 }
986 
987 int
988 pf_get_transaddr_af(struct pf_krule *r, struct pf_pdesc *pd)
989 {
990 #if defined(INET) && defined(INET6)
991 	struct pf_addr	 ndaddr, nsaddr, naddr;
992 	u_int16_t	 nport = 0;
993 	int		 prefixlen = 96;
994 	struct pf_srchash	*sh = NULL;
995 	struct pf_ksrc_node	*sns = NULL;
996 
997 	if (V_pf_status.debug >= PF_DEBUG_MISC) {
998 		printf("pf: af-to %s %s, ",
999 		    pd->naf == AF_INET ? "inet" : "inet6",
1000 		    TAILQ_EMPTY(&r->rdr.list) ? "nat" : "rdr");
1001 		pf_print_host(&pd->nsaddr, pd->nsport, pd->af);
1002 		printf(" -> ");
1003 		pf_print_host(&pd->ndaddr, pd->ndport, pd->af);
1004 		printf("\n");
1005 	}
1006 
1007 	if (TAILQ_EMPTY(&r->nat.list))
1008 		panic("pf_get_transaddr_af: no nat pool for source address");
1009 
1010 	/* get source address and port */
1011 	if (pf_get_sport(pd, r, &nsaddr, &nport,
1012 	    r->nat.proxy_port[0], r->nat.proxy_port[1], &sns, &sh, &r->nat, NULL)) {
1013 		DPFPRINTF(PF_DEBUG_MISC,
1014 		    ("pf: af-to NAT proxy port allocation (%u-%u) failed",
1015 		    r->nat.proxy_port[0], r->nat.proxy_port[1]));
1016 		return (-1);
1017 	}
1018 
1019 	if (pd->proto == IPPROTO_ICMPV6 && pd->naf == AF_INET) {
1020 		if (pd->dir == PF_IN) {
1021 			NTOHS(pd->ndport);
1022 			if (pd->ndport == ICMP6_ECHO_REQUEST)
1023 				pd->ndport = ICMP_ECHO;
1024 			else if (pd->ndport == ICMP6_ECHO_REPLY)
1025 				pd->ndport = ICMP_ECHOREPLY;
1026 			HTONS(pd->ndport);
1027 		} else {
1028 			NTOHS(pd->nsport);
1029 			if (pd->nsport == ICMP6_ECHO_REQUEST)
1030 				pd->nsport = ICMP_ECHO;
1031 			else if (pd->nsport == ICMP6_ECHO_REPLY)
1032 				pd->nsport = ICMP_ECHOREPLY;
1033 			HTONS(pd->nsport);
1034 		}
1035 	} else if (pd->proto == IPPROTO_ICMP && pd->naf == AF_INET6) {
1036 		if (pd->dir == PF_IN) {
1037 			NTOHS(pd->ndport);
1038 			if (pd->ndport == ICMP_ECHO)
1039 				pd->ndport = ICMP6_ECHO_REQUEST;
1040 			else if (pd->ndport == ICMP_ECHOREPLY)
1041 				pd->ndport = ICMP6_ECHO_REPLY;
1042 			HTONS(pd->ndport);
1043 		} else {
1044 			NTOHS(pd->nsport);
1045 			if (pd->nsport == ICMP_ECHO)
1046 				pd->nsport = ICMP6_ECHO_REQUEST;
1047 			else if (pd->nsport == ICMP_ECHOREPLY)
1048 				pd->nsport = ICMP6_ECHO_REPLY;
1049 			HTONS(pd->nsport);
1050 		}
1051 	}
1052 
1053 	/* get the destination address and port */
1054 	if (! TAILQ_EMPTY(&r->rdr.list)) {
1055 		if (pf_map_addr_sn(pd->naf, r, &nsaddr, &naddr, NULL, NULL,
1056 		    &sns, NULL, &r->rdr))
1057 			return (-1);
1058 		if (r->rdr.proxy_port[0])
1059 			pd->ndport = htons(r->rdr.proxy_port[0]);
1060 
1061 		if (pd->naf == AF_INET) {
1062 			/* The prefix is the IPv4 rdr address */
1063 			prefixlen = in_mask2len(
1064 			    (struct in_addr *)&r->rdr.cur->addr.v.a.mask);
1065 			inet_nat46(pd->naf, &pd->ndaddr, &ndaddr, &naddr,
1066 			    prefixlen);
1067 		} else {
1068 			/* The prefix is the IPv6 rdr address */
1069 			prefixlen = in6_mask2len(
1070 			    (struct in6_addr *)&r->rdr.cur->addr.v.a.mask, NULL);
1071 			inet_nat64(pd->naf, &pd->ndaddr, &ndaddr, &naddr,
1072 			    prefixlen);
1073 		}
1074 	} else {
1075 		if (pd->naf == AF_INET) {
1076 			/* The prefix is the IPv6 dst address */
1077 			prefixlen = in6_mask2len(
1078 			    (struct in6_addr *)&r->dst.addr.v.a.mask, NULL);
1079 			if (prefixlen < 32)
1080 				prefixlen = 96;
1081 			inet_nat64(pd->naf, &pd->ndaddr, &ndaddr, &pd->ndaddr,
1082 			    prefixlen);
1083 		} else {
1084 			/*
1085 			 * The prefix is the IPv6 nat address
1086 			 * (that was stored in pd->nsaddr)
1087 			 */
1088 			prefixlen = in6_mask2len(
1089 			    (struct in6_addr *)&r->nat.cur->addr.v.a.mask, NULL);
1090 			if (prefixlen > 96)
1091 				prefixlen = 96;
1092 			inet_nat64(pd->naf, &pd->ndaddr, &ndaddr, &nsaddr,
1093 			    prefixlen);
1094 		}
1095 	}
1096 
1097 	PF_ACPY(&pd->nsaddr, &nsaddr, pd->naf);
1098 	PF_ACPY(&pd->ndaddr, &ndaddr, pd->naf);
1099 
1100 	if (V_pf_status.debug >= PF_DEBUG_MISC) {
1101 		printf("pf: af-to %s done, prefixlen %d, ",
1102 		    pd->naf == AF_INET ? "inet" : "inet6",
1103 		    prefixlen);
1104 		pf_print_host(&pd->nsaddr, pd->nsport, pd->naf);
1105 		printf(" -> ");
1106 		pf_print_host(&pd->ndaddr, pd->ndport, pd->naf);
1107 		printf("\n");
1108 	}
1109 
1110 	return (0);
1111 #else
1112 	return (-1);
1113 #endif
1114 }
1115