xref: /freebsd/sys/netpfil/pf/pf_norm.c (revision 3e5645b78f476816ca3b5acc28b29bbafbb9c444)
1 /*-
2  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
3  * Copyright 2011 Alexander Bluhm <bluhm@openbsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  *	$OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34 #include "opt_pf.h"
35 
36 #include <sys/param.h>
37 #include <sys/lock.h>
38 #include <sys/mbuf.h>
39 #include <sys/mutex.h>
40 #include <sys/refcount.h>
41 #include <sys/rwlock.h>
42 #include <sys/socket.h>
43 
44 #include <net/if.h>
45 #include <net/vnet.h>
46 #include <net/pfvar.h>
47 #include <net/if_pflog.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/ip.h>
51 #include <netinet/ip_var.h>
52 #include <netinet6/ip6_var.h>
53 #include <netinet/tcp.h>
54 #include <netinet/tcp_fsm.h>
55 #include <netinet/tcp_seq.h>
56 
57 #ifdef INET6
58 #include <netinet/ip6.h>
59 #endif /* INET6 */
60 
61 struct pf_frent {
62 	TAILQ_ENTRY(pf_frent)	fr_next;
63 	struct mbuf	*fe_m;
64 	uint16_t	fe_hdrlen;	/* ipv4 header lenght with ip options
65 					   ipv6, extension, fragment header */
66 	uint16_t	fe_extoff;	/* last extension header offset or 0 */
67 	uint16_t	fe_len;		/* fragment length */
68 	uint16_t	fe_off;		/* fragment offset */
69 	uint16_t	fe_mff;		/* more fragment flag */
70 };
71 
72 struct pf_fragment_cmp {
73 	struct pf_addr	frc_src;
74 	struct pf_addr	frc_dst;
75 	uint32_t	frc_id;
76 	sa_family_t	frc_af;
77 	uint8_t		frc_proto;
78 	uint8_t		frc_direction;
79 };
80 
81 struct pf_fragment {
82 	struct pf_fragment_cmp	fr_key;
83 #define fr_src	fr_key.frc_src
84 #define fr_dst	fr_key.frc_dst
85 #define fr_id	fr_key.frc_id
86 #define fr_af	fr_key.frc_af
87 #define fr_proto	fr_key.frc_proto
88 #define fr_direction	fr_key.frc_direction
89 
90 	RB_ENTRY(pf_fragment) fr_entry;
91 	TAILQ_ENTRY(pf_fragment) frag_next;
92 	uint8_t		fr_flags;	/* status flags */
93 #define PFFRAG_SEENLAST		0x0001	/* Seen the last fragment for this */
94 #define PFFRAG_NOBUFFER		0x0002	/* Non-buffering fragment cache */
95 #define PFFRAG_DROP		0x0004	/* Drop all fragments */
96 #define BUFFER_FRAGMENTS(fr)	(!((fr)->fr_flags & PFFRAG_NOBUFFER))
97 	uint16_t	fr_max;		/* fragment data max */
98 	uint32_t	fr_timeout;
99 	uint16_t	fr_maxlen;	/* maximum length of single fragment */
100 	TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
101 };
102 
103 struct pf_fragment_tag {
104 	uint16_t	ft_hdrlen;	/* header length of reassembled pkt */
105 	uint16_t	ft_extoff;	/* last extension header offset or 0 */
106 	uint16_t	ft_maxlen;	/* maximum fragment payload length */
107 	uint32_t	ft_id;		/* fragment id */
108 };
109 
110 static struct mtx pf_frag_mtx;
111 #define PF_FRAG_LOCK()		mtx_lock(&pf_frag_mtx)
112 #define PF_FRAG_UNLOCK()	mtx_unlock(&pf_frag_mtx)
113 #define PF_FRAG_ASSERT()	mtx_assert(&pf_frag_mtx, MA_OWNED)
114 
115 VNET_DEFINE(uma_zone_t, pf_state_scrub_z);	/* XXX: shared with pfsync */
116 
117 static VNET_DEFINE(uma_zone_t, pf_frent_z);
118 #define	V_pf_frent_z	VNET(pf_frent_z)
119 static VNET_DEFINE(uma_zone_t, pf_frag_z);
120 #define	V_pf_frag_z	VNET(pf_frag_z)
121 
122 TAILQ_HEAD(pf_fragqueue, pf_fragment);
123 TAILQ_HEAD(pf_cachequeue, pf_fragment);
124 static VNET_DEFINE(struct pf_fragqueue,	pf_fragqueue);
125 #define	V_pf_fragqueue			VNET(pf_fragqueue)
126 static VNET_DEFINE(struct pf_cachequeue,	pf_cachequeue);
127 #define	V_pf_cachequeue			VNET(pf_cachequeue)
128 RB_HEAD(pf_frag_tree, pf_fragment);
129 static VNET_DEFINE(struct pf_frag_tree,	pf_frag_tree);
130 #define	V_pf_frag_tree			VNET(pf_frag_tree)
131 static VNET_DEFINE(struct pf_frag_tree,	pf_cache_tree);
132 #define	V_pf_cache_tree			VNET(pf_cache_tree)
133 static int		 pf_frag_compare(struct pf_fragment *,
134 			    struct pf_fragment *);
135 static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
136 static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
137 
138 static void	pf_flush_fragments(void);
139 static void	pf_free_fragment(struct pf_fragment *);
140 static void	pf_remove_fragment(struct pf_fragment *);
141 static int	pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
142 		    struct tcphdr *, int, sa_family_t);
143 static struct pf_frent *pf_create_fragment(u_short *);
144 static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key,
145 		    struct pf_frag_tree *tree);
146 static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *,
147 		    struct pf_frent *, u_short *);
148 static int	pf_isfull_fragment(struct pf_fragment *);
149 static struct mbuf *pf_join_fragment(struct pf_fragment *);
150 #ifdef INET
151 static void	pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t);
152 static int	pf_reassemble(struct mbuf **, struct ip *, int, u_short *);
153 static struct mbuf *pf_fragcache(struct mbuf **, struct ip*,
154 		    struct pf_fragment **, int, int, int *);
155 #endif	/* INET */
156 #ifdef INET6
157 static int	pf_reassemble6(struct mbuf **, struct ip6_hdr *,
158 		    struct ip6_frag *, uint16_t, uint16_t, int, u_short *);
159 static void	pf_scrub_ip6(struct mbuf **, uint8_t);
160 #endif	/* INET6 */
161 
162 #define	DPFPRINTF(x) do {				\
163 	if (V_pf_status.debug >= PF_DEBUG_MISC) {	\
164 		printf("%s: ", __func__);		\
165 		printf x ;				\
166 	}						\
167 } while(0)
168 
169 #ifdef INET
170 static void
171 pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key)
172 {
173 
174 	key->frc_src.v4 = ip->ip_src;
175 	key->frc_dst.v4 = ip->ip_dst;
176 	key->frc_af = AF_INET;
177 	key->frc_proto = ip->ip_p;
178 	key->frc_id = ip->ip_id;
179 	key->frc_direction = dir;
180 }
181 #endif	/* INET */
182 
183 void
184 pf_normalize_init(void)
185 {
186 
187 	V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
188 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
189 	V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
190 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
191 	V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
192 	    sizeof(struct pf_state_scrub),  NULL, NULL, NULL, NULL,
193 	    UMA_ALIGN_PTR, 0);
194 
195 	V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z;
196 	V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
197 	uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT);
198 	uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached");
199 
200 	mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF);
201 
202 	TAILQ_INIT(&V_pf_fragqueue);
203 	TAILQ_INIT(&V_pf_cachequeue);
204 }
205 
206 void
207 pf_normalize_cleanup(void)
208 {
209 
210 	uma_zdestroy(V_pf_state_scrub_z);
211 	uma_zdestroy(V_pf_frent_z);
212 	uma_zdestroy(V_pf_frag_z);
213 
214 	mtx_destroy(&pf_frag_mtx);
215 }
216 
217 static int
218 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
219 {
220 	int	diff;
221 
222 	if ((diff = a->fr_id - b->fr_id) != 0)
223 		return (diff);
224 	if ((diff = a->fr_proto - b->fr_proto) != 0)
225 		return (diff);
226 	if ((diff = a->fr_af - b->fr_af) != 0)
227 		return (diff);
228 	if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
229 		return (diff);
230 	if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
231 		return (diff);
232 	return (0);
233 }
234 
235 void
236 pf_purge_expired_fragments(void)
237 {
238 	struct pf_fragment	*frag;
239 	u_int32_t		 expire = time_uptime -
240 				    V_pf_default_rule.timeout[PFTM_FRAG];
241 
242 	PF_FRAG_LOCK();
243 	while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
244 		KASSERT((BUFFER_FRAGMENTS(frag)),
245 		    ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__));
246 		if (frag->fr_timeout > expire)
247 			break;
248 
249 		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
250 		pf_free_fragment(frag);
251 	}
252 
253 	while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) {
254 		KASSERT((!BUFFER_FRAGMENTS(frag)),
255 		    ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__));
256 		if (frag->fr_timeout > expire)
257 			break;
258 
259 		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
260 		pf_free_fragment(frag);
261 		KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) ||
262 		    TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag),
263 		    ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
264 		    __FUNCTION__));
265 	}
266 	PF_FRAG_UNLOCK();
267 }
268 
269 /*
270  * Try to flush old fragments to make space for new ones
271  */
272 static void
273 pf_flush_fragments(void)
274 {
275 	struct pf_fragment	*frag, *cache;
276 	int			 goal;
277 
278 	PF_FRAG_ASSERT();
279 
280 	goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
281 	DPFPRINTF(("trying to free %d frag entriess\n", goal));
282 	while (goal < uma_zone_get_cur(V_pf_frent_z)) {
283 		frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
284 		if (frag)
285 			pf_free_fragment(frag);
286 		cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue);
287 		if (cache)
288 			pf_free_fragment(cache);
289 		if (frag == NULL && cache == NULL)
290 			break;
291 	}
292 }
293 
294 /* Frees the fragments and all associated entries */
295 static void
296 pf_free_fragment(struct pf_fragment *frag)
297 {
298 	struct pf_frent		*frent;
299 
300 	PF_FRAG_ASSERT();
301 
302 	/* Free all fragments */
303 	if (BUFFER_FRAGMENTS(frag)) {
304 		for (frent = TAILQ_FIRST(&frag->fr_queue); frent;
305 		    frent = TAILQ_FIRST(&frag->fr_queue)) {
306 			TAILQ_REMOVE(&frag->fr_queue, frent, fr_next);
307 
308 			m_freem(frent->fe_m);
309 			uma_zfree(V_pf_frent_z, frent);
310 		}
311 	} else {
312 		for (frent = TAILQ_FIRST(&frag->fr_queue); frent;
313 		    frent = TAILQ_FIRST(&frag->fr_queue)) {
314 			TAILQ_REMOVE(&frag->fr_queue, frent, fr_next);
315 
316 			KASSERT((TAILQ_EMPTY(&frag->fr_queue) ||
317 			    TAILQ_FIRST(&frag->fr_queue)->fe_off >
318 			    frent->fe_len),
319 			    ("! (TAILQ_EMPTY() || TAILQ_FIRST()->fe_off >"
320 			    " frent->fe_len): %s", __func__));
321 
322 			uma_zfree(V_pf_frent_z, frent);
323 		}
324 	}
325 
326 	pf_remove_fragment(frag);
327 }
328 
329 static struct pf_fragment *
330 pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
331 {
332 	struct pf_fragment	*frag;
333 
334 	PF_FRAG_ASSERT();
335 
336 	frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
337 	if (frag != NULL) {
338 		/* XXX Are we sure we want to update the timeout? */
339 		frag->fr_timeout = time_uptime;
340 		if (BUFFER_FRAGMENTS(frag)) {
341 			TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
342 			TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
343 		} else {
344 			TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
345 			TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next);
346 		}
347 	}
348 
349 	return (frag);
350 }
351 
352 /* Removes a fragment from the fragment queue and frees the fragment */
353 static void
354 pf_remove_fragment(struct pf_fragment *frag)
355 {
356 
357 	PF_FRAG_ASSERT();
358 
359 	if (BUFFER_FRAGMENTS(frag)) {
360 		RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
361 		TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
362 		uma_zfree(V_pf_frag_z, frag);
363 	} else {
364 		RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag);
365 		TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
366 		uma_zfree(V_pf_frag_z, frag);
367 	}
368 }
369 
370 static struct pf_frent *
371 pf_create_fragment(u_short *reason)
372 {
373 	struct pf_frent *frent;
374 
375 	PF_FRAG_ASSERT();
376 
377 	frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
378 	if (frent == NULL) {
379 		pf_flush_fragments();
380 		frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
381 		if (frent == NULL) {
382 			REASON_SET(reason, PFRES_MEMORY);
383 			return (NULL);
384 		}
385 	}
386 
387 	return (frent);
388 }
389 
390 static struct pf_fragment *
391 pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
392 		u_short *reason)
393 {
394 	struct pf_frent		*after, *next, *prev;
395 	struct pf_fragment	*frag;
396 	uint16_t		total;
397 
398 	PF_FRAG_ASSERT();
399 
400 	/* No empty fragments. */
401 	if (frent->fe_len == 0) {
402 		DPFPRINTF(("bad fragment: len 0"));
403 		goto bad_fragment;
404 	}
405 
406 	/* All fragments are 8 byte aligned. */
407 	if (frent->fe_mff && (frent->fe_len & 0x7)) {
408 		DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len));
409 		goto bad_fragment;
410 	}
411 
412 	/* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */
413 	if (frent->fe_off + frent->fe_len > IP_MAXPACKET) {
414 		DPFPRINTF(("bad fragment: max packet %d",
415 		    frent->fe_off + frent->fe_len));
416 		goto bad_fragment;
417 	}
418 
419 	DPFPRINTF((key->frc_af == AF_INET ?
420 	    "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d",
421 	    key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len));
422 
423 	/* Fully buffer all of the fragments in this fragment queue. */
424 	frag = pf_find_fragment(key, &V_pf_frag_tree);
425 
426 	/* Create a new reassembly queue for this packet. */
427 	if (frag == NULL) {
428 		frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
429 		if (frag == NULL) {
430 			pf_flush_fragments();
431 			frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
432 			if (frag == NULL) {
433 				REASON_SET(reason, PFRES_MEMORY);
434 				goto drop_fragment;
435 			}
436 		}
437 
438 		*(struct pf_fragment_cmp *)frag = *key;
439 		frag->fr_timeout = time_second;
440 		frag->fr_maxlen = frent->fe_len;
441 		TAILQ_INIT(&frag->fr_queue);
442 
443 		RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag);
444 		TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
445 
446 		/* We do not have a previous fragment. */
447 		TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next);
448 
449 		return (frag);
450 	}
451 
452 	KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue"));
453 
454 	/* Remember maximum fragment len for refragmentation. */
455 	if (frent->fe_len > frag->fr_maxlen)
456 		frag->fr_maxlen = frent->fe_len;
457 
458 	/* Maximum data we have seen already. */
459 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
460 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
461 
462 	/* Non terminal fragments must have more fragments flag. */
463 	if (frent->fe_off + frent->fe_len < total && !frent->fe_mff)
464 		goto bad_fragment;
465 
466 	/* Check if we saw the last fragment already. */
467 	if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) {
468 		if (frent->fe_off + frent->fe_len > total ||
469 		    (frent->fe_off + frent->fe_len == total && frent->fe_mff))
470 			goto bad_fragment;
471 	} else {
472 		if (frent->fe_off + frent->fe_len == total && !frent->fe_mff)
473 			goto bad_fragment;
474 	}
475 
476 	/* Find a fragment after the current one. */
477 	prev = NULL;
478 	TAILQ_FOREACH(after, &frag->fr_queue, fr_next) {
479 		if (after->fe_off > frent->fe_off)
480 			break;
481 		prev = after;
482 	}
483 
484 	KASSERT(prev != NULL || after != NULL,
485 	    ("prev != NULL || after != NULL"));
486 
487 	if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) {
488 		uint16_t precut;
489 
490 		precut = prev->fe_off + prev->fe_len - frent->fe_off;
491 		if (precut >= frent->fe_len)
492 			goto bad_fragment;
493 		DPFPRINTF(("overlap -%d", precut));
494 		m_adj(frent->fe_m, precut);
495 		frent->fe_off += precut;
496 		frent->fe_len -= precut;
497 	}
498 
499 	for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off;
500 	    after = next) {
501 		uint16_t aftercut;
502 
503 		aftercut = frent->fe_off + frent->fe_len - after->fe_off;
504 		DPFPRINTF(("adjust overlap %d", aftercut));
505 		if (aftercut < after->fe_len) {
506 			m_adj(after->fe_m, aftercut);
507 			after->fe_off += aftercut;
508 			after->fe_len -= aftercut;
509 			break;
510 		}
511 
512 		/* This fragment is completely overlapped, lose it. */
513 		next = TAILQ_NEXT(after, fr_next);
514 		m_freem(after->fe_m);
515 		TAILQ_REMOVE(&frag->fr_queue, after, fr_next);
516 		uma_zfree(V_pf_frent_z, after);
517 	}
518 
519 	if (prev == NULL)
520 		TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next);
521 	else
522 		TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next);
523 
524 	return (frag);
525 
526 bad_fragment:
527 	REASON_SET(reason, PFRES_FRAG);
528 drop_fragment:
529 	uma_zfree(V_pf_frent_z, frent);
530 	return (NULL);
531 }
532 
533 static int
534 pf_isfull_fragment(struct pf_fragment *frag)
535 {
536 	struct pf_frent	*frent, *next;
537 	uint16_t off, total;
538 
539 	/* Check if we are completely reassembled */
540 	if (TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff)
541 		return (0);
542 
543 	/* Maximum data we have seen already */
544 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
545 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
546 
547 	/* Check if we have all the data */
548 	off = 0;
549 	for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = next) {
550 		next = TAILQ_NEXT(frent, fr_next);
551 
552 		off += frent->fe_len;
553 		if (off < total && (next == NULL || next->fe_off != off)) {
554 			DPFPRINTF(("missing fragment at %d, next %d, total %d",
555 			    off, next == NULL ? -1 : next->fe_off, total));
556 			return (0);
557 		}
558 	}
559 	DPFPRINTF(("%d < %d?", off, total));
560 	if (off < total)
561 		return (0);
562 	KASSERT(off == total, ("off == total"));
563 
564 	return (1);
565 }
566 
567 static struct mbuf *
568 pf_join_fragment(struct pf_fragment *frag)
569 {
570 	struct mbuf *m, *m2;
571 	struct pf_frent	*frent, *next;
572 
573 	frent = TAILQ_FIRST(&frag->fr_queue);
574 	next = TAILQ_NEXT(frent, fr_next);
575 
576 	m = frent->fe_m;
577 	m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len);
578 	uma_zfree(V_pf_frent_z, frent);
579 	for (frent = next; frent != NULL; frent = next) {
580 		next = TAILQ_NEXT(frent, fr_next);
581 
582 		m2 = frent->fe_m;
583 		/* Strip off ip header. */
584 		m_adj(m2, frent->fe_hdrlen);
585 		/* Strip off any trailing bytes. */
586 		m_adj(m2, frent->fe_len - m2->m_pkthdr.len);
587 
588 		uma_zfree(V_pf_frent_z, frent);
589 		m_cat(m, m2);
590 	}
591 
592 	/* Remove from fragment queue. */
593 	pf_remove_fragment(frag);
594 
595 	return (m);
596 }
597 
598 #ifdef INET
599 static int
600 pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason)
601 {
602 	struct mbuf		*m = *m0;
603 	struct pf_frent		*frent;
604 	struct pf_fragment	*frag;
605 	struct pf_fragment_cmp	key;
606 	uint16_t		total, hdrlen;
607 
608 	/* Get an entry for the fragment queue */
609 	if ((frent = pf_create_fragment(reason)) == NULL)
610 		return (PF_DROP);
611 
612 	frent->fe_m = m;
613 	frent->fe_hdrlen = ip->ip_hl << 2;
614 	frent->fe_extoff = 0;
615 	frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2);
616 	frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
617 	frent->fe_mff = ntohs(ip->ip_off) & IP_MF;
618 
619 	pf_ip2key(ip, dir, &key);
620 
621 	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
622 		return (PF_DROP);
623 
624 	/* The mbuf is part of the fragment entry, no direct free or access */
625 	m = *m0 = NULL;
626 
627 	if (!pf_isfull_fragment(frag))
628 		return (PF_PASS);  /* drop because *m0 is NULL, no error */
629 
630 	/* We have all the data */
631 	frent = TAILQ_FIRST(&frag->fr_queue);
632 	KASSERT(frent != NULL, ("frent != NULL"));
633 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
634 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
635 	hdrlen = frent->fe_hdrlen;
636 
637 	m = *m0 = pf_join_fragment(frag);
638 	frag = NULL;
639 
640 	if (m->m_flags & M_PKTHDR) {
641 		int plen = 0;
642 		for (m = *m0; m; m = m->m_next)
643 			plen += m->m_len;
644 		m = *m0;
645 		m->m_pkthdr.len = plen;
646 	}
647 
648 	ip = mtod(m, struct ip *);
649 	ip->ip_len = htons(hdrlen + total);
650 	ip->ip_off &= ~(IP_MF|IP_OFFMASK);
651 
652 	if (hdrlen + total > IP_MAXPACKET) {
653 		DPFPRINTF(("drop: too big: %d", total));
654 		ip->ip_len = 0;
655 		REASON_SET(reason, PFRES_SHORT);
656 		/* PF_DROP requires a valid mbuf *m0 in pf_test() */
657 		return (PF_DROP);
658 	}
659 
660 	DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
661 	return (PF_PASS);
662 }
663 #endif	/* INET */
664 
665 #ifdef INET6
666 static int
667 pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr,
668     uint16_t hdrlen, uint16_t extoff, int dir, u_short *reason)
669 {
670 	struct mbuf		*m = *m0;
671 	struct pf_frent		*frent;
672 	struct pf_fragment	*frag;
673 	struct pf_fragment_cmp	 key;
674 	struct m_tag		*mtag;
675 	struct pf_fragment_tag	*ftag;
676 	int			 off;
677 	uint32_t		 frag_id;
678 	uint16_t		 total, maxlen;
679 	uint8_t			 proto;
680 
681 	PF_FRAG_LOCK();
682 
683 	/* Get an entry for the fragment queue. */
684 	if ((frent = pf_create_fragment(reason)) == NULL) {
685 		PF_FRAG_UNLOCK();
686 		return (PF_DROP);
687 	}
688 
689 	frent->fe_m = m;
690 	frent->fe_hdrlen = hdrlen;
691 	frent->fe_extoff = extoff;
692 	frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen;
693 	frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
694 	frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
695 
696 	key.frc_src.v6 = ip6->ip6_src;
697 	key.frc_dst.v6 = ip6->ip6_dst;
698 	key.frc_af = AF_INET6;
699 	/* Only the first fragment's protocol is relevant. */
700 	key.frc_proto = 0;
701 	key.frc_id = fraghdr->ip6f_ident;
702 	key.frc_direction = dir;
703 
704 	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
705 		PF_FRAG_UNLOCK();
706 		return (PF_DROP);
707 	}
708 
709 	/* The mbuf is part of the fragment entry, no direct free or access. */
710 	m = *m0 = NULL;
711 
712 	if (!pf_isfull_fragment(frag)) {
713 		PF_FRAG_UNLOCK();
714 		return (PF_PASS);  /* Drop because *m0 is NULL, no error. */
715 	}
716 
717 	/* We have all the data. */
718 	extoff = frent->fe_extoff;
719 	maxlen = frag->fr_maxlen;
720 	frag_id = frag->fr_id;
721 	frent = TAILQ_FIRST(&frag->fr_queue);
722 	KASSERT(frent != NULL, ("frent != NULL"));
723 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
724 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
725 	hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag);
726 
727 	m = *m0 = pf_join_fragment(frag);
728 	frag = NULL;
729 
730 	PF_FRAG_UNLOCK();
731 
732 	/* Take protocol from first fragment header. */
733 	m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off);
734 	KASSERT(m, ("%s: short mbuf chain", __func__));
735 	proto = *(mtod(m, caddr_t) + off);
736 	m = *m0;
737 
738 	/* Delete frag6 header */
739 	if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0)
740 		goto fail;
741 
742 	if (m->m_flags & M_PKTHDR) {
743 		int plen = 0;
744 		for (m = *m0; m; m = m->m_next)
745 			plen += m->m_len;
746 		m = *m0;
747 		m->m_pkthdr.len = plen;
748 	}
749 
750 	if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag),
751 	    M_NOWAIT)) == NULL)
752 		goto fail;
753 	ftag = (struct pf_fragment_tag *)(mtag + 1);
754 	ftag->ft_hdrlen = hdrlen;
755 	ftag->ft_extoff = extoff;
756 	ftag->ft_maxlen = maxlen;
757 	ftag->ft_id = frag_id;
758 	m_tag_prepend(m, mtag);
759 
760 	ip6 = mtod(m, struct ip6_hdr *);
761 	ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total);
762 	if (extoff) {
763 		/* Write protocol into next field of last extension header. */
764 		m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
765 		    &off);
766 		KASSERT(m, ("%s: short mbuf chain", __func__));
767 		*(mtod(m, char *) + off) = proto;
768 		m = *m0;
769 	} else
770 		ip6->ip6_nxt = proto;
771 
772 	if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) {
773 		DPFPRINTF(("drop: too big: %d", total));
774 		ip6->ip6_plen = 0;
775 		REASON_SET(reason, PFRES_SHORT);
776 		/* PF_DROP requires a valid mbuf *m0 in pf_test6(). */
777 		return (PF_DROP);
778 	}
779 
780 	DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen)));
781 	return (PF_PASS);
782 
783 fail:
784 	REASON_SET(reason, PFRES_MEMORY);
785 	/* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */
786 	return (PF_DROP);
787 }
788 #endif	/* INET6 */
789 
790 #ifdef INET
791 static struct mbuf *
792 pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
793     int drop, int *nomem)
794 {
795 	struct mbuf		*m = *m0;
796 	struct pf_frent		*frp, *fra, *cur = NULL;
797 	int			 ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
798 	u_int16_t		 off = ntohs(h->ip_off) << 3;
799 	u_int16_t		 max = ip_len + off;
800 	int			 hosed = 0;
801 
802 	PF_FRAG_ASSERT();
803 	KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
804 	    ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
805 
806 	/* Create a new range queue for this packet */
807 	if (*frag == NULL) {
808 		*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
809 		if (*frag == NULL) {
810 			pf_flush_fragments();
811 			*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
812 			if (*frag == NULL)
813 				goto no_mem;
814 		}
815 
816 		/* Get an entry for the queue */
817 		cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
818 		if (cur == NULL) {
819 			uma_zfree(V_pf_frag_z, *frag);
820 			*frag = NULL;
821 			goto no_mem;
822 		}
823 
824 		(*frag)->fr_flags = PFFRAG_NOBUFFER;
825 		(*frag)->fr_max = 0;
826 		(*frag)->fr_src.v4 = h->ip_src;
827 		(*frag)->fr_dst.v4 = h->ip_dst;
828 		(*frag)->fr_id = h->ip_id;
829 		(*frag)->fr_timeout = time_uptime;
830 
831 		cur->fe_off = off;
832 		cur->fe_len = max; /* TODO: fe_len = max - off ? */
833 		TAILQ_INIT(&(*frag)->fr_queue);
834 		TAILQ_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next);
835 
836 		RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag);
837 		TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next);
838 
839 		DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
840 
841 		goto pass;
842 	}
843 
844 	/*
845 	 * Find a fragment after the current one:
846 	 *  - off contains the real shifted offset.
847 	 */
848 	frp = NULL;
849 	TAILQ_FOREACH(fra, &(*frag)->fr_queue, fr_next) {
850 		if (fra->fe_off > off)
851 			break;
852 		frp = fra;
853 	}
854 
855 	KASSERT((frp != NULL || fra != NULL),
856 	    ("!(frp != NULL || fra != NULL): %s", __FUNCTION__));
857 
858 	if (frp != NULL) {
859 		int	precut;
860 
861 		precut = frp->fe_len - off;
862 		if (precut >= ip_len) {
863 			/* Fragment is entirely a duplicate */
864 			DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
865 			    h->ip_id, frp->fe_off, frp->fe_len, off, max));
866 			goto drop_fragment;
867 		}
868 		if (precut == 0) {
869 			/* They are adjacent.  Fixup cache entry */
870 			DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
871 			    h->ip_id, frp->fe_off, frp->fe_len, off, max));
872 			frp->fe_len = max;
873 		} else if (precut > 0) {
874 			/* The first part of this payload overlaps with a
875 			 * fragment that has already been passed.
876 			 * Need to trim off the first part of the payload.
877 			 * But to do so easily, we need to create another
878 			 * mbuf to throw the original header into.
879 			 */
880 
881 			DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
882 			    h->ip_id, precut, frp->fe_off, frp->fe_len, off,
883 			    max));
884 
885 			off += precut;
886 			max -= precut;
887 			/* Update the previous frag to encompass this one */
888 			frp->fe_len = max;
889 
890 			if (!drop) {
891 				/* XXX Optimization opportunity
892 				 * This is a very heavy way to trim the payload.
893 				 * we could do it much faster by diddling mbuf
894 				 * internals but that would be even less legible
895 				 * than this mbuf magic.  For my next trick,
896 				 * I'll pull a rabbit out of my laptop.
897 				 */
898 				*m0 = m_dup(m, M_NOWAIT);
899 				if (*m0 == NULL)
900 					goto no_mem;
901 				/* From KAME Project : We have missed this! */
902 				m_adj(*m0, (h->ip_hl << 2) -
903 				    (*m0)->m_pkthdr.len);
904 
905 				KASSERT(((*m0)->m_next == NULL),
906 				    ("(*m0)->m_next != NULL: %s",
907 				    __FUNCTION__));
908 				m_adj(m, precut + (h->ip_hl << 2));
909 				m_cat(*m0, m);
910 				m = *m0;
911 				if (m->m_flags & M_PKTHDR) {
912 					int plen = 0;
913 					struct mbuf *t;
914 					for (t = m; t; t = t->m_next)
915 						plen += t->m_len;
916 					m->m_pkthdr.len = plen;
917 				}
918 
919 
920 				h = mtod(m, struct ip *);
921 
922 				KASSERT(((int)m->m_len ==
923 				    ntohs(h->ip_len) - precut),
924 				    ("m->m_len != ntohs(h->ip_len) - precut: %s",
925 				    __FUNCTION__));
926 				h->ip_off = htons(ntohs(h->ip_off) +
927 				    (precut >> 3));
928 				h->ip_len = htons(ntohs(h->ip_len) - precut);
929 			} else {
930 				hosed++;
931 			}
932 		} else {
933 			/* There is a gap between fragments */
934 
935 			DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
936 			    h->ip_id, -precut, frp->fe_off, frp->fe_len, off,
937 			    max));
938 
939 			cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
940 			if (cur == NULL)
941 				goto no_mem;
942 
943 			cur->fe_off = off;
944 			cur->fe_len = max;
945 			TAILQ_INSERT_AFTER(&(*frag)->fr_queue, frp, cur, fr_next);
946 		}
947 	}
948 
949 	if (fra != NULL) {
950 		int	aftercut;
951 		int	merge = 0;
952 
953 		aftercut = max - fra->fe_off;
954 		if (aftercut == 0) {
955 			/* Adjacent fragments */
956 			DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
957 			    h->ip_id, off, max, fra->fe_off, fra->fe_len));
958 			fra->fe_off = off;
959 			merge = 1;
960 		} else if (aftercut > 0) {
961 			/* Need to chop off the tail of this fragment */
962 			DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
963 			    h->ip_id, aftercut, off, max, fra->fe_off,
964 			    fra->fe_len));
965 			fra->fe_off = off;
966 			max -= aftercut;
967 
968 			merge = 1;
969 
970 			if (!drop) {
971 				m_adj(m, -aftercut);
972 				if (m->m_flags & M_PKTHDR) {
973 					int plen = 0;
974 					struct mbuf *t;
975 					for (t = m; t; t = t->m_next)
976 						plen += t->m_len;
977 					m->m_pkthdr.len = plen;
978 				}
979 				h = mtod(m, struct ip *);
980 				KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut),
981 				    ("m->m_len != ntohs(h->ip_len) - aftercut: %s",
982 				    __FUNCTION__));
983 				h->ip_len = htons(ntohs(h->ip_len) - aftercut);
984 			} else {
985 				hosed++;
986 			}
987 		} else if (frp == NULL) {
988 			/* There is a gap between fragments */
989 			DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
990 			    h->ip_id, -aftercut, off, max, fra->fe_off,
991 			    fra->fe_len));
992 
993 			cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
994 			if (cur == NULL)
995 				goto no_mem;
996 
997 			cur->fe_off = off;
998 			cur->fe_len = max;
999 			TAILQ_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next);
1000 		}
1001 
1002 
1003 		/* Need to glue together two separate fragment descriptors */
1004 		if (merge) {
1005 			if (cur && fra->fe_off <= cur->fe_len) {
1006 				/* Need to merge in a previous 'cur' */
1007 				DPFPRINTF(("fragcache[%d]: adjacent(merge "
1008 				    "%d-%d) %d-%d (%d-%d)\n",
1009 				    h->ip_id, cur->fe_off, cur->fe_len, off,
1010 				    max, fra->fe_off, fra->fe_len));
1011 				fra->fe_off = cur->fe_off;
1012 				TAILQ_REMOVE(&(*frag)->fr_queue, cur, fr_next);
1013 				uma_zfree(V_pf_frent_z, cur);
1014 				cur = NULL;
1015 
1016 			} else if (frp && fra->fe_off <= frp->fe_len) {
1017 				/* Need to merge in a modified 'frp' */
1018 				KASSERT((cur == NULL), ("cur != NULL: %s",
1019 				    __FUNCTION__));
1020 				DPFPRINTF(("fragcache[%d]: adjacent(merge "
1021 				    "%d-%d) %d-%d (%d-%d)\n",
1022 				    h->ip_id, frp->fe_off, frp->fe_len, off,
1023 				    max, fra->fe_off, fra->fe_len));
1024 				fra->fe_off = frp->fe_off;
1025 				TAILQ_REMOVE(&(*frag)->fr_queue, frp, fr_next);
1026 				uma_zfree(V_pf_frent_z, frp);
1027 				frp = NULL;
1028 
1029 			}
1030 		}
1031 	}
1032 
1033 	if (hosed) {
1034 		/*
1035 		 * We must keep tracking the overall fragment even when
1036 		 * we're going to drop it anyway so that we know when to
1037 		 * free the overall descriptor.  Thus we drop the frag late.
1038 		 */
1039 		goto drop_fragment;
1040 	}
1041 
1042 
1043  pass:
1044 	/* Update maximum data size */
1045 	if ((*frag)->fr_max < max)
1046 		(*frag)->fr_max = max;
1047 
1048 	/* This is the last segment */
1049 	if (!mff)
1050 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1051 
1052 	/* Check if we are completely reassembled */
1053 	if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
1054 	    TAILQ_FIRST(&(*frag)->fr_queue)->fe_off == 0 &&
1055 	    TAILQ_FIRST(&(*frag)->fr_queue)->fe_len == (*frag)->fr_max) {
1056 		/* Remove from fragment queue */
1057 		DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
1058 		    (*frag)->fr_max));
1059 		pf_free_fragment(*frag);
1060 		*frag = NULL;
1061 	}
1062 
1063 	return (m);
1064 
1065  no_mem:
1066 	*nomem = 1;
1067 
1068 	/* Still need to pay attention to !IP_MF */
1069 	if (!mff && *frag != NULL)
1070 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1071 
1072 	m_freem(m);
1073 	return (NULL);
1074 
1075  drop_fragment:
1076 
1077 	/* Still need to pay attention to !IP_MF */
1078 	if (!mff && *frag != NULL)
1079 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
1080 
1081 	if (drop) {
1082 		/* This fragment has been deemed bad.  Don't reass */
1083 		if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
1084 			DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
1085 			    h->ip_id));
1086 		(*frag)->fr_flags |= PFFRAG_DROP;
1087 	}
1088 
1089 	m_freem(m);
1090 	return (NULL);
1091 }
1092 #endif	/* INET */
1093 
1094 #ifdef INET6
1095 int
1096 pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag)
1097 {
1098 	struct mbuf		*m = *m0, *t;
1099 	struct pf_fragment_tag	*ftag = (struct pf_fragment_tag *)(mtag + 1);
1100 	struct pf_pdesc		 pd;
1101 	uint32_t		 frag_id;
1102 	uint16_t		 hdrlen, extoff, maxlen;
1103 	uint8_t			 proto;
1104 	int			 error, action;
1105 
1106 	hdrlen = ftag->ft_hdrlen;
1107 	extoff = ftag->ft_extoff;
1108 	maxlen = ftag->ft_maxlen;
1109 	frag_id = ftag->ft_id;
1110 	m_tag_delete(m, mtag);
1111 	mtag = NULL;
1112 	ftag = NULL;
1113 
1114 	if (extoff) {
1115 		int off;
1116 
1117 		/* Use protocol from next field of last extension header */
1118 		m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
1119 		    &off);
1120 		KASSERT((m != NULL), ("pf_refragment6: short mbuf chain"));
1121 		proto = *(mtod(m, caddr_t) + off);
1122 		*(mtod(m, char *) + off) = IPPROTO_FRAGMENT;
1123 		m = *m0;
1124 	} else {
1125 		struct ip6_hdr *hdr;
1126 
1127 		hdr = mtod(m, struct ip6_hdr *);
1128 		proto = hdr->ip6_nxt;
1129 		hdr->ip6_nxt = IPPROTO_FRAGMENT;
1130 	}
1131 
1132 	/*
1133 	 * Maxlen may be less than 8 if there was only a single
1134 	 * fragment.  As it was fragmented before, add a fragment
1135 	 * header also for a single fragment.  If total or maxlen
1136 	 * is less than 8, ip6_fragment() will return EMSGSIZE and
1137 	 * we drop the packet.
1138 	 */
1139 	error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id);
1140 	m = (*m0)->m_nextpkt;
1141 	(*m0)->m_nextpkt = NULL;
1142 	if (error == 0) {
1143 		/* The first mbuf contains the unfragmented packet. */
1144 		m_freem(*m0);
1145 		*m0 = NULL;
1146 		action = PF_PASS;
1147 	} else {
1148 		/* Drop expects an mbuf to free. */
1149 		DPFPRINTF(("refragment error %d", error));
1150 		action = PF_DROP;
1151 	}
1152 	for (t = m; m; m = t) {
1153 		t = m->m_nextpkt;
1154 		m->m_nextpkt = NULL;
1155 		m->m_flags |= M_SKIP_FIREWALL;
1156 		memset(&pd, 0, sizeof(pd));
1157 		pd.pf_mtag = pf_find_mtag(m);
1158 		if (error == 0)
1159 			ip6_forward(m, 0);
1160 		else
1161 			m_freem(m);
1162 	}
1163 
1164 	return (action);
1165 }
1166 #endif /* INET6 */
1167 
1168 #ifdef INET
1169 int
1170 pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
1171     struct pf_pdesc *pd)
1172 {
1173 	struct mbuf		*m = *m0;
1174 	struct pf_rule		*r;
1175 	struct pf_fragment	*frag = NULL;
1176 	struct pf_fragment_cmp	key;
1177 	struct ip		*h = mtod(m, struct ip *);
1178 	int			 mff = (ntohs(h->ip_off) & IP_MF);
1179 	int			 hlen = h->ip_hl << 2;
1180 	u_int16_t		 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1181 	u_int16_t		 max;
1182 	int			 ip_len;
1183 	int			 ip_off;
1184 	int			 tag = -1;
1185 	int			 verdict;
1186 
1187 	PF_RULES_RASSERT();
1188 
1189 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1190 	while (r != NULL) {
1191 		r->evaluations++;
1192 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1193 			r = r->skip[PF_SKIP_IFP].ptr;
1194 		else if (r->direction && r->direction != dir)
1195 			r = r->skip[PF_SKIP_DIR].ptr;
1196 		else if (r->af && r->af != AF_INET)
1197 			r = r->skip[PF_SKIP_AF].ptr;
1198 		else if (r->proto && r->proto != h->ip_p)
1199 			r = r->skip[PF_SKIP_PROTO].ptr;
1200 		else if (PF_MISMATCHAW(&r->src.addr,
1201 		    (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
1202 		    r->src.neg, kif, M_GETFIB(m)))
1203 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1204 		else if (PF_MISMATCHAW(&r->dst.addr,
1205 		    (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
1206 		    r->dst.neg, NULL, M_GETFIB(m)))
1207 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1208 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
1209 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
1210 			r = TAILQ_NEXT(r, entries);
1211 		else
1212 			break;
1213 	}
1214 
1215 	if (r == NULL || r->action == PF_NOSCRUB)
1216 		return (PF_PASS);
1217 	else {
1218 		r->packets[dir == PF_OUT]++;
1219 		r->bytes[dir == PF_OUT] += pd->tot_len;
1220 	}
1221 
1222 	/* Check for illegal packets */
1223 	if (hlen < (int)sizeof(struct ip))
1224 		goto drop;
1225 
1226 	if (hlen > ntohs(h->ip_len))
1227 		goto drop;
1228 
1229 	/* Clear IP_DF if the rule uses the no-df option */
1230 	if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
1231 		u_int16_t ip_off = h->ip_off;
1232 
1233 		h->ip_off &= htons(~IP_DF);
1234 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1235 	}
1236 
1237 	/* We will need other tests here */
1238 	if (!fragoff && !mff)
1239 		goto no_fragment;
1240 
1241 	/* We're dealing with a fragment now. Don't allow fragments
1242 	 * with IP_DF to enter the cache. If the flag was cleared by
1243 	 * no-df above, fine. Otherwise drop it.
1244 	 */
1245 	if (h->ip_off & htons(IP_DF)) {
1246 		DPFPRINTF(("IP_DF\n"));
1247 		goto bad;
1248 	}
1249 
1250 	ip_len = ntohs(h->ip_len) - hlen;
1251 	ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1252 
1253 	/* All fragments are 8 byte aligned */
1254 	if (mff && (ip_len & 0x7)) {
1255 		DPFPRINTF(("mff and %d\n", ip_len));
1256 		goto bad;
1257 	}
1258 
1259 	/* Respect maximum length */
1260 	if (fragoff + ip_len > IP_MAXPACKET) {
1261 		DPFPRINTF(("max packet %d\n", fragoff + ip_len));
1262 		goto bad;
1263 	}
1264 	max = fragoff + ip_len;
1265 
1266 	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
1267 
1268 		/* Fully buffer all of the fragments */
1269 		PF_FRAG_LOCK();
1270 
1271 		pf_ip2key(h, dir, &key);
1272 		frag = pf_find_fragment(&key, &V_pf_frag_tree);
1273 
1274 		/* Check if we saw the last fragment already */
1275 		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1276 		    max > frag->fr_max)
1277 			goto bad;
1278 
1279 		/* Might return a completely reassembled mbuf, or NULL */
1280 		DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
1281 		verdict = pf_reassemble(m0, h, dir, reason);
1282 		PF_FRAG_UNLOCK();
1283 
1284 		if (verdict != PF_PASS)
1285 			return (PF_DROP);
1286 
1287 		m = *m0;
1288 		if (m == NULL)
1289 			return (PF_DROP);
1290 
1291 		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1292 			goto drop;
1293 
1294 		h = mtod(m, struct ip *);
1295 	} else {
1296 		/* non-buffering fragment cache (drops or masks overlaps) */
1297 		int	nomem = 0;
1298 
1299 		if (dir == PF_OUT && pd->pf_mtag &&
1300 		    pd->pf_mtag->flags & PF_TAG_FRAGCACHE) {
1301 			/*
1302 			 * Already passed the fragment cache in the
1303 			 * input direction.  If we continued, it would
1304 			 * appear to be a dup and would be dropped.
1305 			 */
1306 			goto fragment_pass;
1307 		}
1308 
1309 		PF_FRAG_LOCK();
1310 		pf_ip2key(h, dir, &key);
1311 		frag = pf_find_fragment(&key, &V_pf_cache_tree);
1312 
1313 		/* Check if we saw the last fragment already */
1314 		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1315 		    max > frag->fr_max) {
1316 			if (r->rule_flag & PFRULE_FRAGDROP)
1317 				frag->fr_flags |= PFFRAG_DROP;
1318 			goto bad;
1319 		}
1320 
1321 		*m0 = m = pf_fragcache(m0, h, &frag, mff,
1322 		    (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1323 		PF_FRAG_UNLOCK();
1324 		if (m == NULL) {
1325 			if (nomem)
1326 				goto no_mem;
1327 			goto drop;
1328 		}
1329 
1330 		if (dir == PF_IN) {
1331 			/* Use mtag from copied and trimmed mbuf chain. */
1332 			pd->pf_mtag = pf_get_mtag(m);
1333 			if (pd->pf_mtag == NULL) {
1334 				m_freem(m);
1335 				*m0 = NULL;
1336 				goto no_mem;
1337 			}
1338 			pd->pf_mtag->flags |= PF_TAG_FRAGCACHE;
1339 		}
1340 
1341 		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1342 			goto drop;
1343 		goto fragment_pass;
1344 	}
1345 
1346  no_fragment:
1347 	/* At this point, only IP_DF is allowed in ip_off */
1348 	if (h->ip_off & ~htons(IP_DF)) {
1349 		u_int16_t ip_off = h->ip_off;
1350 
1351 		h->ip_off &= htons(IP_DF);
1352 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1353 	}
1354 
1355 	/* not missing a return here */
1356 
1357  fragment_pass:
1358 	pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
1359 
1360 	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1361 		pd->flags |= PFDESC_IP_REAS;
1362 	return (PF_PASS);
1363 
1364  no_mem:
1365 	REASON_SET(reason, PFRES_MEMORY);
1366 	if (r != NULL && r->log)
1367 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1368 		    1);
1369 	return (PF_DROP);
1370 
1371  drop:
1372 	REASON_SET(reason, PFRES_NORM);
1373 	if (r != NULL && r->log)
1374 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1375 		    1);
1376 	return (PF_DROP);
1377 
1378  bad:
1379 	DPFPRINTF(("dropping bad fragment\n"));
1380 
1381 	/* Free associated fragments */
1382 	if (frag != NULL) {
1383 		pf_free_fragment(frag);
1384 		PF_FRAG_UNLOCK();
1385 	}
1386 
1387 	REASON_SET(reason, PFRES_FRAG);
1388 	if (r != NULL && r->log)
1389 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1390 		    1);
1391 
1392 	return (PF_DROP);
1393 }
1394 #endif
1395 
1396 #ifdef INET6
1397 int
1398 pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1399     u_short *reason, struct pf_pdesc *pd)
1400 {
1401 	struct mbuf		*m = *m0;
1402 	struct pf_rule		*r;
1403 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
1404 	int			 extoff;
1405 	int			 off;
1406 	struct ip6_ext		 ext;
1407 	struct ip6_opt		 opt;
1408 	struct ip6_opt_jumbo	 jumbo;
1409 	struct ip6_frag		 frag;
1410 	u_int32_t		 jumbolen = 0, plen;
1411 	int			 optend;
1412 	int			 ooff;
1413 	u_int8_t		 proto;
1414 	int			 terminal;
1415 
1416 	PF_RULES_RASSERT();
1417 
1418 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1419 	while (r != NULL) {
1420 		r->evaluations++;
1421 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1422 			r = r->skip[PF_SKIP_IFP].ptr;
1423 		else if (r->direction && r->direction != dir)
1424 			r = r->skip[PF_SKIP_DIR].ptr;
1425 		else if (r->af && r->af != AF_INET6)
1426 			r = r->skip[PF_SKIP_AF].ptr;
1427 #if 0 /* header chain! */
1428 		else if (r->proto && r->proto != h->ip6_nxt)
1429 			r = r->skip[PF_SKIP_PROTO].ptr;
1430 #endif
1431 		else if (PF_MISMATCHAW(&r->src.addr,
1432 		    (struct pf_addr *)&h->ip6_src, AF_INET6,
1433 		    r->src.neg, kif, M_GETFIB(m)))
1434 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1435 		else if (PF_MISMATCHAW(&r->dst.addr,
1436 		    (struct pf_addr *)&h->ip6_dst, AF_INET6,
1437 		    r->dst.neg, NULL, M_GETFIB(m)))
1438 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1439 		else
1440 			break;
1441 	}
1442 
1443 	if (r == NULL || r->action == PF_NOSCRUB)
1444 		return (PF_PASS);
1445 	else {
1446 		r->packets[dir == PF_OUT]++;
1447 		r->bytes[dir == PF_OUT] += pd->tot_len;
1448 	}
1449 
1450 	/* Check for illegal packets */
1451 	if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
1452 		goto drop;
1453 
1454 	extoff = 0;
1455 	off = sizeof(struct ip6_hdr);
1456 	proto = h->ip6_nxt;
1457 	terminal = 0;
1458 	do {
1459 		switch (proto) {
1460 		case IPPROTO_FRAGMENT:
1461 			goto fragment;
1462 			break;
1463 		case IPPROTO_AH:
1464 		case IPPROTO_ROUTING:
1465 		case IPPROTO_DSTOPTS:
1466 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1467 			    NULL, AF_INET6))
1468 				goto shortpkt;
1469 			extoff = off;
1470 			if (proto == IPPROTO_AH)
1471 				off += (ext.ip6e_len + 2) * 4;
1472 			else
1473 				off += (ext.ip6e_len + 1) * 8;
1474 			proto = ext.ip6e_nxt;
1475 			break;
1476 		case IPPROTO_HOPOPTS:
1477 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1478 			    NULL, AF_INET6))
1479 				goto shortpkt;
1480 			extoff = off;
1481 			optend = off + (ext.ip6e_len + 1) * 8;
1482 			ooff = off + sizeof(ext);
1483 			do {
1484 				if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1485 				    sizeof(opt.ip6o_type), NULL, NULL,
1486 				    AF_INET6))
1487 					goto shortpkt;
1488 				if (opt.ip6o_type == IP6OPT_PAD1) {
1489 					ooff++;
1490 					continue;
1491 				}
1492 				if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
1493 				    NULL, NULL, AF_INET6))
1494 					goto shortpkt;
1495 				if (ooff + sizeof(opt) + opt.ip6o_len > optend)
1496 					goto drop;
1497 				switch (opt.ip6o_type) {
1498 				case IP6OPT_JUMBO:
1499 					if (h->ip6_plen != 0)
1500 						goto drop;
1501 					if (!pf_pull_hdr(m, ooff, &jumbo,
1502 					    sizeof(jumbo), NULL, NULL,
1503 					    AF_INET6))
1504 						goto shortpkt;
1505 					memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1506 					    sizeof(jumbolen));
1507 					jumbolen = ntohl(jumbolen);
1508 					if (jumbolen <= IPV6_MAXPACKET)
1509 						goto drop;
1510 					if (sizeof(struct ip6_hdr) + jumbolen !=
1511 					    m->m_pkthdr.len)
1512 						goto drop;
1513 					break;
1514 				default:
1515 					break;
1516 				}
1517 				ooff += sizeof(opt) + opt.ip6o_len;
1518 			} while (ooff < optend);
1519 
1520 			off = optend;
1521 			proto = ext.ip6e_nxt;
1522 			break;
1523 		default:
1524 			terminal = 1;
1525 			break;
1526 		}
1527 	} while (!terminal);
1528 
1529 	/* jumbo payload option must be present, or plen > 0 */
1530 	if (ntohs(h->ip6_plen) == 0)
1531 		plen = jumbolen;
1532 	else
1533 		plen = ntohs(h->ip6_plen);
1534 	if (plen == 0)
1535 		goto drop;
1536 	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1537 		goto shortpkt;
1538 
1539 	pf_scrub_ip6(&m, r->min_ttl);
1540 
1541 	return (PF_PASS);
1542 
1543  fragment:
1544 	/* Jumbo payload packets cannot be fragmented. */
1545 	plen = ntohs(h->ip6_plen);
1546 	if (plen == 0 || jumbolen)
1547 		goto drop;
1548 	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1549 		goto shortpkt;
1550 
1551 	if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
1552 		goto shortpkt;
1553 
1554 	/* Offset now points to data portion. */
1555 	off += sizeof(frag);
1556 
1557 	/* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */
1558 	if (pf_reassemble6(m0, h, &frag, off, extoff, dir, reason) != PF_PASS)
1559 		return (PF_DROP);
1560 	m = *m0;
1561 	if (m == NULL)
1562 		return (PF_DROP);
1563 
1564 	pd->flags |= PFDESC_IP_REAS;
1565 	return (PF_PASS);
1566 
1567  shortpkt:
1568 	REASON_SET(reason, PFRES_SHORT);
1569 	if (r != NULL && r->log)
1570 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
1571 		    1);
1572 	return (PF_DROP);
1573 
1574  drop:
1575 	REASON_SET(reason, PFRES_NORM);
1576 	if (r != NULL && r->log)
1577 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
1578 		    1);
1579 	return (PF_DROP);
1580 }
1581 #endif /* INET6 */
1582 
1583 int
1584 pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
1585     int off, void *h, struct pf_pdesc *pd)
1586 {
1587 	struct pf_rule	*r, *rm = NULL;
1588 	struct tcphdr	*th = pd->hdr.tcp;
1589 	int		 rewrite = 0;
1590 	u_short		 reason;
1591 	u_int8_t	 flags;
1592 	sa_family_t	 af = pd->af;
1593 
1594 	PF_RULES_RASSERT();
1595 
1596 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1597 	while (r != NULL) {
1598 		r->evaluations++;
1599 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1600 			r = r->skip[PF_SKIP_IFP].ptr;
1601 		else if (r->direction && r->direction != dir)
1602 			r = r->skip[PF_SKIP_DIR].ptr;
1603 		else if (r->af && r->af != af)
1604 			r = r->skip[PF_SKIP_AF].ptr;
1605 		else if (r->proto && r->proto != pd->proto)
1606 			r = r->skip[PF_SKIP_PROTO].ptr;
1607 		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
1608 		    r->src.neg, kif, M_GETFIB(m)))
1609 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1610 		else if (r->src.port_op && !pf_match_port(r->src.port_op,
1611 			    r->src.port[0], r->src.port[1], th->th_sport))
1612 			r = r->skip[PF_SKIP_SRC_PORT].ptr;
1613 		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
1614 		    r->dst.neg, NULL, M_GETFIB(m)))
1615 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1616 		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
1617 			    r->dst.port[0], r->dst.port[1], th->th_dport))
1618 			r = r->skip[PF_SKIP_DST_PORT].ptr;
1619 		else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
1620 			    pf_osfp_fingerprint(pd, m, off, th),
1621 			    r->os_fingerprint))
1622 			r = TAILQ_NEXT(r, entries);
1623 		else {
1624 			rm = r;
1625 			break;
1626 		}
1627 	}
1628 
1629 	if (rm == NULL || rm->action == PF_NOSCRUB)
1630 		return (PF_PASS);
1631 	else {
1632 		r->packets[dir == PF_OUT]++;
1633 		r->bytes[dir == PF_OUT] += pd->tot_len;
1634 	}
1635 
1636 	if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
1637 		pd->flags |= PFDESC_TCP_NORM;
1638 
1639 	flags = th->th_flags;
1640 	if (flags & TH_SYN) {
1641 		/* Illegal packet */
1642 		if (flags & TH_RST)
1643 			goto tcp_drop;
1644 
1645 		if (flags & TH_FIN)
1646 			goto tcp_drop;
1647 	} else {
1648 		/* Illegal packet */
1649 		if (!(flags & (TH_ACK|TH_RST)))
1650 			goto tcp_drop;
1651 	}
1652 
1653 	if (!(flags & TH_ACK)) {
1654 		/* These flags are only valid if ACK is set */
1655 		if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
1656 			goto tcp_drop;
1657 	}
1658 
1659 	/* Check for illegal header length */
1660 	if (th->th_off < (sizeof(struct tcphdr) >> 2))
1661 		goto tcp_drop;
1662 
1663 	/* If flags changed, or reserved data set, then adjust */
1664 	if (flags != th->th_flags || th->th_x2 != 0) {
1665 		u_int16_t	ov, nv;
1666 
1667 		ov = *(u_int16_t *)(&th->th_ack + 1);
1668 		th->th_flags = flags;
1669 		th->th_x2 = 0;
1670 		nv = *(u_int16_t *)(&th->th_ack + 1);
1671 
1672 		th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
1673 		rewrite = 1;
1674 	}
1675 
1676 	/* Remove urgent pointer, if TH_URG is not set */
1677 	if (!(flags & TH_URG) && th->th_urp) {
1678 		th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
1679 		th->th_urp = 0;
1680 		rewrite = 1;
1681 	}
1682 
1683 	/* Process options */
1684 	if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
1685 		rewrite = 1;
1686 
1687 	/* copy back packet headers if we sanitized */
1688 	if (rewrite)
1689 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
1690 
1691 	return (PF_PASS);
1692 
1693  tcp_drop:
1694 	REASON_SET(&reason, PFRES_NORM);
1695 	if (rm != NULL && r->log)
1696 		PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd,
1697 		    1);
1698 	return (PF_DROP);
1699 }
1700 
1701 int
1702 pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
1703     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
1704 {
1705 	u_int32_t tsval, tsecr;
1706 	u_int8_t hdr[60];
1707 	u_int8_t *opt;
1708 
1709 	KASSERT((src->scrub == NULL),
1710 	    ("pf_normalize_tcp_init: src->scrub != NULL"));
1711 
1712 	src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT);
1713 	if (src->scrub == NULL)
1714 		return (1);
1715 
1716 	switch (pd->af) {
1717 #ifdef INET
1718 	case AF_INET: {
1719 		struct ip *h = mtod(m, struct ip *);
1720 		src->scrub->pfss_ttl = h->ip_ttl;
1721 		break;
1722 	}
1723 #endif /* INET */
1724 #ifdef INET6
1725 	case AF_INET6: {
1726 		struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1727 		src->scrub->pfss_ttl = h->ip6_hlim;
1728 		break;
1729 	}
1730 #endif /* INET6 */
1731 	}
1732 
1733 
1734 	/*
1735 	 * All normalizations below are only begun if we see the start of
1736 	 * the connections.  They must all set an enabled bit in pfss_flags
1737 	 */
1738 	if ((th->th_flags & TH_SYN) == 0)
1739 		return (0);
1740 
1741 
1742 	if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
1743 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1744 		/* Diddle with TCP options */
1745 		int hlen;
1746 		opt = hdr + sizeof(struct tcphdr);
1747 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1748 		while (hlen >= TCPOLEN_TIMESTAMP) {
1749 			switch (*opt) {
1750 			case TCPOPT_EOL:	/* FALLTHROUGH */
1751 			case TCPOPT_NOP:
1752 				opt++;
1753 				hlen--;
1754 				break;
1755 			case TCPOPT_TIMESTAMP:
1756 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1757 					src->scrub->pfss_flags |=
1758 					    PFSS_TIMESTAMP;
1759 					src->scrub->pfss_ts_mod =
1760 					    htonl(arc4random());
1761 
1762 					/* note PFSS_PAWS not set yet */
1763 					memcpy(&tsval, &opt[2],
1764 					    sizeof(u_int32_t));
1765 					memcpy(&tsecr, &opt[6],
1766 					    sizeof(u_int32_t));
1767 					src->scrub->pfss_tsval0 = ntohl(tsval);
1768 					src->scrub->pfss_tsval = ntohl(tsval);
1769 					src->scrub->pfss_tsecr = ntohl(tsecr);
1770 					getmicrouptime(&src->scrub->pfss_last);
1771 				}
1772 				/* FALLTHROUGH */
1773 			default:
1774 				hlen -= MAX(opt[1], 2);
1775 				opt += MAX(opt[1], 2);
1776 				break;
1777 			}
1778 		}
1779 	}
1780 
1781 	return (0);
1782 }
1783 
1784 void
1785 pf_normalize_tcp_cleanup(struct pf_state *state)
1786 {
1787 	if (state->src.scrub)
1788 		uma_zfree(V_pf_state_scrub_z, state->src.scrub);
1789 	if (state->dst.scrub)
1790 		uma_zfree(V_pf_state_scrub_z, state->dst.scrub);
1791 
1792 	/* Someday... flush the TCP segment reassembly descriptors. */
1793 }
1794 
1795 int
1796 pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
1797     u_short *reason, struct tcphdr *th, struct pf_state *state,
1798     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
1799 {
1800 	struct timeval uptime;
1801 	u_int32_t tsval, tsecr;
1802 	u_int tsval_from_last;
1803 	u_int8_t hdr[60];
1804 	u_int8_t *opt;
1805 	int copyback = 0;
1806 	int got_ts = 0;
1807 
1808 	KASSERT((src->scrub || dst->scrub),
1809 	    ("%s: src->scrub && dst->scrub!", __func__));
1810 
1811 	/*
1812 	 * Enforce the minimum TTL seen for this connection.  Negate a common
1813 	 * technique to evade an intrusion detection system and confuse
1814 	 * firewall state code.
1815 	 */
1816 	switch (pd->af) {
1817 #ifdef INET
1818 	case AF_INET: {
1819 		if (src->scrub) {
1820 			struct ip *h = mtod(m, struct ip *);
1821 			if (h->ip_ttl > src->scrub->pfss_ttl)
1822 				src->scrub->pfss_ttl = h->ip_ttl;
1823 			h->ip_ttl = src->scrub->pfss_ttl;
1824 		}
1825 		break;
1826 	}
1827 #endif /* INET */
1828 #ifdef INET6
1829 	case AF_INET6: {
1830 		if (src->scrub) {
1831 			struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1832 			if (h->ip6_hlim > src->scrub->pfss_ttl)
1833 				src->scrub->pfss_ttl = h->ip6_hlim;
1834 			h->ip6_hlim = src->scrub->pfss_ttl;
1835 		}
1836 		break;
1837 	}
1838 #endif /* INET6 */
1839 	}
1840 
1841 	if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
1842 	    ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
1843 	    (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
1844 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1845 		/* Diddle with TCP options */
1846 		int hlen;
1847 		opt = hdr + sizeof(struct tcphdr);
1848 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1849 		while (hlen >= TCPOLEN_TIMESTAMP) {
1850 			switch (*opt) {
1851 			case TCPOPT_EOL:	/* FALLTHROUGH */
1852 			case TCPOPT_NOP:
1853 				opt++;
1854 				hlen--;
1855 				break;
1856 			case TCPOPT_TIMESTAMP:
1857 				/* Modulate the timestamps.  Can be used for
1858 				 * NAT detection, OS uptime determination or
1859 				 * reboot detection.
1860 				 */
1861 
1862 				if (got_ts) {
1863 					/* Huh?  Multiple timestamps!? */
1864 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
1865 						DPFPRINTF(("multiple TS??"));
1866 						pf_print_state(state);
1867 						printf("\n");
1868 					}
1869 					REASON_SET(reason, PFRES_TS);
1870 					return (PF_DROP);
1871 				}
1872 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1873 					memcpy(&tsval, &opt[2],
1874 					    sizeof(u_int32_t));
1875 					if (tsval && src->scrub &&
1876 					    (src->scrub->pfss_flags &
1877 					    PFSS_TIMESTAMP)) {
1878 						tsval = ntohl(tsval);
1879 						pf_change_a(&opt[2],
1880 						    &th->th_sum,
1881 						    htonl(tsval +
1882 						    src->scrub->pfss_ts_mod),
1883 						    0);
1884 						copyback = 1;
1885 					}
1886 
1887 					/* Modulate TS reply iff valid (!0) */
1888 					memcpy(&tsecr, &opt[6],
1889 					    sizeof(u_int32_t));
1890 					if (tsecr && dst->scrub &&
1891 					    (dst->scrub->pfss_flags &
1892 					    PFSS_TIMESTAMP)) {
1893 						tsecr = ntohl(tsecr)
1894 						    - dst->scrub->pfss_ts_mod;
1895 						pf_change_a(&opt[6],
1896 						    &th->th_sum, htonl(tsecr),
1897 						    0);
1898 						copyback = 1;
1899 					}
1900 					got_ts = 1;
1901 				}
1902 				/* FALLTHROUGH */
1903 			default:
1904 				hlen -= MAX(opt[1], 2);
1905 				opt += MAX(opt[1], 2);
1906 				break;
1907 			}
1908 		}
1909 		if (copyback) {
1910 			/* Copyback the options, caller copys back header */
1911 			*writeback = 1;
1912 			m_copyback(m, off + sizeof(struct tcphdr),
1913 			    (th->th_off << 2) - sizeof(struct tcphdr), hdr +
1914 			    sizeof(struct tcphdr));
1915 		}
1916 	}
1917 
1918 
1919 	/*
1920 	 * Must invalidate PAWS checks on connections idle for too long.
1921 	 * The fastest allowed timestamp clock is 1ms.  That turns out to
1922 	 * be about 24 days before it wraps.  XXX Right now our lowerbound
1923 	 * TS echo check only works for the first 12 days of a connection
1924 	 * when the TS has exhausted half its 32bit space
1925 	 */
1926 #define TS_MAX_IDLE	(24*24*60*60)
1927 #define TS_MAX_CONN	(12*24*60*60)	/* XXX remove when better tsecr check */
1928 
1929 	getmicrouptime(&uptime);
1930 	if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
1931 	    (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
1932 	    time_uptime - state->creation > TS_MAX_CONN))  {
1933 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1934 			DPFPRINTF(("src idled out of PAWS\n"));
1935 			pf_print_state(state);
1936 			printf("\n");
1937 		}
1938 		src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
1939 		    | PFSS_PAWS_IDLED;
1940 	}
1941 	if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
1942 	    uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
1943 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1944 			DPFPRINTF(("dst idled out of PAWS\n"));
1945 			pf_print_state(state);
1946 			printf("\n");
1947 		}
1948 		dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
1949 		    | PFSS_PAWS_IDLED;
1950 	}
1951 
1952 	if (got_ts && src->scrub && dst->scrub &&
1953 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
1954 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
1955 		/* Validate that the timestamps are "in-window".
1956 		 * RFC1323 describes TCP Timestamp options that allow
1957 		 * measurement of RTT (round trip time) and PAWS
1958 		 * (protection against wrapped sequence numbers).  PAWS
1959 		 * gives us a set of rules for rejecting packets on
1960 		 * long fat pipes (packets that were somehow delayed
1961 		 * in transit longer than the time it took to send the
1962 		 * full TCP sequence space of 4Gb).  We can use these
1963 		 * rules and infer a few others that will let us treat
1964 		 * the 32bit timestamp and the 32bit echoed timestamp
1965 		 * as sequence numbers to prevent a blind attacker from
1966 		 * inserting packets into a connection.
1967 		 *
1968 		 * RFC1323 tells us:
1969 		 *  - The timestamp on this packet must be greater than
1970 		 *    or equal to the last value echoed by the other
1971 		 *    endpoint.  The RFC says those will be discarded
1972 		 *    since it is a dup that has already been acked.
1973 		 *    This gives us a lowerbound on the timestamp.
1974 		 *        timestamp >= other last echoed timestamp
1975 		 *  - The timestamp will be less than or equal to
1976 		 *    the last timestamp plus the time between the
1977 		 *    last packet and now.  The RFC defines the max
1978 		 *    clock rate as 1ms.  We will allow clocks to be
1979 		 *    up to 10% fast and will allow a total difference
1980 		 *    or 30 seconds due to a route change.  And this
1981 		 *    gives us an upperbound on the timestamp.
1982 		 *        timestamp <= last timestamp + max ticks
1983 		 *    We have to be careful here.  Windows will send an
1984 		 *    initial timestamp of zero and then initialize it
1985 		 *    to a random value after the 3whs; presumably to
1986 		 *    avoid a DoS by having to call an expensive RNG
1987 		 *    during a SYN flood.  Proof MS has at least one
1988 		 *    good security geek.
1989 		 *
1990 		 *  - The TCP timestamp option must also echo the other
1991 		 *    endpoints timestamp.  The timestamp echoed is the
1992 		 *    one carried on the earliest unacknowledged segment
1993 		 *    on the left edge of the sequence window.  The RFC
1994 		 *    states that the host will reject any echoed
1995 		 *    timestamps that were larger than any ever sent.
1996 		 *    This gives us an upperbound on the TS echo.
1997 		 *        tescr <= largest_tsval
1998 		 *  - The lowerbound on the TS echo is a little more
1999 		 *    tricky to determine.  The other endpoint's echoed
2000 		 *    values will not decrease.  But there may be
2001 		 *    network conditions that re-order packets and
2002 		 *    cause our view of them to decrease.  For now the
2003 		 *    only lowerbound we can safely determine is that
2004 		 *    the TS echo will never be less than the original
2005 		 *    TS.  XXX There is probably a better lowerbound.
2006 		 *    Remove TS_MAX_CONN with better lowerbound check.
2007 		 *        tescr >= other original TS
2008 		 *
2009 		 * It is also important to note that the fastest
2010 		 * timestamp clock of 1ms will wrap its 32bit space in
2011 		 * 24 days.  So we just disable TS checking after 24
2012 		 * days of idle time.  We actually must use a 12d
2013 		 * connection limit until we can come up with a better
2014 		 * lowerbound to the TS echo check.
2015 		 */
2016 		struct timeval delta_ts;
2017 		int ts_fudge;
2018 
2019 
2020 		/*
2021 		 * PFTM_TS_DIFF is how many seconds of leeway to allow
2022 		 * a host's timestamp.  This can happen if the previous
2023 		 * packet got delayed in transit for much longer than
2024 		 * this packet.
2025 		 */
2026 		if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
2027 			ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
2028 
2029 		/* Calculate max ticks since the last timestamp */
2030 #define TS_MAXFREQ	1100		/* RFC max TS freq of 1Khz + 10% skew */
2031 #define TS_MICROSECS	1000000		/* microseconds per second */
2032 		delta_ts = uptime;
2033 		timevalsub(&delta_ts, &src->scrub->pfss_last);
2034 		tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
2035 		tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
2036 
2037 		if ((src->state >= TCPS_ESTABLISHED &&
2038 		    dst->state >= TCPS_ESTABLISHED) &&
2039 		    (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
2040 		    SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
2041 		    (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
2042 		    SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
2043 			/* Bad RFC1323 implementation or an insertion attack.
2044 			 *
2045 			 * - Solaris 2.6 and 2.7 are known to send another ACK
2046 			 *   after the FIN,FIN|ACK,ACK closing that carries
2047 			 *   an old timestamp.
2048 			 */
2049 
2050 			DPFPRINTF(("Timestamp failed %c%c%c%c\n",
2051 			    SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
2052 			    SEQ_GT(tsval, src->scrub->pfss_tsval +
2053 			    tsval_from_last) ? '1' : ' ',
2054 			    SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
2055 			    SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
2056 			DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
2057 			    "idle: %jus %lums\n",
2058 			    tsval, tsecr, tsval_from_last,
2059 			    (uintmax_t)delta_ts.tv_sec,
2060 			    delta_ts.tv_usec / 1000));
2061 			DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
2062 			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
2063 			DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
2064 			    "\n", dst->scrub->pfss_tsval,
2065 			    dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
2066 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
2067 				pf_print_state(state);
2068 				pf_print_flags(th->th_flags);
2069 				printf("\n");
2070 			}
2071 			REASON_SET(reason, PFRES_TS);
2072 			return (PF_DROP);
2073 		}
2074 
2075 		/* XXX I'd really like to require tsecr but it's optional */
2076 
2077 	} else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
2078 	    ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
2079 	    || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
2080 	    src->scrub && dst->scrub &&
2081 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
2082 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
2083 		/* Didn't send a timestamp.  Timestamps aren't really useful
2084 		 * when:
2085 		 *  - connection opening or closing (often not even sent).
2086 		 *    but we must not let an attacker to put a FIN on a
2087 		 *    data packet to sneak it through our ESTABLISHED check.
2088 		 *  - on a TCP reset.  RFC suggests not even looking at TS.
2089 		 *  - on an empty ACK.  The TS will not be echoed so it will
2090 		 *    probably not help keep the RTT calculation in sync and
2091 		 *    there isn't as much danger when the sequence numbers
2092 		 *    got wrapped.  So some stacks don't include TS on empty
2093 		 *    ACKs :-(
2094 		 *
2095 		 * To minimize the disruption to mostly RFC1323 conformant
2096 		 * stacks, we will only require timestamps on data packets.
2097 		 *
2098 		 * And what do ya know, we cannot require timestamps on data
2099 		 * packets.  There appear to be devices that do legitimate
2100 		 * TCP connection hijacking.  There are HTTP devices that allow
2101 		 * a 3whs (with timestamps) and then buffer the HTTP request.
2102 		 * If the intermediate device has the HTTP response cache, it
2103 		 * will spoof the response but not bother timestamping its
2104 		 * packets.  So we can look for the presence of a timestamp in
2105 		 * the first data packet and if there, require it in all future
2106 		 * packets.
2107 		 */
2108 
2109 		if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
2110 			/*
2111 			 * Hey!  Someone tried to sneak a packet in.  Or the
2112 			 * stack changed its RFC1323 behavior?!?!
2113 			 */
2114 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
2115 				DPFPRINTF(("Did not receive expected RFC1323 "
2116 				    "timestamp\n"));
2117 				pf_print_state(state);
2118 				pf_print_flags(th->th_flags);
2119 				printf("\n");
2120 			}
2121 			REASON_SET(reason, PFRES_TS);
2122 			return (PF_DROP);
2123 		}
2124 	}
2125 
2126 
2127 	/*
2128 	 * We will note if a host sends his data packets with or without
2129 	 * timestamps.  And require all data packets to contain a timestamp
2130 	 * if the first does.  PAWS implicitly requires that all data packets be
2131 	 * timestamped.  But I think there are middle-man devices that hijack
2132 	 * TCP streams immediately after the 3whs and don't timestamp their
2133 	 * packets (seen in a WWW accelerator or cache).
2134 	 */
2135 	if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
2136 	    (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
2137 		if (got_ts)
2138 			src->scrub->pfss_flags |= PFSS_DATA_TS;
2139 		else {
2140 			src->scrub->pfss_flags |= PFSS_DATA_NOTS;
2141 			if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
2142 			    (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
2143 				/* Don't warn if other host rejected RFC1323 */
2144 				DPFPRINTF(("Broken RFC1323 stack did not "
2145 				    "timestamp data packet. Disabled PAWS "
2146 				    "security.\n"));
2147 				pf_print_state(state);
2148 				pf_print_flags(th->th_flags);
2149 				printf("\n");
2150 			}
2151 		}
2152 	}
2153 
2154 
2155 	/*
2156 	 * Update PAWS values
2157 	 */
2158 	if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
2159 	    (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
2160 		getmicrouptime(&src->scrub->pfss_last);
2161 		if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
2162 		    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2163 			src->scrub->pfss_tsval = tsval;
2164 
2165 		if (tsecr) {
2166 			if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
2167 			    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2168 				src->scrub->pfss_tsecr = tsecr;
2169 
2170 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
2171 			    (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
2172 			    src->scrub->pfss_tsval0 == 0)) {
2173 				/* tsval0 MUST be the lowest timestamp */
2174 				src->scrub->pfss_tsval0 = tsval;
2175 			}
2176 
2177 			/* Only fully initialized after a TS gets echoed */
2178 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
2179 				src->scrub->pfss_flags |= PFSS_PAWS;
2180 		}
2181 	}
2182 
2183 	/* I have a dream....  TCP segment reassembly.... */
2184 	return (0);
2185 }
2186 
2187 static int
2188 pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
2189     int off, sa_family_t af)
2190 {
2191 	u_int16_t	*mss;
2192 	int		 thoff;
2193 	int		 opt, cnt, optlen = 0;
2194 	int		 rewrite = 0;
2195 	u_char		 opts[TCP_MAXOLEN];
2196 	u_char		*optp = opts;
2197 
2198 	thoff = th->th_off << 2;
2199 	cnt = thoff - sizeof(struct tcphdr);
2200 
2201 	if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
2202 	    NULL, NULL, af))
2203 		return (rewrite);
2204 
2205 	for (; cnt > 0; cnt -= optlen, optp += optlen) {
2206 		opt = optp[0];
2207 		if (opt == TCPOPT_EOL)
2208 			break;
2209 		if (opt == TCPOPT_NOP)
2210 			optlen = 1;
2211 		else {
2212 			if (cnt < 2)
2213 				break;
2214 			optlen = optp[1];
2215 			if (optlen < 2 || optlen > cnt)
2216 				break;
2217 		}
2218 		switch (opt) {
2219 		case TCPOPT_MAXSEG:
2220 			mss = (u_int16_t *)(optp + 2);
2221 			if ((ntohs(*mss)) > r->max_mss) {
2222 				th->th_sum = pf_cksum_fixup(th->th_sum,
2223 				    *mss, htons(r->max_mss), 0);
2224 				*mss = htons(r->max_mss);
2225 				rewrite = 1;
2226 			}
2227 			break;
2228 		default:
2229 			break;
2230 		}
2231 	}
2232 
2233 	if (rewrite)
2234 		m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
2235 
2236 	return (rewrite);
2237 }
2238 
2239 #ifdef INET
2240 static void
2241 pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
2242 {
2243 	struct mbuf		*m = *m0;
2244 	struct ip		*h = mtod(m, struct ip *);
2245 
2246 	/* Clear IP_DF if no-df was requested */
2247 	if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
2248 		u_int16_t ip_off = h->ip_off;
2249 
2250 		h->ip_off &= htons(~IP_DF);
2251 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
2252 	}
2253 
2254 	/* Enforce a minimum ttl, may cause endless packet loops */
2255 	if (min_ttl && h->ip_ttl < min_ttl) {
2256 		u_int16_t ip_ttl = h->ip_ttl;
2257 
2258 		h->ip_ttl = min_ttl;
2259 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
2260 	}
2261 
2262 	/* Enforce tos */
2263 	if (flags & PFRULE_SET_TOS) {
2264 		u_int16_t	ov, nv;
2265 
2266 		ov = *(u_int16_t *)h;
2267 		h->ip_tos = tos;
2268 		nv = *(u_int16_t *)h;
2269 
2270 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
2271 	}
2272 
2273 	/* random-id, but not for fragments */
2274 	if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
2275 		uint16_t ip_id = h->ip_id;
2276 
2277 		ip_fillid(h);
2278 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
2279 	}
2280 }
2281 #endif /* INET */
2282 
2283 #ifdef INET6
2284 static void
2285 pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
2286 {
2287 	struct mbuf		*m = *m0;
2288 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
2289 
2290 	/* Enforce a minimum ttl, may cause endless packet loops */
2291 	if (min_ttl && h->ip6_hlim < min_ttl)
2292 		h->ip6_hlim = min_ttl;
2293 }
2294 #endif
2295