xref: /freebsd/sys/netpfil/pf/pf_norm.c (revision ddd5b8e9b4d8957fce018c520657cdfa4ecffad3)
1 /*-
2  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  *	$OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_pf.h"
34 
35 #include <sys/param.h>
36 #include <sys/lock.h>
37 #include <sys/mbuf.h>
38 #include <sys/mutex.h>
39 #include <sys/refcount.h>
40 #include <sys/rwlock.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/vnet.h>
45 #include <net/pfvar.h>
46 #include <net/pf_mtag.h>
47 #include <net/if_pflog.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/ip.h>
51 #include <netinet/ip_var.h>
52 #include <netinet/tcp.h>
53 #include <netinet/tcp_fsm.h>
54 #include <netinet/tcp_seq.h>
55 
56 #ifdef INET6
57 #include <netinet/ip6.h>
58 #endif /* INET6 */
59 
60 struct pf_frent {
61 	LIST_ENTRY(pf_frent) fr_next;
62 	union {
63 		struct {
64 			struct ip *_fr_ip;
65 			struct mbuf *_fr_m;
66 		} _frag;
67 		struct {
68 			uint16_t _fr_off;
69 			uint16_t _fr_end;
70 		} _cache;
71 	} _u;
72 };
73 #define	fr_ip	_u._frag._fr_ip
74 #define	fr_m	_u._frag._fr_m
75 #define	fr_off	_u._cache._fr_off
76 #define	fr_end	_u._cache._fr_end
77 
78 struct pf_fragment {
79 	RB_ENTRY(pf_fragment) fr_entry;
80 	TAILQ_ENTRY(pf_fragment) frag_next;
81 	struct in_addr	fr_src;
82 	struct in_addr	fr_dst;
83 	u_int8_t	fr_p;		/* protocol of this fragment */
84 	u_int8_t	fr_flags;	/* status flags */
85 #define PFFRAG_SEENLAST	0x0001		/* Seen the last fragment for this */
86 #define PFFRAG_NOBUFFER	0x0002		/* Non-buffering fragment cache */
87 #define PFFRAG_DROP	0x0004		/* Drop all fragments */
88 #define BUFFER_FRAGMENTS(fr)	(!((fr)->fr_flags & PFFRAG_NOBUFFER))
89 	u_int16_t	fr_id;		/* fragment id for reassemble */
90 	u_int16_t	fr_max;		/* fragment data max */
91 	u_int32_t	fr_timeout;
92 	LIST_HEAD(, pf_frent) fr_queue;
93 };
94 
95 static struct mtx pf_frag_mtx;
96 #define PF_FRAG_LOCK()		mtx_lock(&pf_frag_mtx)
97 #define PF_FRAG_UNLOCK()	mtx_unlock(&pf_frag_mtx)
98 #define PF_FRAG_ASSERT()	mtx_assert(&pf_frag_mtx, MA_OWNED)
99 
100 VNET_DEFINE(uma_zone_t, pf_state_scrub_z);	/* XXX: shared with pfsync */
101 
102 static VNET_DEFINE(uma_zone_t, pf_frent_z);
103 #define	V_pf_frent_z	VNET(pf_frent_z)
104 static VNET_DEFINE(uma_zone_t, pf_frag_z);
105 #define	V_pf_frag_z	VNET(pf_frag_z)
106 
107 TAILQ_HEAD(pf_fragqueue, pf_fragment);
108 TAILQ_HEAD(pf_cachequeue, pf_fragment);
109 static VNET_DEFINE(struct pf_fragqueue,	pf_fragqueue);
110 #define	V_pf_fragqueue			VNET(pf_fragqueue)
111 static VNET_DEFINE(struct pf_cachequeue,	pf_cachequeue);
112 #define	V_pf_cachequeue			VNET(pf_cachequeue)
113 RB_HEAD(pf_frag_tree, pf_fragment);
114 static VNET_DEFINE(struct pf_frag_tree,	pf_frag_tree);
115 #define	V_pf_frag_tree			VNET(pf_frag_tree)
116 static VNET_DEFINE(struct pf_frag_tree,	pf_cache_tree);
117 #define	V_pf_cache_tree			VNET(pf_cache_tree)
118 static int		 pf_frag_compare(struct pf_fragment *,
119 			    struct pf_fragment *);
120 static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
121 static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
122 
123 /* Private prototypes */
124 static void		 pf_free_fragment(struct pf_fragment *);
125 static void		 pf_remove_fragment(struct pf_fragment *);
126 static int		 pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
127 			    struct tcphdr *, int, sa_family_t);
128 #ifdef INET
129 static void		 pf_ip2key(struct pf_fragment *, struct ip *);
130 static void		 pf_scrub_ip(struct mbuf **, u_int32_t, u_int8_t,
131 			    u_int8_t);
132 static void		 pf_flush_fragments(void);
133 static struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *);
134 static struct mbuf	*pf_reassemble(struct mbuf **, struct pf_fragment **,
135 			    struct pf_frent *, int);
136 static struct mbuf	*pf_fragcache(struct mbuf **, struct ip*,
137 			    struct pf_fragment **, int, int, int *);
138 #endif /* INET */
139 #ifdef INET6
140 static void		 pf_scrub_ip6(struct mbuf **, u_int8_t);
141 #endif
142 #define	DPFPRINTF(x) do {				\
143 	if (V_pf_status.debug >= PF_DEBUG_MISC) {	\
144 		printf("%s: ", __func__);		\
145 		printf x ;				\
146 	}						\
147 } while(0)
148 
149 void
150 pf_normalize_init(void)
151 {
152 
153 	V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
154 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
155 	V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
156 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
157 	V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
158 	    sizeof(struct pf_state_scrub),  NULL, NULL, NULL, NULL,
159 	    UMA_ALIGN_PTR, 0);
160 
161 	V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z;
162 	V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
163 	uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT);
164 	uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached");
165 
166 	mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF);
167 
168 	TAILQ_INIT(&V_pf_fragqueue);
169 	TAILQ_INIT(&V_pf_cachequeue);
170 }
171 
172 void
173 pf_normalize_cleanup(void)
174 {
175 
176 	uma_zdestroy(V_pf_state_scrub_z);
177 	uma_zdestroy(V_pf_frent_z);
178 	uma_zdestroy(V_pf_frag_z);
179 
180 	mtx_destroy(&pf_frag_mtx);
181 }
182 
183 static int
184 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
185 {
186 	int	diff;
187 
188 	if ((diff = a->fr_id - b->fr_id))
189 		return (diff);
190 	else if ((diff = a->fr_p - b->fr_p))
191 		return (diff);
192 	else if (a->fr_src.s_addr < b->fr_src.s_addr)
193 		return (-1);
194 	else if (a->fr_src.s_addr > b->fr_src.s_addr)
195 		return (1);
196 	else if (a->fr_dst.s_addr < b->fr_dst.s_addr)
197 		return (-1);
198 	else if (a->fr_dst.s_addr > b->fr_dst.s_addr)
199 		return (1);
200 	return (0);
201 }
202 
203 void
204 pf_purge_expired_fragments(void)
205 {
206 	struct pf_fragment	*frag;
207 	u_int32_t		 expire = time_uptime -
208 				    V_pf_default_rule.timeout[PFTM_FRAG];
209 
210 	PF_FRAG_LOCK();
211 	while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
212 		KASSERT((BUFFER_FRAGMENTS(frag)),
213 		    ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__));
214 		if (frag->fr_timeout > expire)
215 			break;
216 
217 		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
218 		pf_free_fragment(frag);
219 	}
220 
221 	while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) {
222 		KASSERT((!BUFFER_FRAGMENTS(frag)),
223 		    ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__));
224 		if (frag->fr_timeout > expire)
225 			break;
226 
227 		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
228 		pf_free_fragment(frag);
229 		KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) ||
230 		    TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag),
231 		    ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
232 		    __FUNCTION__));
233 	}
234 	PF_FRAG_UNLOCK();
235 }
236 
237 #ifdef INET
238 /*
239  * Try to flush old fragments to make space for new ones
240  */
241 static void
242 pf_flush_fragments(void)
243 {
244 	struct pf_fragment	*frag, *cache;
245 	int			 goal;
246 
247 	PF_FRAG_ASSERT();
248 
249 	goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
250 	DPFPRINTF(("trying to free %d frag entriess\n", goal));
251 	while (goal < uma_zone_get_cur(V_pf_frent_z)) {
252 		frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
253 		if (frag)
254 			pf_free_fragment(frag);
255 		cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue);
256 		if (cache)
257 			pf_free_fragment(cache);
258 		if (frag == NULL && cache == NULL)
259 			break;
260 	}
261 }
262 #endif /* INET */
263 
264 /* Frees the fragments and all associated entries */
265 static void
266 pf_free_fragment(struct pf_fragment *frag)
267 {
268 	struct pf_frent		*frent;
269 
270 	PF_FRAG_ASSERT();
271 
272 	/* Free all fragments */
273 	if (BUFFER_FRAGMENTS(frag)) {
274 		for (frent = LIST_FIRST(&frag->fr_queue); frent;
275 		    frent = LIST_FIRST(&frag->fr_queue)) {
276 			LIST_REMOVE(frent, fr_next);
277 
278 			m_freem(frent->fr_m);
279 			uma_zfree(V_pf_frent_z, frent);
280 		}
281 	} else {
282 		for (frent = LIST_FIRST(&frag->fr_queue); frent;
283 		    frent = LIST_FIRST(&frag->fr_queue)) {
284 			LIST_REMOVE(frent, fr_next);
285 
286 			KASSERT((LIST_EMPTY(&frag->fr_queue) ||
287 			    LIST_FIRST(&frag->fr_queue)->fr_off >
288 			    frent->fr_end),
289 			    ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
290 			    " frent->fr_end): %s", __func__));
291 
292 			uma_zfree(V_pf_frent_z, frent);
293 		}
294 	}
295 
296 	pf_remove_fragment(frag);
297 }
298 
299 #ifdef INET
300 static void
301 pf_ip2key(struct pf_fragment *key, struct ip *ip)
302 {
303 	key->fr_p = ip->ip_p;
304 	key->fr_id = ip->ip_id;
305 	key->fr_src.s_addr = ip->ip_src.s_addr;
306 	key->fr_dst.s_addr = ip->ip_dst.s_addr;
307 }
308 
309 static struct pf_fragment *
310 pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree)
311 {
312 	struct pf_fragment	 key;
313 	struct pf_fragment	*frag;
314 
315 	PF_FRAG_ASSERT();
316 
317 	pf_ip2key(&key, ip);
318 
319 	frag = RB_FIND(pf_frag_tree, tree, &key);
320 	if (frag != NULL) {
321 		/* XXX Are we sure we want to update the timeout? */
322 		frag->fr_timeout = time_uptime;
323 		if (BUFFER_FRAGMENTS(frag)) {
324 			TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
325 			TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
326 		} else {
327 			TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
328 			TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next);
329 		}
330 	}
331 
332 	return (frag);
333 }
334 #endif /* INET */
335 
336 /* Removes a fragment from the fragment queue and frees the fragment */
337 
338 static void
339 pf_remove_fragment(struct pf_fragment *frag)
340 {
341 
342 	PF_FRAG_ASSERT();
343 
344 	if (BUFFER_FRAGMENTS(frag)) {
345 		RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
346 		TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
347 		uma_zfree(V_pf_frag_z, frag);
348 	} else {
349 		RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag);
350 		TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
351 		uma_zfree(V_pf_frag_z, frag);
352 	}
353 }
354 
355 #ifdef INET
356 #define FR_IP_OFF(fr)	((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
357 static struct mbuf *
358 pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
359     struct pf_frent *frent, int mff)
360 {
361 	struct mbuf	*m = *m0, *m2;
362 	struct pf_frent	*frea, *next;
363 	struct pf_frent	*frep = NULL;
364 	struct ip	*ip = frent->fr_ip;
365 	int		 hlen = ip->ip_hl << 2;
366 	u_int16_t	 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
367 	u_int16_t	 ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4;
368 	u_int16_t	 max = ip_len + off;
369 
370 	PF_FRAG_ASSERT();
371 	KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)),
372 	    ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
373 
374 	/* Strip off ip header */
375 	m->m_data += hlen;
376 	m->m_len -= hlen;
377 
378 	/* Create a new reassembly queue for this packet */
379 	if (*frag == NULL) {
380 		*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
381 		if (*frag == NULL) {
382 			pf_flush_fragments();
383 			*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
384 			if (*frag == NULL)
385 				goto drop_fragment;
386 		}
387 
388 		(*frag)->fr_flags = 0;
389 		(*frag)->fr_max = 0;
390 		(*frag)->fr_src = frent->fr_ip->ip_src;
391 		(*frag)->fr_dst = frent->fr_ip->ip_dst;
392 		(*frag)->fr_p = frent->fr_ip->ip_p;
393 		(*frag)->fr_id = frent->fr_ip->ip_id;
394 		(*frag)->fr_timeout = time_uptime;
395 		LIST_INIT(&(*frag)->fr_queue);
396 
397 		RB_INSERT(pf_frag_tree, &V_pf_frag_tree, *frag);
398 		TAILQ_INSERT_HEAD(&V_pf_fragqueue, *frag, frag_next);
399 
400 		/* We do not have a previous fragment */
401 		frep = NULL;
402 		goto insert;
403 	}
404 
405 	/*
406 	 * Find a fragment after the current one:
407 	 *  - off contains the real shifted offset.
408 	 */
409 	LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
410 		if (FR_IP_OFF(frea) > off)
411 			break;
412 		frep = frea;
413 	}
414 
415 	KASSERT((frep != NULL || frea != NULL),
416 	    ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));;
417 
418 	if (frep != NULL &&
419 	    FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl *
420 	    4 > off)
421 	{
422 		u_int16_t	precut;
423 
424 		precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
425 		    frep->fr_ip->ip_hl * 4 - off;
426 		if (precut >= ip_len)
427 			goto drop_fragment;
428 		m_adj(frent->fr_m, precut);
429 		DPFPRINTF(("overlap -%d\n", precut));
430 		/* Enforce 8 byte boundaries */
431 		ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
432 		off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
433 		ip_len -= precut;
434 		ip->ip_len = htons(ip_len);
435 	}
436 
437 	for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
438 	    frea = next)
439 	{
440 		u_int16_t	aftercut;
441 
442 		aftercut = ip_len + off - FR_IP_OFF(frea);
443 		DPFPRINTF(("adjust overlap %d\n", aftercut));
444 		if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl
445 		    * 4)
446 		{
447 			frea->fr_ip->ip_len =
448 			    htons(ntohs(frea->fr_ip->ip_len) - aftercut);
449 			frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) +
450 			    (aftercut >> 3));
451 			m_adj(frea->fr_m, aftercut);
452 			break;
453 		}
454 
455 		/* This fragment is completely overlapped, lose it */
456 		next = LIST_NEXT(frea, fr_next);
457 		m_freem(frea->fr_m);
458 		LIST_REMOVE(frea, fr_next);
459 		uma_zfree(V_pf_frent_z, frea);
460 	}
461 
462  insert:
463 	/* Update maximum data size */
464 	if ((*frag)->fr_max < max)
465 		(*frag)->fr_max = max;
466 	/* This is the last segment */
467 	if (!mff)
468 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
469 
470 	if (frep == NULL)
471 		LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
472 	else
473 		LIST_INSERT_AFTER(frep, frent, fr_next);
474 
475 	/* Check if we are completely reassembled */
476 	if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
477 		return (NULL);
478 
479 	/* Check if we have all the data */
480 	off = 0;
481 	for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
482 		next = LIST_NEXT(frep, fr_next);
483 
484 		off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
485 		if (off < (*frag)->fr_max &&
486 		    (next == NULL || FR_IP_OFF(next) != off))
487 		{
488 			DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
489 			    off, next == NULL ? -1 : FR_IP_OFF(next),
490 			    (*frag)->fr_max));
491 			return (NULL);
492 		}
493 	}
494 	DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
495 	if (off < (*frag)->fr_max)
496 		return (NULL);
497 
498 	/* We have all the data */
499 	frent = LIST_FIRST(&(*frag)->fr_queue);
500 	KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__));
501 	if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
502 		DPFPRINTF(("drop: too big: %d\n", off));
503 		pf_free_fragment(*frag);
504 		*frag = NULL;
505 		return (NULL);
506 	}
507 	next = LIST_NEXT(frent, fr_next);
508 
509 	/* Magic from ip_input */
510 	ip = frent->fr_ip;
511 	m = frent->fr_m;
512 	m2 = m->m_next;
513 	m->m_next = NULL;
514 	m_cat(m, m2);
515 	uma_zfree(V_pf_frent_z, frent);
516 	for (frent = next; frent != NULL; frent = next) {
517 		next = LIST_NEXT(frent, fr_next);
518 
519 		m2 = frent->fr_m;
520 		uma_zfree(V_pf_frent_z, frent);
521 		m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags;
522 		m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data;
523 		m_cat(m, m2);
524 	}
525 
526 	while (m->m_pkthdr.csum_data & 0xffff0000)
527 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
528 		    (m->m_pkthdr.csum_data >> 16);
529 	ip->ip_src = (*frag)->fr_src;
530 	ip->ip_dst = (*frag)->fr_dst;
531 
532 	/* Remove from fragment queue */
533 	pf_remove_fragment(*frag);
534 	*frag = NULL;
535 
536 	hlen = ip->ip_hl << 2;
537 	ip->ip_len = htons(off + hlen);
538 	m->m_len += hlen;
539 	m->m_data -= hlen;
540 
541 	/* some debugging cruft by sklower, below, will go away soon */
542 	/* XXX this should be done elsewhere */
543 	if (m->m_flags & M_PKTHDR) {
544 		int plen = 0;
545 		for (m2 = m; m2; m2 = m2->m_next)
546 			plen += m2->m_len;
547 		m->m_pkthdr.len = plen;
548 	}
549 
550 	DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
551 	return (m);
552 
553  drop_fragment:
554 	/* Oops - fail safe - drop packet */
555 	uma_zfree(V_pf_frent_z, frent);
556 	m_freem(m);
557 	return (NULL);
558 }
559 
560 static struct mbuf *
561 pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
562     int drop, int *nomem)
563 {
564 	struct mbuf		*m = *m0;
565 	struct pf_frent		*frp, *fra, *cur = NULL;
566 	int			 ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
567 	u_int16_t		 off = ntohs(h->ip_off) << 3;
568 	u_int16_t		 max = ip_len + off;
569 	int			 hosed = 0;
570 
571 	PF_FRAG_ASSERT();
572 	KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
573 	    ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
574 
575 	/* Create a new range queue for this packet */
576 	if (*frag == NULL) {
577 		*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
578 		if (*frag == NULL) {
579 			pf_flush_fragments();
580 			*frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
581 			if (*frag == NULL)
582 				goto no_mem;
583 		}
584 
585 		/* Get an entry for the queue */
586 		cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
587 		if (cur == NULL) {
588 			uma_zfree(V_pf_frag_z, *frag);
589 			*frag = NULL;
590 			goto no_mem;
591 		}
592 
593 		(*frag)->fr_flags = PFFRAG_NOBUFFER;
594 		(*frag)->fr_max = 0;
595 		(*frag)->fr_src = h->ip_src;
596 		(*frag)->fr_dst = h->ip_dst;
597 		(*frag)->fr_p = h->ip_p;
598 		(*frag)->fr_id = h->ip_id;
599 		(*frag)->fr_timeout = time_uptime;
600 
601 		cur->fr_off = off;
602 		cur->fr_end = max;
603 		LIST_INIT(&(*frag)->fr_queue);
604 		LIST_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next);
605 
606 		RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag);
607 		TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next);
608 
609 		DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
610 
611 		goto pass;
612 	}
613 
614 	/*
615 	 * Find a fragment after the current one:
616 	 *  - off contains the real shifted offset.
617 	 */
618 	frp = NULL;
619 	LIST_FOREACH(fra, &(*frag)->fr_queue, fr_next) {
620 		if (fra->fr_off > off)
621 			break;
622 		frp = fra;
623 	}
624 
625 	KASSERT((frp != NULL || fra != NULL),
626 	    ("!(frp != NULL || fra != NULL): %s", __FUNCTION__));
627 
628 	if (frp != NULL) {
629 		int	precut;
630 
631 		precut = frp->fr_end - off;
632 		if (precut >= ip_len) {
633 			/* Fragment is entirely a duplicate */
634 			DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
635 			    h->ip_id, frp->fr_off, frp->fr_end, off, max));
636 			goto drop_fragment;
637 		}
638 		if (precut == 0) {
639 			/* They are adjacent.  Fixup cache entry */
640 			DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
641 			    h->ip_id, frp->fr_off, frp->fr_end, off, max));
642 			frp->fr_end = max;
643 		} else if (precut > 0) {
644 			/* The first part of this payload overlaps with a
645 			 * fragment that has already been passed.
646 			 * Need to trim off the first part of the payload.
647 			 * But to do so easily, we need to create another
648 			 * mbuf to throw the original header into.
649 			 */
650 
651 			DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
652 			    h->ip_id, precut, frp->fr_off, frp->fr_end, off,
653 			    max));
654 
655 			off += precut;
656 			max -= precut;
657 			/* Update the previous frag to encompass this one */
658 			frp->fr_end = max;
659 
660 			if (!drop) {
661 				/* XXX Optimization opportunity
662 				 * This is a very heavy way to trim the payload.
663 				 * we could do it much faster by diddling mbuf
664 				 * internals but that would be even less legible
665 				 * than this mbuf magic.  For my next trick,
666 				 * I'll pull a rabbit out of my laptop.
667 				 */
668 				*m0 = m_dup(m, M_NOWAIT);
669 				if (*m0 == NULL)
670 					goto no_mem;
671 				/* From KAME Project : We have missed this! */
672 				m_adj(*m0, (h->ip_hl << 2) -
673 				    (*m0)->m_pkthdr.len);
674 
675 				KASSERT(((*m0)->m_next == NULL),
676 				    ("(*m0)->m_next != NULL: %s",
677 				    __FUNCTION__));
678 				m_adj(m, precut + (h->ip_hl << 2));
679 				m_cat(*m0, m);
680 				m = *m0;
681 				if (m->m_flags & M_PKTHDR) {
682 					int plen = 0;
683 					struct mbuf *t;
684 					for (t = m; t; t = t->m_next)
685 						plen += t->m_len;
686 					m->m_pkthdr.len = plen;
687 				}
688 
689 
690 				h = mtod(m, struct ip *);
691 
692 				KASSERT(((int)m->m_len ==
693 				    ntohs(h->ip_len) - precut),
694 				    ("m->m_len != ntohs(h->ip_len) - precut: %s",
695 				    __FUNCTION__));
696 				h->ip_off = htons(ntohs(h->ip_off) +
697 				    (precut >> 3));
698 				h->ip_len = htons(ntohs(h->ip_len) - precut);
699 			} else {
700 				hosed++;
701 			}
702 		} else {
703 			/* There is a gap between fragments */
704 
705 			DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
706 			    h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
707 			    max));
708 
709 			cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
710 			if (cur == NULL)
711 				goto no_mem;
712 
713 			cur->fr_off = off;
714 			cur->fr_end = max;
715 			LIST_INSERT_AFTER(frp, cur, fr_next);
716 		}
717 	}
718 
719 	if (fra != NULL) {
720 		int	aftercut;
721 		int	merge = 0;
722 
723 		aftercut = max - fra->fr_off;
724 		if (aftercut == 0) {
725 			/* Adjacent fragments */
726 			DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
727 			    h->ip_id, off, max, fra->fr_off, fra->fr_end));
728 			fra->fr_off = off;
729 			merge = 1;
730 		} else if (aftercut > 0) {
731 			/* Need to chop off the tail of this fragment */
732 			DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
733 			    h->ip_id, aftercut, off, max, fra->fr_off,
734 			    fra->fr_end));
735 			fra->fr_off = off;
736 			max -= aftercut;
737 
738 			merge = 1;
739 
740 			if (!drop) {
741 				m_adj(m, -aftercut);
742 				if (m->m_flags & M_PKTHDR) {
743 					int plen = 0;
744 					struct mbuf *t;
745 					for (t = m; t; t = t->m_next)
746 						plen += t->m_len;
747 					m->m_pkthdr.len = plen;
748 				}
749 				h = mtod(m, struct ip *);
750 				KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut),
751 				    ("m->m_len != ntohs(h->ip_len) - aftercut: %s",
752 				    __FUNCTION__));
753 				h->ip_len = htons(ntohs(h->ip_len) - aftercut);
754 			} else {
755 				hosed++;
756 			}
757 		} else if (frp == NULL) {
758 			/* There is a gap between fragments */
759 			DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
760 			    h->ip_id, -aftercut, off, max, fra->fr_off,
761 			    fra->fr_end));
762 
763 			cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
764 			if (cur == NULL)
765 				goto no_mem;
766 
767 			cur->fr_off = off;
768 			cur->fr_end = max;
769 			LIST_INSERT_BEFORE(fra, cur, fr_next);
770 		}
771 
772 
773 		/* Need to glue together two separate fragment descriptors */
774 		if (merge) {
775 			if (cur && fra->fr_off <= cur->fr_end) {
776 				/* Need to merge in a previous 'cur' */
777 				DPFPRINTF(("fragcache[%d]: adjacent(merge "
778 				    "%d-%d) %d-%d (%d-%d)\n",
779 				    h->ip_id, cur->fr_off, cur->fr_end, off,
780 				    max, fra->fr_off, fra->fr_end));
781 				fra->fr_off = cur->fr_off;
782 				LIST_REMOVE(cur, fr_next);
783 				uma_zfree(V_pf_frent_z, cur);
784 				cur = NULL;
785 
786 			} else if (frp && fra->fr_off <= frp->fr_end) {
787 				/* Need to merge in a modified 'frp' */
788 				KASSERT((cur == NULL), ("cur != NULL: %s",
789 				    __FUNCTION__));
790 				DPFPRINTF(("fragcache[%d]: adjacent(merge "
791 				    "%d-%d) %d-%d (%d-%d)\n",
792 				    h->ip_id, frp->fr_off, frp->fr_end, off,
793 				    max, fra->fr_off, fra->fr_end));
794 				fra->fr_off = frp->fr_off;
795 				LIST_REMOVE(frp, fr_next);
796 				uma_zfree(V_pf_frent_z, frp);
797 				frp = NULL;
798 
799 			}
800 		}
801 	}
802 
803 	if (hosed) {
804 		/*
805 		 * We must keep tracking the overall fragment even when
806 		 * we're going to drop it anyway so that we know when to
807 		 * free the overall descriptor.  Thus we drop the frag late.
808 		 */
809 		goto drop_fragment;
810 	}
811 
812 
813  pass:
814 	/* Update maximum data size */
815 	if ((*frag)->fr_max < max)
816 		(*frag)->fr_max = max;
817 
818 	/* This is the last segment */
819 	if (!mff)
820 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
821 
822 	/* Check if we are completely reassembled */
823 	if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
824 	    LIST_FIRST(&(*frag)->fr_queue)->fr_off == 0 &&
825 	    LIST_FIRST(&(*frag)->fr_queue)->fr_end == (*frag)->fr_max) {
826 		/* Remove from fragment queue */
827 		DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
828 		    (*frag)->fr_max));
829 		pf_free_fragment(*frag);
830 		*frag = NULL;
831 	}
832 
833 	return (m);
834 
835  no_mem:
836 	*nomem = 1;
837 
838 	/* Still need to pay attention to !IP_MF */
839 	if (!mff && *frag != NULL)
840 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
841 
842 	m_freem(m);
843 	return (NULL);
844 
845  drop_fragment:
846 
847 	/* Still need to pay attention to !IP_MF */
848 	if (!mff && *frag != NULL)
849 		(*frag)->fr_flags |= PFFRAG_SEENLAST;
850 
851 	if (drop) {
852 		/* This fragment has been deemed bad.  Don't reass */
853 		if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
854 			DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
855 			    h->ip_id));
856 		(*frag)->fr_flags |= PFFRAG_DROP;
857 	}
858 
859 	m_freem(m);
860 	return (NULL);
861 }
862 
863 int
864 pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
865     struct pf_pdesc *pd)
866 {
867 	struct mbuf		*m = *m0;
868 	struct pf_rule		*r;
869 	struct pf_frent		*frent;
870 	struct pf_fragment	*frag = NULL;
871 	struct ip		*h = mtod(m, struct ip *);
872 	int			 mff = (ntohs(h->ip_off) & IP_MF);
873 	int			 hlen = h->ip_hl << 2;
874 	u_int16_t		 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
875 	u_int16_t		 max;
876 	int			 ip_len;
877 	int			 ip_off;
878 	int			 tag = -1;
879 
880 	PF_RULES_RASSERT();
881 
882 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
883 	while (r != NULL) {
884 		r->evaluations++;
885 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
886 			r = r->skip[PF_SKIP_IFP].ptr;
887 		else if (r->direction && r->direction != dir)
888 			r = r->skip[PF_SKIP_DIR].ptr;
889 		else if (r->af && r->af != AF_INET)
890 			r = r->skip[PF_SKIP_AF].ptr;
891 		else if (r->proto && r->proto != h->ip_p)
892 			r = r->skip[PF_SKIP_PROTO].ptr;
893 		else if (PF_MISMATCHAW(&r->src.addr,
894 		    (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
895 		    r->src.neg, kif, M_GETFIB(m)))
896 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
897 		else if (PF_MISMATCHAW(&r->dst.addr,
898 		    (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
899 		    r->dst.neg, NULL, M_GETFIB(m)))
900 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
901 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
902 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
903 			r = TAILQ_NEXT(r, entries);
904 		else
905 			break;
906 	}
907 
908 	if (r == NULL || r->action == PF_NOSCRUB)
909 		return (PF_PASS);
910 	else {
911 		r->packets[dir == PF_OUT]++;
912 		r->bytes[dir == PF_OUT] += pd->tot_len;
913 	}
914 
915 	/* Check for illegal packets */
916 	if (hlen < (int)sizeof(struct ip))
917 		goto drop;
918 
919 	if (hlen > ntohs(h->ip_len))
920 		goto drop;
921 
922 	/* Clear IP_DF if the rule uses the no-df option */
923 	if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
924 		u_int16_t ip_off = h->ip_off;
925 
926 		h->ip_off &= htons(~IP_DF);
927 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
928 	}
929 
930 	/* We will need other tests here */
931 	if (!fragoff && !mff)
932 		goto no_fragment;
933 
934 	/* We're dealing with a fragment now. Don't allow fragments
935 	 * with IP_DF to enter the cache. If the flag was cleared by
936 	 * no-df above, fine. Otherwise drop it.
937 	 */
938 	if (h->ip_off & htons(IP_DF)) {
939 		DPFPRINTF(("IP_DF\n"));
940 		goto bad;
941 	}
942 
943 	ip_len = ntohs(h->ip_len) - hlen;
944 	ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
945 
946 	/* All fragments are 8 byte aligned */
947 	if (mff && (ip_len & 0x7)) {
948 		DPFPRINTF(("mff and %d\n", ip_len));
949 		goto bad;
950 	}
951 
952 	/* Respect maximum length */
953 	if (fragoff + ip_len > IP_MAXPACKET) {
954 		DPFPRINTF(("max packet %d\n", fragoff + ip_len));
955 		goto bad;
956 	}
957 	max = fragoff + ip_len;
958 
959 	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
960 
961 		/* Fully buffer all of the fragments */
962 		PF_FRAG_LOCK();
963 		frag = pf_find_fragment(h, &V_pf_frag_tree);
964 
965 		/* Check if we saw the last fragment already */
966 		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
967 		    max > frag->fr_max)
968 			goto bad;
969 
970 		/* Get an entry for the fragment queue */
971 		frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
972 		if (frent == NULL) {
973 			PF_FRAG_UNLOCK();
974 			REASON_SET(reason, PFRES_MEMORY);
975 			return (PF_DROP);
976 		}
977 		frent->fr_ip = h;
978 		frent->fr_m = m;
979 
980 		/* Might return a completely reassembled mbuf, or NULL */
981 		DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
982 		*m0 = m = pf_reassemble(m0, &frag, frent, mff);
983 		PF_FRAG_UNLOCK();
984 
985 		if (m == NULL)
986 			return (PF_DROP);
987 
988 		/* use mtag from concatenated mbuf chain */
989 		pd->pf_mtag = pf_find_mtag(m);
990 #ifdef DIAGNOSTIC
991 		if (pd->pf_mtag == NULL) {
992 			printf("%s: pf_find_mtag returned NULL(1)\n", __func__);
993 			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
994 				m_freem(m);
995 				*m0 = NULL;
996 				goto no_mem;
997 			}
998 		}
999 #endif
1000 		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1001 			goto drop;
1002 
1003 		h = mtod(m, struct ip *);
1004 	} else {
1005 		/* non-buffering fragment cache (drops or masks overlaps) */
1006 		int	nomem = 0;
1007 
1008 		if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) {
1009 			/*
1010 			 * Already passed the fragment cache in the
1011 			 * input direction.  If we continued, it would
1012 			 * appear to be a dup and would be dropped.
1013 			 */
1014 			goto fragment_pass;
1015 		}
1016 
1017 		PF_FRAG_LOCK();
1018 		frag = pf_find_fragment(h, &V_pf_cache_tree);
1019 
1020 		/* Check if we saw the last fragment already */
1021 		if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1022 		    max > frag->fr_max) {
1023 			if (r->rule_flag & PFRULE_FRAGDROP)
1024 				frag->fr_flags |= PFFRAG_DROP;
1025 			goto bad;
1026 		}
1027 
1028 		*m0 = m = pf_fragcache(m0, h, &frag, mff,
1029 		    (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1030 		PF_FRAG_UNLOCK();
1031 		if (m == NULL) {
1032 			if (nomem)
1033 				goto no_mem;
1034 			goto drop;
1035 		}
1036 
1037 		/* use mtag from copied and trimmed mbuf chain */
1038 		pd->pf_mtag = pf_find_mtag(m);
1039 #ifdef DIAGNOSTIC
1040 		if (pd->pf_mtag == NULL) {
1041 			printf("%s: pf_find_mtag returned NULL(2)\n", __func__);
1042 			if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1043 				m_freem(m);
1044 				*m0 = NULL;
1045 				goto no_mem;
1046 			}
1047 		}
1048 #endif
1049 		if (dir == PF_IN)
1050 			pd->pf_mtag->flags |= PF_TAG_FRAGCACHE;
1051 
1052 		if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1053 			goto drop;
1054 		goto fragment_pass;
1055 	}
1056 
1057  no_fragment:
1058 	/* At this point, only IP_DF is allowed in ip_off */
1059 	if (h->ip_off & ~htons(IP_DF)) {
1060 		u_int16_t ip_off = h->ip_off;
1061 
1062 		h->ip_off &= htons(IP_DF);
1063 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1064 	}
1065 
1066 	/* not missing a return here */
1067 
1068  fragment_pass:
1069 	pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
1070 
1071 	if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1072 		pd->flags |= PFDESC_IP_REAS;
1073 	return (PF_PASS);
1074 
1075  no_mem:
1076 	REASON_SET(reason, PFRES_MEMORY);
1077 	if (r != NULL && r->log)
1078 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1079 		    1);
1080 	return (PF_DROP);
1081 
1082  drop:
1083 	REASON_SET(reason, PFRES_NORM);
1084 	if (r != NULL && r->log)
1085 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1086 		    1);
1087 	return (PF_DROP);
1088 
1089  bad:
1090 	DPFPRINTF(("dropping bad fragment\n"));
1091 
1092 	/* Free associated fragments */
1093 	if (frag != NULL) {
1094 		pf_free_fragment(frag);
1095 		PF_FRAG_UNLOCK();
1096 	}
1097 
1098 	REASON_SET(reason, PFRES_FRAG);
1099 	if (r != NULL && r->log)
1100 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
1101 		    1);
1102 
1103 	return (PF_DROP);
1104 }
1105 #endif
1106 
1107 #ifdef INET6
1108 int
1109 pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
1110     u_short *reason, struct pf_pdesc *pd)
1111 {
1112 	struct mbuf		*m = *m0;
1113 	struct pf_rule		*r;
1114 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
1115 	int			 off;
1116 	struct ip6_ext		 ext;
1117 	struct ip6_opt		 opt;
1118 	struct ip6_opt_jumbo	 jumbo;
1119 	struct ip6_frag		 frag;
1120 	u_int32_t		 jumbolen = 0, plen;
1121 	u_int16_t		 fragoff = 0;
1122 	int			 optend;
1123 	int			 ooff;
1124 	u_int8_t		 proto;
1125 	int			 terminal;
1126 
1127 	PF_RULES_RASSERT();
1128 
1129 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1130 	while (r != NULL) {
1131 		r->evaluations++;
1132 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1133 			r = r->skip[PF_SKIP_IFP].ptr;
1134 		else if (r->direction && r->direction != dir)
1135 			r = r->skip[PF_SKIP_DIR].ptr;
1136 		else if (r->af && r->af != AF_INET6)
1137 			r = r->skip[PF_SKIP_AF].ptr;
1138 #if 0 /* header chain! */
1139 		else if (r->proto && r->proto != h->ip6_nxt)
1140 			r = r->skip[PF_SKIP_PROTO].ptr;
1141 #endif
1142 		else if (PF_MISMATCHAW(&r->src.addr,
1143 		    (struct pf_addr *)&h->ip6_src, AF_INET6,
1144 		    r->src.neg, kif, M_GETFIB(m)))
1145 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1146 		else if (PF_MISMATCHAW(&r->dst.addr,
1147 		    (struct pf_addr *)&h->ip6_dst, AF_INET6,
1148 		    r->dst.neg, NULL, M_GETFIB(m)))
1149 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1150 		else
1151 			break;
1152 	}
1153 
1154 	if (r == NULL || r->action == PF_NOSCRUB)
1155 		return (PF_PASS);
1156 	else {
1157 		r->packets[dir == PF_OUT]++;
1158 		r->bytes[dir == PF_OUT] += pd->tot_len;
1159 	}
1160 
1161 	/* Check for illegal packets */
1162 	if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
1163 		goto drop;
1164 
1165 	off = sizeof(struct ip6_hdr);
1166 	proto = h->ip6_nxt;
1167 	terminal = 0;
1168 	do {
1169 		switch (proto) {
1170 		case IPPROTO_FRAGMENT:
1171 			goto fragment;
1172 			break;
1173 		case IPPROTO_AH:
1174 		case IPPROTO_ROUTING:
1175 		case IPPROTO_DSTOPTS:
1176 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1177 			    NULL, AF_INET6))
1178 				goto shortpkt;
1179 			if (proto == IPPROTO_AH)
1180 				off += (ext.ip6e_len + 2) * 4;
1181 			else
1182 				off += (ext.ip6e_len + 1) * 8;
1183 			proto = ext.ip6e_nxt;
1184 			break;
1185 		case IPPROTO_HOPOPTS:
1186 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
1187 			    NULL, AF_INET6))
1188 				goto shortpkt;
1189 			optend = off + (ext.ip6e_len + 1) * 8;
1190 			ooff = off + sizeof(ext);
1191 			do {
1192 				if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
1193 				    sizeof(opt.ip6o_type), NULL, NULL,
1194 				    AF_INET6))
1195 					goto shortpkt;
1196 				if (opt.ip6o_type == IP6OPT_PAD1) {
1197 					ooff++;
1198 					continue;
1199 				}
1200 				if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
1201 				    NULL, NULL, AF_INET6))
1202 					goto shortpkt;
1203 				if (ooff + sizeof(opt) + opt.ip6o_len > optend)
1204 					goto drop;
1205 				switch (opt.ip6o_type) {
1206 				case IP6OPT_JUMBO:
1207 					if (h->ip6_plen != 0)
1208 						goto drop;
1209 					if (!pf_pull_hdr(m, ooff, &jumbo,
1210 					    sizeof(jumbo), NULL, NULL,
1211 					    AF_INET6))
1212 						goto shortpkt;
1213 					memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
1214 					    sizeof(jumbolen));
1215 					jumbolen = ntohl(jumbolen);
1216 					if (jumbolen <= IPV6_MAXPACKET)
1217 						goto drop;
1218 					if (sizeof(struct ip6_hdr) + jumbolen !=
1219 					    m->m_pkthdr.len)
1220 						goto drop;
1221 					break;
1222 				default:
1223 					break;
1224 				}
1225 				ooff += sizeof(opt) + opt.ip6o_len;
1226 			} while (ooff < optend);
1227 
1228 			off = optend;
1229 			proto = ext.ip6e_nxt;
1230 			break;
1231 		default:
1232 			terminal = 1;
1233 			break;
1234 		}
1235 	} while (!terminal);
1236 
1237 	/* jumbo payload option must be present, or plen > 0 */
1238 	if (ntohs(h->ip6_plen) == 0)
1239 		plen = jumbolen;
1240 	else
1241 		plen = ntohs(h->ip6_plen);
1242 	if (plen == 0)
1243 		goto drop;
1244 	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
1245 		goto shortpkt;
1246 
1247 	pf_scrub_ip6(&m, r->min_ttl);
1248 
1249 	return (PF_PASS);
1250 
1251  fragment:
1252 	if (ntohs(h->ip6_plen) == 0 || jumbolen)
1253 		goto drop;
1254 	plen = ntohs(h->ip6_plen);
1255 
1256 	if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
1257 		goto shortpkt;
1258 	fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
1259 	if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET)
1260 		goto badfrag;
1261 
1262 	/* do something about it */
1263 	/* remember to set pd->flags |= PFDESC_IP_REAS */
1264 	return (PF_PASS);
1265 
1266  shortpkt:
1267 	REASON_SET(reason, PFRES_SHORT);
1268 	if (r != NULL && r->log)
1269 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
1270 		    1);
1271 	return (PF_DROP);
1272 
1273  drop:
1274 	REASON_SET(reason, PFRES_NORM);
1275 	if (r != NULL && r->log)
1276 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
1277 		    1);
1278 	return (PF_DROP);
1279 
1280  badfrag:
1281 	REASON_SET(reason, PFRES_FRAG);
1282 	if (r != NULL && r->log)
1283 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
1284 		    1);
1285 	return (PF_DROP);
1286 }
1287 #endif /* INET6 */
1288 
1289 int
1290 pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
1291     int off, void *h, struct pf_pdesc *pd)
1292 {
1293 	struct pf_rule	*r, *rm = NULL;
1294 	struct tcphdr	*th = pd->hdr.tcp;
1295 	int		 rewrite = 0;
1296 	u_short		 reason;
1297 	u_int8_t	 flags;
1298 	sa_family_t	 af = pd->af;
1299 
1300 	PF_RULES_RASSERT();
1301 
1302 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1303 	while (r != NULL) {
1304 		r->evaluations++;
1305 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
1306 			r = r->skip[PF_SKIP_IFP].ptr;
1307 		else if (r->direction && r->direction != dir)
1308 			r = r->skip[PF_SKIP_DIR].ptr;
1309 		else if (r->af && r->af != af)
1310 			r = r->skip[PF_SKIP_AF].ptr;
1311 		else if (r->proto && r->proto != pd->proto)
1312 			r = r->skip[PF_SKIP_PROTO].ptr;
1313 		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
1314 		    r->src.neg, kif, M_GETFIB(m)))
1315 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1316 		else if (r->src.port_op && !pf_match_port(r->src.port_op,
1317 			    r->src.port[0], r->src.port[1], th->th_sport))
1318 			r = r->skip[PF_SKIP_SRC_PORT].ptr;
1319 		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
1320 		    r->dst.neg, NULL, M_GETFIB(m)))
1321 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
1322 		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
1323 			    r->dst.port[0], r->dst.port[1], th->th_dport))
1324 			r = r->skip[PF_SKIP_DST_PORT].ptr;
1325 		else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
1326 			    pf_osfp_fingerprint(pd, m, off, th),
1327 			    r->os_fingerprint))
1328 			r = TAILQ_NEXT(r, entries);
1329 		else {
1330 			rm = r;
1331 			break;
1332 		}
1333 	}
1334 
1335 	if (rm == NULL || rm->action == PF_NOSCRUB)
1336 		return (PF_PASS);
1337 	else {
1338 		r->packets[dir == PF_OUT]++;
1339 		r->bytes[dir == PF_OUT] += pd->tot_len;
1340 	}
1341 
1342 	if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
1343 		pd->flags |= PFDESC_TCP_NORM;
1344 
1345 	flags = th->th_flags;
1346 	if (flags & TH_SYN) {
1347 		/* Illegal packet */
1348 		if (flags & TH_RST)
1349 			goto tcp_drop;
1350 
1351 		if (flags & TH_FIN)
1352 			flags &= ~TH_FIN;
1353 	} else {
1354 		/* Illegal packet */
1355 		if (!(flags & (TH_ACK|TH_RST)))
1356 			goto tcp_drop;
1357 	}
1358 
1359 	if (!(flags & TH_ACK)) {
1360 		/* These flags are only valid if ACK is set */
1361 		if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
1362 			goto tcp_drop;
1363 	}
1364 
1365 	/* Check for illegal header length */
1366 	if (th->th_off < (sizeof(struct tcphdr) >> 2))
1367 		goto tcp_drop;
1368 
1369 	/* If flags changed, or reserved data set, then adjust */
1370 	if (flags != th->th_flags || th->th_x2 != 0) {
1371 		u_int16_t	ov, nv;
1372 
1373 		ov = *(u_int16_t *)(&th->th_ack + 1);
1374 		th->th_flags = flags;
1375 		th->th_x2 = 0;
1376 		nv = *(u_int16_t *)(&th->th_ack + 1);
1377 
1378 		th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
1379 		rewrite = 1;
1380 	}
1381 
1382 	/* Remove urgent pointer, if TH_URG is not set */
1383 	if (!(flags & TH_URG) && th->th_urp) {
1384 		th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
1385 		th->th_urp = 0;
1386 		rewrite = 1;
1387 	}
1388 
1389 	/* Process options */
1390 	if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
1391 		rewrite = 1;
1392 
1393 	/* copy back packet headers if we sanitized */
1394 	if (rewrite)
1395 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
1396 
1397 	return (PF_PASS);
1398 
1399  tcp_drop:
1400 	REASON_SET(&reason, PFRES_NORM);
1401 	if (rm != NULL && r->log)
1402 		PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd,
1403 		    1);
1404 	return (PF_DROP);
1405 }
1406 
1407 int
1408 pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
1409     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
1410 {
1411 	u_int32_t tsval, tsecr;
1412 	u_int8_t hdr[60];
1413 	u_int8_t *opt;
1414 
1415 	KASSERT((src->scrub == NULL),
1416 	    ("pf_normalize_tcp_init: src->scrub != NULL"));
1417 
1418 	src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT);
1419 	if (src->scrub == NULL)
1420 		return (1);
1421 
1422 	switch (pd->af) {
1423 #ifdef INET
1424 	case AF_INET: {
1425 		struct ip *h = mtod(m, struct ip *);
1426 		src->scrub->pfss_ttl = h->ip_ttl;
1427 		break;
1428 	}
1429 #endif /* INET */
1430 #ifdef INET6
1431 	case AF_INET6: {
1432 		struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1433 		src->scrub->pfss_ttl = h->ip6_hlim;
1434 		break;
1435 	}
1436 #endif /* INET6 */
1437 	}
1438 
1439 
1440 	/*
1441 	 * All normalizations below are only begun if we see the start of
1442 	 * the connections.  They must all set an enabled bit in pfss_flags
1443 	 */
1444 	if ((th->th_flags & TH_SYN) == 0)
1445 		return (0);
1446 
1447 
1448 	if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
1449 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1450 		/* Diddle with TCP options */
1451 		int hlen;
1452 		opt = hdr + sizeof(struct tcphdr);
1453 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1454 		while (hlen >= TCPOLEN_TIMESTAMP) {
1455 			switch (*opt) {
1456 			case TCPOPT_EOL:	/* FALLTHROUGH */
1457 			case TCPOPT_NOP:
1458 				opt++;
1459 				hlen--;
1460 				break;
1461 			case TCPOPT_TIMESTAMP:
1462 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1463 					src->scrub->pfss_flags |=
1464 					    PFSS_TIMESTAMP;
1465 					src->scrub->pfss_ts_mod =
1466 					    htonl(arc4random());
1467 
1468 					/* note PFSS_PAWS not set yet */
1469 					memcpy(&tsval, &opt[2],
1470 					    sizeof(u_int32_t));
1471 					memcpy(&tsecr, &opt[6],
1472 					    sizeof(u_int32_t));
1473 					src->scrub->pfss_tsval0 = ntohl(tsval);
1474 					src->scrub->pfss_tsval = ntohl(tsval);
1475 					src->scrub->pfss_tsecr = ntohl(tsecr);
1476 					getmicrouptime(&src->scrub->pfss_last);
1477 				}
1478 				/* FALLTHROUGH */
1479 			default:
1480 				hlen -= MAX(opt[1], 2);
1481 				opt += MAX(opt[1], 2);
1482 				break;
1483 			}
1484 		}
1485 	}
1486 
1487 	return (0);
1488 }
1489 
1490 void
1491 pf_normalize_tcp_cleanup(struct pf_state *state)
1492 {
1493 	if (state->src.scrub)
1494 		uma_zfree(V_pf_state_scrub_z, state->src.scrub);
1495 	if (state->dst.scrub)
1496 		uma_zfree(V_pf_state_scrub_z, state->dst.scrub);
1497 
1498 	/* Someday... flush the TCP segment reassembly descriptors. */
1499 }
1500 
1501 int
1502 pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
1503     u_short *reason, struct tcphdr *th, struct pf_state *state,
1504     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
1505 {
1506 	struct timeval uptime;
1507 	u_int32_t tsval, tsecr;
1508 	u_int tsval_from_last;
1509 	u_int8_t hdr[60];
1510 	u_int8_t *opt;
1511 	int copyback = 0;
1512 	int got_ts = 0;
1513 
1514 	KASSERT((src->scrub || dst->scrub),
1515 	    ("%s: src->scrub && dst->scrub!", __func__));
1516 
1517 	/*
1518 	 * Enforce the minimum TTL seen for this connection.  Negate a common
1519 	 * technique to evade an intrusion detection system and confuse
1520 	 * firewall state code.
1521 	 */
1522 	switch (pd->af) {
1523 #ifdef INET
1524 	case AF_INET: {
1525 		if (src->scrub) {
1526 			struct ip *h = mtod(m, struct ip *);
1527 			if (h->ip_ttl > src->scrub->pfss_ttl)
1528 				src->scrub->pfss_ttl = h->ip_ttl;
1529 			h->ip_ttl = src->scrub->pfss_ttl;
1530 		}
1531 		break;
1532 	}
1533 #endif /* INET */
1534 #ifdef INET6
1535 	case AF_INET6: {
1536 		if (src->scrub) {
1537 			struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
1538 			if (h->ip6_hlim > src->scrub->pfss_ttl)
1539 				src->scrub->pfss_ttl = h->ip6_hlim;
1540 			h->ip6_hlim = src->scrub->pfss_ttl;
1541 		}
1542 		break;
1543 	}
1544 #endif /* INET6 */
1545 	}
1546 
1547 	if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
1548 	    ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
1549 	    (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
1550 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
1551 		/* Diddle with TCP options */
1552 		int hlen;
1553 		opt = hdr + sizeof(struct tcphdr);
1554 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
1555 		while (hlen >= TCPOLEN_TIMESTAMP) {
1556 			switch (*opt) {
1557 			case TCPOPT_EOL:	/* FALLTHROUGH */
1558 			case TCPOPT_NOP:
1559 				opt++;
1560 				hlen--;
1561 				break;
1562 			case TCPOPT_TIMESTAMP:
1563 				/* Modulate the timestamps.  Can be used for
1564 				 * NAT detection, OS uptime determination or
1565 				 * reboot detection.
1566 				 */
1567 
1568 				if (got_ts) {
1569 					/* Huh?  Multiple timestamps!? */
1570 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
1571 						DPFPRINTF(("multiple TS??"));
1572 						pf_print_state(state);
1573 						printf("\n");
1574 					}
1575 					REASON_SET(reason, PFRES_TS);
1576 					return (PF_DROP);
1577 				}
1578 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
1579 					memcpy(&tsval, &opt[2],
1580 					    sizeof(u_int32_t));
1581 					if (tsval && src->scrub &&
1582 					    (src->scrub->pfss_flags &
1583 					    PFSS_TIMESTAMP)) {
1584 						tsval = ntohl(tsval);
1585 						pf_change_a(&opt[2],
1586 						    &th->th_sum,
1587 						    htonl(tsval +
1588 						    src->scrub->pfss_ts_mod),
1589 						    0);
1590 						copyback = 1;
1591 					}
1592 
1593 					/* Modulate TS reply iff valid (!0) */
1594 					memcpy(&tsecr, &opt[6],
1595 					    sizeof(u_int32_t));
1596 					if (tsecr && dst->scrub &&
1597 					    (dst->scrub->pfss_flags &
1598 					    PFSS_TIMESTAMP)) {
1599 						tsecr = ntohl(tsecr)
1600 						    - dst->scrub->pfss_ts_mod;
1601 						pf_change_a(&opt[6],
1602 						    &th->th_sum, htonl(tsecr),
1603 						    0);
1604 						copyback = 1;
1605 					}
1606 					got_ts = 1;
1607 				}
1608 				/* FALLTHROUGH */
1609 			default:
1610 				hlen -= MAX(opt[1], 2);
1611 				opt += MAX(opt[1], 2);
1612 				break;
1613 			}
1614 		}
1615 		if (copyback) {
1616 			/* Copyback the options, caller copys back header */
1617 			*writeback = 1;
1618 			m_copyback(m, off + sizeof(struct tcphdr),
1619 			    (th->th_off << 2) - sizeof(struct tcphdr), hdr +
1620 			    sizeof(struct tcphdr));
1621 		}
1622 	}
1623 
1624 
1625 	/*
1626 	 * Must invalidate PAWS checks on connections idle for too long.
1627 	 * The fastest allowed timestamp clock is 1ms.  That turns out to
1628 	 * be about 24 days before it wraps.  XXX Right now our lowerbound
1629 	 * TS echo check only works for the first 12 days of a connection
1630 	 * when the TS has exhausted half its 32bit space
1631 	 */
1632 #define TS_MAX_IDLE	(24*24*60*60)
1633 #define TS_MAX_CONN	(12*24*60*60)	/* XXX remove when better tsecr check */
1634 
1635 	getmicrouptime(&uptime);
1636 	if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
1637 	    (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
1638 	    time_uptime - state->creation > TS_MAX_CONN))  {
1639 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1640 			DPFPRINTF(("src idled out of PAWS\n"));
1641 			pf_print_state(state);
1642 			printf("\n");
1643 		}
1644 		src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
1645 		    | PFSS_PAWS_IDLED;
1646 	}
1647 	if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
1648 	    uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
1649 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1650 			DPFPRINTF(("dst idled out of PAWS\n"));
1651 			pf_print_state(state);
1652 			printf("\n");
1653 		}
1654 		dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
1655 		    | PFSS_PAWS_IDLED;
1656 	}
1657 
1658 	if (got_ts && src->scrub && dst->scrub &&
1659 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
1660 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
1661 		/* Validate that the timestamps are "in-window".
1662 		 * RFC1323 describes TCP Timestamp options that allow
1663 		 * measurement of RTT (round trip time) and PAWS
1664 		 * (protection against wrapped sequence numbers).  PAWS
1665 		 * gives us a set of rules for rejecting packets on
1666 		 * long fat pipes (packets that were somehow delayed
1667 		 * in transit longer than the time it took to send the
1668 		 * full TCP sequence space of 4Gb).  We can use these
1669 		 * rules and infer a few others that will let us treat
1670 		 * the 32bit timestamp and the 32bit echoed timestamp
1671 		 * as sequence numbers to prevent a blind attacker from
1672 		 * inserting packets into a connection.
1673 		 *
1674 		 * RFC1323 tells us:
1675 		 *  - The timestamp on this packet must be greater than
1676 		 *    or equal to the last value echoed by the other
1677 		 *    endpoint.  The RFC says those will be discarded
1678 		 *    since it is a dup that has already been acked.
1679 		 *    This gives us a lowerbound on the timestamp.
1680 		 *        timestamp >= other last echoed timestamp
1681 		 *  - The timestamp will be less than or equal to
1682 		 *    the last timestamp plus the time between the
1683 		 *    last packet and now.  The RFC defines the max
1684 		 *    clock rate as 1ms.  We will allow clocks to be
1685 		 *    up to 10% fast and will allow a total difference
1686 		 *    or 30 seconds due to a route change.  And this
1687 		 *    gives us an upperbound on the timestamp.
1688 		 *        timestamp <= last timestamp + max ticks
1689 		 *    We have to be careful here.  Windows will send an
1690 		 *    initial timestamp of zero and then initialize it
1691 		 *    to a random value after the 3whs; presumably to
1692 		 *    avoid a DoS by having to call an expensive RNG
1693 		 *    during a SYN flood.  Proof MS has at least one
1694 		 *    good security geek.
1695 		 *
1696 		 *  - The TCP timestamp option must also echo the other
1697 		 *    endpoints timestamp.  The timestamp echoed is the
1698 		 *    one carried on the earliest unacknowledged segment
1699 		 *    on the left edge of the sequence window.  The RFC
1700 		 *    states that the host will reject any echoed
1701 		 *    timestamps that were larger than any ever sent.
1702 		 *    This gives us an upperbound on the TS echo.
1703 		 *        tescr <= largest_tsval
1704 		 *  - The lowerbound on the TS echo is a little more
1705 		 *    tricky to determine.  The other endpoint's echoed
1706 		 *    values will not decrease.  But there may be
1707 		 *    network conditions that re-order packets and
1708 		 *    cause our view of them to decrease.  For now the
1709 		 *    only lowerbound we can safely determine is that
1710 		 *    the TS echo will never be less than the original
1711 		 *    TS.  XXX There is probably a better lowerbound.
1712 		 *    Remove TS_MAX_CONN with better lowerbound check.
1713 		 *        tescr >= other original TS
1714 		 *
1715 		 * It is also important to note that the fastest
1716 		 * timestamp clock of 1ms will wrap its 32bit space in
1717 		 * 24 days.  So we just disable TS checking after 24
1718 		 * days of idle time.  We actually must use a 12d
1719 		 * connection limit until we can come up with a better
1720 		 * lowerbound to the TS echo check.
1721 		 */
1722 		struct timeval delta_ts;
1723 		int ts_fudge;
1724 
1725 
1726 		/*
1727 		 * PFTM_TS_DIFF is how many seconds of leeway to allow
1728 		 * a host's timestamp.  This can happen if the previous
1729 		 * packet got delayed in transit for much longer than
1730 		 * this packet.
1731 		 */
1732 		if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
1733 			ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
1734 
1735 		/* Calculate max ticks since the last timestamp */
1736 #define TS_MAXFREQ	1100		/* RFC max TS freq of 1Khz + 10% skew */
1737 #define TS_MICROSECS	1000000		/* microseconds per second */
1738 		delta_ts = uptime;
1739 		timevalsub(&delta_ts, &src->scrub->pfss_last);
1740 		tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
1741 		tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
1742 
1743 		if ((src->state >= TCPS_ESTABLISHED &&
1744 		    dst->state >= TCPS_ESTABLISHED) &&
1745 		    (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
1746 		    SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
1747 		    (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
1748 		    SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
1749 			/* Bad RFC1323 implementation or an insertion attack.
1750 			 *
1751 			 * - Solaris 2.6 and 2.7 are known to send another ACK
1752 			 *   after the FIN,FIN|ACK,ACK closing that carries
1753 			 *   an old timestamp.
1754 			 */
1755 
1756 			DPFPRINTF(("Timestamp failed %c%c%c%c\n",
1757 			    SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
1758 			    SEQ_GT(tsval, src->scrub->pfss_tsval +
1759 			    tsval_from_last) ? '1' : ' ',
1760 			    SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
1761 			    SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
1762 			DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
1763 			    "idle: %jus %lums\n",
1764 			    tsval, tsecr, tsval_from_last,
1765 			    (uintmax_t)delta_ts.tv_sec,
1766 			    delta_ts.tv_usec / 1000));
1767 			DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
1768 			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
1769 			DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
1770 			    "\n", dst->scrub->pfss_tsval,
1771 			    dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
1772 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
1773 				pf_print_state(state);
1774 				pf_print_flags(th->th_flags);
1775 				printf("\n");
1776 			}
1777 			REASON_SET(reason, PFRES_TS);
1778 			return (PF_DROP);
1779 		}
1780 
1781 		/* XXX I'd really like to require tsecr but it's optional */
1782 
1783 	} else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
1784 	    ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
1785 	    || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
1786 	    src->scrub && dst->scrub &&
1787 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
1788 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
1789 		/* Didn't send a timestamp.  Timestamps aren't really useful
1790 		 * when:
1791 		 *  - connection opening or closing (often not even sent).
1792 		 *    but we must not let an attacker to put a FIN on a
1793 		 *    data packet to sneak it through our ESTABLISHED check.
1794 		 *  - on a TCP reset.  RFC suggests not even looking at TS.
1795 		 *  - on an empty ACK.  The TS will not be echoed so it will
1796 		 *    probably not help keep the RTT calculation in sync and
1797 		 *    there isn't as much danger when the sequence numbers
1798 		 *    got wrapped.  So some stacks don't include TS on empty
1799 		 *    ACKs :-(
1800 		 *
1801 		 * To minimize the disruption to mostly RFC1323 conformant
1802 		 * stacks, we will only require timestamps on data packets.
1803 		 *
1804 		 * And what do ya know, we cannot require timestamps on data
1805 		 * packets.  There appear to be devices that do legitimate
1806 		 * TCP connection hijacking.  There are HTTP devices that allow
1807 		 * a 3whs (with timestamps) and then buffer the HTTP request.
1808 		 * If the intermediate device has the HTTP response cache, it
1809 		 * will spoof the response but not bother timestamping its
1810 		 * packets.  So we can look for the presence of a timestamp in
1811 		 * the first data packet and if there, require it in all future
1812 		 * packets.
1813 		 */
1814 
1815 		if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
1816 			/*
1817 			 * Hey!  Someone tried to sneak a packet in.  Or the
1818 			 * stack changed its RFC1323 behavior?!?!
1819 			 */
1820 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
1821 				DPFPRINTF(("Did not receive expected RFC1323 "
1822 				    "timestamp\n"));
1823 				pf_print_state(state);
1824 				pf_print_flags(th->th_flags);
1825 				printf("\n");
1826 			}
1827 			REASON_SET(reason, PFRES_TS);
1828 			return (PF_DROP);
1829 		}
1830 	}
1831 
1832 
1833 	/*
1834 	 * We will note if a host sends his data packets with or without
1835 	 * timestamps.  And require all data packets to contain a timestamp
1836 	 * if the first does.  PAWS implicitly requires that all data packets be
1837 	 * timestamped.  But I think there are middle-man devices that hijack
1838 	 * TCP streams immediately after the 3whs and don't timestamp their
1839 	 * packets (seen in a WWW accelerator or cache).
1840 	 */
1841 	if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
1842 	    (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
1843 		if (got_ts)
1844 			src->scrub->pfss_flags |= PFSS_DATA_TS;
1845 		else {
1846 			src->scrub->pfss_flags |= PFSS_DATA_NOTS;
1847 			if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
1848 			    (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
1849 				/* Don't warn if other host rejected RFC1323 */
1850 				DPFPRINTF(("Broken RFC1323 stack did not "
1851 				    "timestamp data packet. Disabled PAWS "
1852 				    "security.\n"));
1853 				pf_print_state(state);
1854 				pf_print_flags(th->th_flags);
1855 				printf("\n");
1856 			}
1857 		}
1858 	}
1859 
1860 
1861 	/*
1862 	 * Update PAWS values
1863 	 */
1864 	if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
1865 	    (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
1866 		getmicrouptime(&src->scrub->pfss_last);
1867 		if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
1868 		    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1869 			src->scrub->pfss_tsval = tsval;
1870 
1871 		if (tsecr) {
1872 			if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
1873 			    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
1874 				src->scrub->pfss_tsecr = tsecr;
1875 
1876 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
1877 			    (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
1878 			    src->scrub->pfss_tsval0 == 0)) {
1879 				/* tsval0 MUST be the lowest timestamp */
1880 				src->scrub->pfss_tsval0 = tsval;
1881 			}
1882 
1883 			/* Only fully initialized after a TS gets echoed */
1884 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
1885 				src->scrub->pfss_flags |= PFSS_PAWS;
1886 		}
1887 	}
1888 
1889 	/* I have a dream....  TCP segment reassembly.... */
1890 	return (0);
1891 }
1892 
1893 static int
1894 pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
1895     int off, sa_family_t af)
1896 {
1897 	u_int16_t	*mss;
1898 	int		 thoff;
1899 	int		 opt, cnt, optlen = 0;
1900 	int		 rewrite = 0;
1901 	u_char		 opts[TCP_MAXOLEN];
1902 	u_char		*optp = opts;
1903 
1904 	thoff = th->th_off << 2;
1905 	cnt = thoff - sizeof(struct tcphdr);
1906 
1907 	if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
1908 	    NULL, NULL, af))
1909 		return (rewrite);
1910 
1911 	for (; cnt > 0; cnt -= optlen, optp += optlen) {
1912 		opt = optp[0];
1913 		if (opt == TCPOPT_EOL)
1914 			break;
1915 		if (opt == TCPOPT_NOP)
1916 			optlen = 1;
1917 		else {
1918 			if (cnt < 2)
1919 				break;
1920 			optlen = optp[1];
1921 			if (optlen < 2 || optlen > cnt)
1922 				break;
1923 		}
1924 		switch (opt) {
1925 		case TCPOPT_MAXSEG:
1926 			mss = (u_int16_t *)(optp + 2);
1927 			if ((ntohs(*mss)) > r->max_mss) {
1928 				th->th_sum = pf_cksum_fixup(th->th_sum,
1929 				    *mss, htons(r->max_mss), 0);
1930 				*mss = htons(r->max_mss);
1931 				rewrite = 1;
1932 			}
1933 			break;
1934 		default:
1935 			break;
1936 		}
1937 	}
1938 
1939 	if (rewrite)
1940 		m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
1941 
1942 	return (rewrite);
1943 }
1944 
1945 #ifdef INET
1946 static void
1947 pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
1948 {
1949 	struct mbuf		*m = *m0;
1950 	struct ip		*h = mtod(m, struct ip *);
1951 
1952 	/* Clear IP_DF if no-df was requested */
1953 	if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
1954 		u_int16_t ip_off = h->ip_off;
1955 
1956 		h->ip_off &= htons(~IP_DF);
1957 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
1958 	}
1959 
1960 	/* Enforce a minimum ttl, may cause endless packet loops */
1961 	if (min_ttl && h->ip_ttl < min_ttl) {
1962 		u_int16_t ip_ttl = h->ip_ttl;
1963 
1964 		h->ip_ttl = min_ttl;
1965 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1966 	}
1967 
1968 	/* Enforce tos */
1969 	if (flags & PFRULE_SET_TOS) {
1970 		u_int16_t	ov, nv;
1971 
1972 		ov = *(u_int16_t *)h;
1973 		h->ip_tos = tos;
1974 		nv = *(u_int16_t *)h;
1975 
1976 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
1977 	}
1978 
1979 	/* random-id, but not for fragments */
1980 	if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
1981 		u_int16_t ip_id = h->ip_id;
1982 
1983 		h->ip_id = ip_randomid();
1984 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
1985 	}
1986 }
1987 #endif /* INET */
1988 
1989 #ifdef INET6
1990 static void
1991 pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
1992 {
1993 	struct mbuf		*m = *m0;
1994 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
1995 
1996 	/* Enforce a minimum ttl, may cause endless packet loops */
1997 	if (min_ttl && h->ip6_hlim < min_ttl)
1998 		h->ip6_hlim = min_ttl;
1999 }
2000 #endif
2001