xref: /titanic_52/usr/src/uts/common/inet/ipf/ip_state.c (revision e1dfad11282f1a85298f6361995ffb5d098c2630)
1 /*
2  * Copyright (C) 1995-2003 by Darren Reed.
3  *
4  * See the IPFILTER.LICENCE file for details on licencing.
5  *
6  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
7  * Use is subject to license terms.
8  */
9 
10 #pragma ident	"%Z%%M%	%I%	%E% SMI"
11 
12 #if defined(KERNEL) || defined(_KERNEL)
13 # undef KERNEL
14 # undef _KERNEL
15 # define        KERNEL	1
16 # define        _KERNEL	1
17 #endif
18 #include <sys/errno.h>
19 #include <sys/types.h>
20 #include <sys/param.h>
21 #include <sys/file.h>
22 #if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
23     defined(_KERNEL)
24 # include "opt_ipfilter_log.h"
25 #endif
26 #if defined(_KERNEL) && defined(__FreeBSD_version) && \
27     (__FreeBSD_version >= 400000) && !defined(KLD_MODULE)
28 #include "opt_inet6.h"
29 #endif
30 #if !defined(_KERNEL) && !defined(__KERNEL__)
31 # include <stdio.h>
32 # include <stdlib.h>
33 # include <string.h>
34 # define _KERNEL
35 # ifdef __OpenBSD__
36 struct file;
37 # endif
38 # include <sys/uio.h>
39 # undef _KERNEL
40 #endif
41 #if defined(_KERNEL) && (__FreeBSD_version >= 220000)
42 # include <sys/filio.h>
43 # include <sys/fcntl.h>
44 # if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
45 #  include "opt_ipfilter.h"
46 # endif
47 #else
48 # include <sys/ioctl.h>
49 #endif
50 #include <sys/time.h>
51 #if !defined(linux)
52 # include <sys/protosw.h>
53 #endif
54 #include <sys/socket.h>
55 #if defined(_KERNEL)
56 # include <sys/systm.h>
57 # if !defined(__SVR4) && !defined(__svr4__)
58 #  include <sys/mbuf.h>
59 # endif
60 #endif
61 #if defined(__SVR4) || defined(__svr4__)
62 # include <sys/filio.h>
63 # include <sys/byteorder.h>
64 # ifdef _KERNEL
65 #  include <sys/dditypes.h>
66 # endif
67 # include <sys/stream.h>
68 # include <sys/kmem.h>
69 #endif
70 
71 #include <net/if.h>
72 #ifdef sun
73 # include <net/af.h>
74 #endif
75 #include <net/route.h>
76 #include <netinet/in.h>
77 #include <netinet/in_systm.h>
78 #include <netinet/ip.h>
79 #include <netinet/tcp.h>
80 #if !defined(linux)
81 # include <netinet/ip_var.h>
82 #endif
83 #if !defined(__hpux) && !defined(linux)
84 # include <netinet/tcp_fsm.h>
85 #endif
86 #include <netinet/udp.h>
87 #include <netinet/ip_icmp.h>
88 #include "netinet/ip_compat.h"
89 #include <netinet/tcpip.h>
90 #include "netinet/ip_fil.h"
91 #include "netinet/ip_nat.h"
92 #include "netinet/ip_frag.h"
93 #include "netinet/ip_state.h"
94 #include "netinet/ip_proxy.h"
95 #include "netinet/ipf_stack.h"
96 #ifdef	IPFILTER_SYNC
97 #include "netinet/ip_sync.h"
98 #endif
99 #ifdef	IPFILTER_SCAN
100 #include "netinet/ip_scan.h"
101 #endif
102 #ifdef	USE_INET6
103 #include <netinet/icmp6.h>
104 #endif
105 #if (__FreeBSD_version >= 300000)
106 # include <sys/malloc.h>
107 # if defined(_KERNEL) && !defined(IPFILTER_LKM)
108 #  include <sys/libkern.h>
109 #  include <sys/systm.h>
110 # endif
111 #endif
112 /* END OF INCLUDES */
113 
114 
115 #if !defined(lint)
116 static const char sccsid[] = "@(#)ip_state.c	1.8 6/5/96 (C) 1993-2000 Darren Reed";
117 static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.186.2.36 2005/08/11 19:58:03 darrenr Exp $";
118 #endif
119 
120 #ifdef	USE_INET6
121 static ipstate_t *fr_checkicmp6matchingstate __P((fr_info_t *));
122 #endif
123 static ipstate_t *fr_matchsrcdst __P((fr_info_t *, ipstate_t *, i6addr_t *,
124 				      i6addr_t *, tcphdr_t *, u_32_t));
125 static ipstate_t *fr_checkicmpmatchingstate __P((fr_info_t *));
126 static int fr_state_flush __P((int, int, ipf_stack_t *));
127 static ips_stat_t *fr_statetstats __P((ipf_stack_t *));
128 static void fr_delstate __P((ipstate_t *, int, ipf_stack_t *));
129 static int fr_state_remove __P((caddr_t, ipf_stack_t *));
130 static void fr_ipsmove __P((ipstate_t *, u_int, ipf_stack_t *));
131 static int fr_tcpstate __P((fr_info_t *, tcphdr_t *, ipstate_t *));
132 static int fr_tcpoptions __P((fr_info_t *, tcphdr_t *, tcpdata_t *));
133 static ipstate_t *fr_stclone __P((fr_info_t *, tcphdr_t *, ipstate_t *));
134 static void fr_fixinisn __P((fr_info_t *, ipstate_t *));
135 static void fr_fixoutisn __P((fr_info_t *, ipstate_t *));
136 static void fr_checknewisn __P((fr_info_t *, ipstate_t *));
137 static int fr_stateiter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
138 
139 int fr_stputent __P((caddr_t, ipf_stack_t *));
140 int fr_stgetent __P((caddr_t, ipf_stack_t *));
141 
142 #define	ONE_DAY		IPF_TTLVAL(1 * 86400)	/* 1 day */
143 #define	FIVE_DAYS	(5 * ONE_DAY)
144 #define	DOUBLE_HASH(x, ifs)	\
145     (((x) + ifs->ifs_ips_seed[(x) % ifs->ifs_fr_statesize]) % ifs->ifs_fr_statesize)
146 
147 
148 
149 /* ------------------------------------------------------------------------ */
150 /* Function:    fr_stateinit                                                */
151 /* Returns:     int - 0 == success, -1 == failure                           */
152 /* Parameters:  Nil                                                         */
153 /*                                                                          */
154 /* Initialise all the global variables used within the state code.          */
155 /* This action also includes initiailising locks.                           */
156 /* ------------------------------------------------------------------------ */
157 int fr_stateinit(ifs)
158 ipf_stack_t *ifs;
159 {
160 	int i;
161 
162 	KMALLOCS(ifs->ifs_ips_table, ipstate_t **,
163 		 ifs->ifs_fr_statesize * sizeof(ipstate_t *));
164 	if (ifs->ifs_ips_table == NULL)
165 		return -1;
166 	bzero((char *)ifs->ifs_ips_table,
167 	      ifs->ifs_fr_statesize * sizeof(ipstate_t *));
168 
169 	KMALLOCS(ifs->ifs_ips_seed, u_long *,
170 		 ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_seed));
171 	if (ifs->ifs_ips_seed == NULL)
172 		return -2;
173 	for (i = 0; i < ifs->ifs_fr_statesize; i++) {
174 		/*
175 		 * XXX - ips_seed[X] should be a random number of sorts.
176 		 */
177 #if  (__FreeBSD_version >= 400000)
178 		ifs->ifs_ips_seed[i] = arc4random();
179 #else
180 		ifs->ifs_ips_seed[i] = ((u_long)ifs->ifs_ips_seed + i) *
181 		    ifs->ifs_fr_statesize;
182 		ifs->ifs_ips_seed[i] ^= 0xa5a55a5a;
183 		ifs->ifs_ips_seed[i] *= (u_long)ifs->ifs_ips_seed;
184 		ifs->ifs_ips_seed[i] ^= 0x5a5aa5a5;
185 		ifs->ifs_ips_seed[i] *= ifs->ifs_fr_statemax;
186 #endif
187 	}
188 
189 	/* fill icmp reply type table */
190 	for (i = 0; i <= ICMP_MAXTYPE; i++)
191 		icmpreplytype4[i] = -1;
192 	icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY;
193 	icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY;
194 	icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY;
195 	icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY;
196 #ifdef	USE_INET6
197 	/* fill icmp reply type table */
198 	for (i = 0; i <= ICMP6_MAXTYPE; i++)
199 		icmpreplytype6[i] = -1;
200 	icmpreplytype6[ICMP6_ECHO_REQUEST] = ICMP6_ECHO_REPLY;
201 	icmpreplytype6[ICMP6_MEMBERSHIP_QUERY] = ICMP6_MEMBERSHIP_REPORT;
202 	icmpreplytype6[ICMP6_NI_QUERY] = ICMP6_NI_REPLY;
203 	icmpreplytype6[ND_ROUTER_SOLICIT] = ND_ROUTER_ADVERT;
204 	icmpreplytype6[ND_NEIGHBOR_SOLICIT] = ND_NEIGHBOR_ADVERT;
205 #endif
206 
207 	KMALLOCS(ifs->ifs_ips_stats.iss_bucketlen, u_long *,
208 		 ifs->ifs_fr_statesize * sizeof(u_long));
209 	if (ifs->ifs_ips_stats.iss_bucketlen == NULL)
210 		return -1;
211 	bzero((char *)ifs->ifs_ips_stats.iss_bucketlen,
212 	      ifs->ifs_fr_statesize * sizeof(u_long));
213 
214 	if (ifs->ifs_fr_state_maxbucket == 0) {
215 		for (i = ifs->ifs_fr_statesize; i > 0; i >>= 1)
216 			ifs->ifs_fr_state_maxbucket++;
217 		ifs->ifs_fr_state_maxbucket *= 2;
218 	}
219 
220 	fr_sttab_init(ifs->ifs_ips_tqtqb, ifs);
221 	ifs->ifs_ips_tqtqb[IPF_TCP_NSTATES - 1].ifq_next = &ifs->ifs_ips_udptq;
222 	ifs->ifs_ips_udptq.ifq_ttl = (u_long)ifs->ifs_fr_udptimeout;
223 	ifs->ifs_ips_udptq.ifq_ref = 1;
224 	ifs->ifs_ips_udptq.ifq_head = NULL;
225 	ifs->ifs_ips_udptq.ifq_tail = &ifs->ifs_ips_udptq.ifq_head;
226 	MUTEX_INIT(&ifs->ifs_ips_udptq.ifq_lock, "ipftq udp tab");
227 	ifs->ifs_ips_udptq.ifq_next = &ifs->ifs_ips_udpacktq;
228 	ifs->ifs_ips_udpacktq.ifq_ttl = (u_long)ifs->ifs_fr_udpacktimeout;
229 	ifs->ifs_ips_udpacktq.ifq_ref = 1;
230 	ifs->ifs_ips_udpacktq.ifq_head = NULL;
231 	ifs->ifs_ips_udpacktq.ifq_tail = &ifs->ifs_ips_udpacktq.ifq_head;
232 	MUTEX_INIT(&ifs->ifs_ips_udpacktq.ifq_lock, "ipftq udpack tab");
233 	ifs->ifs_ips_udpacktq.ifq_next = &ifs->ifs_ips_icmptq;
234 	ifs->ifs_ips_icmptq.ifq_ttl = (u_long)ifs->ifs_fr_icmptimeout;
235 	ifs->ifs_ips_icmptq.ifq_ref = 1;
236 	ifs->ifs_ips_icmptq.ifq_head = NULL;
237 	ifs->ifs_ips_icmptq.ifq_tail = &ifs->ifs_ips_icmptq.ifq_head;
238 	MUTEX_INIT(&ifs->ifs_ips_icmptq.ifq_lock, "ipftq icmp tab");
239 	ifs->ifs_ips_icmptq.ifq_next = &ifs->ifs_ips_icmpacktq;
240 	ifs->ifs_ips_icmpacktq.ifq_ttl = (u_long)ifs->ifs_fr_icmpacktimeout;
241 	ifs->ifs_ips_icmpacktq.ifq_ref = 1;
242 	ifs->ifs_ips_icmpacktq.ifq_head = NULL;
243 	ifs->ifs_ips_icmpacktq.ifq_tail = &ifs->ifs_ips_icmpacktq.ifq_head;
244 	MUTEX_INIT(&ifs->ifs_ips_icmpacktq.ifq_lock, "ipftq icmpack tab");
245 	ifs->ifs_ips_icmpacktq.ifq_next = &ifs->ifs_ips_iptq;
246 	ifs->ifs_ips_iptq.ifq_ttl = (u_long)ifs->ifs_fr_iptimeout;
247 	ifs->ifs_ips_iptq.ifq_ref = 1;
248 	ifs->ifs_ips_iptq.ifq_head = NULL;
249 	ifs->ifs_ips_iptq.ifq_tail = &ifs->ifs_ips_iptq.ifq_head;
250 	MUTEX_INIT(&ifs->ifs_ips_iptq.ifq_lock, "ipftq ip tab");
251 	ifs->ifs_ips_iptq.ifq_next = &ifs->ifs_ips_deletetq;
252 	/* entry's ttl in deletetq is just 1 tick */
253 	ifs->ifs_ips_deletetq.ifq_ttl = (u_long) 1;
254 	ifs->ifs_ips_deletetq.ifq_ref = 1;
255 	ifs->ifs_ips_deletetq.ifq_head = NULL;
256 	ifs->ifs_ips_deletetq.ifq_tail = &ifs->ifs_ips_deletetq.ifq_head;
257 	MUTEX_INIT(&ifs->ifs_ips_deletetq.ifq_lock, "state delete queue");
258 	ifs->ifs_ips_deletetq.ifq_next = NULL;
259 
260 	RWLOCK_INIT(&ifs->ifs_ipf_state, "ipf IP state rwlock");
261 	MUTEX_INIT(&ifs->ifs_ipf_stinsert, "ipf state insert mutex");
262 	ifs->ifs_fr_state_init = 1;
263 
264 	ifs->ifs_ips_last_force_flush = ifs->ifs_fr_ticks;
265 	return 0;
266 }
267 
268 
269 /* ------------------------------------------------------------------------ */
270 /* Function:    fr_stateunload                                              */
271 /* Returns:     Nil                                                         */
272 /* Parameters:  Nil                                                         */
273 /*                                                                          */
274 /* Release and destroy any resources acquired or initialised so that        */
275 /* IPFilter can be unloaded or re-initialised.                              */
276 /* ------------------------------------------------------------------------ */
277 void fr_stateunload(ifs)
278 ipf_stack_t *ifs;
279 {
280 	ipftq_t *ifq, *ifqnext;
281 	ipstate_t *is;
282 
283 	while ((is = ifs->ifs_ips_list) != NULL)
284 	    fr_delstate(is, 0, ifs);
285 
286 	/*
287 	 * Proxy timeout queues are not cleaned here because although they
288 	 * exist on the state list, appr_unload is called after fr_stateunload
289 	 * and the proxies actually are responsible for them being created.
290 	 * Should the proxy timeouts have their own list?  There's no real
291 	 * justification as this is the only complicationA
292 	 */
293 	for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
294 		ifqnext = ifq->ifq_next;
295 		if (((ifq->ifq_flags & IFQF_PROXY) == 0) &&
296 		    (fr_deletetimeoutqueue(ifq) == 0))
297 			fr_freetimeoutqueue(ifq, ifs);
298 	}
299 
300 	ifs->ifs_ips_stats.iss_inuse = 0;
301 	ifs->ifs_ips_num = 0;
302 
303 	if (ifs->ifs_fr_state_init == 1) {
304 		fr_sttab_destroy(ifs->ifs_ips_tqtqb);
305 		MUTEX_DESTROY(&ifs->ifs_ips_udptq.ifq_lock);
306 		MUTEX_DESTROY(&ifs->ifs_ips_icmptq.ifq_lock);
307 		MUTEX_DESTROY(&ifs->ifs_ips_udpacktq.ifq_lock);
308 		MUTEX_DESTROY(&ifs->ifs_ips_icmpacktq.ifq_lock);
309 		MUTEX_DESTROY(&ifs->ifs_ips_iptq.ifq_lock);
310 		MUTEX_DESTROY(&ifs->ifs_ips_deletetq.ifq_lock);
311 	}
312 
313 	if (ifs->ifs_ips_table != NULL) {
314 		KFREES(ifs->ifs_ips_table,
315 		       ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_table));
316 		ifs->ifs_ips_table = NULL;
317 	}
318 
319 	if (ifs->ifs_ips_seed != NULL) {
320 		KFREES(ifs->ifs_ips_seed,
321 		       ifs->ifs_fr_statesize * sizeof(*ifs->ifs_ips_seed));
322 		ifs->ifs_ips_seed = NULL;
323 	}
324 
325 	if (ifs->ifs_ips_stats.iss_bucketlen != NULL) {
326 		KFREES(ifs->ifs_ips_stats.iss_bucketlen,
327 		       ifs->ifs_fr_statesize * sizeof(u_long));
328 		ifs->ifs_ips_stats.iss_bucketlen = NULL;
329 	}
330 
331 	if (ifs->ifs_fr_state_maxbucket_reset == 1)
332 		ifs->ifs_fr_state_maxbucket = 0;
333 
334 	if (ifs->ifs_fr_state_init == 1) {
335 		ifs->ifs_fr_state_init = 0;
336 		RW_DESTROY(&ifs->ifs_ipf_state);
337 		MUTEX_DESTROY(&ifs->ifs_ipf_stinsert);
338 	}
339 }
340 
341 
342 /* ------------------------------------------------------------------------ */
343 /* Function:    fr_statetstats                                              */
344 /* Returns:     ips_state_t* - pointer to state stats structure             */
345 /* Parameters:  Nil                                                         */
346 /*                                                                          */
347 /* Put all the current numbers and pointers into a single struct and return */
348 /* a pointer to it.                                                         */
349 /* ------------------------------------------------------------------------ */
350 static ips_stat_t *fr_statetstats(ifs)
351 ipf_stack_t *ifs;
352 {
353 	ifs->ifs_ips_stats.iss_active = ifs->ifs_ips_num;
354 	ifs->ifs_ips_stats.iss_statesize = ifs->ifs_fr_statesize;
355 	ifs->ifs_ips_stats.iss_statemax = ifs->ifs_fr_statemax;
356 	ifs->ifs_ips_stats.iss_table = ifs->ifs_ips_table;
357 	ifs->ifs_ips_stats.iss_list = ifs->ifs_ips_list;
358 	ifs->ifs_ips_stats.iss_ticks = ifs->ifs_fr_ticks;
359 	return &ifs->ifs_ips_stats;
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 /* Function:    fr_state_remove                                             */
364 /* Returns:     int - 0 == success, != 0 == failure                         */
365 /* Parameters:  data(I) - pointer to state structure to delete from table   */
366 /*                                                                          */
367 /* Search for a state structure that matches the one passed, according to   */
368 /* the IP addresses and other protocol specific information.                */
369 /* ------------------------------------------------------------------------ */
370 static int fr_state_remove(data, ifs)
371 caddr_t data;
372 ipf_stack_t *ifs;
373 {
374 	ipstate_t *sp, st;
375 	int error;
376 
377 	sp = &st;
378 	error = fr_inobj(data, &st, IPFOBJ_IPSTATE);
379 	if (error)
380 		return EFAULT;
381 
382 	WRITE_ENTER(&ifs->ifs_ipf_state);
383 	for (sp = ifs->ifs_ips_list; sp; sp = sp->is_next)
384 		if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) &&
385 		    !bcmp((caddr_t)&sp->is_src, (caddr_t)&st.is_src,
386 			  sizeof(st.is_src)) &&
387 		    !bcmp((caddr_t)&sp->is_dst, (caddr_t)&st.is_src,
388 			  sizeof(st.is_dst)) &&
389 		    !bcmp((caddr_t)&sp->is_ps, (caddr_t)&st.is_ps,
390 			  sizeof(st.is_ps))) {
391 			fr_delstate(sp, ISL_REMOVE, ifs);
392 			RWLOCK_EXIT(&ifs->ifs_ipf_state);
393 			return 0;
394 		}
395 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
396 	return ESRCH;
397 }
398 
399 
400 /* ------------------------------------------------------------------------ */
401 /* Function:    fr_state_ioctl                                              */
402 /* Returns:     int - 0 == success, != 0 == failure                         */
403 /* Parameters:  data(I) - pointer to ioctl data                             */
404 /*              cmd(I)  - ioctl command integer                             */
405 /*              mode(I) - file mode bits used with open                     */
406 /*                                                                          */
407 /* Processes an ioctl call made to operate on the IP Filter state device.   */
408 /* ------------------------------------------------------------------------ */
409 int fr_state_ioctl(data, cmd, mode, uid, ctx, ifs)
410 caddr_t data;
411 ioctlcmd_t cmd;
412 int mode, uid;
413 void *ctx;
414 ipf_stack_t *ifs;
415 {
416 	int arg, ret, error = 0;
417 
418 	switch (cmd)
419 	{
420 	/*
421 	 * Delete an entry from the state table.
422 	 */
423 	case SIOCDELST :
424 	    error = fr_state_remove(data, ifs);
425 		break;
426 	/*
427 	 * Flush the state table
428 	 */
429 	case SIOCIPFFL :
430 		BCOPYIN(data, (char *)&arg, sizeof(arg));
431 		if (arg == 0 || arg == 1) {
432 			WRITE_ENTER(&ifs->ifs_ipf_state);
433 			ret = fr_state_flush(arg, 4, ifs);
434 			RWLOCK_EXIT(&ifs->ifs_ipf_state);
435 			BCOPYOUT((char *)&ret, data, sizeof(ret));
436 		} else
437 			error = EINVAL;
438 		break;
439 #ifdef	USE_INET6
440 	case SIOCIPFL6 :
441 		BCOPYIN(data, (char *)&arg, sizeof(arg));
442 		if (arg == 0 || arg == 1) {
443 			WRITE_ENTER(&ifs->ifs_ipf_state);
444 			ret = fr_state_flush(arg, 6, ifs);
445 			RWLOCK_EXIT(&ifs->ifs_ipf_state);
446 			BCOPYOUT((char *)&ret, data, sizeof(ret));
447 		} else
448 			error = EINVAL;
449 		break;
450 #endif
451 #ifdef	IPFILTER_LOG
452 	/*
453 	 * Flush the state log.
454 	 */
455 	case SIOCIPFFB :
456 		if (!(mode & FWRITE))
457 			error = EPERM;
458 		else {
459 			int tmp;
460 
461 			tmp = ipflog_clear(IPL_LOGSTATE, ifs);
462 			BCOPYOUT((char *)&tmp, data, sizeof(tmp));
463 		}
464 		break;
465 	/*
466 	 * Turn logging of state information on/off.
467 	 */
468 	case SIOCSETLG :
469 		if (!(mode & FWRITE))
470 			error = EPERM;
471 		else {
472 			BCOPYIN((char *)data,
473 				       (char *)&ifs->ifs_ipstate_logging,
474 				       sizeof(ifs->ifs_ipstate_logging));
475 		}
476 		break;
477 	/*
478 	 * Return the current state of logging.
479 	 */
480 	case SIOCGETLG :
481 		BCOPYOUT((char *)&ifs->ifs_ipstate_logging, (char *)data,
482 			sizeof(ifs->ifs_ipstate_logging));
483 		break;
484 	/*
485 	 * Return the number of bytes currently waiting to be read.
486 	 */
487 	case FIONREAD :
488 		arg = ifs->ifs_iplused[IPL_LOGSTATE]; /* returned in an int */
489 		BCOPYOUT((char *)&arg, data, sizeof(arg));
490 		break;
491 #endif
492 	/*
493 	 * Get the current state statistics.
494 	 */
495 	case SIOCGETFS :
496 		error = fr_outobj(data, fr_statetstats(ifs), IPFOBJ_STATESTAT);
497 		break;
498 	/*
499 	 * Lock/Unlock the state table.  (Locking prevents any changes, which
500 	 * means no packets match).
501 	 */
502 	case SIOCSTLCK :
503 		if (!(mode & FWRITE)) {
504 			error = EPERM;
505 		} else {
506 			fr_lock(data, &ifs->ifs_fr_state_lock);
507 		}
508 		break;
509 	/*
510 	 * Add an entry to the current state table.
511 	 */
512 	case SIOCSTPUT :
513 		if (!ifs->ifs_fr_state_lock || !(mode &FWRITE)) {
514 			error = EACCES;
515 			break;
516 		}
517 		error = fr_stputent(data, ifs);
518 		break;
519 	/*
520 	 * Get a state table entry.
521 	 */
522 	case SIOCSTGET :
523 		if (!ifs->ifs_fr_state_lock) {
524 			error = EACCES;
525 			break;
526 		}
527 		error = fr_stgetent(data, ifs);
528 		break;
529 
530 	case SIOCGENITER :
531 	    {
532 		ipftoken_t *token;
533 		ipfgeniter_t iter;
534 
535 		error = fr_inobj(data, &iter, IPFOBJ_GENITER);
536 		if (error != 0)
537 			break;
538 
539 		token = ipf_findtoken(IPFGENITER_STATE, uid, ctx, ifs);
540 		if (token != NULL)
541 			error = fr_stateiter(token, &iter, ifs);
542 		else
543 			error = ESRCH;
544 		RWLOCK_EXIT(&ifs->ifs_ipf_tokens);
545 		break;
546 	    }
547 
548 	case SIOCIPFDELTOK :
549 		(void) BCOPYIN(data, (char *)&arg, sizeof(arg));
550 		error = ipf_deltoken(arg, uid, ctx, ifs);
551 		break;
552 
553 	default :
554 		error = EINVAL;
555 		break;
556 	}
557 	return error;
558 }
559 
560 
561 /* ------------------------------------------------------------------------ */
562 /* Function:    fr_stgetent                                                 */
563 /* Returns:     int - 0 == success, != 0 == failure                         */
564 /* Parameters:  data(I) - pointer to state structure to retrieve from table */
565 /*                                                                          */
566 /* Copy out state information from the kernel to a user space process.  If  */
567 /* there is a filter rule associated with the state entry, copy that out    */
568 /* as well.  The entry to copy out is taken from the value of "ips_next" in */
569 /* the struct passed in and if not null and not found in the list of current*/
570 /* state entries, the retrieval fails.                                      */
571 /* ------------------------------------------------------------------------ */
572 int fr_stgetent(data, ifs)
573 caddr_t data;
574 ipf_stack_t *ifs;
575 {
576 	ipstate_t *is, *isn;
577 	ipstate_save_t ips;
578 	int error;
579 
580 	error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
581 	if (error)
582 		return EFAULT;
583 
584 	isn = ips.ips_next;
585 	if (isn == NULL) {
586 		isn = ifs->ifs_ips_list;
587 		if (isn == NULL) {
588 			if (ips.ips_next == NULL)
589 				return ENOENT;
590 			return 0;
591 		}
592 	} else {
593 		/*
594 		 * Make sure the pointer we're copying from exists in the
595 		 * current list of entries.  Security precaution to prevent
596 		 * copying of random kernel data.
597 		 */
598 		for (is = ifs->ifs_ips_list; is; is = is->is_next)
599 			if (is == isn)
600 				break;
601 		if (!is)
602 			return ESRCH;
603 	}
604 	ips.ips_next = isn->is_next;
605 	bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is));
606 	ips.ips_rule = isn->is_rule;
607 	if (isn->is_rule != NULL)
608 		bcopy((char *)isn->is_rule, (char *)&ips.ips_fr,
609 		      sizeof(ips.ips_fr));
610 	error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
611 	if (error)
612 		return EFAULT;
613 	return 0;
614 }
615 
616 
617 /* ------------------------------------------------------------------------ */
618 /* Function:    fr_stputent                                                 */
619 /* Returns:     int - 0 == success, != 0 == failure                         */
620 /* Parameters:  data(I) - pointer to state information struct               */
621 /*                                                                          */
622 /* This function implements the SIOCSTPUT ioctl: insert a state entry into  */
623 /* the state table.  If the state info. includes a pointer to a filter rule */
624 /* then also add in an orphaned rule (will not show up in any "ipfstat -io" */
625 /* output.                                                                  */
626 /* ------------------------------------------------------------------------ */
627 int fr_stputent(data, ifs)
628 caddr_t data;
629 ipf_stack_t *ifs;
630 {
631 	ipstate_t *is, *isn;
632 	ipstate_save_t ips;
633 	int error, i;
634 	frentry_t *fr;
635 	char *name;
636 
637 	error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
638 	if (error)
639 		return EFAULT;
640 
641 	KMALLOC(isn, ipstate_t *);
642 	if (isn == NULL)
643 		return ENOMEM;
644 
645 	bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn));
646 	bzero((char *)isn, offsetof(struct ipstate, is_pkts));
647 	isn->is_sti.tqe_pnext = NULL;
648 	isn->is_sti.tqe_next = NULL;
649 	isn->is_sti.tqe_ifq = NULL;
650 	isn->is_sti.tqe_parent = isn;
651 	isn->is_ifp[0] = NULL;
652 	isn->is_ifp[1] = NULL;
653 	isn->is_ifp[2] = NULL;
654 	isn->is_ifp[3] = NULL;
655 	isn->is_sync = NULL;
656 	fr = ips.ips_rule;
657 
658 	if (fr == NULL) {
659 		READ_ENTER(&ifs->ifs_ipf_state);
660 		fr_stinsert(isn, 0, ifs);
661 		MUTEX_EXIT(&isn->is_lock);
662 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
663 		return 0;
664 	}
665 
666 	if (isn->is_flags & SI_NEWFR) {
667 		KMALLOC(fr, frentry_t *);
668 		if (fr == NULL) {
669 			KFREE(isn);
670 			return ENOMEM;
671 		}
672 		bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
673 		isn->is_rule = fr;
674 		ips.ips_is.is_rule = fr;
675 		MUTEX_NUKE(&fr->fr_lock);
676 		MUTEX_INIT(&fr->fr_lock, "state filter rule lock");
677 
678 		/*
679 		 * Look up all the interface names in the rule.
680 		 */
681 		for (i = 0; i < 4; i++) {
682 			name = fr->fr_ifnames[i];
683 			fr->fr_ifas[i] = fr_resolvenic(name, fr->fr_v, ifs);
684 			name = isn->is_ifname[i];
685 			isn->is_ifp[i] = fr_resolvenic(name, isn->is_v, ifs);
686 		}
687 
688 		fr->fr_ref = 0;
689 		fr->fr_dsize = 0;
690 		fr->fr_data = NULL;
691 
692 		fr_resolvedest(&fr->fr_tif, fr->fr_v, ifs);
693 		fr_resolvedest(&fr->fr_dif, fr->fr_v, ifs);
694 		fr_resolvedest(&fr->fr_rif, fr->fr_v, ifs);
695 
696 		/*
697 		 * send a copy back to userland of what we ended up
698 		 * to allow for verification.
699 		 */
700 		error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
701 		if (error) {
702 			KFREE(isn);
703 			MUTEX_DESTROY(&fr->fr_lock);
704 			KFREE(fr);
705 			return EFAULT;
706 		}
707 		READ_ENTER(&ifs->ifs_ipf_state);
708 		fr_stinsert(isn, 0, ifs);
709 		MUTEX_EXIT(&isn->is_lock);
710 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
711 
712 	} else {
713 		READ_ENTER(&ifs->ifs_ipf_state);
714 		for (is = ifs->ifs_ips_list; is; is = is->is_next)
715 			if (is->is_rule == fr) {
716 				fr_stinsert(isn, 0, ifs);
717 				MUTEX_EXIT(&isn->is_lock);
718 				break;
719 			}
720 
721 		if (is == NULL) {
722 			KFREE(isn);
723 			isn = NULL;
724 		}
725 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
726 
727 		return (isn == NULL) ? ESRCH : 0;
728 	}
729 
730 	return 0;
731 }
732 
733 
734 /* ------------------------------------------------------------------------ */
735 /* Function:   fr_stinsert                                                  */
736 /* Returns:    Nil                                                          */
737 /* Parameters: is(I)  - pointer to state structure                          */
738 /*             rev(I) - flag indicating forward/reverse direction of packet */
739 /*                                                                          */
740 /* Inserts a state structure into the hash table (for lookups) and the list */
741 /* of state entries (for enumeration).  Resolves all of the interface names */
742 /* to pointers and adjusts running stats for the hash table as appropriate. */
743 /*                                                                          */
744 /* Locking: it is assumed that some kind of lock on ipf_state is held.      */
745 /*          Exits with is_lock initialised and held.                        */
746 /* ------------------------------------------------------------------------ */
747 void fr_stinsert(is, rev, ifs)
748 ipstate_t *is;
749 int rev;
750 ipf_stack_t *ifs;
751 {
752 	frentry_t *fr;
753 	u_int hv;
754 	int i;
755 
756 	MUTEX_INIT(&is->is_lock, "ipf state entry");
757 
758 	fr = is->is_rule;
759 	if (fr != NULL) {
760 		MUTEX_ENTER(&fr->fr_lock);
761 		fr->fr_ref++;
762 		fr->fr_statecnt++;
763 		MUTEX_EXIT(&fr->fr_lock);
764 	}
765 
766 	/*
767 	 * Look up all the interface names in the state entry.
768 	 */
769 	for (i = 0; i < 4; i++) {
770 		if (is->is_ifp[i] != NULL)
771 			continue;
772 		is->is_ifp[i] = fr_resolvenic(is->is_ifname[i], is->is_v, ifs);
773 	}
774 
775 	/*
776 	 * If we could trust is_hv, then the modulous would not be needed, but
777 	 * when running with IPFILTER_SYNC, this stops bad values.
778 	 */
779 	hv = is->is_hv % ifs->ifs_fr_statesize;
780 	is->is_hv = hv;
781 
782 	/*
783 	 * We need to get both of these locks...the first because it is
784 	 * possible that once the insert is complete another packet might
785 	 * come along, match the entry and want to update it.
786 	 */
787 	MUTEX_ENTER(&is->is_lock);
788 	MUTEX_ENTER(&ifs->ifs_ipf_stinsert);
789 
790 	/*
791 	 * add into list table.
792 	 */
793 	if (ifs->ifs_ips_list != NULL)
794 		ifs->ifs_ips_list->is_pnext = &is->is_next;
795 	is->is_pnext = &ifs->ifs_ips_list;
796 	is->is_next = ifs->ifs_ips_list;
797 	ifs->ifs_ips_list = is;
798 
799 	if (ifs->ifs_ips_table[hv] != NULL)
800 		ifs->ifs_ips_table[hv]->is_phnext = &is->is_hnext;
801 	else
802 		ifs->ifs_ips_stats.iss_inuse++;
803 	is->is_phnext = ifs->ifs_ips_table + hv;
804 	is->is_hnext = ifs->ifs_ips_table[hv];
805 	ifs->ifs_ips_table[hv] = is;
806 	ifs->ifs_ips_stats.iss_bucketlen[hv]++;
807 	ifs->ifs_ips_num++;
808 	MUTEX_EXIT(&ifs->ifs_ipf_stinsert);
809 
810 	fr_setstatequeue(is, rev, ifs);
811 }
812 
813 
814 /* ------------------------------------------------------------------------ */
815 /* Function:    fr_addstate                                                 */
816 /* Returns:     ipstate_t* - NULL == failure, else pointer to new state     */
817 /* Parameters:  fin(I)    - pointer to packet information                   */
818 /*              stsave(O) - pointer to place to save pointer to created     */
819 /*                          state structure.                                */
820 /*              flags(I)  - flags to use when creating the structure        */
821 /*                                                                          */
822 /* Creates a new IP state structure from the packet information collected.  */
823 /* Inserts it into the state table and appends to the bottom of the active  */
824 /* list.  If the capacity of the table has reached the maximum allowed then */
825 /* the call will fail and a flush is scheduled for the next timeout call.   */
826 /* ------------------------------------------------------------------------ */
827 ipstate_t *fr_addstate(fin, stsave, flags)
828 fr_info_t *fin;
829 ipstate_t **stsave;
830 u_int flags;
831 {
832 	ipstate_t *is, ips;
833 	struct icmp *ic;
834 	u_int pass, hv;
835 	frentry_t *fr;
836 	tcphdr_t *tcp;
837 	grehdr_t *gre;
838 	void *ifp;
839 	int out;
840 	ipf_stack_t *ifs = fin->fin_ifs;
841 
842 	if (ifs->ifs_fr_state_lock ||
843 	    (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
844 		return NULL;
845 
846 	if ((fin->fin_flx & FI_OOW) && !(fin->fin_tcpf & TH_SYN))
847 		return NULL;
848 
849 	/*
850 	 * If a "keep state" rule has reached the maximum number of references
851 	 * to it, then schedule an automatic flush in case we can clear out
852 	 * some "dead old wood".  Note that because the lock isn't held on
853 	 * fr it is possible that we could overflow.  The cost of overflowing
854 	 * is being ignored here as the number by which it can overflow is
855 	 * a product of the number of simultaneous threads that could be
856 	 * executing in here, so a limit of 100 won't result in 200, but could
857 	 * result in 101 or 102.
858 	 */
859 	fr = fin->fin_fr;
860 	if (fr != NULL) {
861 		if ((ifs->ifs_ips_num == ifs->ifs_fr_statemax) && (fr->fr_statemax == 0)) {
862 			ATOMIC_INCL(ifs->ifs_ips_stats.iss_max);
863 			ifs->ifs_fr_state_doflush = 1;
864 			return NULL;
865 		}
866 		if ((fr->fr_statemax != 0) &&
867 		    (fr->fr_statecnt >= fr->fr_statemax)) {
868 			ATOMIC_INCL(ifs->ifs_ips_stats.iss_maxref);
869 			ifs->ifs_fr_state_doflush = 1;
870 			return NULL;
871 		}
872 	}
873 
874 	pass = (fr == NULL) ? 0 : fr->fr_flags;
875 
876 	ic = NULL;
877 	tcp = NULL;
878 	out = fin->fin_out;
879 	is = &ips;
880 	bzero((char *)is, sizeof(*is));
881 	is->is_die = 1 + ifs->ifs_fr_ticks;
882 
883 	/*
884 	 * Copy and calculate...
885 	 */
886 	hv = (is->is_p = fin->fin_fi.fi_p);
887 	is->is_src = fin->fin_fi.fi_src;
888 	hv += is->is_saddr;
889 	is->is_dst = fin->fin_fi.fi_dst;
890 	hv += is->is_daddr;
891 #ifdef	USE_INET6
892 	if (fin->fin_v == 6) {
893 		/*
894 		 * For ICMPv6, we check to see if the destination address is
895 		 * a multicast address.  If it is, do not include it in the
896 		 * calculation of the hash because the correct reply will come
897 		 * back from a real address, not a multicast address.
898 		 */
899 		if ((is->is_p == IPPROTO_ICMPV6) &&
900 		    IN6_IS_ADDR_MULTICAST(&is->is_dst.in6)) {
901 			/*
902 			 * So you can do keep state with neighbour discovery.
903 			 *
904 			 * Here we could use the address from the neighbour
905 			 * solicit message to put in the state structure and
906 			 * we could use that without a wildcard flag too...
907 			 */
908 			is->is_flags |= SI_W_DADDR;
909 			hv -= is->is_daddr;
910 		} else {
911 			hv += is->is_dst.i6[1];
912 			hv += is->is_dst.i6[2];
913 			hv += is->is_dst.i6[3];
914 		}
915 		hv += is->is_src.i6[1];
916 		hv += is->is_src.i6[2];
917 		hv += is->is_src.i6[3];
918 	}
919 #endif
920 
921 	switch (is->is_p)
922 	{
923 #ifdef	USE_INET6
924 	case IPPROTO_ICMPV6 :
925 		ic = fin->fin_dp;
926 
927 		switch (ic->icmp_type)
928 		{
929 		case ICMP6_ECHO_REQUEST :
930 			is->is_icmp.ici_type = ic->icmp_type;
931 			hv += (is->is_icmp.ici_id = ic->icmp_id);
932 			break;
933 		case ICMP6_MEMBERSHIP_QUERY :
934 		case ND_ROUTER_SOLICIT :
935 		case ND_NEIGHBOR_SOLICIT :
936 		case ICMP6_NI_QUERY :
937 			is->is_icmp.ici_type = ic->icmp_type;
938 			break;
939 		default :
940 			return NULL;
941 		}
942 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_icmp);
943 		break;
944 #endif
945 	case IPPROTO_ICMP :
946 		ic = fin->fin_dp;
947 
948 		switch (ic->icmp_type)
949 		{
950 		case ICMP_ECHO :
951 		case ICMP_TSTAMP :
952 		case ICMP_IREQ :
953 		case ICMP_MASKREQ :
954 			is->is_icmp.ici_type = ic->icmp_type;
955 			hv += (is->is_icmp.ici_id = ic->icmp_id);
956 			break;
957 		default :
958 			return NULL;
959 		}
960 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_icmp);
961 		break;
962 
963 	case IPPROTO_GRE :
964 		gre = fin->fin_dp;
965 
966 		is->is_gre.gs_flags = gre->gr_flags;
967 		is->is_gre.gs_ptype = gre->gr_ptype;
968 		if (GRE_REV(is->is_gre.gs_flags) == 1) {
969 			is->is_call[0] = fin->fin_data[0];
970 			is->is_call[1] = fin->fin_data[1];
971 		}
972 		break;
973 
974 	case IPPROTO_TCP :
975 		tcp = fin->fin_dp;
976 
977 		if (tcp->th_flags & TH_RST)
978 			return NULL;
979 		/*
980 		 * The endian of the ports doesn't matter, but the ack and
981 		 * sequence numbers do as we do mathematics on them later.
982 		 */
983 		is->is_sport = htons(fin->fin_data[0]);
984 		is->is_dport = htons(fin->fin_data[1]);
985 		if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
986 			hv += is->is_sport;
987 			hv += is->is_dport;
988 		}
989 
990 		/*
991 		 * If this is a real packet then initialise fields in the
992 		 * state information structure from the TCP header information.
993 		 */
994 
995 		is->is_maxdwin = 1;
996 		is->is_maxswin = ntohs(tcp->th_win);
997 		if (is->is_maxswin == 0)
998 			is->is_maxswin = 1;
999 
1000 		if ((fin->fin_flx & FI_IGNORE) == 0) {
1001 			is->is_send = ntohl(tcp->th_seq) + fin->fin_dlen -
1002 				      (TCP_OFF(tcp) << 2) +
1003 				      ((tcp->th_flags & TH_SYN) ? 1 : 0) +
1004 				      ((tcp->th_flags & TH_FIN) ? 1 : 0);
1005 			is->is_maxsend = is->is_send;
1006 
1007 			/*
1008 			 * Window scale option is only present in
1009 			 * SYN/SYN-ACK packet.
1010 			 */
1011 			if ((tcp->th_flags & ~(TH_FIN|TH_ACK|TH_ECNALL)) ==
1012 			    TH_SYN &&
1013 			    (TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2))) {
1014 				if (fr_tcpoptions(fin, tcp,
1015 					      &is->is_tcp.ts_data[0]))
1016 					is->is_swinflags = TCP_WSCALE_SEEN|
1017 							   TCP_WSCALE_FIRST;
1018 			}
1019 
1020 			if ((fin->fin_out != 0) && (pass & FR_NEWISN) != 0) {
1021 				fr_checknewisn(fin, is);
1022 				fr_fixoutisn(fin, is);
1023 			}
1024 
1025 			if ((tcp->th_flags & TH_OPENING) == TH_SYN)
1026 				flags |= IS_TCPFSM;
1027 			else {
1028 				is->is_maxdwin = is->is_maxswin * 2;
1029 				is->is_dend = ntohl(tcp->th_ack);
1030 				is->is_maxdend = ntohl(tcp->th_ack);
1031 				is->is_maxdwin *= 2;
1032 			}
1033 		}
1034 
1035 		/*
1036 		 * If we're creating state for a starting connection, start the
1037 		 * timer on it as we'll never see an error if it fails to
1038 		 * connect.
1039 		 */
1040 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_tcp);
1041 		break;
1042 
1043 	case IPPROTO_UDP :
1044 		tcp = fin->fin_dp;
1045 
1046 		is->is_sport = htons(fin->fin_data[0]);
1047 		is->is_dport = htons(fin->fin_data[1]);
1048 		if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
1049 			hv += tcp->th_dport;
1050 			hv += tcp->th_sport;
1051 		}
1052 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_udp);
1053 		break;
1054 
1055 	default :
1056 		break;
1057 	}
1058 	hv = DOUBLE_HASH(hv, ifs);
1059 	is->is_hv = hv;
1060 	is->is_rule = fr;
1061 	is->is_flags = flags & IS_INHERITED;
1062 
1063 	/*
1064 	 * Look for identical state.
1065 	 */
1066 	for (is = ifs->ifs_ips_table[is->is_hv % ifs->ifs_fr_statesize];
1067 	     is != NULL;
1068 	     is = is->is_hnext) {
1069 		if (bcmp(&ips.is_src, &is->is_src,
1070 			 offsetof(struct ipstate, is_ps) -
1071 			 offsetof(struct ipstate, is_src)) == 0)
1072 			break;
1073 	}
1074 	if (is != NULL)
1075 		return NULL;
1076 
1077 	if (ifs->ifs_ips_stats.iss_bucketlen[hv] >= ifs->ifs_fr_state_maxbucket) {
1078 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_bucketfull);
1079 		return NULL;
1080 	}
1081 	KMALLOC(is, ipstate_t *);
1082 	if (is == NULL) {
1083 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_nomem);
1084 		return NULL;
1085 	}
1086 	bcopy((char *)&ips, (char *)is, sizeof(*is));
1087 	/*
1088 	 * Do not do the modulous here, it is done in fr_stinsert().
1089 	 */
1090 	if (fr != NULL) {
1091 		(void) strncpy(is->is_group, fr->fr_group, FR_GROUPLEN);
1092 		if (fr->fr_age[0] != 0) {
1093 			is->is_tqehead[0] =
1094 			    fr_addtimeoutqueue(&ifs->ifs_ips_utqe,
1095 					       fr->fr_age[0], ifs);
1096 			is->is_sti.tqe_flags |= TQE_RULEBASED;
1097 		}
1098 		if (fr->fr_age[1] != 0) {
1099 			is->is_tqehead[1] =
1100 			    fr_addtimeoutqueue(&ifs->ifs_ips_utqe,
1101 					       fr->fr_age[1], ifs);
1102 			is->is_sti.tqe_flags |= TQE_RULEBASED;
1103 		}
1104 		is->is_tag = fr->fr_logtag;
1105 
1106 		is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
1107 		is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
1108 		is->is_ifp[((1 - out) << 1) + 1] = fr->fr_ifas[3];
1109 
1110 		if (((ifp = fr->fr_ifas[1]) != NULL) &&
1111 		    (ifp != (void *)-1)) {
1112 			COPYIFNAME(ifp, is->is_ifname[(out << 1) + 1], fr->fr_v);
1113 		}
1114 		if (((ifp = fr->fr_ifas[2]) != NULL) &&
1115 		    (ifp != (void *)-1)) {
1116 			COPYIFNAME(ifp, is->is_ifname[(1 - out) << 1], fr->fr_v);
1117 		}
1118 		if (((ifp = fr->fr_ifas[3]) != NULL) &&
1119 		    (ifp != (void *)-1)) {
1120 			COPYIFNAME(ifp, is->is_ifname[((1 - out) << 1) + 1], fr->fr_v);
1121 		}
1122 	} else {
1123 		pass = ifs->ifs_fr_flags;
1124 		is->is_tag = FR_NOLOGTAG;
1125 	}
1126 
1127 	is->is_ifp[out << 1] = fin->fin_ifp;
1128 	if (fin->fin_ifp != NULL) {
1129 		COPYIFNAME(fin->fin_ifp, is->is_ifname[out << 1], fr->fr_v);
1130 	}
1131 
1132 	/*
1133 	 * It may seem strange to set is_ref to 2, but fr_check() will call
1134 	 * fr_statederef() after calling fr_addstate() and the idea is to
1135 	 * have it exist at the end of fr_check() with is_ref == 1.
1136 	 */
1137 	is->is_ref = 2;
1138 	is->is_pass = pass;
1139 	is->is_pkts[0] = 0, is->is_bytes[0] = 0;
1140 	is->is_pkts[1] = 0, is->is_bytes[1] = 0;
1141 	is->is_pkts[2] = 0, is->is_bytes[2] = 0;
1142 	is->is_pkts[3] = 0, is->is_bytes[3] = 0;
1143 	if ((fin->fin_flx & FI_IGNORE) == 0) {
1144 		is->is_pkts[out] = 1;
1145 		is->is_bytes[out] = fin->fin_plen;
1146 		is->is_flx[out][0] = fin->fin_flx & FI_CMP;
1147 		is->is_flx[out][0] &= ~FI_OOW;
1148 	}
1149 
1150 	if (pass & FR_STSTRICT)
1151 		is->is_flags |= IS_STRICT;
1152 
1153 	if (pass & FR_STATESYNC)
1154 		is->is_flags |= IS_STATESYNC;
1155 
1156 	/*
1157 	 * We want to check everything that is a property of this packet,
1158 	 * but we don't (automatically) care about it's fragment status as
1159 	 * this may change.
1160 	 */
1161 	is->is_v = fin->fin_v;
1162 	is->is_opt[0] = fin->fin_optmsk;
1163 	is->is_optmsk[0] = 0xffffffff;
1164 	is->is_optmsk[1] = 0xffffffff;
1165 	if (is->is_v == 6) {
1166 		is->is_opt[0] &= ~0x8;
1167 		is->is_optmsk[0] &= ~0x8;
1168 		is->is_optmsk[1] &= ~0x8;
1169 	}
1170 	is->is_sec = fin->fin_secmsk;
1171 	is->is_secmsk = 0xffff;
1172 	is->is_auth = fin->fin_auth;
1173 	is->is_authmsk = 0xffff;
1174 	if (flags & (SI_WILDP|SI_WILDA)) {
1175 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_wild);
1176 	}
1177 	is->is_rulen = fin->fin_rule;
1178 
1179 
1180 	if (pass & FR_LOGFIRST)
1181 		is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
1182 
1183 	READ_ENTER(&ifs->ifs_ipf_state);
1184 	is->is_me = stsave;
1185 
1186 	fr_stinsert(is, fin->fin_rev, ifs);
1187 
1188 	if (fin->fin_p == IPPROTO_TCP) {
1189 		/*
1190 		* If we're creating state for a starting connection, start the
1191 		* timer on it as we'll never see an error if it fails to
1192 		* connect.
1193 		*/
1194 		(void) fr_tcp_age(&is->is_sti, fin, ifs->ifs_ips_tqtqb,
1195 				  is->is_flags);
1196 		MUTEX_EXIT(&is->is_lock);
1197 #ifdef	IPFILTER_SCAN
1198 		if ((is->is_flags & SI_CLONE) == 0)
1199 			(void) ipsc_attachis(is);
1200 #endif
1201 	} else {
1202 		MUTEX_EXIT(&is->is_lock);
1203 	}
1204 #ifdef	IPFILTER_SYNC
1205 	if ((is->is_flags & IS_STATESYNC) && ((is->is_flags & SI_CLONE) == 0))
1206 		is->is_sync = ipfsync_new(SMC_STATE, fin, is);
1207 #endif
1208 	if (ifs->ifs_ipstate_logging)
1209 		ipstate_log(is, ISL_NEW, ifs);
1210 
1211 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
1212 	fin->fin_state = is;
1213 	fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
1214 	fin->fin_flx |= FI_STATE;
1215 	if (fin->fin_flx & FI_FRAG)
1216 		(void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
1217 
1218 	return is;
1219 }
1220 
1221 
1222 /* ------------------------------------------------------------------------ */
1223 /* Function:    fr_tcpoptions                                               */
1224 /* Returns:     int - 1 == packet matches state entry, 0 == it does not     */
1225 /* Parameters:  fin(I) - pointer to packet information                      */
1226 /*              tcp(I) - pointer to TCP packet header                       */
1227 /*              td(I)  - pointer to TCP data held as part of the state      */
1228 /*                                                                          */
1229 /* Look after the TCP header for any options and deal with those that are   */
1230 /* present.  Record details about those that we recogise.                   */
1231 /* ------------------------------------------------------------------------ */
1232 static int fr_tcpoptions(fin, tcp, td)
1233 fr_info_t *fin;
1234 tcphdr_t *tcp;
1235 tcpdata_t *td;
1236 {
1237 	int off, mlen, ol, i, len, retval;
1238 	char buf[64], *s, opt;
1239 	mb_t *m = NULL;
1240 
1241 	len = (TCP_OFF(tcp) << 2);
1242 	if (fin->fin_dlen < len)
1243 		return 0;
1244 	len -= sizeof(*tcp);
1245 
1246 	off = fin->fin_plen - fin->fin_dlen + sizeof(*tcp) + fin->fin_ipoff;
1247 
1248 	m = fin->fin_m;
1249 	mlen = MSGDSIZE(m) - off;
1250 	if (len > mlen) {
1251 		len = mlen;
1252 		retval = 0;
1253 	} else {
1254 		retval = 1;
1255 	}
1256 
1257 	COPYDATA(m, off, len, buf);
1258 
1259 	for (s = buf; len > 0; ) {
1260 		opt = *s;
1261 		if (opt == TCPOPT_EOL)
1262 			break;
1263 		else if (opt == TCPOPT_NOP)
1264 			ol = 1;
1265 		else {
1266 			if (len < 2)
1267 				break;
1268 			ol = (int)*(s + 1);
1269 			if (ol < 2 || ol > len)
1270 				break;
1271 
1272 			/*
1273 			 * Extract the TCP options we are interested in out of
1274 			 * the header and store them in the the tcpdata struct.
1275 			 */
1276 			switch (opt)
1277 			{
1278 			case TCPOPT_WINDOW :
1279 				if (ol == TCPOLEN_WINDOW) {
1280 					i = (int)*(s + 2);
1281 					if (i > TCP_WSCALE_MAX)
1282 						i = TCP_WSCALE_MAX;
1283 					else if (i < 0)
1284 						i = 0;
1285 					td->td_winscale = i;
1286 				}
1287 				break;
1288 			case TCPOPT_MAXSEG :
1289 				/*
1290 				 * So, if we wanted to set the TCP MAXSEG,
1291 				 * it should be done here...
1292 				 */
1293 				if (ol == TCPOLEN_MAXSEG) {
1294 					i = (int)*(s + 2);
1295 					i <<= 8;
1296 					i += (int)*(s + 3);
1297 					td->td_maxseg = i;
1298 				}
1299 				break;
1300 			}
1301 		}
1302 		len -= ol;
1303 		s += ol;
1304 	}
1305 	return retval;
1306 }
1307 
1308 
1309 /* ------------------------------------------------------------------------ */
1310 /* Function:    fr_tcpstate                                                 */
1311 /* Returns:     int - 1 == packet matches state entry, 0 == it does not     */
1312 /* Parameters:  fin(I)   - pointer to packet information                    */
1313 /*              tcp(I)   - pointer to TCP packet header                     */
1314 /*              is(I)  - pointer to master state structure                  */
1315 /*                                                                          */
1316 /* Check to see if a packet with TCP headers fits within the TCP window.    */
1317 /* Change timeout depending on whether new packet is a SYN-ACK returning    */
1318 /* for a SYN or a RST or FIN which indicate time to close up shop.          */
1319 /* ------------------------------------------------------------------------ */
1320 static int fr_tcpstate(fin, tcp, is)
1321 fr_info_t *fin;
1322 tcphdr_t *tcp;
1323 ipstate_t *is;
1324 {
1325 	int source, ret = 0, flags;
1326 	tcpdata_t  *fdata, *tdata;
1327 	ipf_stack_t *ifs = fin->fin_ifs;
1328 
1329 	source = !fin->fin_rev;
1330 	if (((is->is_flags & IS_TCPFSM) != 0) && (source == 1) &&
1331 	    (ntohs(is->is_sport) != fin->fin_data[0]))
1332 		source = 0;
1333 	fdata = &is->is_tcp.ts_data[!source];
1334 	tdata = &is->is_tcp.ts_data[source];
1335 
1336 	MUTEX_ENTER(&is->is_lock);
1337 
1338 	/*
1339 	 * If a SYN packet is received for a connection that is in a half
1340 	 * closed state, then move its state entry to deletetq. In such case
1341 	 * the SYN packet will be consequently dropped. This allows new state
1342 	 * entry to be created with a retransmited SYN packet.
1343 	 */
1344 	if ((tcp->th_flags & TH_OPENING) == TH_SYN) {
1345 		if (((is->is_state[source] > IPF_TCPS_ESTABLISHED) ||
1346 		    (is->is_state[source] == IPF_TCPS_CLOSED)) &&
1347 		    ((is->is_state[!source] > IPF_TCPS_ESTABLISHED) ||
1348 		    (is->is_state[!source] == IPF_TCPS_CLOSED))) {
1349 			/*
1350 			 * Do not update is->is_sti.tqe_die in case state entry
1351 			 * is already present in deletetq. It prevents state
1352 			 * entry ttl update by retransmitted SYN packets, which
1353 			 * may arrive before timer tick kicks off. The SYN
1354 			 * packet will be dropped again.
1355 			 */
1356 			if (is->is_sti.tqe_ifq != &ifs->ifs_ips_deletetq)
1357 				fr_movequeue(&is->is_sti, is->is_sti.tqe_ifq,
1358 					&fin->fin_ifs->ifs_ips_deletetq,
1359 					fin->fin_ifs);
1360 
1361 			MUTEX_EXIT(&is->is_lock);
1362 			return 0;
1363 		}
1364 	}
1365 
1366 	if (fr_tcpinwindow(fin, fdata, tdata, tcp, is->is_flags)) {
1367 #ifdef	IPFILTER_SCAN
1368 		if (is->is_flags & (IS_SC_CLIENT|IS_SC_SERVER)) {
1369 			ipsc_packet(fin, is);
1370 			if (FR_ISBLOCK(is->is_pass)) {
1371 				MUTEX_EXIT(&is->is_lock);
1372 				return 1;
1373 			}
1374 		}
1375 #endif
1376 
1377 		/*
1378 		 * Nearing end of connection, start timeout.
1379 		 */
1380 		ret = fr_tcp_age(&is->is_sti, fin, ifs->ifs_ips_tqtqb,
1381 				 is->is_flags);
1382 		if (ret == 0) {
1383 			MUTEX_EXIT(&is->is_lock);
1384 			return 0;
1385 		}
1386 
1387 		/*
1388 		 * set s0's as appropriate.  Use syn-ack packet as it
1389 		 * contains both pieces of required information.
1390 		 */
1391 		/*
1392 		 * Window scale option is only present in SYN/SYN-ACK packet.
1393 		 * Compare with ~TH_FIN to mask out T/TCP setups.
1394 		 */
1395 		flags = tcp->th_flags & ~(TH_FIN|TH_ECNALL);
1396 		if (flags == (TH_SYN|TH_ACK)) {
1397 			is->is_s0[source] = ntohl(tcp->th_ack);
1398 			is->is_s0[!source] = ntohl(tcp->th_seq) + 1;
1399 			if ((TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2)) &&
1400 			    tdata->td_winscale) {
1401 				if (fr_tcpoptions(fin, tcp, fdata)) {
1402 					fdata->td_winflags = TCP_WSCALE_SEEN|
1403 							     TCP_WSCALE_FIRST;
1404 				} else {
1405 					if (!fdata->td_winscale)
1406 						tdata->td_winscale = 0;
1407 				}
1408 			}
1409 			if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1410 				fr_checknewisn(fin, is);
1411 		} else if (flags == TH_SYN) {
1412 			is->is_s0[source] = ntohl(tcp->th_seq) + 1;
1413 			if ((TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2)))
1414 				if (fr_tcpoptions(fin, tcp, tdata)) {
1415 					tdata->td_winflags = TCP_WSCALE_SEEN|
1416 							     TCP_WSCALE_FIRST;
1417 				}
1418 
1419 			if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1420 				fr_checknewisn(fin, is);
1421 
1422 		}
1423 		ret = 1;
1424 	} else
1425 		fin->fin_flx |= FI_OOW;
1426 	MUTEX_EXIT(&is->is_lock);
1427 	return ret;
1428 }
1429 
1430 
1431 /* ------------------------------------------------------------------------ */
1432 /* Function:    fr_checknewisn                                              */
1433 /* Returns:     Nil                                                         */
1434 /* Parameters:  fin(I)   - pointer to packet information                    */
1435 /*              is(I)  - pointer to master state structure                  */
1436 /*                                                                          */
1437 /* Check to see if this TCP connection is expecting and needs a new         */
1438 /* sequence number for a particular direction of the connection.            */
1439 /*                                                                          */
1440 /* NOTE: This does not actually change the sequence numbers, only gets new  */
1441 /* one ready.                                                               */
1442 /* ------------------------------------------------------------------------ */
1443 static void fr_checknewisn(fin, is)
1444 fr_info_t *fin;
1445 ipstate_t *is;
1446 {
1447 	u_32_t sumd, old, new;
1448 	tcphdr_t *tcp;
1449 	int i;
1450 
1451 	i = fin->fin_rev;
1452 	tcp = fin->fin_dp;
1453 
1454 	if (((i == 0) && !(is->is_flags & IS_ISNSYN)) ||
1455 	    ((i == 1) && !(is->is_flags & IS_ISNACK))) {
1456 		old = ntohl(tcp->th_seq);
1457 		new = fr_newisn(fin);
1458 		is->is_isninc[i] = new - old;
1459 		CALC_SUMD(old, new, sumd);
1460 		is->is_sumd[i] = (sumd & 0xffff) + (sumd >> 16);
1461 
1462 		is->is_flags |= ((i == 0) ? IS_ISNSYN : IS_ISNACK);
1463 	}
1464 }
1465 
1466 
1467 /* ------------------------------------------------------------------------ */
1468 /* Function:    fr_tcpinwindow                                              */
1469 /* Returns:     int - 1 == packet inside TCP "window", 0 == not inside.     */
1470 /* Parameters:  fin(I)   - pointer to packet information                    */
1471 /*              fdata(I) - pointer to tcp state informatio (forward)        */
1472 /*              tdata(I) - pointer to tcp state informatio (reverse)        */
1473 /*              tcp(I)   - pointer to TCP packet header                     */
1474 /*                                                                          */
1475 /* Given a packet has matched addresses and ports, check to see if it is    */
1476 /* within the TCP data window.  In a show of generosity, allow packets that */
1477 /* are within the window space behind the current sequence # as well.       */
1478 /* ------------------------------------------------------------------------ */
1479 int fr_tcpinwindow(fin, fdata, tdata, tcp, flags)
1480 fr_info_t *fin;
1481 tcpdata_t  *fdata, *tdata;
1482 tcphdr_t *tcp;
1483 int flags;
1484 {
1485 	tcp_seq seq, ack, end;
1486 	int ackskew, tcpflags;
1487 	u_32_t win, maxwin;
1488 
1489 	/*
1490 	 * Find difference between last checked packet and this packet.
1491 	 */
1492 	tcpflags = tcp->th_flags;
1493 	seq = ntohl(tcp->th_seq);
1494 	ack = ntohl(tcp->th_ack);
1495 	if (tcpflags & TH_SYN)
1496 		win = ntohs(tcp->th_win);
1497 	else
1498 		win = ntohs(tcp->th_win) << fdata->td_winscale;
1499 	if (win == 0)
1500 		win = 1;
1501 
1502 	/*
1503 	 * if window scaling is present, the scaling is only allowed
1504 	 * for windows not in the first SYN packet. In that packet the
1505 	 * window is 65535 to specify the largest window possible
1506 	 * for receivers not implementing the window scale option.
1507 	 * Currently, we do not assume TTCP here. That means that
1508 	 * if we see a second packet from a host (after the initial
1509 	 * SYN), we can assume that the receiver of the SYN did
1510 	 * already send back the SYN/ACK (and thus that we know if
1511 	 * the receiver also does window scaling)
1512 	 */
1513 	if (!(tcpflags & TH_SYN) && (fdata->td_winflags & TCP_WSCALE_FIRST)) {
1514 		if (tdata->td_winflags & TCP_WSCALE_SEEN) {
1515 			fdata->td_winflags &= ~TCP_WSCALE_FIRST;
1516 			fdata->td_maxwin = win;
1517 		} else {
1518 			fdata->td_winscale = 0;
1519 			fdata->td_winflags = 0;
1520 			tdata->td_winscale = 0;
1521 			tdata->td_winflags = 0;
1522 		  }
1523 	}
1524 
1525 	end = seq + fin->fin_dlen - (TCP_OFF(tcp) << 2) +
1526 	      ((tcpflags & TH_SYN) ? 1 : 0) + ((tcpflags & TH_FIN) ? 1 : 0);
1527 
1528 	if ((fdata->td_end == 0) &&
1529 	    (!(flags & IS_TCPFSM) ||
1530 	     ((tcpflags & TH_OPENING) == TH_OPENING))) {
1531 		/*
1532 		 * Must be a (outgoing) SYN-ACK in reply to a SYN.
1533 		 */
1534 		fdata->td_end = end;
1535 		fdata->td_maxwin = 1;
1536 		fdata->td_maxend = end + win;
1537 	}
1538 
1539 	if (!(tcpflags & TH_ACK)) {  /* Pretend an ack was sent */
1540 		ack = tdata->td_end;
1541 	} else if (((tcpflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
1542 		   (ack == 0)) {
1543 		/* gross hack to get around certain broken tcp stacks */
1544 		ack = tdata->td_end;
1545 	}
1546 
1547 	if (seq == end)
1548 		seq = end = fdata->td_end;
1549 
1550 	maxwin = tdata->td_maxwin;
1551 	ackskew = tdata->td_end - ack;
1552 
1553 	/*
1554 	 * Strict sequencing only allows in-order delivery.
1555 	 */
1556 	if ((flags & IS_STRICT) != 0) {
1557 		if (seq != fdata->td_end) {
1558 			return 0;
1559 		}
1560 	}
1561 
1562 #define	SEQ_GE(a,b)	((int)((a) - (b)) >= 0)
1563 #define	SEQ_GT(a,b)	((int)((a) - (b)) > 0)
1564 	if (
1565 #if defined(_KERNEL)
1566 	    (SEQ_GE(fdata->td_maxend, end)) &&
1567 	    (SEQ_GE(seq, fdata->td_end - maxwin)) &&
1568 #endif
1569 /* XXX what about big packets */
1570 #define MAXACKWINDOW 66000
1571 	    (-ackskew <= (MAXACKWINDOW << fdata->td_winscale)) &&
1572 	    ( ackskew <= (MAXACKWINDOW << fdata->td_winscale))) {
1573 
1574 		/* if ackskew < 0 then this should be due to fragmented
1575 		 * packets. There is no way to know the length of the
1576 		 * total packet in advance.
1577 		 * We do know the total length from the fragment cache though.
1578 		 * Note however that there might be more sessions with
1579 		 * exactly the same source and destination parameters in the
1580 		 * state cache (and source and destination is the only stuff
1581 		 * that is saved in the fragment cache). Note further that
1582 		 * some TCP connections in the state cache are hashed with
1583 		 * sport and dport as well which makes it not worthwhile to
1584 		 * look for them.
1585 		 * Thus, when ackskew is negative but still seems to belong
1586 		 * to this session, we bump up the destinations end value.
1587 		 */
1588 		if (ackskew < 0)
1589 			tdata->td_end = ack;
1590 
1591 		/* update max window seen */
1592 		if (fdata->td_maxwin < win)
1593 			fdata->td_maxwin = win;
1594 		if (SEQ_GT(end, fdata->td_end))
1595 			fdata->td_end = end;
1596 		if (SEQ_GE(ack + win, tdata->td_maxend))
1597 			tdata->td_maxend = ack + win;
1598 		return 1;
1599 	}
1600 	return 0;
1601 }
1602 
1603 
1604 /* ------------------------------------------------------------------------ */
1605 /* Function:    fr_stclone                                                  */
1606 /* Returns:     ipstate_t* - NULL == cloning failed,                        */
1607 /*                           else pointer to new state structure            */
1608 /* Parameters:  fin(I) - pointer to packet information                      */
1609 /*              tcp(I) - pointer to TCP/UDP header                          */
1610 /*              is(I)  - pointer to master state structure                  */
1611 /*                                                                          */
1612 /* Create a "duplcate" state table entry from the master.                   */
1613 /* ------------------------------------------------------------------------ */
1614 static ipstate_t *fr_stclone(fin, tcp, is)
1615 fr_info_t *fin;
1616 tcphdr_t *tcp;
1617 ipstate_t *is;
1618 {
1619 	ipstate_t *clone;
1620 	u_32_t send;
1621 	ipf_stack_t *ifs = fin->fin_ifs;
1622 
1623 	if (ifs->ifs_ips_num == ifs->ifs_fr_statemax) {
1624 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_max);
1625 		ifs->ifs_fr_state_doflush = 1;
1626 		return NULL;
1627 	}
1628 	KMALLOC(clone, ipstate_t *);
1629 	if (clone == NULL)
1630 		return NULL;
1631 	bcopy((char *)is, (char *)clone, sizeof(*clone));
1632 
1633 	MUTEX_NUKE(&clone->is_lock);
1634 
1635 	clone->is_die = ONE_DAY + ifs->ifs_fr_ticks;
1636 	clone->is_state[0] = 0;
1637 	clone->is_state[1] = 0;
1638 	send = ntohl(tcp->th_seq) + fin->fin_dlen - (TCP_OFF(tcp) << 2) +
1639 		((tcp->th_flags & TH_SYN) ? 1 : 0) +
1640 		((tcp->th_flags & TH_FIN) ? 1 : 0);
1641 
1642 	if (fin->fin_rev == 1) {
1643 		clone->is_dend = send;
1644 		clone->is_maxdend = send;
1645 		clone->is_send = 0;
1646 		clone->is_maxswin = 1;
1647 		clone->is_maxdwin = ntohs(tcp->th_win);
1648 		if (clone->is_maxdwin == 0)
1649 			clone->is_maxdwin = 1;
1650 	} else {
1651 		clone->is_send = send;
1652 		clone->is_maxsend = send;
1653 		clone->is_dend = 0;
1654 		clone->is_maxdwin = 1;
1655 		clone->is_maxswin = ntohs(tcp->th_win);
1656 		if (clone->is_maxswin == 0)
1657 			clone->is_maxswin = 1;
1658 	}
1659 
1660 	clone->is_flags &= ~SI_CLONE;
1661 	clone->is_flags |= SI_CLONED;
1662 	fr_stinsert(clone, fin->fin_rev, ifs);
1663 	clone->is_ref = 2;
1664 	if (clone->is_p == IPPROTO_TCP) {
1665 		(void) fr_tcp_age(&clone->is_sti, fin, ifs->ifs_ips_tqtqb,
1666 				  clone->is_flags);
1667 	}
1668 	MUTEX_EXIT(&clone->is_lock);
1669 #ifdef	IPFILTER_SCAN
1670 	(void) ipsc_attachis(is);
1671 #endif
1672 #ifdef	IPFILTER_SYNC
1673 	if (is->is_flags & IS_STATESYNC)
1674 		clone->is_sync = ipfsync_new(SMC_STATE, fin, clone);
1675 #endif
1676 	return clone;
1677 }
1678 
1679 
1680 /* ------------------------------------------------------------------------ */
1681 /* Function:    fr_matchsrcdst                                              */
1682 /* Returns:     Nil                                                         */
1683 /* Parameters:  fin(I) - pointer to packet information                      */
1684 /*              is(I)  - pointer to state structure                         */
1685 /*              src(I) - pointer to source address                          */
1686 /*              dst(I) - pointer to destination address                     */
1687 /*              tcp(I) - pointer to TCP/UDP header                          */
1688 /*                                                                          */
1689 /* Match a state table entry against an IP packet.  The logic below is that */
1690 /* ret gets set to one if the match succeeds, else remains 0.  If it is     */
1691 /* still 0 after the test. no match.                                        */
1692 /* ------------------------------------------------------------------------ */
1693 static ipstate_t *fr_matchsrcdst(fin, is, src, dst, tcp, cmask)
1694 fr_info_t *fin;
1695 ipstate_t *is;
1696 i6addr_t *src, *dst;
1697 tcphdr_t *tcp;
1698 u_32_t cmask;
1699 {
1700 	int ret = 0, rev, out, flags, flx = 0, idx;
1701 	u_short sp, dp;
1702 	u_32_t cflx;
1703 	void *ifp;
1704 	ipf_stack_t *ifs = fin->fin_ifs;
1705 
1706 	rev = IP6_NEQ(&is->is_dst, dst);
1707 	ifp = fin->fin_ifp;
1708 	out = fin->fin_out;
1709 	flags = is->is_flags;
1710 	sp = 0;
1711 	dp = 0;
1712 
1713 	if (tcp != NULL) {
1714 		sp = htons(fin->fin_sport);
1715 		dp = ntohs(fin->fin_dport);
1716 	}
1717 	if (!rev) {
1718 		if (tcp != NULL) {
1719 			if (!(flags & SI_W_SPORT) && (sp != is->is_sport))
1720 				rev = 1;
1721 			else if (!(flags & SI_W_DPORT) && (dp != is->is_dport))
1722 				rev = 1;
1723 		}
1724 	}
1725 
1726 	idx = (out << 1) + rev;
1727 
1728 	/*
1729 	 * If the interface for this 'direction' is set, make sure it matches.
1730 	 * An interface name that is not set matches any, as does a name of *.
1731 	 */
1732 	if ((is->is_ifp[idx] == NULL &&
1733 	    (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '*')) ||
1734 	    is->is_ifp[idx] == ifp)
1735 		ret = 1;
1736 
1737 	if (ret == 0)
1738 		return NULL;
1739 	ret = 0;
1740 
1741 	/*
1742 	 * Match addresses and ports.
1743 	 */
1744 	if (rev == 0) {
1745 		if ((IP6_EQ(&is->is_dst, dst) || (flags & SI_W_DADDR)) &&
1746 		    (IP6_EQ(&is->is_src, src) || (flags & SI_W_SADDR))) {
1747 			if (tcp) {
1748 				if ((sp == is->is_sport || flags & SI_W_SPORT)&&
1749 				    (dp == is->is_dport || flags & SI_W_DPORT))
1750 					ret = 1;
1751 			} else {
1752 				ret = 1;
1753 			}
1754 		}
1755 	} else {
1756 		if ((IP6_EQ(&is->is_dst, src) || (flags & SI_W_DADDR)) &&
1757 		    (IP6_EQ(&is->is_src, dst) || (flags & SI_W_SADDR))) {
1758 			if (tcp) {
1759 				if ((dp == is->is_sport || flags & SI_W_SPORT)&&
1760 				    (sp == is->is_dport || flags & SI_W_DPORT))
1761 					ret = 1;
1762 			} else {
1763 				ret = 1;
1764 			}
1765 		}
1766 	}
1767 
1768 	if (ret == 0)
1769 		return NULL;
1770 
1771 	/*
1772 	 * Whether or not this should be here, is questionable, but the aim
1773 	 * is to get this out of the main line.
1774 	 */
1775 	if (tcp == NULL)
1776 		flags = is->is_flags & ~(SI_WILDP|SI_NEWFR|SI_CLONE|SI_CLONED);
1777 
1778 	/*
1779 	 * Only one of the source or destination address can be flaged as a
1780 	 * wildcard.  Fill in the missing address, if set.
1781 	 * For IPv6, if the address being copied in is multicast, then
1782 	 * don't reset the wild flag - multicast causes it to be set in the
1783 	 * first place!
1784 	 */
1785 	if ((flags & (SI_W_SADDR|SI_W_DADDR))) {
1786 		fr_ip_t *fi = &fin->fin_fi;
1787 
1788 		if ((flags & SI_W_SADDR) != 0) {
1789 			if (rev == 0) {
1790 #ifdef USE_INET6
1791 				if (is->is_v == 6 &&
1792 				    IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
1793 					/*EMPTY*/;
1794 				else
1795 #endif
1796 				{
1797 					is->is_src = fi->fi_src;
1798 					is->is_flags &= ~SI_W_SADDR;
1799 				}
1800 			} else {
1801 #ifdef USE_INET6
1802 				if (is->is_v == 6 &&
1803 				    IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
1804 					/*EMPTY*/;
1805 				else
1806 #endif
1807 				{
1808 					is->is_src = fi->fi_dst;
1809 					is->is_flags &= ~SI_W_SADDR;
1810 				}
1811 			}
1812 		} else if ((flags & SI_W_DADDR) != 0) {
1813 			if (rev == 0) {
1814 #ifdef USE_INET6
1815 				if (is->is_v == 6 &&
1816 				    IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
1817 					/*EMPTY*/;
1818 				else
1819 #endif
1820 				{
1821 					is->is_dst = fi->fi_dst;
1822 					is->is_flags &= ~SI_W_DADDR;
1823 				}
1824 			} else {
1825 #ifdef USE_INET6
1826 				if (is->is_v == 6 &&
1827 				    IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
1828 					/*EMPTY*/;
1829 				else
1830 #endif
1831 				{
1832 					is->is_dst = fi->fi_src;
1833 					is->is_flags &= ~SI_W_DADDR;
1834 				}
1835 			}
1836 		}
1837 		if ((is->is_flags & (SI_WILDA|SI_WILDP)) == 0) {
1838 			ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
1839 		}
1840 	}
1841 
1842 	flx = fin->fin_flx & cmask;
1843 	cflx = is->is_flx[out][rev];
1844 
1845 	/*
1846 	 * Match up any flags set from IP options.
1847 	 */
1848 	if ((cflx && (flx != (cflx & cmask))) ||
1849 	    ((fin->fin_optmsk & is->is_optmsk[rev]) != is->is_opt[rev]) ||
1850 	    ((fin->fin_secmsk & is->is_secmsk) != is->is_sec) ||
1851 	    ((fin->fin_auth & is->is_authmsk) != is->is_auth))
1852 		return NULL;
1853 
1854 	/*
1855 	 * Only one of the source or destination port can be flagged as a
1856 	 * wildcard.  When filling it in, fill in a copy of the matched entry
1857 	 * if it has the cloning flag set.
1858 	 */
1859 	if ((fin->fin_flx & FI_IGNORE) != 0) {
1860 		fin->fin_rev = rev;
1861 		return is;
1862 	}
1863 
1864 	if ((flags & (SI_W_SPORT|SI_W_DPORT))) {
1865 		if ((flags & SI_CLONE) != 0) {
1866 			ipstate_t *clone;
1867 
1868 			clone = fr_stclone(fin, tcp, is);
1869 			if (clone == NULL)
1870 				return NULL;
1871 			is = clone;
1872 		} else {
1873 			ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
1874 		}
1875 
1876 		if ((flags & SI_W_SPORT) != 0) {
1877 			if (rev == 0) {
1878 				is->is_sport = sp;
1879 				is->is_send = ntohl(tcp->th_seq);
1880 			} else {
1881 				is->is_sport = dp;
1882 				is->is_send = ntohl(tcp->th_ack);
1883 			}
1884 			is->is_maxsend = is->is_send + 1;
1885 		} else if ((flags & SI_W_DPORT) != 0) {
1886 			if (rev == 0) {
1887 				is->is_dport = dp;
1888 				is->is_dend = ntohl(tcp->th_ack);
1889 			} else {
1890 				is->is_dport = sp;
1891 				is->is_dend = ntohl(tcp->th_seq);
1892 			}
1893 			is->is_maxdend = is->is_dend + 1;
1894 		}
1895 		is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
1896 		if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging)
1897 			ipstate_log(is, ISL_CLONE, ifs);
1898 	}
1899 
1900 	ret = -1;
1901 
1902 	if (is->is_flx[out][rev] == 0) {
1903 		is->is_flx[out][rev] = flx;
1904 		is->is_opt[rev] = fin->fin_optmsk;
1905 		if (is->is_v == 6) {
1906 			is->is_opt[rev] &= ~0x8;
1907 			is->is_optmsk[rev] &= ~0x8;
1908 		}
1909 	}
1910 
1911 	/*
1912 	 * Check if the interface name for this "direction" is set and if not,
1913 	 * fill it in.
1914 	 */
1915 	if (is->is_ifp[idx] == NULL &&
1916 	    (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '*')) {
1917 		is->is_ifp[idx] = ifp;
1918 		COPYIFNAME(ifp, is->is_ifname[idx], fin->fin_v);
1919 	}
1920 	fin->fin_rev = rev;
1921 	return is;
1922 }
1923 
1924 
1925 /* ------------------------------------------------------------------------ */
1926 /* Function:    fr_checkicmpmatchingstate                                   */
1927 /* Returns:     Nil                                                         */
1928 /* Parameters:  fin(I) - pointer to packet information                      */
1929 /*                                                                          */
1930 /* If we've got an ICMP error message, using the information stored in the  */
1931 /* ICMP packet, look for a matching state table entry.                      */
1932 /*                                                                          */
1933 /* If we return NULL then no lock on ipf_state is held.                     */
1934 /* If we return non-null then a read-lock on ipf_state is held.             */
1935 /* ------------------------------------------------------------------------ */
1936 static ipstate_t *fr_checkicmpmatchingstate(fin)
1937 fr_info_t *fin;
1938 {
1939 	ipstate_t *is, **isp;
1940 	u_short sport, dport;
1941 	u_char	pr;
1942 	int backward, i, oi;
1943 	i6addr_t dst, src;
1944 	struct icmp *ic;
1945 	u_short savelen;
1946 	icmphdr_t *icmp;
1947 	fr_info_t ofin;
1948 	tcphdr_t *tcp;
1949 	int len;
1950 	ip_t *oip;
1951 	u_int hv;
1952 	ipf_stack_t *ifs = fin->fin_ifs;
1953 
1954 	/*
1955 	 * Does it at least have the return (basic) IP header ?
1956 	 * Is it an actual recognised ICMP error type?
1957 	 * Only a basic IP header (no options) should be with
1958 	 * an ICMP error header.
1959 	 */
1960 	if ((fin->fin_v != 4) || (fin->fin_hlen != sizeof(ip_t)) ||
1961 	    (fin->fin_plen < ICMPERR_MINPKTLEN) ||
1962 	    !(fin->fin_flx & FI_ICMPERR))
1963 		return NULL;
1964 	ic = fin->fin_dp;
1965 
1966 	oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
1967 	/*
1968 	 * Check if the at least the old IP header (with options) and
1969 	 * 8 bytes of payload is present.
1970 	 */
1971 	if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((IP_HL(oip) - 5) << 2))
1972 		return NULL;
1973 
1974 	/*
1975 	 * Sanity Checks.
1976 	 */
1977 	len = fin->fin_dlen - ICMPERR_ICMPHLEN;
1978 	if ((len <= 0) || ((IP_HL(oip) << 2) > len))
1979 		return NULL;
1980 
1981 	/*
1982 	 * Is the buffer big enough for all of it ?  It's the size of the IP
1983 	 * header claimed in the encapsulated part which is of concern.  It
1984 	 * may be too big to be in this buffer but not so big that it's
1985 	 * outside the ICMP packet, leading to TCP deref's causing problems.
1986 	 * This is possible because we don't know how big oip_hl is when we
1987 	 * do the pullup early in fr_check() and thus can't guarantee it is
1988 	 * all here now.
1989 	 */
1990 #ifdef  _KERNEL
1991 	{
1992 	mb_t *m;
1993 
1994 	m = fin->fin_m;
1995 # if defined(MENTAT)
1996 	if ((char *)oip + len > (char *)m->b_wptr)
1997 		return NULL;
1998 # else
1999 	if ((char *)oip + len > (char *)fin->fin_ip + m->m_len)
2000 		return NULL;
2001 # endif
2002 	}
2003 #endif
2004 	bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
2005 
2006 	/*
2007 	 * in the IPv4 case we must zero the i6addr union otherwise
2008 	 * the IP6_EQ and IP6_NEQ macros produce the wrong results because
2009 	 * of the 'junk' in the unused part of the union
2010 	 */
2011 	bzero((char *)&src, sizeof(src));
2012 	bzero((char *)&dst, sizeof(dst));
2013 
2014 	/*
2015 	 * we make an fin entry to be able to feed it to
2016 	 * matchsrcdst note that not all fields are encessary
2017 	 * but this is the cleanest way. Note further we fill
2018 	 * in fin_mp such that if someone uses it we'll get
2019 	 * a kernel panic. fr_matchsrcdst does not use this.
2020 	 *
2021 	 * watch out here, as ip is in host order and oip in network
2022 	 * order. Any change we make must be undone afterwards, like
2023 	 * oip->ip_off - it is still in network byte order so fix it.
2024 	 */
2025 	savelen = oip->ip_len;
2026 	oip->ip_len = len;
2027 	oip->ip_off = ntohs(oip->ip_off);
2028 
2029 	ofin.fin_flx = FI_NOCKSUM;
2030 	ofin.fin_v = 4;
2031 	ofin.fin_ip = oip;
2032 	ofin.fin_m = NULL;	/* if dereferenced, panic XXX */
2033 	ofin.fin_mp = NULL;	/* if dereferenced, panic XXX */
2034 	ofin.fin_plen = fin->fin_dlen - ICMPERR_ICMPHLEN;
2035 	(void) fr_makefrip(IP_HL(oip) << 2, oip, &ofin);
2036 	ofin.fin_ifp = fin->fin_ifp;
2037 	ofin.fin_out = !fin->fin_out;
2038 	/*
2039 	 * Reset the short and bad flag here because in fr_matchsrcdst()
2040 	 * the flags for the current packet (fin_flx) are compared against
2041 	 * those for the existing session.
2042 	 */
2043 	ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
2044 
2045 	/*
2046 	 * Put old values of ip_len and ip_off back as we don't know
2047 	 * if we have to forward the packet (or process it again.
2048 	 */
2049 	oip->ip_len = savelen;
2050 	oip->ip_off = htons(oip->ip_off);
2051 
2052 	switch (oip->ip_p)
2053 	{
2054 	case IPPROTO_ICMP :
2055 		/*
2056 		 * an ICMP error can only be generated as a result of an
2057 		 * ICMP query, not as the response on an ICMP error
2058 		 *
2059 		 * XXX theoretically ICMP_ECHOREP and the other reply's are
2060 		 * ICMP query's as well, but adding them here seems strange XXX
2061 		 */
2062 		if ((ofin.fin_flx & FI_ICMPERR) != 0)
2063 		    	return NULL;
2064 
2065 		/*
2066 		 * perform a lookup of the ICMP packet in the state table
2067 		 */
2068 		icmp = (icmphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2069 		hv = (pr = oip->ip_p);
2070 		src.in4 = oip->ip_src;
2071 		hv += src.in4.s_addr;
2072 		dst.in4 = oip->ip_dst;
2073 		hv += dst.in4.s_addr;
2074 		hv += icmp->icmp_id;
2075 		hv = DOUBLE_HASH(hv, ifs);
2076 
2077 		READ_ENTER(&ifs->ifs_ipf_state);
2078 		for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2079 			isp = &is->is_hnext;
2080 			if ((is->is_p != pr) || (is->is_v != 4))
2081 				continue;
2082 			if (is->is_pass & FR_NOICMPERR)
2083 				continue;
2084 			is = fr_matchsrcdst(&ofin, is, &src, &dst,
2085 					    NULL, FI_ICMPCMP);
2086 			if (is != NULL) {
2087 				if ((is->is_pass & FR_NOICMPERR) != 0) {
2088 					RWLOCK_EXIT(&ifs->ifs_ipf_state);
2089 					return NULL;
2090 				}
2091 				/*
2092 				 * i  : the index of this packet (the icmp
2093 				 *      unreachable)
2094 				 * oi : the index of the original packet found
2095 				 *      in the icmp header (i.e. the packet
2096 				 *      causing this icmp)
2097 				 * backward : original packet was backward
2098 				 *      compared to the state
2099 				 */
2100 				backward = IP6_NEQ(&is->is_src, &src);
2101 				fin->fin_rev = !backward;
2102 				i = (!backward << 1) + fin->fin_out;
2103 				oi = (backward << 1) + ofin.fin_out;
2104 				if (is->is_icmppkts[i] > is->is_pkts[oi])
2105 					continue;
2106 				ifs->ifs_ips_stats.iss_hits++;
2107 				is->is_icmppkts[i]++;
2108 				return is;
2109 			}
2110 		}
2111 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
2112 		return NULL;
2113 	case IPPROTO_TCP :
2114 	case IPPROTO_UDP :
2115 		break;
2116 	default :
2117 		return NULL;
2118 	}
2119 
2120 	tcp = (tcphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2121 	dport = tcp->th_dport;
2122 	sport = tcp->th_sport;
2123 
2124 	hv = (pr = oip->ip_p);
2125 	src.in4 = oip->ip_src;
2126 	hv += src.in4.s_addr;
2127 	dst.in4 = oip->ip_dst;
2128 	hv += dst.in4.s_addr;
2129 	hv += dport;
2130 	hv += sport;
2131 	hv = DOUBLE_HASH(hv, ifs);
2132 
2133 	READ_ENTER(&ifs->ifs_ipf_state);
2134 	for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2135 		isp = &is->is_hnext;
2136 		/*
2137 		 * Only allow this icmp though if the
2138 		 * encapsulated packet was allowed through the
2139 		 * other way around. Note that the minimal amount
2140 		 * of info present does not allow for checking against
2141 		 * tcp internals such as seq and ack numbers.   Only the
2142 		 * ports are known to be present and can be even if the
2143 		 * short flag is set.
2144 		 */
2145 		if ((is->is_p == pr) && (is->is_v == 4) &&
2146 		    (is = fr_matchsrcdst(&ofin, is, &src, &dst,
2147 					 tcp, FI_ICMPCMP))) {
2148 			/*
2149 			 * i  : the index of this packet (the icmp unreachable)
2150 			 * oi : the index of the original packet found in the
2151 			 *      icmp header (i.e. the packet causing this icmp)
2152 			 * backward : original packet was backward compared to
2153 			 *            the state
2154 			 */
2155 			backward = IP6_NEQ(&is->is_src, &src);
2156 			fin->fin_rev = !backward;
2157 			i = (!backward << 1) + fin->fin_out;
2158 			oi = (backward << 1) + ofin.fin_out;
2159 
2160 			if (((is->is_pass & FR_NOICMPERR) != 0) ||
2161 			    (is->is_icmppkts[i] > is->is_pkts[oi]))
2162 				break;
2163 			ifs->ifs_ips_stats.iss_hits++;
2164 			is->is_icmppkts[i]++;
2165 			/*
2166 			 * we deliberately do not touch the timeouts
2167 			 * for the accompanying state table entry.
2168 			 * It remains to be seen if that is correct. XXX
2169 			 */
2170 			return is;
2171 		}
2172 	}
2173 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
2174 	return NULL;
2175 }
2176 
2177 
2178 /* ------------------------------------------------------------------------ */
2179 /* Function:    fr_ipsmove                                                  */
2180 /* Returns:     Nil                                                         */
2181 /* Parameters:  is(I) - pointer to state table entry                        */
2182 /*              hv(I) - new hash value for state table entry                */
2183 /* Write Locks: ipf_state                                                   */
2184 /*                                                                          */
2185 /* Move a state entry from one position in the hash table to another.       */
2186 /* ------------------------------------------------------------------------ */
2187 static void fr_ipsmove(is, hv, ifs)
2188 ipstate_t *is;
2189 u_int hv;
2190 ipf_stack_t *ifs;
2191 {
2192 	ipstate_t **isp;
2193 	u_int hvm;
2194 
2195 	ASSERT(rw_read_locked(&ifs->ifs_ipf_state.ipf_lk) == 0);
2196 
2197 	hvm = is->is_hv;
2198 	/*
2199 	 * Remove the hash from the old location...
2200 	 */
2201 	isp = is->is_phnext;
2202 	if (is->is_hnext)
2203 		is->is_hnext->is_phnext = isp;
2204 	*isp = is->is_hnext;
2205 	if (ifs->ifs_ips_table[hvm] == NULL)
2206 		ifs->ifs_ips_stats.iss_inuse--;
2207 	ifs->ifs_ips_stats.iss_bucketlen[hvm]--;
2208 
2209 	/*
2210 	 * ...and put the hash in the new one.
2211 	 */
2212 	hvm = DOUBLE_HASH(hv, ifs);
2213 	is->is_hv = hvm;
2214 	isp = &ifs->ifs_ips_table[hvm];
2215 	if (*isp)
2216 		(*isp)->is_phnext = &is->is_hnext;
2217 	else
2218 		ifs->ifs_ips_stats.iss_inuse++;
2219 	ifs->ifs_ips_stats.iss_bucketlen[hvm]++;
2220 	is->is_phnext = isp;
2221 	is->is_hnext = *isp;
2222 	*isp = is;
2223 }
2224 
2225 
2226 /* ------------------------------------------------------------------------ */
2227 /* Function:    fr_stlookup                                                 */
2228 /* Returns:     ipstate_t* - NULL == no matching state found,               */
2229 /*                           else pointer to state information is returned  */
2230 /* Parameters:  fin(I) - pointer to packet information                      */
2231 /*              tcp(I) - pointer to TCP/UDP header.                         */
2232 /*                                                                          */
2233 /* Search the state table for a matching entry to the packet described by   */
2234 /* the contents of *fin.                                                    */
2235 /*                                                                          */
2236 /* If we return NULL then no lock on ipf_state is held.                     */
2237 /* If we return non-null then a read-lock on ipf_state is held.             */
2238 /* ------------------------------------------------------------------------ */
2239 ipstate_t *fr_stlookup(fin, tcp, ifqp)
2240 fr_info_t *fin;
2241 tcphdr_t *tcp;
2242 ipftq_t **ifqp;
2243 {
2244 	u_int hv, hvm, pr, v, tryagain;
2245 	ipstate_t *is, **isp;
2246 	u_short dport, sport;
2247 	i6addr_t src, dst;
2248 	struct icmp *ic;
2249 	ipftq_t *ifq;
2250 	int oow;
2251 	ipf_stack_t *ifs = fin->fin_ifs;
2252 
2253 	is = NULL;
2254 	ifq = NULL;
2255 	tcp = fin->fin_dp;
2256 	ic = (struct icmp *)tcp;
2257 	hv = (pr = fin->fin_fi.fi_p);
2258 	src = fin->fin_fi.fi_src;
2259 	dst = fin->fin_fi.fi_dst;
2260 	hv += src.in4.s_addr;
2261 	hv += dst.in4.s_addr;
2262 
2263 	v = fin->fin_fi.fi_v;
2264 #ifdef	USE_INET6
2265 	if (v == 6) {
2266 		hv  += fin->fin_fi.fi_src.i6[1];
2267 		hv  += fin->fin_fi.fi_src.i6[2];
2268 		hv  += fin->fin_fi.fi_src.i6[3];
2269 
2270 		if ((fin->fin_p == IPPROTO_ICMPV6) &&
2271 		    IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_dst.in6)) {
2272 			hv -= dst.in4.s_addr;
2273 		} else {
2274 			hv += fin->fin_fi.fi_dst.i6[1];
2275 			hv += fin->fin_fi.fi_dst.i6[2];
2276 			hv += fin->fin_fi.fi_dst.i6[3];
2277 		}
2278 	}
2279 #endif
2280 
2281 	/*
2282 	 * Search the hash table for matching packet header info.
2283 	 */
2284 	switch (pr)
2285 	{
2286 #ifdef	USE_INET6
2287 	case IPPROTO_ICMPV6 :
2288 		tryagain = 0;
2289 		if (v == 6) {
2290 			if ((ic->icmp_type == ICMP6_ECHO_REQUEST) ||
2291 			    (ic->icmp_type == ICMP6_ECHO_REPLY)) {
2292 				hv += ic->icmp_id;
2293 			}
2294 		}
2295 		READ_ENTER(&ifs->ifs_ipf_state);
2296 icmp6again:
2297 		hvm = DOUBLE_HASH(hv, ifs);
2298 		for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2299 			isp = &is->is_hnext;
2300 			if ((is->is_p != pr) || (is->is_v != v))
2301 				continue;
2302 			is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2303 			if (is != NULL &&
2304 			    fr_matchicmpqueryreply(v, &is->is_icmp,
2305 						   ic, fin->fin_rev)) {
2306 				if (fin->fin_rev)
2307 					ifq = &ifs->ifs_ips_icmpacktq;
2308 				else
2309 					ifq = &ifs->ifs_ips_icmptq;
2310 				break;
2311 			}
2312 		}
2313 
2314 		if (is != NULL) {
2315 			if ((tryagain != 0) && !(is->is_flags & SI_W_DADDR)) {
2316 				hv += fin->fin_fi.fi_src.i6[0];
2317 				hv += fin->fin_fi.fi_src.i6[1];
2318 				hv += fin->fin_fi.fi_src.i6[2];
2319 				hv += fin->fin_fi.fi_src.i6[3];
2320 				fr_ipsmove(is, hv, ifs);
2321 				MUTEX_DOWNGRADE(&ifs->ifs_ipf_state);
2322 			}
2323 			break;
2324 		}
2325 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
2326 
2327 		/*
2328 		 * No matching icmp state entry. Perhaps this is a
2329 		 * response to another state entry.
2330 		 *
2331 		 * XXX With some ICMP6 packets, the "other" address is already
2332 		 * in the packet, after the ICMP6 header, and this could be
2333 		 * used in place of the multicast address.  However, taking
2334 		 * advantage of this requires some significant code changes
2335 		 * to handle the specific types where that is the case.
2336 		 */
2337 		if ((ifs->ifs_ips_stats.iss_wild != 0) && (v == 6) && (tryagain == 0) &&
2338 		    !IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_src.in6)) {
2339 			hv -= fin->fin_fi.fi_src.i6[0];
2340 			hv -= fin->fin_fi.fi_src.i6[1];
2341 			hv -= fin->fin_fi.fi_src.i6[2];
2342 			hv -= fin->fin_fi.fi_src.i6[3];
2343 			tryagain = 1;
2344 			WRITE_ENTER(&ifs->ifs_ipf_state);
2345 			goto icmp6again;
2346 		}
2347 
2348 		is = fr_checkicmp6matchingstate(fin);
2349 		if (is != NULL)
2350 			return is;
2351 		break;
2352 #endif
2353 
2354 	case IPPROTO_ICMP :
2355 		if (v == 4) {
2356 			hv += ic->icmp_id;
2357 		}
2358 		hv = DOUBLE_HASH(hv, ifs);
2359 		READ_ENTER(&ifs->ifs_ipf_state);
2360 		for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
2361 			isp = &is->is_hnext;
2362 			if ((is->is_p != pr) || (is->is_v != v))
2363 				continue;
2364 			is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2365 			if (is != NULL &&
2366 			    fr_matchicmpqueryreply(v, &is->is_icmp,
2367 						   ic, fin->fin_rev)) {
2368 				if (fin->fin_rev)
2369 					ifq = &ifs->ifs_ips_icmpacktq;
2370 				else
2371 					ifq = &ifs->ifs_ips_icmptq;
2372 				break;
2373 			}
2374 		}
2375 		if (is == NULL) {
2376 			RWLOCK_EXIT(&ifs->ifs_ipf_state);
2377 		}
2378 		break;
2379 
2380 	case IPPROTO_TCP :
2381 	case IPPROTO_UDP :
2382 		ifqp = NULL;
2383 		sport = htons(fin->fin_data[0]);
2384 		hv += sport;
2385 		dport = htons(fin->fin_data[1]);
2386 		hv += dport;
2387 		oow = 0;
2388 		tryagain = 0;
2389 		READ_ENTER(&ifs->ifs_ipf_state);
2390 retry_tcpudp:
2391 		hvm = DOUBLE_HASH(hv, ifs);
2392 		for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2393 			isp = &is->is_hnext;
2394 			if ((is->is_p != pr) || (is->is_v != v))
2395 				continue;
2396 			fin->fin_flx &= ~FI_OOW;
2397 			is = fr_matchsrcdst(fin, is, &src, &dst, tcp, FI_CMP);
2398 			if (is != NULL) {
2399 				if (pr == IPPROTO_TCP) {
2400 					if (!fr_tcpstate(fin, tcp, is)) {
2401 						oow |= fin->fin_flx & FI_OOW;
2402 						continue;
2403 					}
2404 				}
2405 				break;
2406 			}
2407 		}
2408 		if (is != NULL) {
2409 			if (tryagain &&
2410 			    !(is->is_flags & (SI_CLONE|SI_WILDP|SI_WILDA))) {
2411 				hv += dport;
2412 				hv += sport;
2413 				fr_ipsmove(is, hv, ifs);
2414 				MUTEX_DOWNGRADE(&ifs->ifs_ipf_state);
2415 			}
2416 			break;
2417 		}
2418 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
2419 
2420 		if (!tryagain && ifs->ifs_ips_stats.iss_wild) {
2421 			hv -= dport;
2422 			hv -= sport;
2423 			tryagain = 1;
2424 			WRITE_ENTER(&ifs->ifs_ipf_state);
2425 			goto retry_tcpudp;
2426 		}
2427 		fin->fin_flx |= oow;
2428 		break;
2429 
2430 #if 0
2431 	case IPPROTO_GRE :
2432 		gre = fin->fin_dp;
2433 		if (GRE_REV(gre->gr_flags) == 1) {
2434 			hv += gre->gr_call;
2435 		}
2436 		/* FALLTHROUGH */
2437 #endif
2438 	default :
2439 		ifqp = NULL;
2440 		hvm = DOUBLE_HASH(hv, ifs);
2441 		READ_ENTER(&ifs->ifs_ipf_state);
2442 		for (isp = &ifs->ifs_ips_table[hvm]; ((is = *isp) != NULL); ) {
2443 			isp = &is->is_hnext;
2444 			if ((is->is_p != pr) || (is->is_v != v))
2445 				continue;
2446 			is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2447 			if (is != NULL) {
2448 				ifq = &ifs->ifs_ips_iptq;
2449 				break;
2450 			}
2451 		}
2452 		if (is == NULL) {
2453 			RWLOCK_EXIT(&ifs->ifs_ipf_state);
2454 		}
2455 		break;
2456 	}
2457 
2458 	if ((is != NULL) && ((is->is_sti.tqe_flags & TQE_RULEBASED) != 0) &&
2459 	    (is->is_tqehead[fin->fin_rev] != NULL))
2460 		ifq = is->is_tqehead[fin->fin_rev];
2461 	if (ifq != NULL && ifqp != NULL)
2462 		*ifqp = ifq;
2463 	return is;
2464 }
2465 
2466 
2467 /* ------------------------------------------------------------------------ */
2468 /* Function:    fr_updatestate                                              */
2469 /* Returns:     Nil                                                         */
2470 /* Parameters:  fin(I) - pointer to packet information                      */
2471 /*              is(I)  - pointer to state table entry                       */
2472 /* Read Locks:  ipf_state                                                   */
2473 /*                                                                          */
2474 /* Updates packet and byte counters for a newly received packet.  Seeds the */
2475 /* fragment cache with a new entry as required.                             */
2476 /* ------------------------------------------------------------------------ */
2477 void fr_updatestate(fin, is, ifq)
2478 fr_info_t *fin;
2479 ipstate_t *is;
2480 ipftq_t *ifq;
2481 {
2482 	ipftqent_t *tqe;
2483 	int i, pass;
2484 	ipf_stack_t *ifs = fin->fin_ifs;
2485 
2486 	i = (fin->fin_rev << 1) + fin->fin_out;
2487 
2488 	/*
2489 	 * For TCP packets, ifq == NULL.  For all others, check if this new
2490 	 * queue is different to the last one it was on and move it if so.
2491 	 */
2492 	tqe = &is->is_sti;
2493 	MUTEX_ENTER(&is->is_lock);
2494 	if ((tqe->tqe_flags & TQE_RULEBASED) != 0)
2495 		ifq = is->is_tqehead[fin->fin_rev];
2496 
2497 	if (ifq != NULL)
2498 		fr_movequeue(tqe, tqe->tqe_ifq, ifq, ifs);
2499 
2500 	is->is_pkts[i]++;
2501 	is->is_bytes[i] += fin->fin_plen;
2502 	MUTEX_EXIT(&is->is_lock);
2503 
2504 #ifdef	IPFILTER_SYNC
2505 	if (is->is_flags & IS_STATESYNC)
2506 		ipfsync_update(SMC_STATE, fin, is->is_sync);
2507 #endif
2508 
2509 	ATOMIC_INCL(ifs->ifs_ips_stats.iss_hits);
2510 
2511 	fin->fin_fr = is->is_rule;
2512 
2513 	/*
2514 	 * If this packet is a fragment and the rule says to track fragments,
2515 	 * then create a new fragment cache entry.
2516 	 */
2517 	pass = is->is_pass;
2518 	if ((fin->fin_flx & FI_FRAG) && FR_ISPASS(pass))
2519 		(void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
2520 }
2521 
2522 
2523 /* ------------------------------------------------------------------------ */
2524 /* Function:    fr_checkstate                                               */
2525 /* Returns:     frentry_t* - NULL == search failed,                         */
2526 /*                           else pointer to rule for matching state        */
2527 /* Parameters:  ifp(I)   - pointer to interface                             */
2528 /*              passp(I) - pointer to filtering result flags                */
2529 /*                                                                          */
2530 /* Check if a packet is associated with an entry in the state table.        */
2531 /* ------------------------------------------------------------------------ */
2532 frentry_t *fr_checkstate(fin, passp)
2533 fr_info_t *fin;
2534 u_32_t *passp;
2535 {
2536 	ipstate_t *is;
2537 	frentry_t *fr;
2538 	tcphdr_t *tcp;
2539 	ipftq_t *ifq;
2540 	u_int pass;
2541 	ipf_stack_t *ifs = fin->fin_ifs;
2542 
2543 	if (ifs->ifs_fr_state_lock || (ifs->ifs_ips_list == NULL) ||
2544 	    (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
2545 		return NULL;
2546 
2547 	is = NULL;
2548 	if ((fin->fin_flx & FI_TCPUDP) ||
2549 	    (fin->fin_fi.fi_p == IPPROTO_ICMP)
2550 #ifdef	USE_INET6
2551 	    || (fin->fin_fi.fi_p == IPPROTO_ICMPV6)
2552 #endif
2553 	    )
2554 		tcp = fin->fin_dp;
2555 	else
2556 		tcp = NULL;
2557 
2558 	/*
2559 	 * Search the hash table for matching packet header info.
2560 	 */
2561 	ifq = NULL;
2562 	is = fin->fin_state;
2563 	if (is == NULL)
2564 		is = fr_stlookup(fin, tcp, &ifq);
2565 	switch (fin->fin_p)
2566 	{
2567 #ifdef	USE_INET6
2568 	case IPPROTO_ICMPV6 :
2569 		if (is != NULL)
2570 			break;
2571 		if (fin->fin_v == 6) {
2572 			is = fr_checkicmp6matchingstate(fin);
2573 			if (is != NULL)
2574 				goto matched;
2575 		}
2576 		break;
2577 #endif
2578 	case IPPROTO_ICMP :
2579 		if (is != NULL)
2580 			break;
2581 		/*
2582 		 * No matching icmp state entry. Perhaps this is a
2583 		 * response to another state entry.
2584 		 */
2585 		is = fr_checkicmpmatchingstate(fin);
2586 		if (is != NULL)
2587 			goto matched;
2588 		break;
2589 	case IPPROTO_TCP :
2590 		if (is == NULL)
2591 			break;
2592 
2593 		if (is->is_pass & FR_NEWISN) {
2594 			if (fin->fin_out == 0)
2595 				fr_fixinisn(fin, is);
2596 			else if (fin->fin_out == 1)
2597 				fr_fixoutisn(fin, is);
2598 		}
2599 		break;
2600 	default :
2601 		if (fin->fin_rev)
2602 			ifq = &ifs->ifs_ips_udpacktq;
2603 		else
2604 			ifq = &ifs->ifs_ips_udptq;
2605 		break;
2606 	}
2607 	if (is == NULL) {
2608 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_miss);
2609 		return NULL;
2610 	}
2611 
2612 matched:
2613 	fr = is->is_rule;
2614 	if (fr != NULL) {
2615 		if ((fin->fin_out == 0) && (fr->fr_nattag.ipt_num[0] != 0)) {
2616 			if (fin->fin_nattag == NULL)
2617 				return NULL;
2618 			if (fr_matchtag(&fr->fr_nattag, fin->fin_nattag) != 0)
2619 				return NULL;
2620 		}
2621 		(void) strncpy(fin->fin_group, fr->fr_group, FR_GROUPLEN);
2622 		fin->fin_icode = fr->fr_icode;
2623 	}
2624 
2625 	fin->fin_rule = is->is_rulen;
2626 	pass = is->is_pass;
2627 	fr_updatestate(fin, is, ifq);
2628 	if (fin->fin_out == 1)
2629 		fin->fin_nat = is->is_nat[fin->fin_rev];
2630 
2631 	fin->fin_state = is;
2632 	is->is_touched = ifs->ifs_fr_ticks;
2633 	MUTEX_ENTER(&is->is_lock);
2634 	is->is_ref++;
2635 	MUTEX_EXIT(&is->is_lock);
2636 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
2637 	fin->fin_flx |= FI_STATE;
2638 	if ((pass & FR_LOGFIRST) != 0)
2639 		pass &= ~(FR_LOGFIRST|FR_LOG);
2640 	*passp = pass;
2641 	return fr;
2642 }
2643 
2644 
2645 /* ------------------------------------------------------------------------ */
2646 /* Function:    fr_fixoutisn                                                */
2647 /* Returns:     Nil                                                         */
2648 /* Parameters:  fin(I)   - pointer to packet information                    */
2649 /*              is(I)  - pointer to master state structure                  */
2650 /*                                                                          */
2651 /* Called only for outbound packets, adjusts the sequence number and the    */
2652 /* TCP checksum to match that change.                                       */
2653 /* ------------------------------------------------------------------------ */
2654 static void fr_fixoutisn(fin, is)
2655 fr_info_t *fin;
2656 ipstate_t *is;
2657 {
2658 	tcphdr_t *tcp;
2659 	int rev;
2660 	u_32_t seq;
2661 
2662 	tcp = fin->fin_dp;
2663 	rev = fin->fin_rev;
2664 	if ((is->is_flags & IS_ISNSYN) != 0) {
2665 		if (rev == 0) {
2666 			seq = ntohl(tcp->th_seq);
2667 			seq += is->is_isninc[0];
2668 			tcp->th_seq = htonl(seq);
2669 			fix_outcksum(&tcp->th_sum, is->is_sumd[0]);
2670 		}
2671 	}
2672 	if ((is->is_flags & IS_ISNACK) != 0) {
2673 		if (rev == 1) {
2674 			seq = ntohl(tcp->th_seq);
2675 			seq += is->is_isninc[1];
2676 			tcp->th_seq = htonl(seq);
2677 			fix_outcksum(&tcp->th_sum, is->is_sumd[1]);
2678 		}
2679 	}
2680 }
2681 
2682 
2683 /* ------------------------------------------------------------------------ */
2684 /* Function:    fr_fixinisn                                                 */
2685 /* Returns:     Nil                                                         */
2686 /* Parameters:  fin(I)   - pointer to packet information                    */
2687 /*              is(I)  - pointer to master state structure                  */
2688 /*                                                                          */
2689 /* Called only for inbound packets, adjusts the acknowledge number and the  */
2690 /* TCP checksum to match that change.                                       */
2691 /* ------------------------------------------------------------------------ */
2692 static void fr_fixinisn(fin, is)
2693 fr_info_t *fin;
2694 ipstate_t *is;
2695 {
2696 	tcphdr_t *tcp;
2697 	int rev;
2698 	u_32_t ack;
2699 
2700 	tcp = fin->fin_dp;
2701 	rev = fin->fin_rev;
2702 	if ((is->is_flags & IS_ISNSYN) != 0) {
2703 		if (rev == 1) {
2704 			ack = ntohl(tcp->th_ack);
2705 			ack -= is->is_isninc[0];
2706 			tcp->th_ack = htonl(ack);
2707 			fix_incksum(&tcp->th_sum, is->is_sumd[0]);
2708 		}
2709 	}
2710 	if ((is->is_flags & IS_ISNACK) != 0) {
2711 		if (rev == 0) {
2712 			ack = ntohl(tcp->th_ack);
2713 			ack -= is->is_isninc[1];
2714 			tcp->th_ack = htonl(ack);
2715 			fix_incksum(&tcp->th_sum, is->is_sumd[1]);
2716 		}
2717 	}
2718 }
2719 
2720 
2721 /* ------------------------------------------------------------------------ */
2722 /* Function:    fr_statesync                                                */
2723 /* Returns:     Nil                                                         */
2724 /* Parameters:  action(I) - type of synchronisation to do                   */
2725 /*              v(I)      - IP version being sync'd (v4 or v6)              */
2726 /*              ifp(I)    - interface identifier associated with action     */
2727 /*              name(I)   - name associated with ifp parameter              */
2728 /*                                                                          */
2729 /* Walk through all state entries and if an interface pointer match is      */
2730 /* found then look it up again, based on its name in case the pointer has   */
2731 /* changed since last time.                                                 */
2732 /*                                                                          */
2733 /* If ifp is passed in as being non-null then we are only doing updates for */
2734 /* existing, matching, uses of it.                                          */
2735 /* ------------------------------------------------------------------------ */
2736 void fr_statesync(action, v, ifp, name, ifs)
2737 int action, v;
2738 void *ifp;
2739 char *name;
2740 ipf_stack_t *ifs;
2741 {
2742 	ipstate_t *is;
2743 	int i;
2744 
2745 	if (ifs->ifs_fr_running <= 0)
2746 		return;
2747 
2748 	WRITE_ENTER(&ifs->ifs_ipf_state);
2749 
2750 	if (ifs->ifs_fr_running <= 0) {
2751 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
2752 		return;
2753 	}
2754 
2755 	switch (action)
2756 	{
2757 	case IPFSYNC_RESYNC :
2758 		for (is = ifs->ifs_ips_list; is; is = is->is_next) {
2759 			if (v != 0 && is->is_v != v)
2760 				continue;
2761 			/*
2762 			 * Look up all the interface names in the state entry.
2763 			 */
2764 			for (i = 0; i < 4; i++) {
2765 				is->is_ifp[i] = fr_resolvenic(is->is_ifname[i],
2766 							      is->is_v, ifs);
2767 			}
2768 		}
2769 		break;
2770 	case IPFSYNC_NEWIFP :
2771 		for (is = ifs->ifs_ips_list; is; is = is->is_next) {
2772 			if (v != 0 && is->is_v != v)
2773 				continue;
2774 			/*
2775 			 * Look up all the interface names in the state entry.
2776 			 */
2777 			for (i = 0; i < 4; i++) {
2778 				if (!strncmp(is->is_ifname[i], name,
2779 					     sizeof(is->is_ifname[i])))
2780 					is->is_ifp[i] = ifp;
2781 			}
2782 		}
2783 		break;
2784 	case IPFSYNC_OLDIFP :
2785 		for (is = ifs->ifs_ips_list; is; is = is->is_next) {
2786 			if (v != 0 && is->is_v != v)
2787 				continue;
2788 			/*
2789 			 * Look up all the interface names in the state entry.
2790 			 */
2791 			for (i = 0; i < 4; i++) {
2792 				if (is->is_ifp[i] == ifp)
2793 					is->is_ifp[i] = (void *)-1;
2794 			}
2795 		}
2796 		break;
2797 	}
2798 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
2799 }
2800 
2801 
2802 /* ------------------------------------------------------------------------ */
2803 /* Function:    fr_delstate                                                 */
2804 /* Returns:     Nil                                                         */
2805 /* Parameters:  is(I)  - pointer to state structure to delete               */
2806 /*              why(I) - if not 0, log reason why it was deleted            */
2807 /* Write Locks: ipf_state/ipf_global                                        */
2808 /*                                                                          */
2809 /* Deletes a state entry from the enumerated list as well as the hash table */
2810 /* and timeout queue lists.  Make adjustments to hash table statistics and  */
2811 /* global counters as required.                                             */
2812 /* ------------------------------------------------------------------------ */
2813 static void fr_delstate(is, why, ifs)
2814 ipstate_t *is;
2815 int why;
2816 ipf_stack_t *ifs;
2817 {
2818 
2819 	ASSERT(rw_write_held(&ifs->ifs_ipf_global.ipf_lk) == 0 ||
2820 		rw_write_held(&ifs->ifs_ipf_state.ipf_lk) == 0);
2821 
2822 	/*
2823 	 * Since we want to delete this, remove it from the state table,
2824 	 * where it can be found & used, first.
2825 	 */
2826 	if (is->is_pnext != NULL) {
2827 		*is->is_pnext = is->is_next;
2828 
2829 		if (is->is_next != NULL)
2830 			is->is_next->is_pnext = is->is_pnext;
2831 
2832 		is->is_pnext = NULL;
2833 		is->is_next = NULL;
2834 	}
2835 
2836 	if (is->is_phnext != NULL) {
2837 		*is->is_phnext = is->is_hnext;
2838 		if (is->is_hnext != NULL)
2839 			is->is_hnext->is_phnext = is->is_phnext;
2840 		if (ifs->ifs_ips_table[is->is_hv] == NULL)
2841 			ifs->ifs_ips_stats.iss_inuse--;
2842 		ifs->ifs_ips_stats.iss_bucketlen[is->is_hv]--;
2843 
2844 		is->is_phnext = NULL;
2845 		is->is_hnext = NULL;
2846 	}
2847 
2848 	/*
2849 	 * Because ifs->ifs_ips_stats.iss_wild is a count of entries in the state
2850 	 * table that have wildcard flags set, only decerement it once
2851 	 * and do it here.
2852 	 */
2853 	if (is->is_flags & (SI_WILDP|SI_WILDA)) {
2854 		if (!(is->is_flags & SI_CLONED)) {
2855 			ATOMIC_DECL(ifs->ifs_ips_stats.iss_wild);
2856 		}
2857 		is->is_flags &= ~(SI_WILDP|SI_WILDA);
2858 	}
2859 
2860 	/*
2861 	 * Next, remove it from the timeout queue it is in.
2862 	 */
2863 	fr_deletequeueentry(&is->is_sti);
2864 
2865 	is->is_me = NULL;
2866 
2867 	/*
2868 	 * If it is still in use by something else, do not go any further,
2869 	 * but note that at this point it is now an orphan.
2870 	 */
2871 	is->is_ref--;
2872 	if (is->is_ref > 0)
2873 		return;
2874 
2875 	if (is->is_tqehead[0] != NULL) {
2876 		if (fr_deletetimeoutqueue(is->is_tqehead[0]) == 0)
2877 			fr_freetimeoutqueue(is->is_tqehead[0], ifs);
2878 	}
2879 	if (is->is_tqehead[1] != NULL) {
2880 		if (fr_deletetimeoutqueue(is->is_tqehead[1]) == 0)
2881 			fr_freetimeoutqueue(is->is_tqehead[1], ifs);
2882 	}
2883 
2884 #ifdef	IPFILTER_SYNC
2885 	if (is->is_sync)
2886 		ipfsync_del(is->is_sync);
2887 #endif
2888 #ifdef	IPFILTER_SCAN
2889 	(void) ipsc_detachis(is);
2890 #endif
2891 
2892 	if (ifs->ifs_ipstate_logging != 0 && why != 0)
2893 		ipstate_log(is, why, ifs);
2894 
2895 	if (is->is_rule != NULL) {
2896 		is->is_rule->fr_statecnt--;
2897 		(void)fr_derefrule(&is->is_rule, ifs);
2898 	}
2899 
2900 	MUTEX_DESTROY(&is->is_lock);
2901 	KFREE(is);
2902 	ifs->ifs_ips_num--;
2903 }
2904 
2905 
2906 /* ------------------------------------------------------------------------ */
2907 /* Function:    fr_timeoutstate                                             */
2908 /* Returns:     Nil                                                         */
2909 /* Parameters:  Nil                                                         */
2910 /*                                                                          */
2911 /* Slowly expire held state for thingslike UDP and ICMP.  The algorithm     */
2912 /* used here is to keep the queue sorted with the oldest things at the top  */
2913 /* and the youngest at the bottom.  So if the top one doesn't need to be    */
2914 /* expired then neither will any under it.                                  */
2915 /* ------------------------------------------------------------------------ */
2916 void fr_timeoutstate(ifs)
2917 ipf_stack_t *ifs;
2918 {
2919 	ipftq_t *ifq, *ifqnext;
2920 	ipftqent_t *tqe, *tqn;
2921 	ipstate_t *is;
2922 	SPL_INT(s);
2923 
2924 	SPL_NET(s);
2925 	WRITE_ENTER(&ifs->ifs_ipf_state);
2926 	for (ifq = ifs->ifs_ips_tqtqb; ifq != NULL; ifq = ifq->ifq_next)
2927 		for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
2928 			if (tqe->tqe_die > ifs->ifs_fr_ticks)
2929 				break;
2930 			tqn = tqe->tqe_next;
2931 			is = tqe->tqe_parent;
2932 			fr_delstate(is, ISL_EXPIRE, ifs);
2933 		}
2934 
2935 	for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
2936 		ifqnext = ifq->ifq_next;
2937 
2938 		for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
2939 			if (tqe->tqe_die > ifs->ifs_fr_ticks)
2940 				break;
2941 			tqn = tqe->tqe_next;
2942 			is = tqe->tqe_parent;
2943 			fr_delstate(is, ISL_EXPIRE, ifs);
2944 		}
2945 	}
2946 
2947 	for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
2948 		ifqnext = ifq->ifq_next;
2949 
2950 		if (((ifq->ifq_flags & IFQF_DELETE) != 0) &&
2951 		    (ifq->ifq_ref == 0)) {
2952 			fr_freetimeoutqueue(ifq, ifs);
2953 		}
2954 	}
2955 
2956 	if (ifs->ifs_fr_state_doflush) {
2957 		(void) fr_state_flush(2, 0, ifs);
2958 		ifs->ifs_fr_state_doflush = 0;
2959 	}
2960 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
2961 	SPL_X(s);
2962 }
2963 
2964 
2965 /* ------------------------------------------------------------------------ */
2966 /* Function:    fr_state_flush                                              */
2967 /* Returns:     int - 0 == success, -1 == failure                           */
2968 /* Parameters:  Nil                                                         */
2969 /* Write Locks: ipf_state                                                   */
2970 /*                                                                          */
2971 /* Flush state tables.  Three actions currently defined:                    */
2972 /* which == 0 : flush all state table entries                               */
2973 /* which == 1 : flush TCP connections which have started to close but are   */
2974 /*	      stuck for some reason.                                        */
2975 /* which == 2 : flush TCP connections which have been idle for a long time, */
2976 /*	      starting at > 4 days idle and working back in successive half-*/
2977 /*	      days to at most 12 hours old.  If this fails to free enough   */
2978 /*            slots then work backwards in half hour slots to 30 minutes.   */
2979 /*            If that too fails, then work backwards in 30 second intervals */
2980 /*            for the last 30 minutes to at worst 30 seconds idle.          */
2981 /* ------------------------------------------------------------------------ */
2982 static int fr_state_flush(which, proto, ifs)
2983 int which, proto;
2984 ipf_stack_t *ifs;
2985 {
2986 	ipftq_t *ifq, *ifqnext;
2987 	ipftqent_t *tqe, *tqn;
2988 	ipstate_t *is, **isp;
2989 	int delete, removed;
2990 	long try, maxtick;
2991 	u_long interval;
2992 	SPL_INT(s);
2993 
2994 	removed = 0;
2995 
2996 	SPL_NET(s);
2997 	for (isp = &ifs->ifs_ips_list; ((is = *isp) != NULL); ) {
2998 		delete = 0;
2999 
3000 		if ((proto != 0) && (is->is_v != proto)) {
3001 			isp = &is->is_next;
3002 			continue;
3003 		}
3004 
3005 		switch (which)
3006 		{
3007 		case 0 :
3008 			delete = 1;
3009 			break;
3010 		case 1 :
3011 		case 2 :
3012 			if (is->is_p != IPPROTO_TCP)
3013 				break;
3014 			if ((is->is_state[0] != IPF_TCPS_ESTABLISHED) ||
3015 			    (is->is_state[1] != IPF_TCPS_ESTABLISHED))
3016 				delete = 1;
3017 			break;
3018 		}
3019 
3020 		if (delete) {
3021 			if (is->is_p == IPPROTO_TCP)
3022 				ifs->ifs_ips_stats.iss_fin++;
3023 			else
3024 				ifs->ifs_ips_stats.iss_expire++;
3025 			fr_delstate(is, ISL_FLUSH, ifs);
3026 			removed++;
3027 		} else
3028 			isp = &is->is_next;
3029 	}
3030 
3031 	if (which != 2) {
3032 		SPL_X(s);
3033 		return removed;
3034 	}
3035 
3036 	/*
3037 	 * Asked to remove inactive entries because the table is full, try
3038 	 * again, 3 times, if first attempt failed with a different criteria
3039 	 * each time.  The order tried in must be in decreasing age.
3040 	 * Another alternative is to implement random drop and drop N entries
3041 	 * at random until N have been freed up.
3042 	 */
3043 	if (ifs->ifs_fr_ticks - ifs->ifs_ips_last_force_flush < IPF_TTLVAL(5))
3044 		goto force_flush_skipped;
3045 	ifs->ifs_ips_last_force_flush = ifs->ifs_fr_ticks;
3046 
3047 	if (ifs->ifs_fr_ticks > IPF_TTLVAL(43200))
3048 		interval = IPF_TTLVAL(43200);
3049 	else if (ifs->ifs_fr_ticks > IPF_TTLVAL(1800))
3050 		interval = IPF_TTLVAL(1800);
3051 	else if (ifs->ifs_fr_ticks > IPF_TTLVAL(30))
3052 		interval = IPF_TTLVAL(30);
3053 	else
3054 		interval = IPF_TTLVAL(10);
3055 	try = ifs->ifs_fr_ticks - (ifs->ifs_fr_ticks - interval);
3056 	if (try < 0)
3057 		goto force_flush_skipped;
3058 
3059 	while (removed == 0) {
3060 		maxtick = ifs->ifs_fr_ticks - interval;
3061 		if (maxtick < 0)
3062 			break;
3063 
3064 		while (try < maxtick) {
3065 			for (ifq = ifs->ifs_ips_tqtqb; ifq != NULL;
3066 			     ifq = ifq->ifq_next) {
3067 				for (tqn = ifq->ifq_head;
3068 				     ((tqe = tqn) != NULL); ) {
3069 					if (tqe->tqe_die > try)
3070 						break;
3071 					tqn = tqe->tqe_next;
3072 					is = tqe->tqe_parent;
3073 					fr_delstate(is, ISL_EXPIRE, ifs);
3074 					removed++;
3075 				}
3076 			}
3077 
3078 			for (ifq = ifs->ifs_ips_utqe; ifq != NULL; ifq = ifqnext) {
3079 				ifqnext = ifq->ifq_next;
3080 
3081 				for (tqn = ifq->ifq_head;
3082 				     ((tqe = tqn) != NULL); ) {
3083 					if (tqe->tqe_die > try)
3084 						break;
3085 					tqn = tqe->tqe_next;
3086 					is = tqe->tqe_parent;
3087 					fr_delstate(is, ISL_EXPIRE, ifs);
3088 					removed++;
3089 				}
3090 			}
3091 			if (try + interval > maxtick)
3092 				break;
3093 			try += interval;
3094 		}
3095 
3096 		if (removed == 0) {
3097 			if (interval == IPF_TTLVAL(43200)) {
3098 				interval = IPF_TTLVAL(1800);
3099 			} else if (interval == IPF_TTLVAL(1800)) {
3100 				interval = IPF_TTLVAL(30);
3101 			} else if (interval == IPF_TTLVAL(30)) {
3102 				interval = IPF_TTLVAL(10);
3103 			} else {
3104 				break;
3105 			}
3106 		}
3107 	}
3108 force_flush_skipped:
3109 	SPL_X(s);
3110 	return removed;
3111 }
3112 
3113 
3114 
3115 /* ------------------------------------------------------------------------ */
3116 /* Function:    fr_tcp_age                                                  */
3117 /* Returns:     int - 1 == state transition made, 0 == no change (rejected) */
3118 /* Parameters:  tq(I)    - pointer to timeout queue information             */
3119 /*              fin(I)   - pointer to packet information                    */
3120 /*              tqtab(I) - TCP timeout queue table this is in               */
3121 /*              flags(I) - flags from state/NAT entry                       */
3122 /*                                                                          */
3123 /* Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29:          */
3124 /*                                                                          */
3125 /* - (try to) base state transitions on real evidence only,                 */
3126 /*   i.e. packets that are sent and have been received by ipfilter;         */
3127 /*   diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used.       */
3128 /*                                                                          */
3129 /* - deal with half-closed connections correctly;                           */
3130 /*                                                                          */
3131 /* - store the state of the source in state[0] such that ipfstat            */
3132 /*   displays the state as source/dest instead of dest/source; the calls    */
3133 /*   to fr_tcp_age have been changed accordingly.                           */
3134 /*                                                                          */
3135 /* Internal Parameters:                                                     */
3136 /*                                                                          */
3137 /*    state[0] = state of source (host that initiated connection)           */
3138 /*    state[1] = state of dest   (host that accepted the connection)        */
3139 /*                                                                          */
3140 /*    dir == 0 : a packet from source to dest                               */
3141 /*    dir == 1 : a packet from dest to source                               */
3142 /*                                                                          */
3143 /* Locking: it is assumed that the parent of the tqe structure is locked.   */
3144 /* ------------------------------------------------------------------------ */
3145 int fr_tcp_age(tqe, fin, tqtab, flags)
3146 ipftqent_t *tqe;
3147 fr_info_t *fin;
3148 ipftq_t *tqtab;
3149 int flags;
3150 {
3151 	int dlen, ostate, nstate, rval, dir;
3152 	u_char tcpflags;
3153 	tcphdr_t *tcp;
3154 	ipf_stack_t *ifs = fin->fin_ifs;
3155 
3156 	tcp = fin->fin_dp;
3157 
3158 	rval = 0;
3159 	dir = fin->fin_rev;
3160 	tcpflags = tcp->th_flags;
3161 	dlen = fin->fin_dlen - (TCP_OFF(tcp) << 2);
3162 
3163 	if (tcpflags & TH_RST) {
3164 		if (!(tcpflags & TH_PUSH) && !dlen)
3165 			nstate = IPF_TCPS_CLOSED;
3166 		else
3167 			nstate = IPF_TCPS_CLOSE_WAIT;
3168 		rval = 1;
3169 	} else {
3170 		ostate = tqe->tqe_state[1 - dir];
3171 		nstate = tqe->tqe_state[dir];
3172 
3173 		switch (nstate)
3174 		{
3175 		case IPF_TCPS_CLOSED: /* 0 */
3176 			if ((tcpflags & TH_OPENING) == TH_OPENING) {
3177 				/*
3178 				 * 'dir' received an S and sends SA in
3179 				 * response, CLOSED -> SYN_RECEIVED
3180 				 */
3181 				nstate = IPF_TCPS_SYN_RECEIVED;
3182 				rval = 1;
3183 			} else if ((tcpflags & TH_OPENING) == TH_SYN) {
3184 				/* 'dir' sent S, CLOSED -> SYN_SENT */
3185 				nstate = IPF_TCPS_SYN_SENT;
3186 				rval = 1;
3187 			}
3188 			/*
3189 			 * the next piece of code makes it possible to get
3190 			 * already established connections into the state table
3191 			 * after a restart or reload of the filter rules; this
3192 			 * does not work when a strict 'flags S keep state' is
3193 			 * used for tcp connections of course
3194 			 */
3195 			if (((flags & IS_TCPFSM) == 0) &&
3196 			    ((tcpflags & TH_ACKMASK) == TH_ACK)) {
3197 				/*
3198 				 * we saw an A, guess 'dir' is in ESTABLISHED
3199 				 * mode
3200 				 */
3201 				switch (ostate)
3202 				{
3203 				case IPF_TCPS_CLOSED :
3204 				case IPF_TCPS_SYN_RECEIVED :
3205 					nstate = IPF_TCPS_HALF_ESTAB;
3206 					rval = 1;
3207 					break;
3208 				case IPF_TCPS_HALF_ESTAB :
3209 				case IPF_TCPS_ESTABLISHED :
3210 					nstate = IPF_TCPS_ESTABLISHED;
3211 					rval = 1;
3212 					break;
3213 				default :
3214 					break;
3215 				}
3216 			}
3217 			/*
3218 			 * TODO: besides regular ACK packets we can have other
3219 			 * packets as well; it is yet to be determined how we
3220 			 * should initialize the states in those cases
3221 			 */
3222 			break;
3223 
3224 		case IPF_TCPS_LISTEN: /* 1 */
3225 			/* NOT USED */
3226 			break;
3227 
3228 		case IPF_TCPS_SYN_SENT: /* 2 */
3229 			if ((tcpflags & ~(TH_ECN|TH_CWR)) == TH_SYN) {
3230 				/*
3231 				 * A retransmitted SYN packet.  We do not reset
3232 				 * the timeout here to fr_tcptimeout because a
3233 				 * connection connect timeout does not renew
3234 				 * after every packet that is sent.  We need to
3235 				 * set rval so as to indicate the packet has
3236 				 * passed the check for its flags being valid
3237 				 * in the TCP FSM.  Setting rval to 2 has the
3238 				 * result of not resetting the timeout.
3239 				 */
3240 				rval = 2;
3241 			} else if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) ==
3242 				   TH_ACK) {
3243 				/*
3244 				 * we see an A from 'dir' which is in SYN_SENT
3245 				 * state: 'dir' sent an A in response to an SA
3246 				 * which it received, SYN_SENT -> ESTABLISHED
3247 				 */
3248 				nstate = IPF_TCPS_ESTABLISHED;
3249 				rval = 1;
3250 			} else if (tcpflags & TH_FIN) {
3251 				/*
3252 				 * we see an F from 'dir' which is in SYN_SENT
3253 				 * state and wants to close its side of the
3254 				 * connection; SYN_SENT -> FIN_WAIT_1
3255 				 */
3256 				nstate = IPF_TCPS_FIN_WAIT_1;
3257 				rval = 1;
3258 			} else if ((tcpflags & TH_OPENING) == TH_OPENING) {
3259 				/*
3260 				 * we see an SA from 'dir' which is already in
3261 				 * SYN_SENT state, this means we have a
3262 				 * simultaneous open; SYN_SENT -> SYN_RECEIVED
3263 				 */
3264 				nstate = IPF_TCPS_SYN_RECEIVED;
3265 				rval = 1;
3266 			}
3267 			break;
3268 
3269 		case IPF_TCPS_SYN_RECEIVED: /* 3 */
3270 			if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
3271 				/*
3272 				 * we see an A from 'dir' which was in
3273 				 * SYN_RECEIVED state so it must now be in
3274 				 * established state, SYN_RECEIVED ->
3275 				 * ESTABLISHED
3276 				 */
3277 				nstate = IPF_TCPS_ESTABLISHED;
3278 				rval = 1;
3279 			} else if ((tcpflags & ~(TH_ECN|TH_CWR)) ==
3280 				   TH_OPENING) {
3281 				/*
3282 				 * We see an SA from 'dir' which is already in
3283 				 * SYN_RECEIVED state.
3284 				 */
3285 				rval = 2;
3286 			} else if (tcpflags & TH_FIN) {
3287 				/*
3288 				 * we see an F from 'dir' which is in
3289 				 * SYN_RECEIVED state and wants to close its
3290 				 * side of the connection; SYN_RECEIVED ->
3291 				 * FIN_WAIT_1
3292 				 */
3293 				nstate = IPF_TCPS_FIN_WAIT_1;
3294 				rval = 1;
3295 			}
3296 			break;
3297 
3298 		case IPF_TCPS_HALF_ESTAB: /* 4 */
3299 			if (ostate >= IPF_TCPS_HALF_ESTAB) {
3300 				if ((tcpflags & TH_ACKMASK) == TH_ACK) {
3301 					nstate = IPF_TCPS_ESTABLISHED;
3302 					rval = 1;
3303 				}
3304 			}
3305 
3306 			break;
3307 
3308 		case IPF_TCPS_ESTABLISHED: /* 5 */
3309 			rval = 1;
3310 			if (tcpflags & TH_FIN) {
3311 				/*
3312 				 * 'dir' closed its side of the connection;
3313 				 * this gives us a half-closed connection;
3314 				 * ESTABLISHED -> FIN_WAIT_1
3315 				 */
3316 				nstate = IPF_TCPS_FIN_WAIT_1;
3317 			} else if (tcpflags & TH_ACK) {
3318 				/*
3319 				 * an ACK, should we exclude other flags here?
3320 				 */
3321 				if (ostate == IPF_TCPS_FIN_WAIT_1) {
3322 					/*
3323 					 * We know the other side did an active
3324 					 * close, so we are ACKing the recvd
3325 					 * FIN packet (does the window matching
3326 					 * code guarantee this?) and go into
3327 					 * CLOSE_WAIT state; this gives us a
3328 					 * half-closed connection
3329 					 */
3330 					nstate = IPF_TCPS_CLOSE_WAIT;
3331 				} else if (ostate < IPF_TCPS_CLOSE_WAIT) {
3332 					/*
3333 					 * still a fully established
3334 					 * connection reset timeout
3335 					 */
3336 					nstate = IPF_TCPS_ESTABLISHED;
3337 				}
3338 			}
3339 			break;
3340 
3341 		case IPF_TCPS_CLOSE_WAIT: /* 6 */
3342 			rval = 1;
3343 			if (tcpflags & TH_FIN) {
3344 				/*
3345 				 * application closed and 'dir' sent a FIN,
3346 				 * we're now going into LAST_ACK state
3347 				 */
3348 				nstate = IPF_TCPS_LAST_ACK;
3349 			} else {
3350 				/*
3351 				 * we remain in CLOSE_WAIT because the other
3352 				 * side has closed already and we did not
3353 				 * close our side yet; reset timeout
3354 				 */
3355 				nstate = IPF_TCPS_CLOSE_WAIT;
3356 			}
3357 			break;
3358 
3359 		case IPF_TCPS_FIN_WAIT_1: /* 7 */
3360 			rval = 1;
3361 			if ((tcpflags & TH_ACK) &&
3362 			    ostate > IPF_TCPS_CLOSE_WAIT) {
3363 				/*
3364 				 * if the other side is not active anymore
3365 				 * it has sent us a FIN packet that we are
3366 				 * ack'ing now with an ACK; this means both
3367 				 * sides have now closed the connection and
3368 				 * we go into TIME_WAIT
3369 				 */
3370 				/*
3371 				 * XXX: how do we know we really are ACKing
3372 				 * the FIN packet here? does the window code
3373 				 * guarantee that?
3374 				 */
3375 				nstate = IPF_TCPS_TIME_WAIT;
3376 			} else {
3377 				/*
3378 				 * we closed our side of the connection
3379 				 * already but the other side is still active
3380 				 * (ESTABLISHED/CLOSE_WAIT); continue with
3381 				 * this half-closed connection
3382 				 */
3383 				nstate = IPF_TCPS_FIN_WAIT_1;
3384 			}
3385 			break;
3386 
3387 		case IPF_TCPS_CLOSING: /* 8 */
3388 			/* NOT USED */
3389 			break;
3390 
3391 		case IPF_TCPS_LAST_ACK: /* 9 */
3392 			if (tcpflags & TH_ACK) {
3393 				if ((tcpflags & TH_PUSH) || dlen)
3394 					/*
3395 					 * there is still data to be delivered,
3396 					 * reset timeout
3397 					 */
3398 					rval = 1;
3399 				else
3400 					rval = 2;
3401 			}
3402 			/*
3403 			 * we cannot detect when we go out of LAST_ACK state to
3404 			 * CLOSED because that is based on the reception of ACK
3405 			 * packets; ipfilter can only detect that a packet
3406 			 * has been sent by a host
3407 			 */
3408 			break;
3409 
3410 		case IPF_TCPS_FIN_WAIT_2: /* 10 */
3411 			rval = 1;
3412 			if ((tcpflags & TH_OPENING) == TH_OPENING)
3413 				nstate = IPF_TCPS_SYN_RECEIVED;
3414 			else if (tcpflags & TH_SYN)
3415 				nstate = IPF_TCPS_SYN_SENT;
3416 			break;
3417 
3418 		case IPF_TCPS_TIME_WAIT: /* 11 */
3419 			/* we're in 2MSL timeout now */
3420 			rval = 1;
3421 			break;
3422 
3423 		default :
3424 #if defined(_KERNEL)
3425 # if SOLARIS
3426 			cmn_err(CE_NOTE,
3427 				"tcp %lx flags %x si %lx nstate %d ostate %d\n",
3428 				(u_long)tcp, tcpflags, (u_long)tqe,
3429 				nstate, ostate);
3430 # else
3431 			printf("tcp %lx flags %x si %lx nstate %d ostate %d\n",
3432 				(u_long)tcp, tcpflags, (u_long)tqe,
3433 				nstate, ostate);
3434 # endif
3435 #else
3436 			abort();
3437 #endif
3438 			break;
3439 		}
3440 	}
3441 
3442 	/*
3443 	 * If rval == 2 then do not update the queue position, but treat the
3444 	 * packet as being ok.
3445 	 */
3446 	if (rval == 2)
3447 		rval = 1;
3448 	else if (rval == 1) {
3449 		tqe->tqe_state[dir] = nstate;
3450 		if ((tqe->tqe_flags & TQE_RULEBASED) == 0)
3451 			fr_movequeue(tqe, tqe->tqe_ifq, tqtab + nstate, ifs);
3452 	}
3453 
3454 	return rval;
3455 }
3456 
3457 
3458 /* ------------------------------------------------------------------------ */
3459 /* Function:    ipstate_log                                                 */
3460 /* Returns:     Nil                                                         */
3461 /* Parameters:  is(I)   - pointer to state structure                        */
3462 /*              type(I) - type of log entry to create                       */
3463 /*                                                                          */
3464 /* Creates a state table log entry using the state structure and type info. */
3465 /* passed in.  Log packet/byte counts, source/destination address and other */
3466 /* protocol specific information.                                           */
3467 /* ------------------------------------------------------------------------ */
3468 void ipstate_log(is, type, ifs)
3469 struct ipstate *is;
3470 u_int type;
3471 ipf_stack_t *ifs;
3472 {
3473 #ifdef	IPFILTER_LOG
3474 	struct	ipslog	ipsl;
3475 	size_t sizes[1];
3476 	void *items[1];
3477 	int types[1];
3478 
3479 	/*
3480 	 * Copy information out of the ipstate_t structure and into the
3481 	 * structure used for logging.
3482 	 */
3483 	ipsl.isl_type = type;
3484 	ipsl.isl_pkts[0] = is->is_pkts[0] + is->is_icmppkts[0];
3485 	ipsl.isl_bytes[0] = is->is_bytes[0];
3486 	ipsl.isl_pkts[1] = is->is_pkts[1] + is->is_icmppkts[1];
3487 	ipsl.isl_bytes[1] = is->is_bytes[1];
3488 	ipsl.isl_pkts[2] = is->is_pkts[2] + is->is_icmppkts[2];
3489 	ipsl.isl_bytes[2] = is->is_bytes[2];
3490 	ipsl.isl_pkts[3] = is->is_pkts[3] + is->is_icmppkts[3];
3491 	ipsl.isl_bytes[3] = is->is_bytes[3];
3492 	ipsl.isl_src = is->is_src;
3493 	ipsl.isl_dst = is->is_dst;
3494 	ipsl.isl_p = is->is_p;
3495 	ipsl.isl_v = is->is_v;
3496 	ipsl.isl_flags = is->is_flags;
3497 	ipsl.isl_tag = is->is_tag;
3498 	ipsl.isl_rulen = is->is_rulen;
3499 	(void) strncpy(ipsl.isl_group, is->is_group, FR_GROUPLEN);
3500 
3501 	if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
3502 		ipsl.isl_sport = is->is_sport;
3503 		ipsl.isl_dport = is->is_dport;
3504 		if (ipsl.isl_p == IPPROTO_TCP) {
3505 			ipsl.isl_state[0] = is->is_state[0];
3506 			ipsl.isl_state[1] = is->is_state[1];
3507 		}
3508 	} else if (ipsl.isl_p == IPPROTO_ICMP) {
3509 		ipsl.isl_itype = is->is_icmp.ici_type;
3510 	} else if (ipsl.isl_p == IPPROTO_ICMPV6) {
3511 		ipsl.isl_itype = is->is_icmp.ici_type;
3512 	} else {
3513 		ipsl.isl_ps.isl_filler[0] = 0;
3514 		ipsl.isl_ps.isl_filler[1] = 0;
3515 	}
3516 
3517 	items[0] = &ipsl;
3518 	sizes[0] = sizeof(ipsl);
3519 	types[0] = 0;
3520 
3521 	if (ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1, ifs)) {
3522 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_logged);
3523 	} else {
3524 		ATOMIC_INCL(ifs->ifs_ips_stats.iss_logfail);
3525 	}
3526 #endif
3527 }
3528 
3529 
3530 #ifdef	USE_INET6
3531 /* ------------------------------------------------------------------------ */
3532 /* Function:    fr_checkicmp6matchingstate                                  */
3533 /* Returns:     ipstate_t* - NULL == no match found,                        */
3534 /*                           else  pointer to matching state entry          */
3535 /* Parameters:  fin(I) - pointer to packet information                      */
3536 /* Locks:       NULL == no locks, else Read Lock on ipf_state               */
3537 /*                                                                          */
3538 /* If we've got an ICMPv6 error message, using the information stored in    */
3539 /* the ICMPv6 packet, look for a matching state table entry.                */
3540 /* ------------------------------------------------------------------------ */
3541 static ipstate_t *fr_checkicmp6matchingstate(fin)
3542 fr_info_t *fin;
3543 {
3544 	struct icmp6_hdr *ic6, *oic;
3545 	int backward, i;
3546 	ipstate_t *is, **isp;
3547 	u_short sport, dport;
3548 	i6addr_t dst, src;
3549 	u_short savelen;
3550 	icmpinfo_t *ic;
3551 	fr_info_t ofin;
3552 	tcphdr_t *tcp;
3553 	ip6_t *oip6;
3554 	u_char	pr;
3555 	u_int hv;
3556 	ipf_stack_t *ifs = fin->fin_ifs;
3557 
3558 	/*
3559 	 * Does it at least have the return (basic) IP header ?
3560 	 * Is it an actual recognised ICMP error type?
3561 	 * Only a basic IP header (no options) should be with
3562 	 * an ICMP error header.
3563 	 */
3564 	if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN) ||
3565 	    !(fin->fin_flx & FI_ICMPERR))
3566 		return NULL;
3567 
3568 	ic6 = fin->fin_dp;
3569 
3570 	oip6 = (ip6_t *)((char *)ic6 + ICMPERR_ICMPHLEN);
3571 	if (fin->fin_plen < sizeof(*oip6))
3572 		return NULL;
3573 
3574 	bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
3575 	ofin.fin_v = 6;
3576 	ofin.fin_ifp = fin->fin_ifp;
3577 	ofin.fin_out = !fin->fin_out;
3578 	ofin.fin_m = NULL;	/* if dereferenced, panic XXX */
3579 	ofin.fin_mp = NULL;	/* if dereferenced, panic XXX */
3580 
3581 	/*
3582 	 * We make a fin entry to be able to feed it to
3583 	 * matchsrcdst. Note that not all fields are necessary
3584 	 * but this is the cleanest way. Note further we fill
3585 	 * in fin_mp such that if someone uses it we'll get
3586 	 * a kernel panic. fr_matchsrcdst does not use this.
3587 	 *
3588 	 * watch out here, as ip is in host order and oip6 in network
3589 	 * order. Any change we make must be undone afterwards.
3590 	 */
3591 	savelen = oip6->ip6_plen;
3592 	oip6->ip6_plen = fin->fin_dlen - ICMPERR_ICMPHLEN;
3593 	ofin.fin_flx = FI_NOCKSUM;
3594 	ofin.fin_ip = (ip_t *)oip6;
3595 	ofin.fin_plen = oip6->ip6_plen;
3596 	(void) fr_makefrip(sizeof(*oip6), (ip_t *)oip6, &ofin);
3597 	ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
3598 	oip6->ip6_plen = savelen;
3599 
3600 	if (oip6->ip6_nxt == IPPROTO_ICMPV6) {
3601 		oic = (struct icmp6_hdr *)(oip6 + 1);
3602 		/*
3603 		 * an ICMP error can only be generated as a result of an
3604 		 * ICMP query, not as the response on an ICMP error
3605 		 *
3606 		 * XXX theoretically ICMP_ECHOREP and the other reply's are
3607 		 * ICMP query's as well, but adding them here seems strange XXX
3608 		 */
3609 		 if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK))
3610 		    	return NULL;
3611 
3612 		/*
3613 		 * perform a lookup of the ICMP packet in the state table
3614 		 */
3615 		hv = (pr = oip6->ip6_nxt);
3616 		src.in6 = oip6->ip6_src;
3617 		hv += src.in4.s_addr;
3618 		dst.in6 = oip6->ip6_dst;
3619 		hv += dst.in4.s_addr;
3620 		hv += oic->icmp6_id;
3621 		hv += oic->icmp6_seq;
3622 		hv = DOUBLE_HASH(hv, ifs);
3623 
3624 		READ_ENTER(&ifs->ifs_ipf_state);
3625 		for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
3626 			ic = &is->is_icmp;
3627 			isp = &is->is_hnext;
3628 			if ((is->is_p == pr) &&
3629 			    !(is->is_pass & FR_NOICMPERR) &&
3630 			    (oic->icmp6_id == ic->ici_id) &&
3631 			    (oic->icmp6_seq == ic->ici_seq) &&
3632 			    (is = fr_matchsrcdst(&ofin, is, &src,
3633 						 &dst, NULL, FI_ICMPCMP))) {
3634 			    	/*
3635 			    	 * in the state table ICMP query's are stored
3636 			    	 * with the type of the corresponding ICMP
3637 			    	 * response. Correct here
3638 			    	 */
3639 				if (((ic->ici_type == ICMP6_ECHO_REPLY) &&
3640 				     (oic->icmp6_type == ICMP6_ECHO_REQUEST)) ||
3641 				     (ic->ici_type - 1 == oic->icmp6_type )) {
3642 				    	ifs->ifs_ips_stats.iss_hits++;
3643 					backward = IP6_NEQ(&is->is_dst, &src);
3644 					fin->fin_rev = !backward;
3645 					i = (backward << 1) + fin->fin_out;
3646     					is->is_icmppkts[i]++;
3647 					return is;
3648 				}
3649 			}
3650 		}
3651 		RWLOCK_EXIT(&ifs->ifs_ipf_state);
3652 		return NULL;
3653 	}
3654 
3655 	hv = (pr = oip6->ip6_nxt);
3656 	src.in6 = oip6->ip6_src;
3657 	hv += src.i6[0];
3658 	hv += src.i6[1];
3659 	hv += src.i6[2];
3660 	hv += src.i6[3];
3661 	dst.in6 = oip6->ip6_dst;
3662 	hv += dst.i6[0];
3663 	hv += dst.i6[1];
3664 	hv += dst.i6[2];
3665 	hv += dst.i6[3];
3666 
3667 	if ((oip6->ip6_nxt == IPPROTO_TCP) || (oip6->ip6_nxt == IPPROTO_UDP)) {
3668 		tcp = (tcphdr_t *)(oip6 + 1);
3669 		dport = tcp->th_dport;
3670 		sport = tcp->th_sport;
3671 		hv += dport;
3672 		hv += sport;
3673 	} else
3674 		tcp = NULL;
3675 	hv = DOUBLE_HASH(hv, ifs);
3676 
3677 	READ_ENTER(&ifs->ifs_ipf_state);
3678 	for (isp = &ifs->ifs_ips_table[hv]; ((is = *isp) != NULL); ) {
3679 		isp = &is->is_hnext;
3680 		/*
3681 		 * Only allow this icmp though if the
3682 		 * encapsulated packet was allowed through the
3683 		 * other way around. Note that the minimal amount
3684 		 * of info present does not allow for checking against
3685 		 * tcp internals such as seq and ack numbers.
3686 		 */
3687 		if ((is->is_p != pr) || (is->is_v != 6) ||
3688 		    (is->is_pass & FR_NOICMPERR))
3689 			continue;
3690 		is = fr_matchsrcdst(&ofin, is, &src, &dst, tcp, FI_ICMPCMP);
3691 		if (is != NULL) {
3692 			ifs->ifs_ips_stats.iss_hits++;
3693 			backward = IP6_NEQ(&is->is_dst, &src);
3694 			fin->fin_rev = !backward;
3695 			i = (backward << 1) + fin->fin_out;
3696 			is->is_icmppkts[i]++;
3697 			/*
3698 			 * we deliberately do not touch the timeouts
3699 			 * for the accompanying state table entry.
3700 			 * It remains to be seen if that is correct. XXX
3701 			 */
3702 			return is;
3703 		}
3704 	}
3705 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
3706 	return NULL;
3707 }
3708 #endif
3709 
3710 
3711 /* ------------------------------------------------------------------------ */
3712 /* Function:    fr_sttab_init                                               */
3713 /* Returns:     Nil                                                         */
3714 /* Parameters:  tqp(I) - pointer to an array of timeout queues for TCP      */
3715 /*                                                                          */
3716 /* Initialise the array of timeout queues for TCP.                          */
3717 /* ------------------------------------------------------------------------ */
3718 void fr_sttab_init(tqp, ifs)
3719 ipftq_t *tqp;
3720 ipf_stack_t *ifs;
3721 {
3722 	int i;
3723 
3724 	for (i = IPF_TCP_NSTATES - 1; i >= 0; i--) {
3725 		tqp[i].ifq_ttl = 0;
3726 		tqp[i].ifq_ref = 1;
3727 		tqp[i].ifq_head = NULL;
3728 		tqp[i].ifq_tail = &tqp[i].ifq_head;
3729 		tqp[i].ifq_next = tqp + i + 1;
3730 		MUTEX_INIT(&tqp[i].ifq_lock, "ipftq tcp tab");
3731 	}
3732 	tqp[IPF_TCP_NSTATES - 1].ifq_next = NULL;
3733 	tqp[IPF_TCPS_CLOSED].ifq_ttl = ifs->ifs_fr_tcpclosed;
3734 	tqp[IPF_TCPS_LISTEN].ifq_ttl = ifs->ifs_fr_tcptimeout;
3735 	tqp[IPF_TCPS_SYN_SENT].ifq_ttl = ifs->ifs_fr_tcptimeout;
3736 	tqp[IPF_TCPS_SYN_RECEIVED].ifq_ttl = ifs->ifs_fr_tcptimeout;
3737 	tqp[IPF_TCPS_ESTABLISHED].ifq_ttl = ifs->ifs_fr_tcpidletimeout;
3738 	tqp[IPF_TCPS_CLOSE_WAIT].ifq_ttl = ifs->ifs_fr_tcphalfclosed;
3739 	tqp[IPF_TCPS_FIN_WAIT_1].ifq_ttl = ifs->ifs_fr_tcphalfclosed;
3740 	tqp[IPF_TCPS_CLOSING].ifq_ttl = ifs->ifs_fr_tcptimeout;
3741 	tqp[IPF_TCPS_LAST_ACK].ifq_ttl = ifs->ifs_fr_tcplastack;
3742 	tqp[IPF_TCPS_FIN_WAIT_2].ifq_ttl = ifs->ifs_fr_tcpclosewait;
3743 	tqp[IPF_TCPS_TIME_WAIT].ifq_ttl = ifs->ifs_fr_tcptimeout;
3744 	tqp[IPF_TCPS_HALF_ESTAB].ifq_ttl = ifs->ifs_fr_tcptimeout;
3745 }
3746 
3747 
3748 /* ------------------------------------------------------------------------ */
3749 /* Function:    fr_sttab_destroy                                            */
3750 /* Returns:     Nil                                                         */
3751 /* Parameters:  tqp(I) - pointer to an array of timeout queues for TCP      */
3752 /*                                                                          */
3753 /* Do whatever is necessary to "destroy" each of the entries in the array   */
3754 /* of timeout queues for TCP.                                               */
3755 /* ------------------------------------------------------------------------ */
3756 void fr_sttab_destroy(tqp)
3757 ipftq_t *tqp;
3758 {
3759 	int i;
3760 
3761 	for (i = IPF_TCP_NSTATES - 1; i >= 0; i--)
3762 		MUTEX_DESTROY(&tqp[i].ifq_lock);
3763 }
3764 
3765 
3766 /* ------------------------------------------------------------------------ */
3767 /* Function:    fr_statederef                                               */
3768 /* Returns:     Nil                                                         */
3769 /* Parameters:  isp(I) - pointer to pointer to state table entry            */
3770 /*                                                                          */
3771 /* Decrement the reference counter for this state table entry and free it   */
3772 /* if there are no more things using it.                                    */
3773 /*                                                                          */
3774 /* When operating in userland (ipftest), we have no timers to clear a state */
3775 /* entry.  Therefore, we make a few simple tests before deleting an entry   */
3776 /* outright.  We compare states on each side looking for a combination of   */
3777 /* TIME_WAIT (should really be FIN_WAIT_2?) and LAST_ACK.  Then we factor   */
3778 /* in packet direction with the interface list to make sure we don't        */
3779 /* prematurely delete an entry on a final inbound packet that's we're also  */
3780 /* supposed to route elsewhere.                                             */
3781 /*                                                                          */
3782 /* Internal parameters:                                                     */
3783 /*    state[0] = state of source (host that initiated connection)           */
3784 /*    state[1] = state of dest   (host that accepted the connection)        */
3785 /*                                                                          */
3786 /*    dir == 0 : a packet from source to dest                               */
3787 /*    dir == 1 : a packet from dest to source                               */
3788 /* ------------------------------------------------------------------------ */
3789 void fr_statederef(fin, isp, ifs)
3790 fr_info_t *fin;
3791 ipstate_t **isp;
3792 ipf_stack_t *ifs;
3793 {
3794 	ipstate_t *is = *isp;
3795 #if 0
3796 	int nstate, ostate, dir, eol;
3797 
3798 	eol = 0; /* End-of-the-line flag. */
3799 	dir = fin->fin_rev;
3800 	ostate = is->is_state[1 - dir];
3801 	nstate = is->is_state[dir];
3802 	/*
3803 	 * Determine whether this packet is local or routed.  State entries
3804 	 * with us as the destination will have an interface list of
3805 	 * int1,-,-,int1.  Entries with us as the origin run as -,int1,int1,-.
3806 	 */
3807 	if ((fin->fin_p == IPPROTO_TCP) && (fin->fin_out == 0)) {
3808 		if ((strcmp(is->is_ifname[0], is->is_ifname[3]) == 0) &&
3809 		    (strcmp(is->is_ifname[1], is->is_ifname[2]) == 0)) {
3810 			if ((dir == 0) &&
3811 			    (strcmp(is->is_ifname[1], "-") == 0) &&
3812 			    (strcmp(is->is_ifname[0], "-") != 0)) {
3813 				eol = 1;
3814 			} else if ((dir == 1) &&
3815 				   (strcmp(is->is_ifname[0], "-") == 0) &&
3816 				   (strcmp(is->is_ifname[1], "-") != 0)) {
3817 				eol = 1;
3818 			}
3819 		}
3820 	}
3821 #endif
3822 
3823 	fin = fin;	/* LINT */
3824 	is = *isp;
3825 	*isp = NULL;
3826 	WRITE_ENTER(&ifs->ifs_ipf_state);
3827 	is->is_ref--;
3828 	if (is->is_ref == 0) {
3829 		is->is_ref++;		/* To counter ref-- in fr_delstate() */
3830 		fr_delstate(is, ISL_EXPIRE, ifs);
3831 #ifndef	_KERNEL
3832 #if 0
3833 	} else if (((fin->fin_out == 1) || (eol == 1)) &&
3834 		   ((ostate == IPF_TCPS_LAST_ACK) &&
3835 		   (nstate == IPF_TCPS_TIME_WAIT))) {
3836 		;
3837 #else
3838 	} else if ((is->is_sti.tqe_state[0] > IPF_TCPS_ESTABLISHED) ||
3839 		   (is->is_sti.tqe_state[1] > IPF_TCPS_ESTABLISHED)) {
3840 #endif
3841 		fr_delstate(is, ISL_ORPHAN, ifs);
3842 #endif
3843 	}
3844 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
3845 }
3846 
3847 
3848 /* ------------------------------------------------------------------------ */
3849 /* Function:    fr_setstatequeue                                            */
3850 /* Returns:     Nil                                                         */
3851 /* Parameters:  is(I) - pointer to state structure                          */
3852 /*              rev(I) - forward(0) or reverse(1) direction                 */
3853 /* Locks:       ipf_state (read or write)                                   */
3854 /*                                                                          */
3855 /* Put the state entry on its default queue entry, using rev as a helped in */
3856 /* determining which queue it should be placed on.                          */
3857 /* ------------------------------------------------------------------------ */
3858 void fr_setstatequeue(is, rev, ifs)
3859 ipstate_t *is;
3860 int rev;
3861 ipf_stack_t *ifs;
3862 {
3863 	ipftq_t *oifq, *nifq;
3864 
3865 
3866 	if ((is->is_sti.tqe_flags & TQE_RULEBASED) != 0)
3867 		nifq = is->is_tqehead[rev];
3868 	else
3869 		nifq = NULL;
3870 
3871 	if (nifq == NULL) {
3872 		switch (is->is_p)
3873 		{
3874 #ifdef USE_INET6
3875 		case IPPROTO_ICMPV6 :
3876 			if (rev == 1)
3877 				nifq = &ifs->ifs_ips_icmpacktq;
3878 			else
3879 				nifq = &ifs->ifs_ips_icmptq;
3880 			break;
3881 #endif
3882 		case IPPROTO_ICMP :
3883 			if (rev == 1)
3884 				nifq = &ifs->ifs_ips_icmpacktq;
3885 			else
3886 				nifq = &ifs->ifs_ips_icmptq;
3887 			break;
3888 		case IPPROTO_TCP :
3889 			nifq = ifs->ifs_ips_tqtqb + is->is_state[rev];
3890 			break;
3891 
3892 		case IPPROTO_UDP :
3893 			if (rev == 1)
3894 				nifq = &ifs->ifs_ips_udpacktq;
3895 			else
3896 				nifq = &ifs->ifs_ips_udptq;
3897 			break;
3898 
3899 		default :
3900 			nifq = &ifs->ifs_ips_iptq;
3901 			break;
3902 		}
3903 	}
3904 
3905 	oifq = is->is_sti.tqe_ifq;
3906 	/*
3907 	 * If it's currently on a timeout queue, move it from one queue to
3908 	 * another, else put it on the end of the newly determined queue.
3909 	 */
3910 	if (oifq != NULL)
3911 		fr_movequeue(&is->is_sti, oifq, nifq, ifs);
3912 	else
3913 		fr_queueappend(&is->is_sti, nifq, is, ifs);
3914 	return;
3915 }
3916 
3917 
3918 /* ------------------------------------------------------------------------ */
3919 /* Function:    fr_stateiter                                                */
3920 /* Returns:     int - 0 == success, else error                              */
3921 /* Parameters:  token(I) - pointer to ipftoken structure                    */
3922 /*              itp(I)   - pointer to ipfgeniter structure                  */
3923 /*                                                                          */
3924 /* This function handles the SIOCGENITER ioctl for the state tables and     */
3925 /* walks through the list of entries in the state table list (ips_list.)    */
3926 /* ------------------------------------------------------------------------ */
3927 static int fr_stateiter(token, itp, ifs)
3928 ipftoken_t *token;
3929 ipfgeniter_t *itp;
3930 ipf_stack_t *ifs;
3931 {
3932 	ipstate_t *is, *next, zero;
3933 	int error;
3934 
3935 	if (itp->igi_data == NULL)
3936 		return EFAULT;
3937 
3938 	if (itp->igi_type != IPFGENITER_STATE)
3939 		return EINVAL;
3940 
3941 	is = token->ipt_data;
3942 	if (is == (void *)-1) {
3943 		ipf_freetoken(token, ifs);
3944 		return ESRCH;
3945 	}
3946 
3947 	READ_ENTER(&ifs->ifs_ipf_state);
3948 	if (is == NULL) {
3949 		next = ifs->ifs_ips_list;
3950 	} else {
3951 		next = is->is_next;
3952 	}
3953 
3954 	if (next != NULL) {
3955 		/*
3956 		 * If we find a state entry to use, bump its reference count
3957 		 * so that it can be used for is_next when we come back.
3958 		 */
3959 		MUTEX_ENTER(&next->is_lock);
3960 		next->is_ref++;
3961 		MUTEX_EXIT(&next->is_lock);
3962 		token->ipt_data = next;
3963 	} else {
3964 		bzero(&zero, sizeof(zero));
3965 		next = &zero;
3966 		token->ipt_data = (void *)-1;
3967 	}
3968 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
3969 
3970 	/*
3971 	 * If we had a prior pointer to a state entry, release it.
3972 	 */
3973 	if (is != NULL) {
3974 		fr_statederef(NULL, &is, ifs);
3975 	}
3976 
3977 	/*
3978 	 * This should arguably be via fr_outobj() so that the state
3979 	 * structure can (if required) be massaged going out.
3980 	 */
3981 	error = COPYOUT(next, itp->igi_data, sizeof(*next));
3982 	if (error != 0)
3983 		error = EFAULT;
3984 
3985 	return error;
3986 }
3987 
3988