xref: /freebsd/sys/netpfil/pf/if_pfsync.c (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND ISC)
3  *
4  * Copyright (c) 2002 Michael Shalayeff
5  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
21  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27  * THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*-
31  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
32  *
33  * Permission to use, copy, modify, and distribute this software for any
34  * purpose with or without fee is hereby granted, provided that the above
35  * copyright notice and this permission notice appear in all copies.
36  *
37  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
38  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
39  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
40  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
41  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
42  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
43  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
44  */
45 
46 /*
47  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
48  *
49  * Revisions picked from OpenBSD after revision 1.110 import:
50  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
51  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
52  * 1.120, 1.175 - use monotonic time_uptime
53  * 1.122 - reduce number of updates for non-TCP sessions
54  * 1.125, 1.127 - rewrite merge or stale processing
55  * 1.128 - cleanups
56  * 1.146 - bzero() mbuf before sparsely filling it with data
57  * 1.170 - SIOCSIFMTU checks
58  * 1.126, 1.142 - deferred packets processing
59  * 1.173 - correct expire time processing
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include "opt_inet.h"
66 #include "opt_inet6.h"
67 #include "opt_pf.h"
68 
69 #include <sys/param.h>
70 #include <sys/bus.h>
71 #include <sys/endian.h>
72 #include <sys/interrupt.h>
73 #include <sys/kernel.h>
74 #include <sys/lock.h>
75 #include <sys/mbuf.h>
76 #include <sys/module.h>
77 #include <sys/mutex.h>
78 #include <sys/priv.h>
79 #include <sys/protosw.h>
80 #include <sys/smp.h>
81 #include <sys/socket.h>
82 #include <sys/sockio.h>
83 #include <sys/sysctl.h>
84 #include <sys/syslog.h>
85 
86 #include <net/bpf.h>
87 #include <net/if.h>
88 #include <net/if_var.h>
89 #include <net/if_clone.h>
90 #include <net/if_types.h>
91 #include <net/vnet.h>
92 #include <net/pfvar.h>
93 #include <net/if_pfsync.h>
94 
95 #include <netinet/if_ether.h>
96 #include <netinet/in.h>
97 #include <netinet/in_var.h>
98 #include <netinet/ip.h>
99 #include <netinet/ip_carp.h>
100 #include <netinet/ip_var.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_fsm.h>
103 #include <netinet/tcp_seq.h>
104 
105 #define PFSYNC_MINPKT ( \
106 	sizeof(struct ip) + \
107 	sizeof(struct pfsync_header) + \
108 	sizeof(struct pfsync_subheader) )
109 
110 struct pfsync_bucket;
111 
112 struct pfsync_pkt {
113 	struct ip *ip;
114 	struct in_addr src;
115 	u_int8_t flags;
116 };
117 
118 static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
119 		    struct pfsync_state_peer *);
120 static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
121 static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
122 static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
123 static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
124 static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
125 static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
126 static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
127 static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
128 static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
129 static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
130 static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
131 static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
132 
133 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
134 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
135 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
136 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
137 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
138 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
139 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
140 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
141 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
142 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
143 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
144 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
145 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
146 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
147 };
148 
149 struct pfsync_q {
150 	void		(*write)(struct pf_state *, void *);
151 	size_t		len;
152 	u_int8_t	action;
153 };
154 
155 /* we have one of these for every PFSYNC_S_ */
156 static void	pfsync_out_state(struct pf_state *, void *);
157 static void	pfsync_out_iack(struct pf_state *, void *);
158 static void	pfsync_out_upd_c(struct pf_state *, void *);
159 static void	pfsync_out_del(struct pf_state *, void *);
160 
161 static struct pfsync_q pfsync_qs[] = {
162 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
163 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
164 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
165 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
166 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
167 };
168 
169 static void	pfsync_q_ins(struct pf_state *, int, bool);
170 static void	pfsync_q_del(struct pf_state *, bool, struct pfsync_bucket *);
171 
172 static void	pfsync_update_state(struct pf_state *);
173 
174 struct pfsync_upd_req_item {
175 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
176 	struct pfsync_upd_req			ur_msg;
177 };
178 
179 struct pfsync_deferral {
180 	struct pfsync_softc		*pd_sc;
181 	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
182 	u_int				pd_refs;
183 	struct callout			pd_tmo;
184 
185 	struct pf_state			*pd_st;
186 	struct mbuf			*pd_m;
187 };
188 
189 struct pfsync_sofct;
190 
191 struct pfsync_bucket
192 {
193 	int			b_id;
194 	struct pfsync_softc	*b_sc;
195 	struct mtx		b_mtx;
196 	struct callout		b_tmo;
197 	int			b_flags;
198 #define	PFSYNCF_BUCKET_PUSH	0x00000001
199 
200 	size_t			b_len;
201 	TAILQ_HEAD(, pf_state)			b_qs[PFSYNC_S_COUNT];
202 	TAILQ_HEAD(, pfsync_upd_req_item)	b_upd_req_list;
203 	TAILQ_HEAD(, pfsync_deferral)		b_deferrals;
204 	u_int			b_deferred;
205 	void			*b_plus;
206 	size_t			b_pluslen;
207 
208 	struct  ifaltq b_snd;
209 };
210 
211 struct pfsync_softc {
212 	/* Configuration */
213 	struct ifnet		*sc_ifp;
214 	struct ifnet		*sc_sync_if;
215 	struct ip_moptions	sc_imo;
216 	struct in_addr		sc_sync_peer;
217 	uint32_t		sc_flags;
218 #define	PFSYNCF_OK		0x00000001
219 #define	PFSYNCF_DEFER		0x00000002
220 	uint8_t			sc_maxupdates;
221 	struct ip		sc_template;
222 	struct mtx		sc_mtx;
223 
224 	/* Queued data */
225 	struct pfsync_bucket	*sc_buckets;
226 
227 	/* Bulk update info */
228 	struct mtx		sc_bulk_mtx;
229 	uint32_t		sc_ureq_sent;
230 	int			sc_bulk_tries;
231 	uint32_t		sc_ureq_received;
232 	int			sc_bulk_hashid;
233 	uint64_t		sc_bulk_stateid;
234 	uint32_t		sc_bulk_creatorid;
235 	struct callout		sc_bulk_tmo;
236 	struct callout		sc_bulkfail_tmo;
237 };
238 
239 #define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
240 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
241 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
242 
243 #define PFSYNC_BUCKET_LOCK(b)		mtx_lock(&(b)->b_mtx)
244 #define PFSYNC_BUCKET_UNLOCK(b)		mtx_unlock(&(b)->b_mtx)
245 #define PFSYNC_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->b_mtx, MA_OWNED)
246 
247 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
248 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
249 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
250 
251 static const char pfsyncname[] = "pfsync";
252 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
253 VNET_DEFINE_STATIC(struct pfsync_softc	*, pfsyncif) = NULL;
254 #define	V_pfsyncif		VNET(pfsyncif)
255 VNET_DEFINE_STATIC(void *, pfsync_swi_cookie) = NULL;
256 #define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
257 VNET_DEFINE_STATIC(struct pfsyncstats, pfsyncstats);
258 #define	V_pfsyncstats		VNET(pfsyncstats)
259 VNET_DEFINE_STATIC(int, pfsync_carp_adj) = CARP_MAXSKEW;
260 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
261 
262 static void	pfsync_timeout(void *);
263 static void	pfsync_push(struct pfsync_bucket *);
264 static void	pfsync_push_all(struct pfsync_softc *);
265 static void	pfsyncintr(void *);
266 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
267 		    struct in_mfilter *imf);
268 static void	pfsync_multicast_cleanup(struct pfsync_softc *);
269 static void	pfsync_pointers_init(void);
270 static void	pfsync_pointers_uninit(void);
271 static int	pfsync_init(void);
272 static void	pfsync_uninit(void);
273 
274 static unsigned long pfsync_buckets;
275 
276 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
277     "PFSYNC");
278 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
279     &VNET_NAME(pfsyncstats), pfsyncstats,
280     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
281 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
282     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
283 SYSCTL_ULONG(_net_pfsync, OID_AUTO, pfsync_buckets, CTLFLAG_RDTUN,
284     &pfsync_buckets, 0, "Number of pfsync hash buckets");
285 
286 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
287 static void	pfsync_clone_destroy(struct ifnet *);
288 static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
289 		    struct pf_state_peer *);
290 static int	pfsyncoutput(struct ifnet *, struct mbuf *,
291 		    const struct sockaddr *, struct route *);
292 static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
293 
294 static int	pfsync_defer(struct pf_state *, struct mbuf *);
295 static void	pfsync_undefer(struct pfsync_deferral *, int);
296 static void	pfsync_undefer_state(struct pf_state *, int);
297 static void	pfsync_defer_tmo(void *);
298 
299 static void	pfsync_request_update(u_int32_t, u_int64_t);
300 static bool	pfsync_update_state_req(struct pf_state *);
301 
302 static void	pfsync_drop(struct pfsync_softc *);
303 static void	pfsync_sendout(int, int);
304 static void	pfsync_send_plus(void *, size_t);
305 
306 static void	pfsync_bulk_start(void);
307 static void	pfsync_bulk_status(u_int8_t);
308 static void	pfsync_bulk_update(void *);
309 static void	pfsync_bulk_fail(void *);
310 
311 static void	pfsync_detach_ifnet(struct ifnet *);
312 #ifdef IPSEC
313 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
314 #endif
315 static struct pfsync_bucket	*pfsync_get_bucket(struct pfsync_softc *,
316 		    struct pf_state *);
317 
318 #define PFSYNC_MAX_BULKTRIES	12
319 
320 VNET_DEFINE(struct if_clone *, pfsync_cloner);
321 #define	V_pfsync_cloner	VNET(pfsync_cloner)
322 
323 static int
324 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
325 {
326 	struct pfsync_softc *sc;
327 	struct ifnet *ifp;
328 	struct pfsync_bucket *b;
329 	int c, q;
330 
331 	if (unit != 0)
332 		return (EINVAL);
333 
334 	if (! pfsync_buckets)
335 		pfsync_buckets = mp_ncpus * 2;
336 
337 	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
338 	sc->sc_flags |= PFSYNCF_OK;
339 	sc->sc_maxupdates = 128;
340 
341 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
342 	if (ifp == NULL) {
343 		free(sc, M_PFSYNC);
344 		return (ENOSPC);
345 	}
346 	if_initname(ifp, pfsyncname, unit);
347 	ifp->if_softc = sc;
348 	ifp->if_ioctl = pfsyncioctl;
349 	ifp->if_output = pfsyncoutput;
350 	ifp->if_type = IFT_PFSYNC;
351 	ifp->if_hdrlen = sizeof(struct pfsync_header);
352 	ifp->if_mtu = ETHERMTU;
353 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
354 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
355 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
356 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
357 
358 	if_attach(ifp);
359 
360 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
361 
362 	sc->sc_buckets = mallocarray(pfsync_buckets, sizeof(*sc->sc_buckets),
363 	    M_PFSYNC, M_ZERO | M_WAITOK);
364 	for (c = 0; c < pfsync_buckets; c++) {
365 		b = &sc->sc_buckets[c];
366 		mtx_init(&b->b_mtx, "pfsync bucket", NULL, MTX_DEF);
367 
368 		b->b_id = c;
369 		b->b_sc = sc;
370 		b->b_len = PFSYNC_MINPKT;
371 
372 		for (q = 0; q < PFSYNC_S_COUNT; q++)
373 			TAILQ_INIT(&b->b_qs[q]);
374 
375 		TAILQ_INIT(&b->b_upd_req_list);
376 		TAILQ_INIT(&b->b_deferrals);
377 
378 		callout_init(&b->b_tmo, 1);
379 
380 		b->b_snd.ifq_maxlen = ifqmaxlen;
381 	}
382 
383 	V_pfsyncif = sc;
384 
385 	return (0);
386 }
387 
388 static void
389 pfsync_clone_destroy(struct ifnet *ifp)
390 {
391 	struct pfsync_softc *sc = ifp->if_softc;
392 	struct pfsync_bucket *b;
393 	int c;
394 
395 	for (c = 0; c < pfsync_buckets; c++) {
396 		b = &sc->sc_buckets[c];
397 		/*
398 		 * At this stage, everything should have already been
399 		 * cleared by pfsync_uninit(), and we have only to
400 		 * drain callouts.
401 		 */
402 		while (b->b_deferred > 0) {
403 			struct pfsync_deferral *pd =
404 			    TAILQ_FIRST(&b->b_deferrals);
405 
406 			TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
407 			b->b_deferred--;
408 			if (callout_stop(&pd->pd_tmo) > 0) {
409 				pf_release_state(pd->pd_st);
410 				m_freem(pd->pd_m);
411 				free(pd, M_PFSYNC);
412 			} else {
413 				pd->pd_refs++;
414 				callout_drain(&pd->pd_tmo);
415 				free(pd, M_PFSYNC);
416 			}
417 		}
418 
419 		callout_drain(&b->b_tmo);
420 	}
421 
422 	callout_drain(&sc->sc_bulkfail_tmo);
423 	callout_drain(&sc->sc_bulk_tmo);
424 
425 	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
426 		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
427 	bpfdetach(ifp);
428 	if_detach(ifp);
429 
430 	pfsync_drop(sc);
431 
432 	if_free(ifp);
433 	pfsync_multicast_cleanup(sc);
434 	mtx_destroy(&sc->sc_mtx);
435 	mtx_destroy(&sc->sc_bulk_mtx);
436 
437 	free(sc->sc_buckets, M_PFSYNC);
438 	free(sc, M_PFSYNC);
439 
440 	V_pfsyncif = NULL;
441 }
442 
443 static int
444 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
445     struct pf_state_peer *d)
446 {
447 	if (s->scrub.scrub_flag && d->scrub == NULL) {
448 		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
449 		if (d->scrub == NULL)
450 			return (ENOMEM);
451 	}
452 
453 	return (0);
454 }
455 
456 static int
457 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
458 {
459 	struct pfsync_softc *sc = V_pfsyncif;
460 #ifndef	__NO_STRICT_ALIGNMENT
461 	struct pfsync_state_key key[2];
462 #endif
463 	struct pfsync_state_key *kw, *ks;
464 	struct pf_state	*st = NULL;
465 	struct pf_state_key *skw = NULL, *sks = NULL;
466 	struct pf_rule *r = NULL;
467 	struct pfi_kif	*kif;
468 	int error;
469 
470 	PF_RULES_RASSERT();
471 
472 	if (sp->creatorid == 0) {
473 		if (V_pf_status.debug >= PF_DEBUG_MISC)
474 			printf("%s: invalid creator id: %08x\n", __func__,
475 			    ntohl(sp->creatorid));
476 		return (EINVAL);
477 	}
478 
479 	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
480 		if (V_pf_status.debug >= PF_DEBUG_MISC)
481 			printf("%s: unknown interface: %s\n", __func__,
482 			    sp->ifname);
483 		if (flags & PFSYNC_SI_IOCTL)
484 			return (EINVAL);
485 		return (0);	/* skip this state */
486 	}
487 
488 	/*
489 	 * If the ruleset checksums match or the state is coming from the ioctl,
490 	 * it's safe to associate the state with the rule of that number.
491 	 */
492 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
493 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
494 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
495 		r = pf_main_ruleset.rules[
496 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
497 	else
498 		r = &V_pf_default_rule;
499 
500 	if ((r->max_states &&
501 	    counter_u64_fetch(r->states_cur) >= r->max_states))
502 		goto cleanup;
503 
504 	/*
505 	 * XXXGL: consider M_WAITOK in ioctl path after.
506 	 */
507 	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
508 		goto cleanup;
509 
510 	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
511 		goto cleanup;
512 
513 #ifndef	__NO_STRICT_ALIGNMENT
514 	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
515 	kw = &key[PF_SK_WIRE];
516 	ks = &key[PF_SK_STACK];
517 #else
518 	kw = &sp->key[PF_SK_WIRE];
519 	ks = &sp->key[PF_SK_STACK];
520 #endif
521 
522 	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
523 	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
524 	    kw->port[0] != ks->port[0] ||
525 	    kw->port[1] != ks->port[1]) {
526 		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
527 		if (sks == NULL)
528 			goto cleanup;
529 	} else
530 		sks = skw;
531 
532 	/* allocate memory for scrub info */
533 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
534 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
535 		goto cleanup;
536 
537 	/* Copy to state key(s). */
538 	skw->addr[0] = kw->addr[0];
539 	skw->addr[1] = kw->addr[1];
540 	skw->port[0] = kw->port[0];
541 	skw->port[1] = kw->port[1];
542 	skw->proto = sp->proto;
543 	skw->af = sp->af;
544 	if (sks != skw) {
545 		sks->addr[0] = ks->addr[0];
546 		sks->addr[1] = ks->addr[1];
547 		sks->port[0] = ks->port[0];
548 		sks->port[1] = ks->port[1];
549 		sks->proto = sp->proto;
550 		sks->af = sp->af;
551 	}
552 
553 	/* copy to state */
554 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
555 	st->creation = time_uptime - ntohl(sp->creation);
556 	st->expire = time_uptime;
557 	if (sp->expire) {
558 		uint32_t timeout;
559 
560 		timeout = r->timeout[sp->timeout];
561 		if (!timeout)
562 			timeout = V_pf_default_rule.timeout[sp->timeout];
563 
564 		/* sp->expire may have been adaptively scaled by export. */
565 		st->expire -= timeout - ntohl(sp->expire);
566 	}
567 
568 	st->direction = sp->direction;
569 	st->log = sp->log;
570 	st->timeout = sp->timeout;
571 	st->state_flags = sp->state_flags;
572 
573 	st->id = sp->id;
574 	st->creatorid = sp->creatorid;
575 	pf_state_peer_ntoh(&sp->src, &st->src);
576 	pf_state_peer_ntoh(&sp->dst, &st->dst);
577 
578 	st->rule.ptr = r;
579 	st->nat_rule.ptr = NULL;
580 	st->anchor.ptr = NULL;
581 	st->rt_kif = NULL;
582 
583 	st->pfsync_time = time_uptime;
584 	st->sync_state = PFSYNC_S_NONE;
585 
586 	if (!(flags & PFSYNC_SI_IOCTL))
587 		st->state_flags |= PFSTATE_NOSYNC;
588 
589 	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
590 		goto cleanup_state;
591 
592 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
593 	counter_u64_add(r->states_cur, 1);
594 	counter_u64_add(r->states_tot, 1);
595 
596 	if (!(flags & PFSYNC_SI_IOCTL)) {
597 		st->state_flags &= ~PFSTATE_NOSYNC;
598 		if (st->state_flags & PFSTATE_ACK) {
599 			pfsync_q_ins(st, PFSYNC_S_IACK, true);
600 			pfsync_push_all(sc);
601 		}
602 	}
603 	st->state_flags &= ~PFSTATE_ACK;
604 	PF_STATE_UNLOCK(st);
605 
606 	return (0);
607 
608 cleanup:
609 	error = ENOMEM;
610 	if (skw == sks)
611 		sks = NULL;
612 	if (skw != NULL)
613 		uma_zfree(V_pf_state_key_z, skw);
614 	if (sks != NULL)
615 		uma_zfree(V_pf_state_key_z, sks);
616 
617 cleanup_state:	/* pf_state_insert() frees the state keys. */
618 	if (st) {
619 		if (st->dst.scrub)
620 			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
621 		if (st->src.scrub)
622 			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
623 		uma_zfree(V_pf_state_z, st);
624 	}
625 	return (error);
626 }
627 
628 static int
629 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
630 {
631 	struct pfsync_softc *sc = V_pfsyncif;
632 	struct pfsync_pkt pkt;
633 	struct mbuf *m = *mp;
634 	struct ip *ip = mtod(m, struct ip *);
635 	struct pfsync_header *ph;
636 	struct pfsync_subheader subh;
637 
638 	int offset, len;
639 	int rv;
640 	uint16_t count;
641 
642 	PF_RULES_RLOCK_TRACKER;
643 
644 	*mp = NULL;
645 	V_pfsyncstats.pfsyncs_ipackets++;
646 
647 	/* Verify that we have a sync interface configured. */
648 	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
649 	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
650 		goto done;
651 
652 	/* verify that the packet came in on the right interface */
653 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
654 		V_pfsyncstats.pfsyncs_badif++;
655 		goto done;
656 	}
657 
658 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
659 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
660 	/* verify that the IP TTL is 255. */
661 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
662 		V_pfsyncstats.pfsyncs_badttl++;
663 		goto done;
664 	}
665 
666 	offset = ip->ip_hl << 2;
667 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
668 		V_pfsyncstats.pfsyncs_hdrops++;
669 		goto done;
670 	}
671 
672 	if (offset + sizeof(*ph) > m->m_len) {
673 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
674 			V_pfsyncstats.pfsyncs_hdrops++;
675 			return (IPPROTO_DONE);
676 		}
677 		ip = mtod(m, struct ip *);
678 	}
679 	ph = (struct pfsync_header *)((char *)ip + offset);
680 
681 	/* verify the version */
682 	if (ph->version != PFSYNC_VERSION) {
683 		V_pfsyncstats.pfsyncs_badver++;
684 		goto done;
685 	}
686 
687 	len = ntohs(ph->len) + offset;
688 	if (m->m_pkthdr.len < len) {
689 		V_pfsyncstats.pfsyncs_badlen++;
690 		goto done;
691 	}
692 
693 	/* Cheaper to grab this now than having to mess with mbufs later */
694 	pkt.ip = ip;
695 	pkt.src = ip->ip_src;
696 	pkt.flags = 0;
697 
698 	/*
699 	 * Trusting pf_chksum during packet processing, as well as seeking
700 	 * in interface name tree, require holding PF_RULES_RLOCK().
701 	 */
702 	PF_RULES_RLOCK();
703 	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
704 		pkt.flags |= PFSYNC_SI_CKSUM;
705 
706 	offset += sizeof(*ph);
707 	while (offset <= len - sizeof(subh)) {
708 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
709 		offset += sizeof(subh);
710 
711 		if (subh.action >= PFSYNC_ACT_MAX) {
712 			V_pfsyncstats.pfsyncs_badact++;
713 			PF_RULES_RUNLOCK();
714 			goto done;
715 		}
716 
717 		count = ntohs(subh.count);
718 		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
719 		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
720 		if (rv == -1) {
721 			PF_RULES_RUNLOCK();
722 			return (IPPROTO_DONE);
723 		}
724 
725 		offset += rv;
726 	}
727 	PF_RULES_RUNLOCK();
728 
729 done:
730 	m_freem(m);
731 	return (IPPROTO_DONE);
732 }
733 
734 static int
735 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
736 {
737 	struct pfsync_clr *clr;
738 	struct mbuf *mp;
739 	int len = sizeof(*clr) * count;
740 	int i, offp;
741 	u_int32_t creatorid;
742 
743 	mp = m_pulldown(m, offset, len, &offp);
744 	if (mp == NULL) {
745 		V_pfsyncstats.pfsyncs_badlen++;
746 		return (-1);
747 	}
748 	clr = (struct pfsync_clr *)(mp->m_data + offp);
749 
750 	for (i = 0; i < count; i++) {
751 		creatorid = clr[i].creatorid;
752 
753 		if (clr[i].ifname[0] != '\0' &&
754 		    pfi_kif_find(clr[i].ifname) == NULL)
755 			continue;
756 
757 		for (int i = 0; i <= pf_hashmask; i++) {
758 			struct pf_idhash *ih = &V_pf_idhash[i];
759 			struct pf_state *s;
760 relock:
761 			PF_HASHROW_LOCK(ih);
762 			LIST_FOREACH(s, &ih->states, entry) {
763 				if (s->creatorid == creatorid) {
764 					s->state_flags |= PFSTATE_NOSYNC;
765 					pf_unlink_state(s, PF_ENTER_LOCKED);
766 					goto relock;
767 				}
768 			}
769 			PF_HASHROW_UNLOCK(ih);
770 		}
771 	}
772 
773 	return (len);
774 }
775 
776 static int
777 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
778 {
779 	struct mbuf *mp;
780 	struct pfsync_state *sa, *sp;
781 	int len = sizeof(*sp) * count;
782 	int i, offp;
783 
784 	mp = m_pulldown(m, offset, len, &offp);
785 	if (mp == NULL) {
786 		V_pfsyncstats.pfsyncs_badlen++;
787 		return (-1);
788 	}
789 	sa = (struct pfsync_state *)(mp->m_data + offp);
790 
791 	for (i = 0; i < count; i++) {
792 		sp = &sa[i];
793 
794 		/* Check for invalid values. */
795 		if (sp->timeout >= PFTM_MAX ||
796 		    sp->src.state > PF_TCPS_PROXY_DST ||
797 		    sp->dst.state > PF_TCPS_PROXY_DST ||
798 		    sp->direction > PF_OUT ||
799 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
800 			if (V_pf_status.debug >= PF_DEBUG_MISC)
801 				printf("%s: invalid value\n", __func__);
802 			V_pfsyncstats.pfsyncs_badval++;
803 			continue;
804 		}
805 
806 		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
807 			/* Drop out, but process the rest of the actions. */
808 			break;
809 	}
810 
811 	return (len);
812 }
813 
814 static int
815 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
816 {
817 	struct pfsync_ins_ack *ia, *iaa;
818 	struct pf_state *st;
819 
820 	struct mbuf *mp;
821 	int len = count * sizeof(*ia);
822 	int offp, i;
823 
824 	mp = m_pulldown(m, offset, len, &offp);
825 	if (mp == NULL) {
826 		V_pfsyncstats.pfsyncs_badlen++;
827 		return (-1);
828 	}
829 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
830 
831 	for (i = 0; i < count; i++) {
832 		ia = &iaa[i];
833 
834 		st = pf_find_state_byid(ia->id, ia->creatorid);
835 		if (st == NULL)
836 			continue;
837 
838 		if (st->state_flags & PFSTATE_ACK) {
839 			pfsync_undefer_state(st, 0);
840 		}
841 		PF_STATE_UNLOCK(st);
842 	}
843 	/*
844 	 * XXX this is not yet implemented, but we know the size of the
845 	 * message so we can skip it.
846 	 */
847 
848 	return (count * sizeof(struct pfsync_ins_ack));
849 }
850 
851 static int
852 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
853     struct pfsync_state_peer *dst)
854 {
855 	int sync = 0;
856 
857 	PF_STATE_LOCK_ASSERT(st);
858 
859 	/*
860 	 * The state should never go backwards except
861 	 * for syn-proxy states.  Neither should the
862 	 * sequence window slide backwards.
863 	 */
864 	if ((st->src.state > src->state &&
865 	    (st->src.state < PF_TCPS_PROXY_SRC ||
866 	    src->state >= PF_TCPS_PROXY_SRC)) ||
867 
868 	    (st->src.state == src->state &&
869 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
870 		sync++;
871 	else
872 		pf_state_peer_ntoh(src, &st->src);
873 
874 	if ((st->dst.state > dst->state) ||
875 
876 	    (st->dst.state >= TCPS_SYN_SENT &&
877 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
878 		sync++;
879 	else
880 		pf_state_peer_ntoh(dst, &st->dst);
881 
882 	return (sync);
883 }
884 
885 static int
886 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
887 {
888 	struct pfsync_softc *sc = V_pfsyncif;
889 	struct pfsync_state *sa, *sp;
890 	struct pf_state *st;
891 	int sync;
892 
893 	struct mbuf *mp;
894 	int len = count * sizeof(*sp);
895 	int offp, i;
896 
897 	mp = m_pulldown(m, offset, len, &offp);
898 	if (mp == NULL) {
899 		V_pfsyncstats.pfsyncs_badlen++;
900 		return (-1);
901 	}
902 	sa = (struct pfsync_state *)(mp->m_data + offp);
903 
904 	for (i = 0; i < count; i++) {
905 		sp = &sa[i];
906 
907 		/* check for invalid values */
908 		if (sp->timeout >= PFTM_MAX ||
909 		    sp->src.state > PF_TCPS_PROXY_DST ||
910 		    sp->dst.state > PF_TCPS_PROXY_DST) {
911 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
912 				printf("pfsync_input: PFSYNC_ACT_UPD: "
913 				    "invalid value\n");
914 			}
915 			V_pfsyncstats.pfsyncs_badval++;
916 			continue;
917 		}
918 
919 		st = pf_find_state_byid(sp->id, sp->creatorid);
920 		if (st == NULL) {
921 			/* insert the update */
922 			if (pfsync_state_import(sp, pkt->flags))
923 				V_pfsyncstats.pfsyncs_badstate++;
924 			continue;
925 		}
926 
927 		if (st->state_flags & PFSTATE_ACK) {
928 			pfsync_undefer_state(st, 1);
929 		}
930 
931 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
932 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
933 		else {
934 			sync = 0;
935 
936 			/*
937 			 * Non-TCP protocol state machine always go
938 			 * forwards
939 			 */
940 			if (st->src.state > sp->src.state)
941 				sync++;
942 			else
943 				pf_state_peer_ntoh(&sp->src, &st->src);
944 			if (st->dst.state > sp->dst.state)
945 				sync++;
946 			else
947 				pf_state_peer_ntoh(&sp->dst, &st->dst);
948 		}
949 		if (sync < 2) {
950 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
951 			pf_state_peer_ntoh(&sp->dst, &st->dst);
952 			st->expire = time_uptime;
953 			st->timeout = sp->timeout;
954 		}
955 		st->pfsync_time = time_uptime;
956 
957 		if (sync) {
958 			V_pfsyncstats.pfsyncs_stale++;
959 
960 			pfsync_update_state(st);
961 			PF_STATE_UNLOCK(st);
962 			pfsync_push_all(sc);
963 			continue;
964 		}
965 		PF_STATE_UNLOCK(st);
966 	}
967 
968 	return (len);
969 }
970 
971 static int
972 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
973 {
974 	struct pfsync_softc *sc = V_pfsyncif;
975 	struct pfsync_upd_c *ua, *up;
976 	struct pf_state *st;
977 	int len = count * sizeof(*up);
978 	int sync;
979 	struct mbuf *mp;
980 	int offp, i;
981 
982 	mp = m_pulldown(m, offset, len, &offp);
983 	if (mp == NULL) {
984 		V_pfsyncstats.pfsyncs_badlen++;
985 		return (-1);
986 	}
987 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
988 
989 	for (i = 0; i < count; i++) {
990 		up = &ua[i];
991 
992 		/* check for invalid values */
993 		if (up->timeout >= PFTM_MAX ||
994 		    up->src.state > PF_TCPS_PROXY_DST ||
995 		    up->dst.state > PF_TCPS_PROXY_DST) {
996 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
997 				printf("pfsync_input: "
998 				    "PFSYNC_ACT_UPD_C: "
999 				    "invalid value\n");
1000 			}
1001 			V_pfsyncstats.pfsyncs_badval++;
1002 			continue;
1003 		}
1004 
1005 		st = pf_find_state_byid(up->id, up->creatorid);
1006 		if (st == NULL) {
1007 			/* We don't have this state. Ask for it. */
1008 			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
1009 			pfsync_request_update(up->creatorid, up->id);
1010 			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
1011 			continue;
1012 		}
1013 
1014 		if (st->state_flags & PFSTATE_ACK) {
1015 			pfsync_undefer_state(st, 1);
1016 		}
1017 
1018 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1019 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
1020 		else {
1021 			sync = 0;
1022 
1023 			/*
1024 			 * Non-TCP protocol state machine always go
1025 			 * forwards
1026 			 */
1027 			if (st->src.state > up->src.state)
1028 				sync++;
1029 			else
1030 				pf_state_peer_ntoh(&up->src, &st->src);
1031 			if (st->dst.state > up->dst.state)
1032 				sync++;
1033 			else
1034 				pf_state_peer_ntoh(&up->dst, &st->dst);
1035 		}
1036 		if (sync < 2) {
1037 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
1038 			pf_state_peer_ntoh(&up->dst, &st->dst);
1039 			st->expire = time_uptime;
1040 			st->timeout = up->timeout;
1041 		}
1042 		st->pfsync_time = time_uptime;
1043 
1044 		if (sync) {
1045 			V_pfsyncstats.pfsyncs_stale++;
1046 
1047 			pfsync_update_state(st);
1048 			PF_STATE_UNLOCK(st);
1049 			pfsync_push_all(sc);
1050 			continue;
1051 		}
1052 		PF_STATE_UNLOCK(st);
1053 	}
1054 
1055 	return (len);
1056 }
1057 
1058 static int
1059 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1060 {
1061 	struct pfsync_upd_req *ur, *ura;
1062 	struct mbuf *mp;
1063 	int len = count * sizeof(*ur);
1064 	int i, offp;
1065 
1066 	struct pf_state *st;
1067 
1068 	mp = m_pulldown(m, offset, len, &offp);
1069 	if (mp == NULL) {
1070 		V_pfsyncstats.pfsyncs_badlen++;
1071 		return (-1);
1072 	}
1073 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1074 
1075 	for (i = 0; i < count; i++) {
1076 		ur = &ura[i];
1077 
1078 		if (ur->id == 0 && ur->creatorid == 0)
1079 			pfsync_bulk_start();
1080 		else {
1081 			st = pf_find_state_byid(ur->id, ur->creatorid);
1082 			if (st == NULL) {
1083 				V_pfsyncstats.pfsyncs_badstate++;
1084 				continue;
1085 			}
1086 			if (st->state_flags & PFSTATE_NOSYNC) {
1087 				PF_STATE_UNLOCK(st);
1088 				continue;
1089 			}
1090 
1091 			pfsync_update_state_req(st);
1092 			PF_STATE_UNLOCK(st);
1093 		}
1094 	}
1095 
1096 	return (len);
1097 }
1098 
1099 static int
1100 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1101 {
1102 	struct mbuf *mp;
1103 	struct pfsync_state *sa, *sp;
1104 	struct pf_state *st;
1105 	int len = count * sizeof(*sp);
1106 	int offp, i;
1107 
1108 	mp = m_pulldown(m, offset, len, &offp);
1109 	if (mp == NULL) {
1110 		V_pfsyncstats.pfsyncs_badlen++;
1111 		return (-1);
1112 	}
1113 	sa = (struct pfsync_state *)(mp->m_data + offp);
1114 
1115 	for (i = 0; i < count; i++) {
1116 		sp = &sa[i];
1117 
1118 		st = pf_find_state_byid(sp->id, sp->creatorid);
1119 		if (st == NULL) {
1120 			V_pfsyncstats.pfsyncs_badstate++;
1121 			continue;
1122 		}
1123 		st->state_flags |= PFSTATE_NOSYNC;
1124 		pf_unlink_state(st, PF_ENTER_LOCKED);
1125 	}
1126 
1127 	return (len);
1128 }
1129 
1130 static int
1131 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1132 {
1133 	struct mbuf *mp;
1134 	struct pfsync_del_c *sa, *sp;
1135 	struct pf_state *st;
1136 	int len = count * sizeof(*sp);
1137 	int offp, i;
1138 
1139 	mp = m_pulldown(m, offset, len, &offp);
1140 	if (mp == NULL) {
1141 		V_pfsyncstats.pfsyncs_badlen++;
1142 		return (-1);
1143 	}
1144 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
1145 
1146 	for (i = 0; i < count; i++) {
1147 		sp = &sa[i];
1148 
1149 		st = pf_find_state_byid(sp->id, sp->creatorid);
1150 		if (st == NULL) {
1151 			V_pfsyncstats.pfsyncs_badstate++;
1152 			continue;
1153 		}
1154 
1155 		st->state_flags |= PFSTATE_NOSYNC;
1156 		pf_unlink_state(st, PF_ENTER_LOCKED);
1157 	}
1158 
1159 	return (len);
1160 }
1161 
1162 static int
1163 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1164 {
1165 	struct pfsync_softc *sc = V_pfsyncif;
1166 	struct pfsync_bus *bus;
1167 	struct mbuf *mp;
1168 	int len = count * sizeof(*bus);
1169 	int offp;
1170 
1171 	PFSYNC_BLOCK(sc);
1172 
1173 	/* If we're not waiting for a bulk update, who cares. */
1174 	if (sc->sc_ureq_sent == 0) {
1175 		PFSYNC_BUNLOCK(sc);
1176 		return (len);
1177 	}
1178 
1179 	mp = m_pulldown(m, offset, len, &offp);
1180 	if (mp == NULL) {
1181 		PFSYNC_BUNLOCK(sc);
1182 		V_pfsyncstats.pfsyncs_badlen++;
1183 		return (-1);
1184 	}
1185 	bus = (struct pfsync_bus *)(mp->m_data + offp);
1186 
1187 	switch (bus->status) {
1188 	case PFSYNC_BUS_START:
1189 		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1190 		    V_pf_limits[PF_LIMIT_STATES].limit /
1191 		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1192 		    sizeof(struct pfsync_state)),
1193 		    pfsync_bulk_fail, sc);
1194 		if (V_pf_status.debug >= PF_DEBUG_MISC)
1195 			printf("pfsync: received bulk update start\n");
1196 		break;
1197 
1198 	case PFSYNC_BUS_END:
1199 		if (time_uptime - ntohl(bus->endtime) >=
1200 		    sc->sc_ureq_sent) {
1201 			/* that's it, we're happy */
1202 			sc->sc_ureq_sent = 0;
1203 			sc->sc_bulk_tries = 0;
1204 			callout_stop(&sc->sc_bulkfail_tmo);
1205 			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1206 				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
1207 				    "pfsync bulk done");
1208 			sc->sc_flags |= PFSYNCF_OK;
1209 			if (V_pf_status.debug >= PF_DEBUG_MISC)
1210 				printf("pfsync: received valid "
1211 				    "bulk update end\n");
1212 		} else {
1213 			if (V_pf_status.debug >= PF_DEBUG_MISC)
1214 				printf("pfsync: received invalid "
1215 				    "bulk update end: bad timestamp\n");
1216 		}
1217 		break;
1218 	}
1219 	PFSYNC_BUNLOCK(sc);
1220 
1221 	return (len);
1222 }
1223 
1224 static int
1225 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1226 {
1227 	int len = count * sizeof(struct pfsync_tdb);
1228 
1229 #if defined(IPSEC)
1230 	struct pfsync_tdb *tp;
1231 	struct mbuf *mp;
1232 	int offp;
1233 	int i;
1234 	int s;
1235 
1236 	mp = m_pulldown(m, offset, len, &offp);
1237 	if (mp == NULL) {
1238 		V_pfsyncstats.pfsyncs_badlen++;
1239 		return (-1);
1240 	}
1241 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
1242 
1243 	for (i = 0; i < count; i++)
1244 		pfsync_update_net_tdb(&tp[i]);
1245 #endif
1246 
1247 	return (len);
1248 }
1249 
1250 #if defined(IPSEC)
1251 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1252 static void
1253 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1254 {
1255 	struct tdb		*tdb;
1256 	int			 s;
1257 
1258 	/* check for invalid values */
1259 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1260 	    (pt->dst.sa.sa_family != AF_INET &&
1261 	    pt->dst.sa.sa_family != AF_INET6))
1262 		goto bad;
1263 
1264 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1265 	if (tdb) {
1266 		pt->rpl = ntohl(pt->rpl);
1267 		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1268 
1269 		/* Neither replay nor byte counter should ever decrease. */
1270 		if (pt->rpl < tdb->tdb_rpl ||
1271 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1272 			goto bad;
1273 		}
1274 
1275 		tdb->tdb_rpl = pt->rpl;
1276 		tdb->tdb_cur_bytes = pt->cur_bytes;
1277 	}
1278 	return;
1279 
1280 bad:
1281 	if (V_pf_status.debug >= PF_DEBUG_MISC)
1282 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1283 		    "invalid value\n");
1284 	V_pfsyncstats.pfsyncs_badstate++;
1285 	return;
1286 }
1287 #endif
1288 
1289 static int
1290 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1291 {
1292 	/* check if we are at the right place in the packet */
1293 	if (offset != m->m_pkthdr.len)
1294 		V_pfsyncstats.pfsyncs_badlen++;
1295 
1296 	/* we're done. free and let the caller return */
1297 	m_freem(m);
1298 	return (-1);
1299 }
1300 
1301 static int
1302 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1303 {
1304 	V_pfsyncstats.pfsyncs_badact++;
1305 
1306 	m_freem(m);
1307 	return (-1);
1308 }
1309 
1310 static int
1311 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1312 	struct route *rt)
1313 {
1314 	m_freem(m);
1315 	return (0);
1316 }
1317 
1318 /* ARGSUSED */
1319 static int
1320 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1321 {
1322 	struct pfsync_softc *sc = ifp->if_softc;
1323 	struct ifreq *ifr = (struct ifreq *)data;
1324 	struct pfsyncreq pfsyncr;
1325 	int error;
1326 	int c;
1327 
1328 	switch (cmd) {
1329 	case SIOCSIFFLAGS:
1330 		PFSYNC_LOCK(sc);
1331 		if (ifp->if_flags & IFF_UP) {
1332 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
1333 			PFSYNC_UNLOCK(sc);
1334 			pfsync_pointers_init();
1335 		} else {
1336 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1337 			PFSYNC_UNLOCK(sc);
1338 			pfsync_pointers_uninit();
1339 		}
1340 		break;
1341 	case SIOCSIFMTU:
1342 		if (!sc->sc_sync_if ||
1343 		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
1344 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1345 			return (EINVAL);
1346 		if (ifr->ifr_mtu < ifp->if_mtu) {
1347 			for (c = 0; c < pfsync_buckets; c++) {
1348 				PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
1349 				if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT)
1350 					pfsync_sendout(1, c);
1351 				PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
1352 			}
1353 		}
1354 		ifp->if_mtu = ifr->ifr_mtu;
1355 		break;
1356 	case SIOCGETPFSYNC:
1357 		bzero(&pfsyncr, sizeof(pfsyncr));
1358 		PFSYNC_LOCK(sc);
1359 		if (sc->sc_sync_if) {
1360 			strlcpy(pfsyncr.pfsyncr_syncdev,
1361 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
1362 		}
1363 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1364 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1365 		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1366 		    (sc->sc_flags & PFSYNCF_DEFER));
1367 		PFSYNC_UNLOCK(sc);
1368 		return (copyout(&pfsyncr, ifr_data_get_ptr(ifr),
1369 		    sizeof(pfsyncr)));
1370 
1371 	case SIOCSETPFSYNC:
1372 	    {
1373 		struct in_mfilter *imf = NULL;
1374 		struct ifnet *sifp;
1375 		struct ip *ip;
1376 
1377 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1378 			return (error);
1379 		if ((error = copyin(ifr_data_get_ptr(ifr), &pfsyncr,
1380 		    sizeof(pfsyncr))))
1381 			return (error);
1382 
1383 		if (pfsyncr.pfsyncr_maxupdates > 255)
1384 			return (EINVAL);
1385 
1386 		if (pfsyncr.pfsyncr_syncdev[0] == 0)
1387 			sifp = NULL;
1388 		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1389 			return (EINVAL);
1390 
1391 		if (sifp != NULL && (
1392 		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1393 		    pfsyncr.pfsyncr_syncpeer.s_addr ==
1394 		    htonl(INADDR_PFSYNC_GROUP)))
1395 			imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
1396 
1397 		PFSYNC_LOCK(sc);
1398 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1399 			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1400 		else
1401 			sc->sc_sync_peer.s_addr =
1402 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1403 
1404 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1405 		if (pfsyncr.pfsyncr_defer) {
1406 			sc->sc_flags |= PFSYNCF_DEFER;
1407 			V_pfsync_defer_ptr = pfsync_defer;
1408 		} else {
1409 			sc->sc_flags &= ~PFSYNCF_DEFER;
1410 			V_pfsync_defer_ptr = NULL;
1411 		}
1412 
1413 		if (sifp == NULL) {
1414 			if (sc->sc_sync_if)
1415 				if_rele(sc->sc_sync_if);
1416 			sc->sc_sync_if = NULL;
1417 			pfsync_multicast_cleanup(sc);
1418 			PFSYNC_UNLOCK(sc);
1419 			break;
1420 		}
1421 
1422 		for (c = 0; c < pfsync_buckets; c++) {
1423 			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
1424 			if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT &&
1425 			    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1426 			    (sc->sc_sync_if != NULL &&
1427 			    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1428 			    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1429 				pfsync_sendout(1, c);
1430 			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
1431 		}
1432 
1433 		pfsync_multicast_cleanup(sc);
1434 
1435 		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1436 			error = pfsync_multicast_setup(sc, sifp, imf);
1437 			if (error) {
1438 				if_rele(sifp);
1439 				ip_mfilter_free(imf);
1440 				PFSYNC_UNLOCK(sc);
1441 				return (error);
1442 			}
1443 		}
1444 		if (sc->sc_sync_if)
1445 			if_rele(sc->sc_sync_if);
1446 		sc->sc_sync_if = sifp;
1447 
1448 		ip = &sc->sc_template;
1449 		bzero(ip, sizeof(*ip));
1450 		ip->ip_v = IPVERSION;
1451 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1452 		ip->ip_tos = IPTOS_LOWDELAY;
1453 		/* len and id are set later. */
1454 		ip->ip_off = htons(IP_DF);
1455 		ip->ip_ttl = PFSYNC_DFLTTL;
1456 		ip->ip_p = IPPROTO_PFSYNC;
1457 		ip->ip_src.s_addr = INADDR_ANY;
1458 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1459 
1460 		/* Request a full state table update. */
1461 		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1462 			(*carp_demote_adj_p)(V_pfsync_carp_adj,
1463 			    "pfsync bulk start");
1464 		sc->sc_flags &= ~PFSYNCF_OK;
1465 		if (V_pf_status.debug >= PF_DEBUG_MISC)
1466 			printf("pfsync: requesting bulk update\n");
1467 		PFSYNC_UNLOCK(sc);
1468 		PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
1469 		pfsync_request_update(0, 0);
1470 		PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
1471 		PFSYNC_BLOCK(sc);
1472 		sc->sc_ureq_sent = time_uptime;
1473 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1474 		    sc);
1475 		PFSYNC_BUNLOCK(sc);
1476 
1477 		break;
1478 	    }
1479 	default:
1480 		return (ENOTTY);
1481 	}
1482 
1483 	return (0);
1484 }
1485 
1486 static void
1487 pfsync_out_state(struct pf_state *st, void *buf)
1488 {
1489 	struct pfsync_state *sp = buf;
1490 
1491 	pfsync_state_export(sp, st);
1492 }
1493 
1494 static void
1495 pfsync_out_iack(struct pf_state *st, void *buf)
1496 {
1497 	struct pfsync_ins_ack *iack = buf;
1498 
1499 	iack->id = st->id;
1500 	iack->creatorid = st->creatorid;
1501 }
1502 
1503 static void
1504 pfsync_out_upd_c(struct pf_state *st, void *buf)
1505 {
1506 	struct pfsync_upd_c *up = buf;
1507 
1508 	bzero(up, sizeof(*up));
1509 	up->id = st->id;
1510 	pf_state_peer_hton(&st->src, &up->src);
1511 	pf_state_peer_hton(&st->dst, &up->dst);
1512 	up->creatorid = st->creatorid;
1513 	up->timeout = st->timeout;
1514 }
1515 
1516 static void
1517 pfsync_out_del(struct pf_state *st, void *buf)
1518 {
1519 	struct pfsync_del_c *dp = buf;
1520 
1521 	dp->id = st->id;
1522 	dp->creatorid = st->creatorid;
1523 	st->state_flags |= PFSTATE_NOSYNC;
1524 }
1525 
1526 static void
1527 pfsync_drop(struct pfsync_softc *sc)
1528 {
1529 	struct pf_state *st, *next;
1530 	struct pfsync_upd_req_item *ur;
1531 	struct pfsync_bucket *b;
1532 	int c, q;
1533 
1534 	for (c = 0; c < pfsync_buckets; c++) {
1535 		b = &sc->sc_buckets[c];
1536 		for (q = 0; q < PFSYNC_S_COUNT; q++) {
1537 			if (TAILQ_EMPTY(&b->b_qs[q]))
1538 				continue;
1539 
1540 			TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, next) {
1541 				KASSERT(st->sync_state == q,
1542 					("%s: st->sync_state == q",
1543 						__func__));
1544 				st->sync_state = PFSYNC_S_NONE;
1545 				pf_release_state(st);
1546 			}
1547 			TAILQ_INIT(&b->b_qs[q]);
1548 		}
1549 
1550 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
1551 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
1552 			free(ur, M_PFSYNC);
1553 		}
1554 
1555 		b->b_len = PFSYNC_MINPKT;
1556 		b->b_plus = NULL;
1557 	}
1558 }
1559 
1560 static void
1561 pfsync_sendout(int schedswi, int c)
1562 {
1563 	struct pfsync_softc *sc = V_pfsyncif;
1564 	struct ifnet *ifp = sc->sc_ifp;
1565 	struct mbuf *m;
1566 	struct ip *ip;
1567 	struct pfsync_header *ph;
1568 	struct pfsync_subheader *subh;
1569 	struct pf_state *st, *st_next;
1570 	struct pfsync_upd_req_item *ur;
1571 	struct pfsync_bucket *b = &sc->sc_buckets[c];
1572 	int offset;
1573 	int q, count = 0;
1574 
1575 	KASSERT(sc != NULL, ("%s: null sc", __func__));
1576 	KASSERT(b->b_len > PFSYNC_MINPKT,
1577 	    ("%s: sc_len %zu", __func__, b->b_len));
1578 	PFSYNC_BUCKET_LOCK_ASSERT(b);
1579 
1580 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1581 		pfsync_drop(sc);
1582 		return;
1583 	}
1584 
1585 	m = m_get2(max_linkhdr + b->b_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1586 	if (m == NULL) {
1587 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
1588 		V_pfsyncstats.pfsyncs_onomem++;
1589 		return;
1590 	}
1591 	m->m_data += max_linkhdr;
1592 	m->m_len = m->m_pkthdr.len = b->b_len;
1593 
1594 	/* build the ip header */
1595 	ip = (struct ip *)m->m_data;
1596 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1597 	offset = sizeof(*ip);
1598 
1599 	ip->ip_len = htons(m->m_pkthdr.len);
1600 	ip_fillid(ip);
1601 
1602 	/* build the pfsync header */
1603 	ph = (struct pfsync_header *)(m->m_data + offset);
1604 	bzero(ph, sizeof(*ph));
1605 	offset += sizeof(*ph);
1606 
1607 	ph->version = PFSYNC_VERSION;
1608 	ph->len = htons(b->b_len - sizeof(*ip));
1609 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1610 
1611 	/* walk the queues */
1612 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1613 		if (TAILQ_EMPTY(&b->b_qs[q]))
1614 			continue;
1615 
1616 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1617 		offset += sizeof(*subh);
1618 
1619 		count = 0;
1620 		TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, st_next) {
1621 			KASSERT(st->sync_state == q,
1622 				("%s: st->sync_state == q",
1623 					__func__));
1624 			/*
1625 			 * XXXGL: some of write methods do unlocked reads
1626 			 * of state data :(
1627 			 */
1628 			pfsync_qs[q].write(st, m->m_data + offset);
1629 			offset += pfsync_qs[q].len;
1630 			st->sync_state = PFSYNC_S_NONE;
1631 			pf_release_state(st);
1632 			count++;
1633 		}
1634 		TAILQ_INIT(&b->b_qs[q]);
1635 
1636 		bzero(subh, sizeof(*subh));
1637 		subh->action = pfsync_qs[q].action;
1638 		subh->count = htons(count);
1639 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1640 	}
1641 
1642 	if (!TAILQ_EMPTY(&b->b_upd_req_list)) {
1643 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1644 		offset += sizeof(*subh);
1645 
1646 		count = 0;
1647 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
1648 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
1649 
1650 			bcopy(&ur->ur_msg, m->m_data + offset,
1651 			    sizeof(ur->ur_msg));
1652 			offset += sizeof(ur->ur_msg);
1653 			free(ur, M_PFSYNC);
1654 			count++;
1655 		}
1656 
1657 		bzero(subh, sizeof(*subh));
1658 		subh->action = PFSYNC_ACT_UPD_REQ;
1659 		subh->count = htons(count);
1660 		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1661 	}
1662 
1663 	/* has someone built a custom region for us to add? */
1664 	if (b->b_plus != NULL) {
1665 		bcopy(b->b_plus, m->m_data + offset, b->b_pluslen);
1666 		offset += b->b_pluslen;
1667 
1668 		b->b_plus = NULL;
1669 	}
1670 
1671 	subh = (struct pfsync_subheader *)(m->m_data + offset);
1672 	offset += sizeof(*subh);
1673 
1674 	bzero(subh, sizeof(*subh));
1675 	subh->action = PFSYNC_ACT_EOF;
1676 	subh->count = htons(1);
1677 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1678 
1679 	/* we're done, let's put it on the wire */
1680 	if (ifp->if_bpf) {
1681 		m->m_data += sizeof(*ip);
1682 		m->m_len = m->m_pkthdr.len = b->b_len - sizeof(*ip);
1683 		BPF_MTAP(ifp, m);
1684 		m->m_data -= sizeof(*ip);
1685 		m->m_len = m->m_pkthdr.len = b->b_len;
1686 	}
1687 
1688 	if (sc->sc_sync_if == NULL) {
1689 		b->b_len = PFSYNC_MINPKT;
1690 		m_freem(m);
1691 		return;
1692 	}
1693 
1694 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
1695 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
1696 	b->b_len = PFSYNC_MINPKT;
1697 
1698 	if (!_IF_QFULL(&b->b_snd))
1699 		_IF_ENQUEUE(&b->b_snd, m);
1700 	else {
1701 		m_freem(m);
1702 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
1703 	}
1704 	if (schedswi)
1705 		swi_sched(V_pfsync_swi_cookie, 0);
1706 }
1707 
1708 static void
1709 pfsync_insert_state(struct pf_state *st)
1710 {
1711 	struct pfsync_softc *sc = V_pfsyncif;
1712 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1713 
1714 	if (st->state_flags & PFSTATE_NOSYNC)
1715 		return;
1716 
1717 	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1718 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1719 		st->state_flags |= PFSTATE_NOSYNC;
1720 		return;
1721 	}
1722 
1723 	KASSERT(st->sync_state == PFSYNC_S_NONE,
1724 		("%s: st->sync_state %u", __func__, st->sync_state));
1725 
1726 	PFSYNC_BUCKET_LOCK(b);
1727 	if (b->b_len == PFSYNC_MINPKT)
1728 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
1729 
1730 	pfsync_q_ins(st, PFSYNC_S_INS, true);
1731 	PFSYNC_BUCKET_UNLOCK(b);
1732 
1733 	st->sync_updates = 0;
1734 }
1735 
1736 static int
1737 pfsync_defer(struct pf_state *st, struct mbuf *m)
1738 {
1739 	struct pfsync_softc *sc = V_pfsyncif;
1740 	struct pfsync_deferral *pd;
1741 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1742 
1743 	if (m->m_flags & (M_BCAST|M_MCAST))
1744 		return (0);
1745 
1746 	PFSYNC_LOCK(sc);
1747 
1748 	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1749 	    !(sc->sc_flags & PFSYNCF_DEFER)) {
1750 		PFSYNC_UNLOCK(sc);
1751 		return (0);
1752 	}
1753 
1754 	if (b->b_deferred >= 128)
1755 		pfsync_undefer(TAILQ_FIRST(&b->b_deferrals), 0);
1756 
1757 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1758 	if (pd == NULL)
1759 		return (0);
1760 	b->b_deferred++;
1761 
1762 	m->m_flags |= M_SKIP_FIREWALL;
1763 	st->state_flags |= PFSTATE_ACK;
1764 
1765 	pd->pd_sc = sc;
1766 	pd->pd_refs = 0;
1767 	pd->pd_st = st;
1768 	pf_ref_state(st);
1769 	pd->pd_m = m;
1770 
1771 	TAILQ_INSERT_TAIL(&b->b_deferrals, pd, pd_entry);
1772 	callout_init_mtx(&pd->pd_tmo, &b->b_mtx, CALLOUT_RETURNUNLOCKED);
1773 	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1774 
1775 	pfsync_push(b);
1776 
1777 	return (1);
1778 }
1779 
1780 static void
1781 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1782 {
1783 	struct pfsync_softc *sc = pd->pd_sc;
1784 	struct mbuf *m = pd->pd_m;
1785 	struct pf_state *st = pd->pd_st;
1786 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1787 
1788 	PFSYNC_BUCKET_LOCK_ASSERT(b);
1789 
1790 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
1791 	b->b_deferred--;
1792 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1793 	free(pd, M_PFSYNC);
1794 	pf_release_state(st);
1795 
1796 	if (drop)
1797 		m_freem(m);
1798 	else {
1799 		_IF_ENQUEUE(&b->b_snd, m);
1800 		pfsync_push(b);
1801 	}
1802 }
1803 
1804 static void
1805 pfsync_defer_tmo(void *arg)
1806 {
1807 	struct epoch_tracker et;
1808 	struct pfsync_deferral *pd = arg;
1809 	struct pfsync_softc *sc = pd->pd_sc;
1810 	struct mbuf *m = pd->pd_m;
1811 	struct pf_state *st = pd->pd_st;
1812 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1813 
1814 	PFSYNC_BUCKET_LOCK_ASSERT(b);
1815 
1816 	NET_EPOCH_ENTER(et);
1817 	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1818 
1819 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
1820 	b->b_deferred--;
1821 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
1822 	if (pd->pd_refs == 0)
1823 		free(pd, M_PFSYNC);
1824 	PFSYNC_UNLOCK(sc);
1825 
1826 	ip_output(m, NULL, NULL, 0, NULL, NULL);
1827 
1828 	pf_release_state(st);
1829 
1830 	CURVNET_RESTORE();
1831 	NET_EPOCH_EXIT(et);
1832 }
1833 
1834 static void
1835 pfsync_undefer_state(struct pf_state *st, int drop)
1836 {
1837 	struct pfsync_softc *sc = V_pfsyncif;
1838 	struct pfsync_deferral *pd;
1839 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1840 
1841 	PFSYNC_BUCKET_LOCK(b);
1842 
1843 	TAILQ_FOREACH(pd, &b->b_deferrals, pd_entry) {
1844 		 if (pd->pd_st == st) {
1845 			if (callout_stop(&pd->pd_tmo) > 0)
1846 				pfsync_undefer(pd, drop);
1847 
1848 			PFSYNC_BUCKET_UNLOCK(b);
1849 			return;
1850 		}
1851 	}
1852 	PFSYNC_BUCKET_UNLOCK(b);
1853 
1854 	panic("%s: unable to find deferred state", __func__);
1855 }
1856 
1857 static struct pfsync_bucket*
1858 pfsync_get_bucket(struct pfsync_softc *sc, struct pf_state *st)
1859 {
1860 	int c = PF_IDHASH(st) % pfsync_buckets;
1861 	return &sc->sc_buckets[c];
1862 }
1863 
1864 static void
1865 pfsync_update_state(struct pf_state *st)
1866 {
1867 	struct pfsync_softc *sc = V_pfsyncif;
1868 	bool sync = false, ref = true;
1869 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1870 
1871 	PF_STATE_LOCK_ASSERT(st);
1872 	PFSYNC_BUCKET_LOCK(b);
1873 
1874 	if (st->state_flags & PFSTATE_ACK)
1875 		pfsync_undefer_state(st, 0);
1876 	if (st->state_flags & PFSTATE_NOSYNC) {
1877 		if (st->sync_state != PFSYNC_S_NONE)
1878 			pfsync_q_del(st, true, b);
1879 		PFSYNC_BUCKET_UNLOCK(b);
1880 		return;
1881 	}
1882 
1883 	if (b->b_len == PFSYNC_MINPKT)
1884 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
1885 
1886 	switch (st->sync_state) {
1887 	case PFSYNC_S_UPD_C:
1888 	case PFSYNC_S_UPD:
1889 	case PFSYNC_S_INS:
1890 		/* we're already handling it */
1891 
1892 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1893 			st->sync_updates++;
1894 			if (st->sync_updates >= sc->sc_maxupdates)
1895 				sync = true;
1896 		}
1897 		break;
1898 
1899 	case PFSYNC_S_IACK:
1900 		pfsync_q_del(st, false, b);
1901 		ref = false;
1902 		/* FALLTHROUGH */
1903 
1904 	case PFSYNC_S_NONE:
1905 		pfsync_q_ins(st, PFSYNC_S_UPD_C, ref);
1906 		st->sync_updates = 0;
1907 		break;
1908 
1909 	default:
1910 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1911 	}
1912 
1913 	if (sync || (time_uptime - st->pfsync_time) < 2)
1914 		pfsync_push(b);
1915 
1916 	PFSYNC_BUCKET_UNLOCK(b);
1917 }
1918 
1919 static void
1920 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1921 {
1922 	struct pfsync_softc *sc = V_pfsyncif;
1923 	struct pfsync_bucket *b = &sc->sc_buckets[0];
1924 	struct pfsync_upd_req_item *item;
1925 	size_t nlen = sizeof(struct pfsync_upd_req);
1926 
1927 	PFSYNC_BUCKET_LOCK_ASSERT(b);
1928 
1929 	/*
1930 	 * This code does a bit to prevent multiple update requests for the
1931 	 * same state being generated. It searches current subheader queue,
1932 	 * but it doesn't lookup into queue of already packed datagrams.
1933 	 */
1934 	TAILQ_FOREACH(item, &b->b_upd_req_list, ur_entry)
1935 		if (item->ur_msg.id == id &&
1936 		    item->ur_msg.creatorid == creatorid)
1937 			return;
1938 
1939 	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1940 	if (item == NULL)
1941 		return; /* XXX stats */
1942 
1943 	item->ur_msg.id = id;
1944 	item->ur_msg.creatorid = creatorid;
1945 
1946 	if (TAILQ_EMPTY(&b->b_upd_req_list))
1947 		nlen += sizeof(struct pfsync_subheader);
1948 
1949 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
1950 		pfsync_sendout(1, 0);
1951 
1952 		nlen = sizeof(struct pfsync_subheader) +
1953 		    sizeof(struct pfsync_upd_req);
1954 	}
1955 
1956 	TAILQ_INSERT_TAIL(&b->b_upd_req_list, item, ur_entry);
1957 	b->b_len += nlen;
1958 }
1959 
1960 static bool
1961 pfsync_update_state_req(struct pf_state *st)
1962 {
1963 	struct pfsync_softc *sc = V_pfsyncif;
1964 	bool ref = true, full = false;
1965 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
1966 
1967 	PF_STATE_LOCK_ASSERT(st);
1968 	PFSYNC_BUCKET_LOCK(b);
1969 
1970 	if (st->state_flags & PFSTATE_NOSYNC) {
1971 		if (st->sync_state != PFSYNC_S_NONE)
1972 			pfsync_q_del(st, true, b);
1973 		PFSYNC_BUCKET_UNLOCK(b);
1974 		return (full);
1975 	}
1976 
1977 	switch (st->sync_state) {
1978 	case PFSYNC_S_UPD_C:
1979 	case PFSYNC_S_IACK:
1980 		pfsync_q_del(st, false, b);
1981 		ref = false;
1982 		/* FALLTHROUGH */
1983 
1984 	case PFSYNC_S_NONE:
1985 		pfsync_q_ins(st, PFSYNC_S_UPD, ref);
1986 		pfsync_push(b);
1987 		break;
1988 
1989 	case PFSYNC_S_INS:
1990 	case PFSYNC_S_UPD:
1991 	case PFSYNC_S_DEL:
1992 		/* we're already handling it */
1993 		break;
1994 
1995 	default:
1996 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
1997 	}
1998 
1999 	if ((sc->sc_ifp->if_mtu - b->b_len) < sizeof(struct pfsync_state))
2000 		full = true;
2001 
2002 	PFSYNC_BUCKET_UNLOCK(b);
2003 
2004 	return (full);
2005 }
2006 
2007 static void
2008 pfsync_delete_state(struct pf_state *st)
2009 {
2010 	struct pfsync_softc *sc = V_pfsyncif;
2011 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
2012 	bool ref = true;
2013 
2014 	PFSYNC_BUCKET_LOCK(b);
2015 	if (st->state_flags & PFSTATE_ACK)
2016 		pfsync_undefer_state(st, 1);
2017 	if (st->state_flags & PFSTATE_NOSYNC) {
2018 		if (st->sync_state != PFSYNC_S_NONE)
2019 			pfsync_q_del(st, true, b);
2020 		PFSYNC_BUCKET_UNLOCK(b);
2021 		return;
2022 	}
2023 
2024 	if (b->b_len == PFSYNC_MINPKT)
2025 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
2026 
2027 	switch (st->sync_state) {
2028 	case PFSYNC_S_INS:
2029 		/* We never got to tell the world so just forget about it. */
2030 		pfsync_q_del(st, true, b);
2031 		break;
2032 
2033 	case PFSYNC_S_UPD_C:
2034 	case PFSYNC_S_UPD:
2035 	case PFSYNC_S_IACK:
2036 		pfsync_q_del(st, false, b);
2037 		ref = false;
2038 		/* FALLTHROUGH */
2039 
2040 	case PFSYNC_S_NONE:
2041 		pfsync_q_ins(st, PFSYNC_S_DEL, ref);
2042 		break;
2043 
2044 	default:
2045 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
2046 	}
2047 
2048 	PFSYNC_BUCKET_UNLOCK(b);
2049 }
2050 
2051 static void
2052 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2053 {
2054 	struct {
2055 		struct pfsync_subheader subh;
2056 		struct pfsync_clr clr;
2057 	} __packed r;
2058 
2059 	bzero(&r, sizeof(r));
2060 
2061 	r.subh.action = PFSYNC_ACT_CLR;
2062 	r.subh.count = htons(1);
2063 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
2064 
2065 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2066 	r.clr.creatorid = creatorid;
2067 
2068 	pfsync_send_plus(&r, sizeof(r));
2069 }
2070 
2071 static void
2072 pfsync_q_ins(struct pf_state *st, int q, bool ref)
2073 {
2074 	struct pfsync_softc *sc = V_pfsyncif;
2075 	size_t nlen = pfsync_qs[q].len;
2076 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
2077 
2078 	PFSYNC_BUCKET_LOCK_ASSERT(b);
2079 
2080 	KASSERT(st->sync_state == PFSYNC_S_NONE,
2081 		("%s: st->sync_state %u", __func__, st->sync_state));
2082 	KASSERT(b->b_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
2083 	    b->b_len));
2084 
2085 	if (TAILQ_EMPTY(&b->b_qs[q]))
2086 		nlen += sizeof(struct pfsync_subheader);
2087 
2088 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
2089 		pfsync_sendout(1, b->b_id);
2090 
2091 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
2092 	}
2093 
2094 	b->b_len += nlen;
2095 	TAILQ_INSERT_TAIL(&b->b_qs[q], st, sync_list);
2096 	st->sync_state = q;
2097 	if (ref)
2098 		pf_ref_state(st);
2099 }
2100 
2101 static void
2102 pfsync_q_del(struct pf_state *st, bool unref, struct pfsync_bucket *b)
2103 {
2104 	int q = st->sync_state;
2105 
2106 	PFSYNC_BUCKET_LOCK_ASSERT(b);
2107 	KASSERT(st->sync_state != PFSYNC_S_NONE,
2108 		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2109 
2110 	b->b_len -= pfsync_qs[q].len;
2111 	TAILQ_REMOVE(&b->b_qs[q], st, sync_list);
2112 	st->sync_state = PFSYNC_S_NONE;
2113 	if (unref)
2114 		pf_release_state(st);
2115 
2116 	if (TAILQ_EMPTY(&b->b_qs[q]))
2117 		b->b_len -= sizeof(struct pfsync_subheader);
2118 }
2119 
2120 static void
2121 pfsync_bulk_start(void)
2122 {
2123 	struct pfsync_softc *sc = V_pfsyncif;
2124 
2125 	if (V_pf_status.debug >= PF_DEBUG_MISC)
2126 		printf("pfsync: received bulk update request\n");
2127 
2128 	PFSYNC_BLOCK(sc);
2129 
2130 	sc->sc_ureq_received = time_uptime;
2131 	sc->sc_bulk_hashid = 0;
2132 	sc->sc_bulk_stateid = 0;
2133 	pfsync_bulk_status(PFSYNC_BUS_START);
2134 	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2135 	PFSYNC_BUNLOCK(sc);
2136 }
2137 
2138 static void
2139 pfsync_bulk_update(void *arg)
2140 {
2141 	struct pfsync_softc *sc = arg;
2142 	struct pf_state *s;
2143 	int i, sent = 0;
2144 
2145 	PFSYNC_BLOCK_ASSERT(sc);
2146 	CURVNET_SET(sc->sc_ifp->if_vnet);
2147 
2148 	/*
2149 	 * Start with last state from previous invocation.
2150 	 * It may had gone, in this case start from the
2151 	 * hash slot.
2152 	 */
2153 	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2154 
2155 	if (s != NULL)
2156 		i = PF_IDHASH(s);
2157 	else
2158 		i = sc->sc_bulk_hashid;
2159 
2160 	for (; i <= pf_hashmask; i++) {
2161 		struct pf_idhash *ih = &V_pf_idhash[i];
2162 
2163 		if (s != NULL)
2164 			PF_HASHROW_ASSERT(ih);
2165 		else {
2166 			PF_HASHROW_LOCK(ih);
2167 			s = LIST_FIRST(&ih->states);
2168 		}
2169 
2170 		for (; s; s = LIST_NEXT(s, entry)) {
2171 			if (s->sync_state == PFSYNC_S_NONE &&
2172 			    s->timeout < PFTM_MAX &&
2173 			    s->pfsync_time <= sc->sc_ureq_received) {
2174 				if (pfsync_update_state_req(s)) {
2175 					/* We've filled a packet. */
2176 					sc->sc_bulk_hashid = i;
2177 					sc->sc_bulk_stateid = s->id;
2178 					sc->sc_bulk_creatorid = s->creatorid;
2179 					PF_HASHROW_UNLOCK(ih);
2180 					callout_reset(&sc->sc_bulk_tmo, 1,
2181 					    pfsync_bulk_update, sc);
2182 					goto full;
2183 				}
2184 				sent++;
2185 			}
2186 		}
2187 		PF_HASHROW_UNLOCK(ih);
2188 	}
2189 
2190 	/* We're done. */
2191 	pfsync_bulk_status(PFSYNC_BUS_END);
2192 full:
2193 	CURVNET_RESTORE();
2194 }
2195 
2196 static void
2197 pfsync_bulk_status(u_int8_t status)
2198 {
2199 	struct {
2200 		struct pfsync_subheader subh;
2201 		struct pfsync_bus bus;
2202 	} __packed r;
2203 
2204 	struct pfsync_softc *sc = V_pfsyncif;
2205 
2206 	bzero(&r, sizeof(r));
2207 
2208 	r.subh.action = PFSYNC_ACT_BUS;
2209 	r.subh.count = htons(1);
2210 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2211 
2212 	r.bus.creatorid = V_pf_status.hostid;
2213 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2214 	r.bus.status = status;
2215 
2216 	pfsync_send_plus(&r, sizeof(r));
2217 }
2218 
2219 static void
2220 pfsync_bulk_fail(void *arg)
2221 {
2222 	struct pfsync_softc *sc = arg;
2223 	struct pfsync_bucket *b = &sc->sc_buckets[0];
2224 
2225 	CURVNET_SET(sc->sc_ifp->if_vnet);
2226 
2227 	PFSYNC_BLOCK_ASSERT(sc);
2228 
2229 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2230 		/* Try again */
2231 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2232 		    pfsync_bulk_fail, V_pfsyncif);
2233 		PFSYNC_BUCKET_LOCK(b);
2234 		pfsync_request_update(0, 0);
2235 		PFSYNC_BUCKET_UNLOCK(b);
2236 	} else {
2237 		/* Pretend like the transfer was ok. */
2238 		sc->sc_ureq_sent = 0;
2239 		sc->sc_bulk_tries = 0;
2240 		PFSYNC_LOCK(sc);
2241 		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2242 			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
2243 			    "pfsync bulk fail");
2244 		sc->sc_flags |= PFSYNCF_OK;
2245 		PFSYNC_UNLOCK(sc);
2246 		if (V_pf_status.debug >= PF_DEBUG_MISC)
2247 			printf("pfsync: failed to receive bulk update\n");
2248 	}
2249 
2250 	CURVNET_RESTORE();
2251 }
2252 
2253 static void
2254 pfsync_send_plus(void *plus, size_t pluslen)
2255 {
2256 	struct pfsync_softc *sc = V_pfsyncif;
2257 	struct pfsync_bucket *b = &sc->sc_buckets[0];
2258 
2259 	PFSYNC_BUCKET_LOCK(b);
2260 
2261 	if (b->b_len + pluslen > sc->sc_ifp->if_mtu)
2262 		pfsync_sendout(1, b->b_id);
2263 
2264 	b->b_plus = plus;
2265 	b->b_len += (b->b_pluslen = pluslen);
2266 
2267 	pfsync_sendout(1, b->b_id);
2268 	PFSYNC_BUCKET_UNLOCK(b);
2269 }
2270 
2271 static void
2272 pfsync_timeout(void *arg)
2273 {
2274 	struct pfsync_bucket *b = arg;
2275 
2276 	CURVNET_SET(b->b_sc->sc_ifp->if_vnet);
2277 	PFSYNC_BUCKET_LOCK(b);
2278 	pfsync_push(b);
2279 	PFSYNC_BUCKET_UNLOCK(b);
2280 	CURVNET_RESTORE();
2281 }
2282 
2283 static void
2284 pfsync_push(struct pfsync_bucket *b)
2285 {
2286 
2287 	PFSYNC_BUCKET_LOCK_ASSERT(b);
2288 
2289 	b->b_flags |= PFSYNCF_BUCKET_PUSH;
2290 	swi_sched(V_pfsync_swi_cookie, 0);
2291 }
2292 
2293 static void
2294 pfsync_push_all(struct pfsync_softc *sc)
2295 {
2296 	int c;
2297 	struct pfsync_bucket *b;
2298 
2299 	for (c = 0; c < pfsync_buckets; c++) {
2300 		b = &sc->sc_buckets[c];
2301 
2302 		PFSYNC_BUCKET_LOCK(b);
2303 		pfsync_push(b);
2304 		PFSYNC_BUCKET_UNLOCK(b);
2305 	}
2306 }
2307 
2308 static void
2309 pfsyncintr(void *arg)
2310 {
2311 	struct epoch_tracker et;
2312 	struct pfsync_softc *sc = arg;
2313 	struct pfsync_bucket *b;
2314 	struct mbuf *m, *n;
2315 	int c;
2316 
2317 	NET_EPOCH_ENTER(et);
2318 	CURVNET_SET(sc->sc_ifp->if_vnet);
2319 
2320 	for (c = 0; c < pfsync_buckets; c++) {
2321 		b = &sc->sc_buckets[c];
2322 
2323 		PFSYNC_BUCKET_LOCK(b);
2324 		if ((b->b_flags & PFSYNCF_BUCKET_PUSH) && b->b_len > PFSYNC_MINPKT) {
2325 			pfsync_sendout(0, b->b_id);
2326 			b->b_flags &= ~PFSYNCF_BUCKET_PUSH;
2327 		}
2328 		_IF_DEQUEUE_ALL(&b->b_snd, m);
2329 		PFSYNC_BUCKET_UNLOCK(b);
2330 
2331 		for (; m != NULL; m = n) {
2332 			n = m->m_nextpkt;
2333 			m->m_nextpkt = NULL;
2334 
2335 			/*
2336 			 * We distinguish between a deferral packet and our
2337 			 * own pfsync packet based on M_SKIP_FIREWALL
2338 			 * flag. This is XXX.
2339 			 */
2340 			if (m->m_flags & M_SKIP_FIREWALL)
2341 				ip_output(m, NULL, NULL, 0, NULL, NULL);
2342 			else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2343 			    NULL) == 0)
2344 				V_pfsyncstats.pfsyncs_opackets++;
2345 			else
2346 				V_pfsyncstats.pfsyncs_oerrors++;
2347 		}
2348 	}
2349 	CURVNET_RESTORE();
2350 	NET_EPOCH_EXIT(et);
2351 }
2352 
2353 static int
2354 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp,
2355     struct in_mfilter *imf)
2356 {
2357 	struct ip_moptions *imo = &sc->sc_imo;
2358 	int error;
2359 
2360 	if (!(ifp->if_flags & IFF_MULTICAST))
2361 		return (EADDRNOTAVAIL);
2362 
2363 	imo->imo_multicast_vif = -1;
2364 
2365 	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2366 	    &imf->imf_inm)) != 0)
2367 		return (error);
2368 
2369 	ip_mfilter_init(&imo->imo_head);
2370 	ip_mfilter_insert(&imo->imo_head, imf);
2371 	imo->imo_multicast_ifp = ifp;
2372 	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2373 	imo->imo_multicast_loop = 0;
2374 
2375 	return (0);
2376 }
2377 
2378 static void
2379 pfsync_multicast_cleanup(struct pfsync_softc *sc)
2380 {
2381 	struct ip_moptions *imo = &sc->sc_imo;
2382 	struct in_mfilter *imf;
2383 
2384 	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
2385 		ip_mfilter_remove(&imo->imo_head, imf);
2386 		in_leavegroup(imf->imf_inm, NULL);
2387 		ip_mfilter_free(imf);
2388 	}
2389 	imo->imo_multicast_ifp = NULL;
2390 }
2391 
2392 void
2393 pfsync_detach_ifnet(struct ifnet *ifp)
2394 {
2395 	struct pfsync_softc *sc = V_pfsyncif;
2396 
2397 	if (sc == NULL)
2398 		return;
2399 
2400 	PFSYNC_LOCK(sc);
2401 
2402 	if (sc->sc_sync_if == ifp) {
2403 		/* We don't need mutlicast cleanup here, because the interface
2404 		 * is going away. We do need to ensure we don't try to do
2405 		 * cleanup later.
2406 		 */
2407 		ip_mfilter_init(&sc->sc_imo.imo_head);
2408 		sc->sc_imo.imo_multicast_ifp = NULL;
2409 		sc->sc_sync_if = NULL;
2410 	}
2411 
2412 	PFSYNC_UNLOCK(sc);
2413 }
2414 
2415 #ifdef INET
2416 extern  struct domain inetdomain;
2417 static struct protosw in_pfsync_protosw = {
2418 	.pr_type =		SOCK_RAW,
2419 	.pr_domain =		&inetdomain,
2420 	.pr_protocol =		IPPROTO_PFSYNC,
2421 	.pr_flags =		PR_ATOMIC|PR_ADDR,
2422 	.pr_input =		pfsync_input,
2423 	.pr_output =		rip_output,
2424 	.pr_ctloutput =		rip_ctloutput,
2425 	.pr_usrreqs =		&rip_usrreqs
2426 };
2427 #endif
2428 
2429 static void
2430 pfsync_pointers_init()
2431 {
2432 
2433 	PF_RULES_WLOCK();
2434 	V_pfsync_state_import_ptr = pfsync_state_import;
2435 	V_pfsync_insert_state_ptr = pfsync_insert_state;
2436 	V_pfsync_update_state_ptr = pfsync_update_state;
2437 	V_pfsync_delete_state_ptr = pfsync_delete_state;
2438 	V_pfsync_clear_states_ptr = pfsync_clear_states;
2439 	V_pfsync_defer_ptr = pfsync_defer;
2440 	PF_RULES_WUNLOCK();
2441 }
2442 
2443 static void
2444 pfsync_pointers_uninit()
2445 {
2446 
2447 	PF_RULES_WLOCK();
2448 	V_pfsync_state_import_ptr = NULL;
2449 	V_pfsync_insert_state_ptr = NULL;
2450 	V_pfsync_update_state_ptr = NULL;
2451 	V_pfsync_delete_state_ptr = NULL;
2452 	V_pfsync_clear_states_ptr = NULL;
2453 	V_pfsync_defer_ptr = NULL;
2454 	PF_RULES_WUNLOCK();
2455 }
2456 
2457 static void
2458 vnet_pfsync_init(const void *unused __unused)
2459 {
2460 	int error;
2461 
2462 	V_pfsync_cloner = if_clone_simple(pfsyncname,
2463 	    pfsync_clone_create, pfsync_clone_destroy, 1);
2464 	error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2465 	    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2466 	if (error) {
2467 		if_clone_detach(V_pfsync_cloner);
2468 		log(LOG_INFO, "swi_add() failed in %s\n", __func__);
2469 	}
2470 
2471 	pfsync_pointers_init();
2472 }
2473 VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
2474     vnet_pfsync_init, NULL);
2475 
2476 static void
2477 vnet_pfsync_uninit(const void *unused __unused)
2478 {
2479 
2480 	pfsync_pointers_uninit();
2481 
2482 	if_clone_detach(V_pfsync_cloner);
2483 	swi_remove(V_pfsync_swi_cookie);
2484 }
2485 
2486 VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_FOURTH,
2487     vnet_pfsync_uninit, NULL);
2488 
2489 static int
2490 pfsync_init()
2491 {
2492 #ifdef INET
2493 	int error;
2494 
2495 	pfsync_detach_ifnet_ptr = pfsync_detach_ifnet;
2496 
2497 	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2498 	if (error)
2499 		return (error);
2500 	error = ipproto_register(IPPROTO_PFSYNC);
2501 	if (error) {
2502 		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2503 		return (error);
2504 	}
2505 #endif
2506 
2507 	return (0);
2508 }
2509 
2510 static void
2511 pfsync_uninit()
2512 {
2513 	pfsync_detach_ifnet_ptr = NULL;
2514 
2515 #ifdef INET
2516 	ipproto_unregister(IPPROTO_PFSYNC);
2517 	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2518 #endif
2519 }
2520 
2521 static int
2522 pfsync_modevent(module_t mod, int type, void *data)
2523 {
2524 	int error = 0;
2525 
2526 	switch (type) {
2527 	case MOD_LOAD:
2528 		error = pfsync_init();
2529 		break;
2530 	case MOD_UNLOAD:
2531 		pfsync_uninit();
2532 		break;
2533 	default:
2534 		error = EINVAL;
2535 		break;
2536 	}
2537 
2538 	return (error);
2539 }
2540 
2541 static moduledata_t pfsync_mod = {
2542 	pfsyncname,
2543 	pfsync_modevent,
2544 	0
2545 };
2546 
2547 #define PFSYNC_MODVER 1
2548 
2549 /* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
2550 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
2551 MODULE_VERSION(pfsync, PFSYNC_MODVER);
2552 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
2553