xref: /freebsd/sys/netinet/ip_carp.c (revision 76b28ad6ab6dc8d4a62cb7de7f143595be535813)
1 /*-
2  * Copyright (c) 2002 Michael Shalayeff.
3  * Copyright (c) 2003 Ryan McBride.
4  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_bpf.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/bus.h>
39 #include <sys/jail.h>
40 #include <sys/kernel.h>
41 #include <sys/limits.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/module.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/protosw.h>
48 #include <sys/socket.h>
49 #include <sys/sockio.h>
50 #include <sys/sysctl.h>
51 #include <sys/syslog.h>
52 #include <sys/taskqueue.h>
53 #include <sys/counter.h>
54 
55 #include <net/ethernet.h>
56 #include <net/fddi.h>
57 #include <net/if.h>
58 #include <net/if_var.h>
59 #include <net/if_dl.h>
60 #include <net/if_llatbl.h>
61 #include <net/if_types.h>
62 #include <net/iso88025.h>
63 #include <net/route.h>
64 #include <net/vnet.h>
65 
66 #if defined(INET) || defined(INET6)
67 #include <netinet/in.h>
68 #include <netinet/in_var.h>
69 #include <netinet/ip_carp.h>
70 #include <netinet/ip.h>
71 #include <machine/in_cksum.h>
72 #endif
73 #ifdef INET
74 #include <netinet/ip_var.h>
75 #include <netinet/if_ether.h>
76 #endif
77 
78 #ifdef INET6
79 #include <netinet/icmp6.h>
80 #include <netinet/ip6.h>
81 #include <netinet6/ip6protosw.h>
82 #include <netinet6/in6_var.h>
83 #include <netinet6/ip6_var.h>
84 #include <netinet6/scope6_var.h>
85 #include <netinet6/nd6.h>
86 #endif
87 
88 #include <crypto/sha1.h>
89 
90 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
91 
92 struct carp_softc {
93 	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
94 	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
95 	struct sockaddr_dl	sc_addr;	/* Our link level address. */
96 	struct callout		sc_ad_tmo;	/* Advertising timeout. */
97 #ifdef INET
98 	struct callout		sc_md_tmo;	/* Master down timeout. */
99 #endif
100 #ifdef INET6
101 	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
102 #endif
103 	struct mtx		sc_mtx;
104 
105 	int			sc_vhid;
106 	int			sc_advskew;
107 	int			sc_advbase;
108 
109 	int			sc_naddrs;
110 	int			sc_naddrs6;
111 	int			sc_ifasiz;
112 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
113 	int			sc_suppress;
114 	int			sc_sendad_errors;
115 #define	CARP_SENDAD_MAX_ERRORS	3
116 	int			sc_sendad_success;
117 #define	CARP_SENDAD_MIN_SUCCESS 3
118 
119 	int			sc_init_counter;
120 	uint64_t		sc_counter;
121 
122 	/* authentication */
123 #define	CARP_HMAC_PAD	64
124 	unsigned char sc_key[CARP_KEY_LEN];
125 	unsigned char sc_pad[CARP_HMAC_PAD];
126 	SHA1_CTX sc_sha1;
127 
128 	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
129 	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
130 };
131 
132 struct carp_if {
133 #ifdef INET
134 	int	cif_naddrs;
135 #endif
136 #ifdef INET6
137 	int	cif_naddrs6;
138 #endif
139 	TAILQ_HEAD(, carp_softc) cif_vrs;
140 #ifdef INET
141 	struct ip_moptions 	 cif_imo;
142 #endif
143 #ifdef INET6
144 	struct ip6_moptions 	 cif_im6o;
145 #endif
146 	struct ifnet	*cif_ifp;
147 	struct mtx	cif_mtx;
148 	uint32_t	cif_flags;
149 #define	CIF_PROMISC	0x00000001
150 };
151 
152 #define	CARP_INET	0
153 #define	CARP_INET6	1
154 static int proto_reg[] = {-1, -1};
155 
156 /*
157  * Brief design of carp(4).
158  *
159  * Any carp-capable ifnet may have a list of carp softcs hanging off
160  * its ifp->if_carp pointer. Each softc represents one unique virtual
161  * host id, or vhid. The softc has a back pointer to the ifnet. All
162  * softcs are joined in a global list, which has quite limited use.
163  *
164  * Any interface address that takes part in CARP negotiation has a
165  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
166  * AF_INET or AF_INET6 address.
167  *
168  * Although, one can get the softc's backpointer to ifnet and traverse
169  * through its ifp->if_addrhead queue to find all interface addresses
170  * involved in CARP, we keep a growable array of ifaddr pointers. This
171  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
172  * do calls into the network stack, thus avoiding LORs.
173  *
174  * Locking:
175  *
176  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
177  * callout-driven events and ioctl()s.
178  *
179  * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to
180  * traverse the global list we use the mutex carp_mtx.
181  *
182  * Known issues with locking:
183  *
184  * - There is no protection for races between two ioctl() requests,
185  *   neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all
186  *   interface ioctl()s should be serialized right in net/if.c.
187  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
188  *   counting is done on the softc.
189  * - On module unload we may race (?) with packet processing thread
190  *   dereferencing our function pointers.
191  */
192 
193 /* Accept incoming CARP packets. */
194 static VNET_DEFINE(int, carp_allow) = 1;
195 #define	V_carp_allow	VNET(carp_allow)
196 
197 /* Preempt slower nodes. */
198 static VNET_DEFINE(int, carp_preempt) = 0;
199 #define	V_carp_preempt	VNET(carp_preempt)
200 
201 /* Log level. */
202 static VNET_DEFINE(int, carp_log) = 1;
203 #define	V_carp_log	VNET(carp_log)
204 
205 /* Global advskew demotion. */
206 static VNET_DEFINE(int, carp_demotion) = 0;
207 #define	V_carp_demotion	VNET(carp_demotion)
208 
209 /* Send error demotion factor. */
210 static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
211 #define	V_carp_senderr_adj	VNET(carp_senderr_adj)
212 
213 /* Iface down demotion factor. */
214 static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
215 #define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
216 
217 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
218 
219 SYSCTL_NODE(_net_inet, IPPROTO_CARP,	carp,	CTLFLAG_RW, 0,	"CARP");
220 SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_RW,
221     &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets");
222 SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_RW,
223     &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
224 SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_RW,
225     &VNET_NAME(carp_log), 0, "CARP log level");
226 SYSCTL_VNET_PROC(_net_inet_carp, OID_AUTO, demotion, CTLTYPE_INT|CTLFLAG_RW,
227     0, 0, carp_demote_adj_sysctl, "I",
228     "Adjust demotion factor (skew of advskew)");
229 SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_RW,
230     &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
231 SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_RW,
232     &VNET_NAME(carp_ifdown_adj), 0,
233     "Interface down demotion factor adjustment");
234 
235 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
236 VNET_PCPUSTAT_SYSINIT(carpstats);
237 VNET_PCPUSTAT_SYSUNINIT(carpstats);
238 
239 #define	CARPSTATS_ADD(name, val)	\
240     counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
241 	sizeof(uint64_t)], (val))
242 #define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
243 
244 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
245     carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
246 
247 #define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
248 	NULL, MTX_DEF)
249 #define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
250 #define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
251 #define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
252 #define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
253 #define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
254 	NULL, MTX_DEF)
255 #define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
256 #define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
257 #define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
258 #define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
259 #define	CIF_FREE(cif)	do {				\
260 		CIF_LOCK_ASSERT(cif);			\
261 		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
262 			carp_free_if(cif);		\
263 		else					\
264 			CIF_UNLOCK(cif);		\
265 } while (0)
266 
267 #define	CARP_LOG(...)	do {				\
268 	if (V_carp_log > 0)				\
269 		log(LOG_INFO, "carp: " __VA_ARGS__);	\
270 } while (0)
271 
272 #define	CARP_DEBUG(...)	do {				\
273 	if (V_carp_log > 1)				\
274 		log(LOG_DEBUG, __VA_ARGS__);		\
275 } while (0)
276 
277 #define	IFNET_FOREACH_IFA(ifp, ifa)					\
278 	IF_ADDR_LOCK_ASSERT(ifp);					\
279 	TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)		\
280 		if ((ifa)->ifa_carp != NULL)
281 
282 #define	CARP_FOREACH_IFA(sc, ifa)					\
283 	CARP_LOCK_ASSERT(sc);						\
284 	for (int _i = 0;						\
285 		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
286 		((ifa) = sc->sc_ifas[_i]) != NULL;			\
287 		++_i)
288 
289 #define	IFNET_FOREACH_CARP(ifp, sc)					\
290 	CIF_LOCK_ASSERT(ifp->if_carp);					\
291 	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
292 
293 #define	DEMOTE_ADVSKEW(sc)					\
294     (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
295     CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
296 
297 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
298 static struct carp_softc
299 		*carp_alloc(struct ifnet *);
300 static void	carp_detach_locked(struct ifaddr *);
301 static void	carp_destroy(struct carp_softc *);
302 static struct carp_if
303 		*carp_alloc_if(struct ifnet *);
304 static void	carp_free_if(struct carp_if *);
305 static void	carp_set_state(struct carp_softc *, int);
306 static void	carp_sc_state(struct carp_softc *);
307 static void	carp_setrun(struct carp_softc *, sa_family_t);
308 static void	carp_master_down(void *);
309 static void	carp_master_down_locked(struct carp_softc *);
310 static void	carp_send_ad(void *);
311 static void	carp_send_ad_locked(struct carp_softc *);
312 static void	carp_addroute(struct carp_softc *);
313 static void	carp_ifa_addroute(struct ifaddr *);
314 static void	carp_delroute(struct carp_softc *);
315 static void	carp_ifa_delroute(struct ifaddr *);
316 static void	carp_send_ad_all(void *, int);
317 static void	carp_demote_adj(int, char *);
318 
319 static LIST_HEAD(, carp_softc) carp_list;
320 static struct mtx carp_mtx;
321 static struct task carp_sendall_task =
322     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
323 
324 static void
325 carp_hmac_prepare(struct carp_softc *sc)
326 {
327 	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
328 	uint8_t vhid = sc->sc_vhid & 0xff;
329 	struct ifaddr *ifa;
330 	int i, found;
331 #ifdef INET
332 	struct in_addr last, cur, in;
333 #endif
334 #ifdef INET6
335 	struct in6_addr last6, cur6, in6;
336 #endif
337 
338 	CARP_LOCK_ASSERT(sc);
339 
340 	/* Compute ipad from key. */
341 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
342 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
343 	for (i = 0; i < sizeof(sc->sc_pad); i++)
344 		sc->sc_pad[i] ^= 0x36;
345 
346 	/* Precompute first part of inner hash. */
347 	SHA1Init(&sc->sc_sha1);
348 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
349 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
350 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
351 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
352 #ifdef INET
353 	cur.s_addr = 0;
354 	do {
355 		found = 0;
356 		last = cur;
357 		cur.s_addr = 0xffffffff;
358 		CARP_FOREACH_IFA(sc, ifa) {
359 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
360 			if (ifa->ifa_addr->sa_family == AF_INET &&
361 			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
362 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
363 				cur.s_addr = in.s_addr;
364 				found++;
365 			}
366 		}
367 		if (found)
368 			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
369 	} while (found);
370 #endif /* INET */
371 #ifdef INET6
372 	memset(&cur6, 0, sizeof(cur6));
373 	do {
374 		found = 0;
375 		last6 = cur6;
376 		memset(&cur6, 0xff, sizeof(cur6));
377 		CARP_FOREACH_IFA(sc, ifa) {
378 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
379 			if (IN6_IS_SCOPE_EMBED(&in6))
380 				in6.s6_addr16[1] = 0;
381 			if (ifa->ifa_addr->sa_family == AF_INET6 &&
382 			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
383 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
384 				cur6 = in6;
385 				found++;
386 			}
387 		}
388 		if (found)
389 			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
390 	} while (found);
391 #endif /* INET6 */
392 
393 	/* convert ipad to opad */
394 	for (i = 0; i < sizeof(sc->sc_pad); i++)
395 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
396 }
397 
398 static void
399 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
400     unsigned char md[20])
401 {
402 	SHA1_CTX sha1ctx;
403 
404 	CARP_LOCK_ASSERT(sc);
405 
406 	/* fetch first half of inner hash */
407 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
408 
409 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
410 	SHA1Final(md, &sha1ctx);
411 
412 	/* outer hash */
413 	SHA1Init(&sha1ctx);
414 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
415 	SHA1Update(&sha1ctx, md, 20);
416 	SHA1Final(md, &sha1ctx);
417 }
418 
419 static int
420 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
421     unsigned char md[20])
422 {
423 	unsigned char md2[20];
424 
425 	CARP_LOCK_ASSERT(sc);
426 
427 	carp_hmac_generate(sc, counter, md2);
428 
429 	return (bcmp(md, md2, sizeof(md2)));
430 }
431 
432 /*
433  * process input packet.
434  * we have rearranged checks order compared to the rfc,
435  * but it seems more efficient this way or not possible otherwise.
436  */
437 #ifdef INET
438 void
439 carp_input(struct mbuf *m, int hlen)
440 {
441 	struct ip *ip = mtod(m, struct ip *);
442 	struct carp_header *ch;
443 	int iplen, len;
444 
445 	CARPSTATS_INC(carps_ipackets);
446 
447 	if (!V_carp_allow) {
448 		m_freem(m);
449 		return;
450 	}
451 
452 	/* verify that the IP TTL is 255.  */
453 	if (ip->ip_ttl != CARP_DFLTTL) {
454 		CARPSTATS_INC(carps_badttl);
455 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
456 		    ip->ip_ttl,
457 		    m->m_pkthdr.rcvif->if_xname);
458 		m_freem(m);
459 		return;
460 	}
461 
462 	iplen = ip->ip_hl << 2;
463 
464 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
465 		CARPSTATS_INC(carps_badlen);
466 		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
467 		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
468 		    m->m_pkthdr.rcvif->if_xname);
469 		m_freem(m);
470 		return;
471 	}
472 
473 	if (iplen + sizeof(*ch) < m->m_len) {
474 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
475 			CARPSTATS_INC(carps_hdrops);
476 			CARP_DEBUG("%s: pullup failed\n", __func__);
477 			return;
478 		}
479 		ip = mtod(m, struct ip *);
480 	}
481 	ch = (struct carp_header *)((char *)ip + iplen);
482 
483 	/*
484 	 * verify that the received packet length is
485 	 * equal to the CARP header
486 	 */
487 	len = iplen + sizeof(*ch);
488 	if (len > m->m_pkthdr.len) {
489 		CARPSTATS_INC(carps_badlen);
490 		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
491 		    m->m_pkthdr.len,
492 		    m->m_pkthdr.rcvif->if_xname);
493 		m_freem(m);
494 		return;
495 	}
496 
497 	if ((m = m_pullup(m, len)) == NULL) {
498 		CARPSTATS_INC(carps_hdrops);
499 		return;
500 	}
501 	ip = mtod(m, struct ip *);
502 	ch = (struct carp_header *)((char *)ip + iplen);
503 
504 	/* verify the CARP checksum */
505 	m->m_data += iplen;
506 	if (in_cksum(m, len - iplen)) {
507 		CARPSTATS_INC(carps_badsum);
508 		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
509 		    m->m_pkthdr.rcvif->if_xname);
510 		m_freem(m);
511 		return;
512 	}
513 	m->m_data -= iplen;
514 
515 	carp_input_c(m, ch, AF_INET);
516 }
517 #endif
518 
519 #ifdef INET6
520 int
521 carp6_input(struct mbuf **mp, int *offp, int proto)
522 {
523 	struct mbuf *m = *mp;
524 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
525 	struct carp_header *ch;
526 	u_int len;
527 
528 	CARPSTATS_INC(carps_ipackets6);
529 
530 	if (!V_carp_allow) {
531 		m_freem(m);
532 		return (IPPROTO_DONE);
533 	}
534 
535 	/* check if received on a valid carp interface */
536 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
537 		CARPSTATS_INC(carps_badif);
538 		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
539 		    __func__, m->m_pkthdr.rcvif->if_xname);
540 		m_freem(m);
541 		return (IPPROTO_DONE);
542 	}
543 
544 	/* verify that the IP TTL is 255 */
545 	if (ip6->ip6_hlim != CARP_DFLTTL) {
546 		CARPSTATS_INC(carps_badttl);
547 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
548 		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
549 		m_freem(m);
550 		return (IPPROTO_DONE);
551 	}
552 
553 	/* verify that we have a complete carp packet */
554 	len = m->m_len;
555 	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
556 	if (ch == NULL) {
557 		CARPSTATS_INC(carps_badlen);
558 		CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
559 		return (IPPROTO_DONE);
560 	}
561 
562 
563 	/* verify the CARP checksum */
564 	m->m_data += *offp;
565 	if (in_cksum(m, sizeof(*ch))) {
566 		CARPSTATS_INC(carps_badsum);
567 		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
568 		    m->m_pkthdr.rcvif->if_xname);
569 		m_freem(m);
570 		return (IPPROTO_DONE);
571 	}
572 	m->m_data -= *offp;
573 
574 	carp_input_c(m, ch, AF_INET6);
575 	return (IPPROTO_DONE);
576 }
577 #endif /* INET6 */
578 
579 static void
580 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
581 {
582 	struct ifnet *ifp = m->m_pkthdr.rcvif;
583 	struct ifaddr *ifa;
584 	struct carp_softc *sc;
585 	uint64_t tmp_counter;
586 	struct timeval sc_tv, ch_tv;
587 
588 	/* verify that the VHID is valid on the receiving interface */
589 	IF_ADDR_RLOCK(ifp);
590 	IFNET_FOREACH_IFA(ifp, ifa)
591 		if (ifa->ifa_addr->sa_family == af &&
592 		    ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
593 			ifa_ref(ifa);
594 			break;
595 		}
596 	IF_ADDR_RUNLOCK(ifp);
597 
598 	if (ifa == NULL) {
599 		CARPSTATS_INC(carps_badvhid);
600 		m_freem(m);
601 		return;
602 	}
603 
604 	/* verify the CARP version. */
605 	if (ch->carp_version != CARP_VERSION) {
606 		CARPSTATS_INC(carps_badver);
607 		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
608 		    ch->carp_version);
609 		ifa_free(ifa);
610 		m_freem(m);
611 		return;
612 	}
613 
614 	sc = ifa->ifa_carp;
615 	CARP_LOCK(sc);
616 	ifa_free(ifa);
617 
618 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
619 		CARPSTATS_INC(carps_badauth);
620 		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
621 		    sc->sc_vhid, ifp->if_xname);
622 		goto out;
623 	}
624 
625 	tmp_counter = ntohl(ch->carp_counter[0]);
626 	tmp_counter = tmp_counter<<32;
627 	tmp_counter += ntohl(ch->carp_counter[1]);
628 
629 	/* XXX Replay protection goes here */
630 
631 	sc->sc_init_counter = 0;
632 	sc->sc_counter = tmp_counter;
633 
634 	sc_tv.tv_sec = sc->sc_advbase;
635 	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
636 	ch_tv.tv_sec = ch->carp_advbase;
637 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
638 
639 	switch (sc->sc_state) {
640 	case INIT:
641 		break;
642 	case MASTER:
643 		/*
644 		 * If we receive an advertisement from a master who's going to
645 		 * be more frequent than us, go into BACKUP state.
646 		 */
647 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
648 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
649 			callout_stop(&sc->sc_ad_tmo);
650 			CARP_LOG("VHID %u@%s: MASTER -> BACKUP "
651 			    "(more frequent advertisement received)\n",
652 			    sc->sc_vhid,
653 			    sc->sc_carpdev->if_xname);
654 			carp_set_state(sc, BACKUP);
655 			carp_setrun(sc, 0);
656 			carp_delroute(sc);
657 		}
658 		break;
659 	case BACKUP:
660 		/*
661 		 * If we're pre-empting masters who advertise slower than us,
662 		 * and this one claims to be slower, treat him as down.
663 		 */
664 		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
665 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
666 			    "(preempting a slower master)\n",
667 			    sc->sc_vhid,
668 			    sc->sc_carpdev->if_xname);
669 			carp_master_down_locked(sc);
670 			break;
671 		}
672 
673 		/*
674 		 *  If the master is going to advertise at such a low frequency
675 		 *  that he's guaranteed to time out, we'd might as well just
676 		 *  treat him as timed out now.
677 		 */
678 		sc_tv.tv_sec = sc->sc_advbase * 3;
679 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
680 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
681 			    "(master timed out)\n",
682 			    sc->sc_vhid,
683 			    sc->sc_carpdev->if_xname);
684 			carp_master_down_locked(sc);
685 			break;
686 		}
687 
688 		/*
689 		 * Otherwise, we reset the counter and wait for the next
690 		 * advertisement.
691 		 */
692 		carp_setrun(sc, af);
693 		break;
694 	}
695 
696 out:
697 	CARP_UNLOCK(sc);
698 	m_freem(m);
699 }
700 
701 static int
702 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
703 {
704 	struct m_tag *mtag;
705 
706 	if (sc->sc_init_counter) {
707 		/* this could also be seconds since unix epoch */
708 		sc->sc_counter = arc4random();
709 		sc->sc_counter = sc->sc_counter << 32;
710 		sc->sc_counter += arc4random();
711 	} else
712 		sc->sc_counter++;
713 
714 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
715 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
716 
717 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
718 
719 	/* Tag packet for carp_output */
720 	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
721 	    M_NOWAIT)) == NULL) {
722 		m_freem(m);
723 		CARPSTATS_INC(carps_onomem);
724 		return (ENOMEM);
725 	}
726 	bcopy(&sc, mtag + 1, sizeof(sc));
727 	m_tag_prepend(m, mtag);
728 
729 	return (0);
730 }
731 
732 /*
733  * To avoid LORs and possible recursions this function shouldn't
734  * be called directly, but scheduled via taskqueue.
735  */
736 static void
737 carp_send_ad_all(void *ctx __unused, int pending __unused)
738 {
739 	struct carp_softc *sc;
740 
741 	mtx_lock(&carp_mtx);
742 	LIST_FOREACH(sc, &carp_list, sc_next)
743 		if (sc->sc_state == MASTER) {
744 			CARP_LOCK(sc);
745 			CURVNET_SET(sc->sc_carpdev->if_vnet);
746 			carp_send_ad_locked(sc);
747 			CURVNET_RESTORE();
748 			CARP_UNLOCK(sc);
749 		}
750 	mtx_unlock(&carp_mtx);
751 }
752 
753 /* Send a periodic advertisement, executed in callout context. */
754 static void
755 carp_send_ad(void *v)
756 {
757 	struct carp_softc *sc = v;
758 
759 	CARP_LOCK_ASSERT(sc);
760 	CURVNET_SET(sc->sc_carpdev->if_vnet);
761 	carp_send_ad_locked(sc);
762 	CURVNET_RESTORE();
763 	CARP_UNLOCK(sc);
764 }
765 
766 static void
767 carp_send_ad_error(struct carp_softc *sc, int error)
768 {
769 
770 	if (error) {
771 		if (sc->sc_sendad_errors < INT_MAX)
772 			sc->sc_sendad_errors++;
773 		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
774 			static const char fmt[] = "send error %d on %s";
775 			char msg[sizeof(fmt) + IFNAMSIZ];
776 
777 			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
778 			carp_demote_adj(V_carp_senderr_adj, msg);
779 		}
780 		sc->sc_sendad_success = 0;
781 	} else {
782 		if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
783 		    ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
784 			static const char fmt[] = "send ok on %s";
785 			char msg[sizeof(fmt) + IFNAMSIZ];
786 
787 			sprintf(msg, fmt, sc->sc_carpdev->if_xname);
788 			carp_demote_adj(-V_carp_senderr_adj, msg);
789 			sc->sc_sendad_errors = 0;
790 		} else
791 			sc->sc_sendad_errors = 0;
792 	}
793 }
794 
795 static void
796 carp_send_ad_locked(struct carp_softc *sc)
797 {
798 	struct carp_header ch;
799 	struct timeval tv;
800 	struct sockaddr sa;
801 	struct ifaddr *ifa;
802 	struct carp_header *ch_ptr;
803 	struct mbuf *m;
804 	int len, advskew;
805 
806 	CARP_LOCK_ASSERT(sc);
807 
808 	advskew = DEMOTE_ADVSKEW(sc);
809 	tv.tv_sec = sc->sc_advbase;
810 	tv.tv_usec = advskew * 1000000 / 256;
811 
812 	ch.carp_version = CARP_VERSION;
813 	ch.carp_type = CARP_ADVERTISEMENT;
814 	ch.carp_vhid = sc->sc_vhid;
815 	ch.carp_advbase = sc->sc_advbase;
816 	ch.carp_advskew = advskew;
817 	ch.carp_authlen = 7;	/* XXX DEFINE */
818 	ch.carp_pad1 = 0;	/* must be zero */
819 	ch.carp_cksum = 0;
820 
821 	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
822 
823 #ifdef INET
824 	if (sc->sc_naddrs) {
825 		struct ip *ip;
826 
827 		m = m_gethdr(M_NOWAIT, MT_DATA);
828 		if (m == NULL) {
829 			CARPSTATS_INC(carps_onomem);
830 			goto resched;
831 		}
832 		len = sizeof(*ip) + sizeof(ch);
833 		m->m_pkthdr.len = len;
834 		m->m_pkthdr.rcvif = NULL;
835 		m->m_len = len;
836 		MH_ALIGN(m, m->m_len);
837 		m->m_flags |= M_MCAST;
838 		ip = mtod(m, struct ip *);
839 		ip->ip_v = IPVERSION;
840 		ip->ip_hl = sizeof(*ip) >> 2;
841 		ip->ip_tos = IPTOS_LOWDELAY;
842 		ip->ip_len = htons(len);
843 		ip->ip_id = ip_newid();
844 		ip->ip_off = htons(IP_DF);
845 		ip->ip_ttl = CARP_DFLTTL;
846 		ip->ip_p = IPPROTO_CARP;
847 		ip->ip_sum = 0;
848 
849 		bzero(&sa, sizeof(sa));
850 		sa.sa_family = AF_INET;
851 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
852 		if (ifa != NULL) {
853 			ip->ip_src.s_addr =
854 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
855 			ifa_free(ifa);
856 		} else
857 			ip->ip_src.s_addr = 0;
858 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
859 
860 		ch_ptr = (struct carp_header *)(&ip[1]);
861 		bcopy(&ch, ch_ptr, sizeof(ch));
862 		if (carp_prepare_ad(m, sc, ch_ptr))
863 			goto resched;
864 
865 		m->m_data += sizeof(*ip);
866 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
867 		m->m_data -= sizeof(*ip);
868 
869 		CARPSTATS_INC(carps_opackets);
870 
871 		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
872 		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
873 	}
874 #endif /* INET */
875 #ifdef INET6
876 	if (sc->sc_naddrs6) {
877 		struct ip6_hdr *ip6;
878 
879 		m = m_gethdr(M_NOWAIT, MT_DATA);
880 		if (m == NULL) {
881 			CARPSTATS_INC(carps_onomem);
882 			goto resched;
883 		}
884 		len = sizeof(*ip6) + sizeof(ch);
885 		m->m_pkthdr.len = len;
886 		m->m_pkthdr.rcvif = NULL;
887 		m->m_len = len;
888 		MH_ALIGN(m, m->m_len);
889 		m->m_flags |= M_MCAST;
890 		ip6 = mtod(m, struct ip6_hdr *);
891 		bzero(ip6, sizeof(*ip6));
892 		ip6->ip6_vfc |= IPV6_VERSION;
893 		ip6->ip6_hlim = CARP_DFLTTL;
894 		ip6->ip6_nxt = IPPROTO_CARP;
895 		bzero(&sa, sizeof(sa));
896 
897 		/* set the source address */
898 		sa.sa_family = AF_INET6;
899 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
900 		if (ifa != NULL) {
901 			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
902 			    sizeof(struct in6_addr));
903 			ifa_free(ifa);
904 		} else
905 			/* This should never happen with IPv6. */
906 			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
907 
908 		/* Set the multicast destination. */
909 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
910 		ip6->ip6_dst.s6_addr8[15] = 0x12;
911 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
912 			m_freem(m);
913 			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
914 			goto resched;
915 		}
916 
917 		ch_ptr = (struct carp_header *)(&ip6[1]);
918 		bcopy(&ch, ch_ptr, sizeof(ch));
919 		if (carp_prepare_ad(m, sc, ch_ptr))
920 			goto resched;
921 
922 		m->m_data += sizeof(*ip6);
923 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
924 		m->m_data -= sizeof(*ip6);
925 
926 		CARPSTATS_INC(carps_opackets6);
927 
928 		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
929 		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
930 	}
931 #endif /* INET6 */
932 
933 resched:
934 	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
935 }
936 
937 static void
938 carp_addroute(struct carp_softc *sc)
939 {
940 	struct ifaddr *ifa;
941 
942 	CARP_FOREACH_IFA(sc, ifa)
943 		carp_ifa_addroute(ifa);
944 }
945 
946 static void
947 carp_ifa_addroute(struct ifaddr *ifa)
948 {
949 
950 	switch (ifa->ifa_addr->sa_family) {
951 #ifdef INET
952 	case AF_INET:
953 		in_addprefix(ifatoia(ifa), RTF_UP);
954 		ifa_add_loopback_route(ifa,
955 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
956 		break;
957 #endif
958 #ifdef INET6
959 	case AF_INET6:
960 		ifa_add_loopback_route(ifa,
961 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
962 		nd6_add_ifa_lle(ifatoia6(ifa));
963 		break;
964 #endif
965 	}
966 }
967 
968 static void
969 carp_delroute(struct carp_softc *sc)
970 {
971 	struct ifaddr *ifa;
972 
973 	CARP_FOREACH_IFA(sc, ifa)
974 		carp_ifa_delroute(ifa);
975 }
976 
977 static void
978 carp_ifa_delroute(struct ifaddr *ifa)
979 {
980 
981 	switch (ifa->ifa_addr->sa_family) {
982 #ifdef INET
983 	case AF_INET:
984 		ifa_del_loopback_route(ifa,
985 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
986 		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
987 		break;
988 #endif
989 #ifdef INET6
990 	case AF_INET6:
991 		ifa_del_loopback_route(ifa,
992 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
993 		nd6_rem_ifa_lle(ifatoia6(ifa));
994 		break;
995 #endif
996 	}
997 }
998 
999 int
1000 carp_master(struct ifaddr *ifa)
1001 {
1002 	struct carp_softc *sc = ifa->ifa_carp;
1003 
1004 	return (sc->sc_state == MASTER);
1005 }
1006 
1007 #ifdef INET
1008 /*
1009  * Broadcast a gratuitous ARP request containing
1010  * the virtual router MAC address for each IP address
1011  * associated with the virtual router.
1012  */
1013 static void
1014 carp_send_arp(struct carp_softc *sc)
1015 {
1016 	struct ifaddr *ifa;
1017 
1018 	CARP_FOREACH_IFA(sc, ifa)
1019 		if (ifa->ifa_addr->sa_family == AF_INET)
1020 			arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr));
1021 }
1022 
1023 int
1024 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
1025 {
1026 	struct carp_softc *sc = ifa->ifa_carp;
1027 
1028 	if (sc->sc_state == MASTER) {
1029 		*enaddr = LLADDR(&sc->sc_addr);
1030 		return (1);
1031 	}
1032 
1033 	return (0);
1034 }
1035 #endif
1036 
1037 #ifdef INET6
1038 static void
1039 carp_send_na(struct carp_softc *sc)
1040 {
1041 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1042 	struct ifaddr *ifa;
1043 	struct in6_addr *in6;
1044 
1045 	CARP_FOREACH_IFA(sc, ifa) {
1046 		if (ifa->ifa_addr->sa_family != AF_INET6)
1047 			continue;
1048 
1049 		in6 = IFA_IN6(ifa);
1050 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
1051 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
1052 		DELAY(1000);	/* XXX */
1053 	}
1054 }
1055 
1056 /*
1057  * Returns ifa in case it's a carp address and it is MASTER, or if the address
1058  * matches and is not a carp address.  Returns NULL otherwise.
1059  */
1060 struct ifaddr *
1061 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
1062 {
1063 	struct ifaddr *ifa;
1064 
1065 	ifa = NULL;
1066 	IF_ADDR_RLOCK(ifp);
1067 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1068 		if (ifa->ifa_addr->sa_family != AF_INET6)
1069 			continue;
1070 		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
1071 			continue;
1072 		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
1073 			ifa = NULL;
1074 		else
1075 			ifa_ref(ifa);
1076 		break;
1077 	}
1078 	IF_ADDR_RUNLOCK(ifp);
1079 
1080 	return (ifa);
1081 }
1082 
1083 caddr_t
1084 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
1085 {
1086 	struct ifaddr *ifa;
1087 
1088 	IF_ADDR_RLOCK(ifp);
1089 	IFNET_FOREACH_IFA(ifp, ifa)
1090 		if (ifa->ifa_addr->sa_family == AF_INET6 &&
1091 		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
1092 			struct carp_softc *sc = ifa->ifa_carp;
1093 			struct m_tag *mtag;
1094 
1095 			IF_ADDR_RUNLOCK(ifp);
1096 
1097 			mtag = m_tag_get(PACKET_TAG_CARP,
1098 			    sizeof(struct carp_softc *), M_NOWAIT);
1099 			if (mtag == NULL)
1100 				/* Better a bit than nothing. */
1101 				return (LLADDR(&sc->sc_addr));
1102 
1103 			bcopy(&sc, mtag + 1, sizeof(sc));
1104 			m_tag_prepend(m, mtag);
1105 
1106 			return (LLADDR(&sc->sc_addr));
1107 		}
1108 	IF_ADDR_RUNLOCK(ifp);
1109 
1110 	return (NULL);
1111 }
1112 #endif /* INET6 */
1113 
1114 int
1115 carp_forus(struct ifnet *ifp, u_char *dhost)
1116 {
1117 	struct carp_softc *sc;
1118 	uint8_t *ena = dhost;
1119 
1120 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
1121 		return (0);
1122 
1123 	CIF_LOCK(ifp->if_carp);
1124 	IFNET_FOREACH_CARP(ifp, sc) {
1125 		CARP_LOCK(sc);
1126 		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
1127 		    ETHER_ADDR_LEN)) {
1128 			CARP_UNLOCK(sc);
1129 			CIF_UNLOCK(ifp->if_carp);
1130 			return (1);
1131 		}
1132 		CARP_UNLOCK(sc);
1133 	}
1134 	CIF_UNLOCK(ifp->if_carp);
1135 
1136 	return (0);
1137 }
1138 
1139 /* Master down timeout event, executed in callout context. */
1140 static void
1141 carp_master_down(void *v)
1142 {
1143 	struct carp_softc *sc = v;
1144 
1145 	CARP_LOCK_ASSERT(sc);
1146 
1147 	CURVNET_SET(sc->sc_carpdev->if_vnet);
1148 	if (sc->sc_state == BACKUP) {
1149 		CARP_LOG("VHID %u@%s: BACKUP -> MASTER (master down)\n",
1150 		    sc->sc_vhid,
1151 		    sc->sc_carpdev->if_xname);
1152 		carp_master_down_locked(sc);
1153 	}
1154 	CURVNET_RESTORE();
1155 
1156 	CARP_UNLOCK(sc);
1157 }
1158 
1159 static void
1160 carp_master_down_locked(struct carp_softc *sc)
1161 {
1162 
1163 	CARP_LOCK_ASSERT(sc);
1164 
1165 	switch (sc->sc_state) {
1166 	case BACKUP:
1167 		carp_set_state(sc, MASTER);
1168 		carp_send_ad_locked(sc);
1169 #ifdef INET
1170 		carp_send_arp(sc);
1171 #endif
1172 #ifdef INET6
1173 		carp_send_na(sc);
1174 #endif
1175 		carp_setrun(sc, 0);
1176 		carp_addroute(sc);
1177 		break;
1178 	case INIT:
1179 	case MASTER:
1180 #ifdef INVARIANTS
1181 		panic("carp: VHID %u@%s: master_down event in %s state\n",
1182 		    sc->sc_vhid,
1183 		    sc->sc_carpdev->if_xname,
1184 		    sc->sc_state ? "MASTER" : "INIT");
1185 #endif
1186 		break;
1187 	}
1188 }
1189 
1190 /*
1191  * When in backup state, af indicates whether to reset the master down timer
1192  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1193  */
1194 static void
1195 carp_setrun(struct carp_softc *sc, sa_family_t af)
1196 {
1197 	struct timeval tv;
1198 
1199 	CARP_LOCK_ASSERT(sc);
1200 
1201 	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
1202 	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1203 	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
1204 		return;
1205 
1206 	switch (sc->sc_state) {
1207 	case INIT:
1208 		CARP_LOG("VHID %u@%s: INIT -> BACKUP\n",
1209 		    sc->sc_vhid,
1210 		    sc->sc_carpdev->if_xname);
1211 		carp_set_state(sc, BACKUP);
1212 		carp_setrun(sc, 0);
1213 		break;
1214 	case BACKUP:
1215 		callout_stop(&sc->sc_ad_tmo);
1216 		tv.tv_sec = 3 * sc->sc_advbase;
1217 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1218 		switch (af) {
1219 #ifdef INET
1220 		case AF_INET:
1221 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1222 			    carp_master_down, sc);
1223 			break;
1224 #endif
1225 #ifdef INET6
1226 		case AF_INET6:
1227 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1228 			    carp_master_down, sc);
1229 			break;
1230 #endif
1231 		default:
1232 #ifdef INET
1233 			if (sc->sc_naddrs)
1234 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1235 				    carp_master_down, sc);
1236 #endif
1237 #ifdef INET6
1238 			if (sc->sc_naddrs6)
1239 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1240 				    carp_master_down, sc);
1241 #endif
1242 			break;
1243 		}
1244 		break;
1245 	case MASTER:
1246 		tv.tv_sec = sc->sc_advbase;
1247 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1248 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1249 		    carp_send_ad, sc);
1250 		break;
1251 	}
1252 }
1253 
1254 /*
1255  * Setup multicast structures.
1256  */
1257 static int
1258 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
1259 {
1260 	struct ifnet *ifp = cif->cif_ifp;
1261 	int error = 0;
1262 
1263 	CIF_LOCK_ASSERT(cif);
1264 
1265 	switch (sa) {
1266 #ifdef INET
1267 	case AF_INET:
1268 	    {
1269 		struct ip_moptions *imo = &cif->cif_imo;
1270 		struct in_addr addr;
1271 
1272 		if (imo->imo_membership)
1273 			return (0);
1274 
1275 		imo->imo_membership = (struct in_multi **)malloc(
1276 		    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
1277 		    M_NOWAIT);
1278 		if (imo->imo_membership == NULL)
1279 			return (ENOMEM);
1280 		imo->imo_mfilters = NULL;
1281 		imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1282 		imo->imo_multicast_vif = -1;
1283 
1284 		addr.s_addr = htonl(INADDR_CARP_GROUP);
1285 		if ((error = in_joingroup(ifp, &addr, NULL,
1286 		    &imo->imo_membership[0])) != 0) {
1287 			free(imo->imo_membership, M_CARP);
1288 			break;
1289 		}
1290 		imo->imo_num_memberships++;
1291 		imo->imo_multicast_ifp = ifp;
1292 		imo->imo_multicast_ttl = CARP_DFLTTL;
1293 		imo->imo_multicast_loop = 0;
1294 		break;
1295 	   }
1296 #endif
1297 #ifdef INET6
1298 	case AF_INET6:
1299 	    {
1300 		struct ip6_moptions *im6o = &cif->cif_im6o;
1301 		struct in6_addr in6;
1302 		struct in6_multi *in6m;
1303 
1304 		if (im6o->im6o_membership)
1305 			return (0);
1306 
1307 		im6o->im6o_membership = (struct in6_multi **)malloc(
1308 		    (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
1309 		    M_ZERO | M_NOWAIT);
1310 		if (im6o->im6o_membership == NULL)
1311 			return (ENOMEM);
1312 		im6o->im6o_mfilters = NULL;
1313 		im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
1314 		im6o->im6o_multicast_hlim = CARP_DFLTTL;
1315 		im6o->im6o_multicast_ifp = ifp;
1316 
1317 		/* Join IPv6 CARP multicast group. */
1318 		bzero(&in6, sizeof(in6));
1319 		in6.s6_addr16[0] = htons(0xff02);
1320 		in6.s6_addr8[15] = 0x12;
1321 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1322 			free(im6o->im6o_membership, M_CARP);
1323 			break;
1324 		}
1325 		in6m = NULL;
1326 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1327 			free(im6o->im6o_membership, M_CARP);
1328 			break;
1329 		}
1330 		im6o->im6o_membership[0] = in6m;
1331 		im6o->im6o_num_memberships++;
1332 
1333 		/* Join solicited multicast address. */
1334 		bzero(&in6, sizeof(in6));
1335 		in6.s6_addr16[0] = htons(0xff02);
1336 		in6.s6_addr32[1] = 0;
1337 		in6.s6_addr32[2] = htonl(1);
1338 		in6.s6_addr32[3] = 0;
1339 		in6.s6_addr8[12] = 0xff;
1340 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1341 			in6_mc_leave(im6o->im6o_membership[0], NULL);
1342 			free(im6o->im6o_membership, M_CARP);
1343 			break;
1344 		}
1345 		in6m = NULL;
1346 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1347 			in6_mc_leave(im6o->im6o_membership[0], NULL);
1348 			free(im6o->im6o_membership, M_CARP);
1349 			break;
1350 		}
1351 		im6o->im6o_membership[1] = in6m;
1352 		im6o->im6o_num_memberships++;
1353 		break;
1354 	    }
1355 #endif
1356 	}
1357 
1358 	return (error);
1359 }
1360 
1361 /*
1362  * Free multicast structures.
1363  */
1364 static void
1365 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
1366 {
1367 
1368 	CIF_LOCK_ASSERT(cif);
1369 	switch (sa) {
1370 #ifdef INET
1371 	case AF_INET:
1372 		if (cif->cif_naddrs == 0) {
1373 			struct ip_moptions *imo = &cif->cif_imo;
1374 
1375 			in_leavegroup(imo->imo_membership[0], NULL);
1376 			KASSERT(imo->imo_mfilters == NULL,
1377 			    ("%s: imo_mfilters != NULL", __func__));
1378 			free(imo->imo_membership, M_CARP);
1379 			imo->imo_membership = NULL;
1380 
1381 		}
1382 		break;
1383 #endif
1384 #ifdef INET6
1385 	case AF_INET6:
1386 		if (cif->cif_naddrs6 == 0) {
1387 			struct ip6_moptions *im6o = &cif->cif_im6o;
1388 
1389 			in6_mc_leave(im6o->im6o_membership[0], NULL);
1390 			in6_mc_leave(im6o->im6o_membership[1], NULL);
1391 			KASSERT(im6o->im6o_mfilters == NULL,
1392 			    ("%s: im6o_mfilters != NULL", __func__));
1393 			free(im6o->im6o_membership, M_CARP);
1394 			im6o->im6o_membership = NULL;
1395 		}
1396 		break;
1397 #endif
1398 	}
1399 }
1400 
1401 int
1402 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
1403 {
1404 	struct m_tag *mtag;
1405 	struct carp_softc *sc;
1406 
1407 	if (!sa)
1408 		return (0);
1409 
1410 	switch (sa->sa_family) {
1411 #ifdef INET
1412 	case AF_INET:
1413 		break;
1414 #endif
1415 #ifdef INET6
1416 	case AF_INET6:
1417 		break;
1418 #endif
1419 	default:
1420 		return (0);
1421 	}
1422 
1423 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
1424 	if (mtag == NULL)
1425 		return (0);
1426 
1427 	bcopy(mtag + 1, &sc, sizeof(sc));
1428 
1429 	/* Set the source MAC address to the Virtual Router MAC Address. */
1430 	switch (ifp->if_type) {
1431 	case IFT_ETHER:
1432 	case IFT_BRIDGE:
1433 	case IFT_L2VLAN: {
1434 			struct ether_header *eh;
1435 
1436 			eh = mtod(m, struct ether_header *);
1437 			eh->ether_shost[0] = 0;
1438 			eh->ether_shost[1] = 0;
1439 			eh->ether_shost[2] = 0x5e;
1440 			eh->ether_shost[3] = 0;
1441 			eh->ether_shost[4] = 1;
1442 			eh->ether_shost[5] = sc->sc_vhid;
1443 		}
1444 		break;
1445 	case IFT_FDDI: {
1446 			struct fddi_header *fh;
1447 
1448 			fh = mtod(m, struct fddi_header *);
1449 			fh->fddi_shost[0] = 0;
1450 			fh->fddi_shost[1] = 0;
1451 			fh->fddi_shost[2] = 0x5e;
1452 			fh->fddi_shost[3] = 0;
1453 			fh->fddi_shost[4] = 1;
1454 			fh->fddi_shost[5] = sc->sc_vhid;
1455 		}
1456 		break;
1457 	case IFT_ISO88025: {
1458  			struct iso88025_header *th;
1459  			th = mtod(m, struct iso88025_header *);
1460 			th->iso88025_shost[0] = 3;
1461 			th->iso88025_shost[1] = 0;
1462 			th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
1463 			th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
1464 			th->iso88025_shost[4] = 0;
1465 			th->iso88025_shost[5] = 0;
1466 		}
1467 		break;
1468 	default:
1469 		printf("%s: carp is not supported for the %d interface type\n",
1470 		    ifp->if_xname, ifp->if_type);
1471 		return (EOPNOTSUPP);
1472 	}
1473 
1474 	return (0);
1475 }
1476 
1477 static struct carp_softc*
1478 carp_alloc(struct ifnet *ifp)
1479 {
1480 	struct carp_softc *sc;
1481 	struct carp_if *cif;
1482 
1483 	if ((cif = ifp->if_carp) == NULL)
1484 		cif = carp_alloc_if(ifp);
1485 
1486 	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
1487 
1488 	sc->sc_advbase = CARP_DFLTINTV;
1489 	sc->sc_vhid = -1;	/* required setting */
1490 	sc->sc_init_counter = 1;
1491 	sc->sc_state = INIT;
1492 
1493 	sc->sc_ifasiz = sizeof(struct ifaddr *);
1494 	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
1495 	sc->sc_carpdev = ifp;
1496 
1497 	CARP_LOCK_INIT(sc);
1498 #ifdef INET
1499 	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1500 #endif
1501 #ifdef INET6
1502 	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1503 #endif
1504 	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1505 
1506 	CIF_LOCK(cif);
1507 	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
1508 	CIF_UNLOCK(cif);
1509 
1510 	mtx_lock(&carp_mtx);
1511 	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
1512 	mtx_unlock(&carp_mtx);
1513 
1514 	return (sc);
1515 }
1516 
1517 static int
1518 carp_grow_ifas(struct carp_softc *sc)
1519 {
1520 	struct ifaddr **new;
1521 
1522 	CARP_LOCK_ASSERT(sc);
1523 
1524 	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO);
1525 	if (new == NULL)
1526 		return (ENOMEM);
1527 	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
1528 	free(sc->sc_ifas, M_CARP);
1529 	sc->sc_ifas = new;
1530 	sc->sc_ifasiz *= 2;
1531 
1532 	return (0);
1533 }
1534 
1535 static void
1536 carp_destroy(struct carp_softc *sc)
1537 {
1538 	struct ifnet *ifp = sc->sc_carpdev;
1539 	struct carp_if *cif = ifp->if_carp;
1540 
1541 	CIF_LOCK_ASSERT(cif);
1542 
1543 	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
1544 
1545 	mtx_lock(&carp_mtx);
1546 	LIST_REMOVE(sc, sc_next);
1547 	mtx_unlock(&carp_mtx);
1548 
1549 	CARP_LOCK(sc);
1550 	if (sc->sc_suppress)
1551 		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
1552 	callout_drain(&sc->sc_ad_tmo);
1553 #ifdef INET
1554 	callout_drain(&sc->sc_md_tmo);
1555 #endif
1556 #ifdef INET6
1557 	callout_drain(&sc->sc_md6_tmo);
1558 #endif
1559 	CARP_LOCK_DESTROY(sc);
1560 
1561 	free(sc->sc_ifas, M_CARP);
1562 	free(sc, M_CARP);
1563 }
1564 
1565 static struct carp_if*
1566 carp_alloc_if(struct ifnet *ifp)
1567 {
1568 	struct carp_if *cif;
1569 	int error;
1570 
1571 	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
1572 
1573 	if ((error = ifpromisc(ifp, 1)) != 0)
1574 		printf("%s: ifpromisc(%s) failed: %d\n",
1575 		    __func__, ifp->if_xname, error);
1576 	else
1577 		cif->cif_flags |= CIF_PROMISC;
1578 
1579 	CIF_LOCK_INIT(cif);
1580 	cif->cif_ifp = ifp;
1581 	TAILQ_INIT(&cif->cif_vrs);
1582 
1583 	IF_ADDR_WLOCK(ifp);
1584 	ifp->if_carp = cif;
1585 	if_ref(ifp);
1586 	IF_ADDR_WUNLOCK(ifp);
1587 
1588 	return (cif);
1589 }
1590 
1591 static void
1592 carp_free_if(struct carp_if *cif)
1593 {
1594 	struct ifnet *ifp = cif->cif_ifp;
1595 
1596 	CIF_LOCK_ASSERT(cif);
1597 	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
1598 	    __func__));
1599 
1600 	IF_ADDR_WLOCK(ifp);
1601 	ifp->if_carp = NULL;
1602 	IF_ADDR_WUNLOCK(ifp);
1603 
1604 	CIF_LOCK_DESTROY(cif);
1605 
1606 	if (cif->cif_flags & CIF_PROMISC)
1607 		ifpromisc(ifp, 0);
1608 	if_rele(ifp);
1609 
1610 	free(cif, M_CARP);
1611 }
1612 
1613 static void
1614 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
1615 {
1616 
1617 	CARP_LOCK(sc);
1618 	carpr->carpr_state = sc->sc_state;
1619 	carpr->carpr_vhid = sc->sc_vhid;
1620 	carpr->carpr_advbase = sc->sc_advbase;
1621 	carpr->carpr_advskew = sc->sc_advskew;
1622 	if (priv)
1623 		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
1624 	else
1625 		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
1626 	CARP_UNLOCK(sc);
1627 }
1628 
1629 int
1630 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
1631 {
1632 	struct carpreq carpr;
1633 	struct ifnet *ifp;
1634 	struct carp_softc *sc = NULL;
1635 	int error = 0, locked = 0;
1636 
1637 	if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
1638 		return (error);
1639 
1640 	ifp = ifunit_ref(ifr->ifr_name);
1641 	if (ifp == NULL)
1642 		return (ENXIO);
1643 
1644 	switch (ifp->if_type) {
1645 	case IFT_ETHER:
1646 	case IFT_L2VLAN:
1647 	case IFT_BRIDGE:
1648 	case IFT_FDDI:
1649 	case IFT_ISO88025:
1650 		break;
1651 	default:
1652 		error = EOPNOTSUPP;
1653 		goto out;
1654 	}
1655 
1656 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1657 		error = EADDRNOTAVAIL;
1658 		goto out;
1659 	}
1660 
1661 	switch (cmd) {
1662 	case SIOCSVH:
1663 		if ((error = priv_check(td, PRIV_NETINET_CARP)))
1664 			break;
1665 		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
1666 		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
1667 			error = EINVAL;
1668 			break;
1669 		}
1670 
1671 		if (ifp->if_carp) {
1672 			CIF_LOCK(ifp->if_carp);
1673 			IFNET_FOREACH_CARP(ifp, sc)
1674 				if (sc->sc_vhid == carpr.carpr_vhid)
1675 					break;
1676 			CIF_UNLOCK(ifp->if_carp);
1677 		}
1678 		if (sc == NULL) {
1679 			sc = carp_alloc(ifp);
1680 			CARP_LOCK(sc);
1681 			sc->sc_vhid = carpr.carpr_vhid;
1682 			LLADDR(&sc->sc_addr)[0] = 0;
1683 			LLADDR(&sc->sc_addr)[1] = 0;
1684 			LLADDR(&sc->sc_addr)[2] = 0x5e;
1685 			LLADDR(&sc->sc_addr)[3] = 0;
1686 			LLADDR(&sc->sc_addr)[4] = 1;
1687 			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
1688 		} else
1689 			CARP_LOCK(sc);
1690 		locked = 1;
1691 		if (carpr.carpr_advbase > 0) {
1692 			if (carpr.carpr_advbase > 255 ||
1693 			    carpr.carpr_advbase < CARP_DFLTINTV) {
1694 				error = EINVAL;
1695 				break;
1696 			}
1697 			sc->sc_advbase = carpr.carpr_advbase;
1698 		}
1699 		if (carpr.carpr_advskew > 0) {
1700 			if (carpr.carpr_advskew >= 255) {
1701 				error = EINVAL;
1702 				break;
1703 			}
1704 			sc->sc_advskew = carpr.carpr_advskew;
1705 		}
1706 		if (carpr.carpr_key[0] != '\0') {
1707 			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
1708 			carp_hmac_prepare(sc);
1709 		}
1710 		if (sc->sc_state != INIT &&
1711 		    carpr.carpr_state != sc->sc_state) {
1712 			switch (carpr.carpr_state) {
1713 			case BACKUP:
1714 				callout_stop(&sc->sc_ad_tmo);
1715 				carp_set_state(sc, BACKUP);
1716 				carp_setrun(sc, 0);
1717 				carp_delroute(sc);
1718 				break;
1719 			case MASTER:
1720 				carp_master_down_locked(sc);
1721 				break;
1722 			default:
1723 				break;
1724 			}
1725 		}
1726 		break;
1727 
1728 	case SIOCGVH:
1729 	    {
1730 		int priveleged;
1731 
1732 		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
1733 			error = EINVAL;
1734 			break;
1735 		}
1736 		if (carpr.carpr_count < 1) {
1737 			error = EMSGSIZE;
1738 			break;
1739 		}
1740 		if (ifp->if_carp == NULL) {
1741 			error = ENOENT;
1742 			break;
1743 		}
1744 
1745 		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
1746 		if (carpr.carpr_vhid != 0) {
1747 			CIF_LOCK(ifp->if_carp);
1748 			IFNET_FOREACH_CARP(ifp, sc)
1749 				if (sc->sc_vhid == carpr.carpr_vhid)
1750 					break;
1751 			CIF_UNLOCK(ifp->if_carp);
1752 			if (sc == NULL) {
1753 				error = ENOENT;
1754 				break;
1755 			}
1756 			carp_carprcp(&carpr, sc, priveleged);
1757 			error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
1758 		} else  {
1759 			int i, count;
1760 
1761 			count = 0;
1762 			CIF_LOCK(ifp->if_carp);
1763 			IFNET_FOREACH_CARP(ifp, sc)
1764 				count++;
1765 
1766 			if (count > carpr.carpr_count) {
1767 				CIF_UNLOCK(ifp->if_carp);
1768 				error = EMSGSIZE;
1769 				break;
1770 			}
1771 
1772 			i = 0;
1773 			IFNET_FOREACH_CARP(ifp, sc) {
1774 				carp_carprcp(&carpr, sc, priveleged);
1775 				carpr.carpr_count = count;
1776 				error = copyout(&carpr, ifr->ifr_data +
1777 				    (i * sizeof(carpr)), sizeof(carpr));
1778 				if (error) {
1779 					CIF_UNLOCK(ifp->if_carp);
1780 					break;
1781 				}
1782 				i++;
1783 			}
1784 			CIF_UNLOCK(ifp->if_carp);
1785 		}
1786 		break;
1787 	    }
1788 	default:
1789 		error = EINVAL;
1790 	}
1791 
1792 out:
1793 	if (locked)
1794 		CARP_UNLOCK(sc);
1795 	if_rele(ifp);
1796 
1797 	return (error);
1798 }
1799 
1800 static int
1801 carp_get_vhid(struct ifaddr *ifa)
1802 {
1803 
1804 	if (ifa == NULL || ifa->ifa_carp == NULL)
1805 		return (0);
1806 
1807 	return (ifa->ifa_carp->sc_vhid);
1808 }
1809 
1810 int
1811 carp_attach(struct ifaddr *ifa, int vhid)
1812 {
1813 	struct ifnet *ifp = ifa->ifa_ifp;
1814 	struct carp_if *cif = ifp->if_carp;
1815 	struct carp_softc *sc;
1816 	int index, error;
1817 
1818 	if (ifp->if_carp == NULL)
1819 		return (ENOPROTOOPT);
1820 
1821 	switch (ifa->ifa_addr->sa_family) {
1822 #ifdef INET
1823 	case AF_INET:
1824 #endif
1825 #ifdef INET6
1826 	case AF_INET6:
1827 #endif
1828 		break;
1829 	default:
1830 		return (EPROTOTYPE);
1831 	}
1832 
1833 	CIF_LOCK(cif);
1834 	IFNET_FOREACH_CARP(ifp, sc)
1835 		if (sc->sc_vhid == vhid)
1836 			break;
1837 	if (sc == NULL) {
1838 		CIF_UNLOCK(cif);
1839 		return (ENOENT);
1840 	}
1841 
1842 	if (ifa->ifa_carp) {
1843 		if (ifa->ifa_carp->sc_vhid != vhid)
1844 			carp_detach_locked(ifa);
1845 		else {
1846 			CIF_UNLOCK(cif);
1847 			return (0);
1848 		}
1849 	}
1850 
1851 	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
1852 	if (error) {
1853 		CIF_FREE(cif);
1854 		return (error);
1855 	}
1856 
1857 	CARP_LOCK(sc);
1858 	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
1859 	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
1860 		if ((error = carp_grow_ifas(sc)) != 0) {
1861 			carp_multicast_cleanup(cif,
1862 			    ifa->ifa_addr->sa_family);
1863 			CARP_UNLOCK(sc);
1864 			CIF_FREE(cif);
1865 			return (error);
1866 		}
1867 
1868 	switch (ifa->ifa_addr->sa_family) {
1869 #ifdef INET
1870 	case AF_INET:
1871 		cif->cif_naddrs++;
1872 		sc->sc_naddrs++;
1873 		break;
1874 #endif
1875 #ifdef INET6
1876 	case AF_INET6:
1877 		cif->cif_naddrs6++;
1878 		sc->sc_naddrs6++;
1879 		break;
1880 #endif
1881 	}
1882 
1883 	ifa_ref(ifa);
1884 	sc->sc_ifas[index - 1] = ifa;
1885 	ifa->ifa_carp = sc;
1886 
1887 	carp_hmac_prepare(sc);
1888 	carp_sc_state(sc);
1889 
1890 	CARP_UNLOCK(sc);
1891 	CIF_UNLOCK(cif);
1892 
1893 	return (0);
1894 }
1895 
1896 void
1897 carp_detach(struct ifaddr *ifa)
1898 {
1899 	struct ifnet *ifp = ifa->ifa_ifp;
1900 	struct carp_if *cif = ifp->if_carp;
1901 
1902 	CIF_LOCK(cif);
1903 	carp_detach_locked(ifa);
1904 	CIF_FREE(cif);
1905 }
1906 
1907 static void
1908 carp_detach_locked(struct ifaddr *ifa)
1909 {
1910 	struct ifnet *ifp = ifa->ifa_ifp;
1911 	struct carp_if *cif = ifp->if_carp;
1912 	struct carp_softc *sc = ifa->ifa_carp;
1913 	int i, index;
1914 
1915 	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
1916 
1917 	CIF_LOCK_ASSERT(cif);
1918 	CARP_LOCK(sc);
1919 
1920 	/* Shift array. */
1921 	index = sc->sc_naddrs + sc->sc_naddrs6;
1922 	for (i = 0; i < index; i++)
1923 		if (sc->sc_ifas[i] == ifa)
1924 			break;
1925 	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
1926 	for (; i < index - 1; i++)
1927 		sc->sc_ifas[i] = sc->sc_ifas[i+1];
1928 	sc->sc_ifas[index - 1] = NULL;
1929 
1930 	switch (ifa->ifa_addr->sa_family) {
1931 #ifdef INET
1932 	case AF_INET:
1933 		cif->cif_naddrs--;
1934 		sc->sc_naddrs--;
1935 		break;
1936 #endif
1937 #ifdef INET6
1938 	case AF_INET6:
1939 		cif->cif_naddrs6--;
1940 		sc->sc_naddrs6--;
1941 		break;
1942 #endif
1943 	}
1944 
1945 	carp_ifa_delroute(ifa);
1946 	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
1947 
1948 	ifa->ifa_carp = NULL;
1949 	ifa_free(ifa);
1950 
1951 	carp_hmac_prepare(sc);
1952 	carp_sc_state(sc);
1953 
1954 	if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
1955 		CARP_UNLOCK(sc);
1956 		carp_destroy(sc);
1957 	} else
1958 		CARP_UNLOCK(sc);
1959 }
1960 
1961 static void
1962 carp_set_state(struct carp_softc *sc, int state)
1963 {
1964 
1965 	CARP_LOCK_ASSERT(sc);
1966 
1967 	if (sc->sc_state != state) {
1968 		const char *carp_states[] = { CARP_STATES };
1969 		char subsys[IFNAMSIZ+5];
1970 
1971 		sc->sc_state = state;
1972 
1973 		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
1974 		    sc->sc_carpdev->if_xname);
1975 		devctl_notify("CARP", subsys, carp_states[state], NULL);
1976 	}
1977 }
1978 
1979 static void
1980 carp_linkstate(struct ifnet *ifp)
1981 {
1982 	struct carp_softc *sc;
1983 
1984 	CIF_LOCK(ifp->if_carp);
1985 	IFNET_FOREACH_CARP(ifp, sc) {
1986 		CARP_LOCK(sc);
1987 		carp_sc_state(sc);
1988 		CARP_UNLOCK(sc);
1989 	}
1990 	CIF_UNLOCK(ifp->if_carp);
1991 }
1992 
1993 static void
1994 carp_sc_state(struct carp_softc *sc)
1995 {
1996 
1997 	CARP_LOCK_ASSERT(sc);
1998 
1999 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
2000 	    !(sc->sc_carpdev->if_flags & IFF_UP)) {
2001 		callout_stop(&sc->sc_ad_tmo);
2002 #ifdef INET
2003 		callout_stop(&sc->sc_md_tmo);
2004 #endif
2005 #ifdef INET6
2006 		callout_stop(&sc->sc_md6_tmo);
2007 #endif
2008 		carp_set_state(sc, INIT);
2009 		carp_setrun(sc, 0);
2010 		if (!sc->sc_suppress)
2011 			carp_demote_adj(V_carp_ifdown_adj, "interface down");
2012 		sc->sc_suppress = 1;
2013 	} else {
2014 		carp_set_state(sc, INIT);
2015 		carp_setrun(sc, 0);
2016 		if (sc->sc_suppress)
2017 			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
2018 		sc->sc_suppress = 0;
2019 	}
2020 }
2021 
2022 static void
2023 carp_demote_adj(int adj, char *reason)
2024 {
2025 	atomic_add_int(&V_carp_demotion, adj);
2026 	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
2027 	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
2028 }
2029 
2030 static int
2031 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
2032 {
2033 	int new, error;
2034 
2035 	new = V_carp_demotion;
2036 	error = sysctl_handle_int(oidp, &new, 0, req);
2037 	if (error || !req->newptr)
2038 		return (error);
2039 
2040 	carp_demote_adj(new, "sysctl");
2041 
2042 	return (0);
2043 }
2044 
2045 #ifdef INET
2046 extern  struct domain inetdomain;
2047 static struct protosw in_carp_protosw = {
2048 	.pr_type =		SOCK_RAW,
2049 	.pr_domain =		&inetdomain,
2050 	.pr_protocol =		IPPROTO_CARP,
2051 	.pr_flags =		PR_ATOMIC|PR_ADDR,
2052 	.pr_input =		carp_input,
2053 	.pr_output =		(pr_output_t *)rip_output,
2054 	.pr_ctloutput =		rip_ctloutput,
2055 	.pr_usrreqs =		&rip_usrreqs
2056 };
2057 #endif
2058 
2059 #ifdef INET6
2060 extern	struct domain inet6domain;
2061 static struct ip6protosw in6_carp_protosw = {
2062 	.pr_type =		SOCK_RAW,
2063 	.pr_domain =		&inet6domain,
2064 	.pr_protocol =		IPPROTO_CARP,
2065 	.pr_flags =		PR_ATOMIC|PR_ADDR,
2066 	.pr_input =		carp6_input,
2067 	.pr_output =		rip6_output,
2068 	.pr_ctloutput =		rip6_ctloutput,
2069 	.pr_usrreqs =		&rip6_usrreqs
2070 };
2071 #endif
2072 
2073 static void
2074 carp_mod_cleanup(void)
2075 {
2076 
2077 #ifdef INET
2078 	if (proto_reg[CARP_INET] == 0) {
2079 		(void)ipproto_unregister(IPPROTO_CARP);
2080 		pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
2081 		proto_reg[CARP_INET] = -1;
2082 	}
2083 	carp_iamatch_p = NULL;
2084 #endif
2085 #ifdef INET6
2086 	if (proto_reg[CARP_INET6] == 0) {
2087 		(void)ip6proto_unregister(IPPROTO_CARP);
2088 		pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
2089 		proto_reg[CARP_INET6] = -1;
2090 	}
2091 	carp_iamatch6_p = NULL;
2092 	carp_macmatch6_p = NULL;
2093 #endif
2094 	carp_ioctl_p = NULL;
2095 	carp_attach_p = NULL;
2096 	carp_detach_p = NULL;
2097 	carp_get_vhid_p = NULL;
2098 	carp_linkstate_p = NULL;
2099 	carp_forus_p = NULL;
2100 	carp_output_p = NULL;
2101 	carp_demote_adj_p = NULL;
2102 	carp_master_p = NULL;
2103 	mtx_unlock(&carp_mtx);
2104 	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
2105 	mtx_destroy(&carp_mtx);
2106 }
2107 
2108 static int
2109 carp_mod_load(void)
2110 {
2111 	int err;
2112 
2113 	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
2114 	LIST_INIT(&carp_list);
2115 	carp_get_vhid_p = carp_get_vhid;
2116 	carp_forus_p = carp_forus;
2117 	carp_output_p = carp_output;
2118 	carp_linkstate_p = carp_linkstate;
2119 	carp_ioctl_p = carp_ioctl;
2120 	carp_attach_p = carp_attach;
2121 	carp_detach_p = carp_detach;
2122 	carp_demote_adj_p = carp_demote_adj;
2123 	carp_master_p = carp_master;
2124 #ifdef INET6
2125 	carp_iamatch6_p = carp_iamatch6;
2126 	carp_macmatch6_p = carp_macmatch6;
2127 	proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
2128 	    (struct protosw *)&in6_carp_protosw);
2129 	if (proto_reg[CARP_INET6]) {
2130 		printf("carp: error %d attaching to PF_INET6\n",
2131 		    proto_reg[CARP_INET6]);
2132 		carp_mod_cleanup();
2133 		return (proto_reg[CARP_INET6]);
2134 	}
2135 	err = ip6proto_register(IPPROTO_CARP);
2136 	if (err) {
2137 		printf("carp: error %d registering with INET6\n", err);
2138 		carp_mod_cleanup();
2139 		return (err);
2140 	}
2141 #endif
2142 #ifdef INET
2143 	carp_iamatch_p = carp_iamatch;
2144 	proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
2145 	if (proto_reg[CARP_INET]) {
2146 		printf("carp: error %d attaching to PF_INET\n",
2147 		    proto_reg[CARP_INET]);
2148 		carp_mod_cleanup();
2149 		return (proto_reg[CARP_INET]);
2150 	}
2151 	err = ipproto_register(IPPROTO_CARP);
2152 	if (err) {
2153 		printf("carp: error %d registering with INET\n", err);
2154 		carp_mod_cleanup();
2155 		return (err);
2156 	}
2157 #endif
2158 	return (0);
2159 }
2160 
2161 static int
2162 carp_modevent(module_t mod, int type, void *data)
2163 {
2164 	switch (type) {
2165 	case MOD_LOAD:
2166 		return carp_mod_load();
2167 		/* NOTREACHED */
2168 	case MOD_UNLOAD:
2169 		mtx_lock(&carp_mtx);
2170 		if (LIST_EMPTY(&carp_list))
2171 			carp_mod_cleanup();
2172 		else {
2173 			mtx_unlock(&carp_mtx);
2174 			return (EBUSY);
2175 		}
2176 		break;
2177 
2178 	default:
2179 		return (EINVAL);
2180 	}
2181 
2182 	return (0);
2183 }
2184 
2185 static moduledata_t carp_mod = {
2186 	"carp",
2187 	carp_modevent,
2188 	0
2189 };
2190 
2191 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2192