1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002 Michael Shalayeff. 5 * Copyright (c) 2003 Ryan McBride. 6 * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org> 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, 22 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 28 * THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "opt_bpf.h" 32 #include "opt_inet.h" 33 #include "opt_inet6.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/devctl.h> 38 #include <sys/jail.h> 39 #include <sys/kassert.h> 40 #include <sys/kernel.h> 41 #include <sys/limits.h> 42 #include <sys/malloc.h> 43 #include <sys/mbuf.h> 44 #include <sys/module.h> 45 #include <sys/priv.h> 46 #include <sys/proc.h> 47 #include <sys/socket.h> 48 #include <sys/sockio.h> 49 #include <sys/sysctl.h> 50 #include <sys/syslog.h> 51 #include <sys/taskqueue.h> 52 #include <sys/counter.h> 53 54 #include <net/ethernet.h> 55 #include <net/if.h> 56 #include <net/if_var.h> 57 #include <net/if_dl.h> 58 #include <net/if_llatbl.h> 59 #include <net/if_private.h> 60 #include <net/if_types.h> 61 #include <net/route.h> 62 #include <net/vnet.h> 63 64 #if defined(INET) || defined(INET6) 65 #include <netinet/in.h> 66 #include <netinet/in_var.h> 67 #include <netinet/ip_carp.h> 68 #include <netinet/ip_carp_nl.h> 69 #include <netinet/ip.h> 70 #include <machine/in_cksum.h> 71 #endif 72 #ifdef INET 73 #include <netinet/ip_var.h> 74 #include <netinet/if_ether.h> 75 #endif 76 77 #ifdef INET6 78 #include <netinet/icmp6.h> 79 #include <netinet/ip6.h> 80 #include <netinet6/in6_var.h> 81 #include <netinet6/ip6_var.h> 82 #include <netinet6/scope6_var.h> 83 #include <netinet6/nd6.h> 84 #endif 85 86 #include <netlink/netlink.h> 87 #include <netlink/netlink_ctl.h> 88 #include <netlink/netlink_generic.h> 89 #include <netlink/netlink_message_parser.h> 90 91 #include <crypto/sha1.h> 92 93 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); 94 95 struct carp_softc { 96 struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ 97 struct ifaddr **sc_ifas; /* Our ifaddrs. */ 98 carp_version_t sc_version; /* carp or VRRPv3 */ 99 uint8_t sc_addr[ETHER_ADDR_LEN]; /* Our link level address. */ 100 struct callout sc_ad_tmo; /* Advertising timeout. */ 101 #ifdef INET 102 struct callout sc_md_tmo; /* Master down timeout. */ 103 #endif 104 #ifdef INET6 105 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ 106 #endif 107 struct mtx sc_mtx; 108 109 int sc_vhid; 110 union { 111 struct { /* sc_version == CARP_VERSION_CARP */ 112 int sc_advskew; 113 int sc_advbase; 114 struct in_addr sc_carpaddr; 115 struct in6_addr sc_carpaddr6; 116 uint64_t sc_counter; 117 bool sc_init_counter; 118 #define CARP_HMAC_PAD 64 119 unsigned char sc_key[CARP_KEY_LEN]; 120 unsigned char sc_pad[CARP_HMAC_PAD]; 121 SHA1_CTX sc_sha1; 122 }; 123 struct { /* sc_version == CARP_VERSION_VRRPv3 */ 124 uint8_t sc_vrrp_prio; 125 uint16_t sc_vrrp_adv_inter; 126 uint16_t sc_vrrp_master_inter; 127 }; 128 }; 129 int sc_naddrs; 130 int sc_naddrs6; 131 int sc_ifasiz; 132 enum { INIT = 0, BACKUP, MASTER } sc_state; 133 int sc_suppress; 134 int sc_sendad_errors; 135 #define CARP_SENDAD_MAX_ERRORS 3 136 int sc_sendad_success; 137 #define CARP_SENDAD_MIN_SUCCESS 3 138 139 TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ 140 LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ 141 }; 142 143 struct carp_if { 144 #ifdef INET 145 int cif_naddrs; 146 #endif 147 #ifdef INET6 148 int cif_naddrs6; 149 #endif 150 TAILQ_HEAD(, carp_softc) cif_vrs; 151 #ifdef INET 152 struct ip_moptions cif_imo; 153 #endif 154 #ifdef INET6 155 struct ip6_moptions cif_im6o; 156 #endif 157 struct ifnet *cif_ifp; 158 struct mtx cif_mtx; 159 uint32_t cif_flags; 160 #define CIF_PROMISC 0x00000001 161 }; 162 163 /* 164 * Brief design of carp(4). 165 * 166 * Any carp-capable ifnet may have a list of carp softcs hanging off 167 * its ifp->if_carp pointer. Each softc represents one unique virtual 168 * host id, or vhid. The softc has a back pointer to the ifnet. All 169 * softcs are joined in a global list, which has quite limited use. 170 * 171 * Any interface address that takes part in CARP negotiation has a 172 * pointer to the softc of its vhid, ifa->ifa_carp. That could be either 173 * AF_INET or AF_INET6 address. 174 * 175 * Although, one can get the softc's backpointer to ifnet and traverse 176 * through its ifp->if_addrhead queue to find all interface addresses 177 * involved in CARP, we keep a growable array of ifaddr pointers. This 178 * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that 179 * do calls into the network stack, thus avoiding LORs. 180 * 181 * Locking: 182 * 183 * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), 184 * callout-driven events and ioctl()s. 185 * 186 * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. 187 * To traverse the global list we use the mutex carp_mtx. 188 * 189 * Known issues with locking: 190 * 191 * - On module unload we may race (?) with packet processing thread 192 * dereferencing our function pointers. 193 */ 194 195 /* Accept incoming CARP packets. */ 196 VNET_DEFINE_STATIC(int, carp_allow) = 1; 197 #define V_carp_allow VNET(carp_allow) 198 199 /* Set DSCP in outgoing CARP packets. */ 200 VNET_DEFINE_STATIC(int, carp_dscp) = 56; 201 #define V_carp_dscp VNET(carp_dscp) 202 203 /* Preempt slower nodes. */ 204 VNET_DEFINE_STATIC(int, carp_preempt) = 0; 205 #define V_carp_preempt VNET(carp_preempt) 206 207 /* Log level. */ 208 VNET_DEFINE_STATIC(int, carp_log) = 1; 209 #define V_carp_log VNET(carp_log) 210 211 /* Global advskew demotion. */ 212 VNET_DEFINE_STATIC(int, carp_demotion) = 0; 213 #define V_carp_demotion VNET(carp_demotion) 214 215 /* Send error demotion factor. */ 216 VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; 217 #define V_carp_senderr_adj VNET(carp_senderr_adj) 218 219 /* Iface down demotion factor. */ 220 VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; 221 #define V_carp_ifdown_adj VNET(carp_ifdown_adj) 222 223 static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); 224 static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); 225 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); 226 227 SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 228 "CARP"); 229 SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, 230 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 231 &VNET_NAME(carp_allow), 0, carp_allow_sysctl, "I", 232 "Accept incoming CARP packets"); 233 SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, 234 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 235 0, 0, carp_dscp_sysctl, "I", 236 "DSCP value for carp packets"); 237 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, 238 &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); 239 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, 240 &VNET_NAME(carp_log), 0, "CARP log level"); 241 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, 242 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 243 0, 0, carp_demote_adj_sysctl, "I", 244 "Adjust demotion factor (skew of advskew)"); 245 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, 246 CTLFLAG_VNET | CTLFLAG_RW, 247 &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); 248 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, 249 CTLFLAG_VNET | CTLFLAG_RW, 250 &VNET_NAME(carp_ifdown_adj), 0, 251 "Interface down demotion factor adjustment"); 252 253 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); 254 VNET_PCPUSTAT_SYSINIT(carpstats); 255 VNET_PCPUSTAT_SYSUNINIT(carpstats); 256 257 #define CARPSTATS_ADD(name, val) \ 258 counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ 259 sizeof(uint64_t)], (val)) 260 #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) 261 262 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, 263 carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); 264 265 #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ 266 NULL, MTX_DEF) 267 #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) 268 #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) 269 #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) 270 #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) 271 #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ 272 NULL, MTX_DEF) 273 #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) 274 #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) 275 #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) 276 #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) 277 #define CIF_FREE(cif) do { \ 278 CIF_LOCK(cif); \ 279 if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ 280 carp_free_if(cif); \ 281 else \ 282 CIF_UNLOCK(cif); \ 283 } while (0) 284 285 #define CARP_LOG(...) do { \ 286 if (V_carp_log > 0) \ 287 log(LOG_INFO, "carp: " __VA_ARGS__); \ 288 } while (0) 289 290 #define CARP_DEBUG(...) do { \ 291 if (V_carp_log > 1) \ 292 log(LOG_DEBUG, __VA_ARGS__); \ 293 } while (0) 294 295 #define IFNET_FOREACH_IFA(ifp, ifa) \ 296 CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ 297 if ((ifa)->ifa_carp != NULL) 298 299 #define CARP_FOREACH_IFA(sc, ifa) \ 300 CARP_LOCK_ASSERT(sc); \ 301 for (int _i = 0; \ 302 _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ 303 ((ifa) = sc->sc_ifas[_i]) != NULL; \ 304 ++_i) 305 306 #define IFNET_FOREACH_CARP(ifp, sc) \ 307 KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ 308 sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ 309 TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) 310 311 #define DEMOTE_ADVSKEW(sc) \ 312 (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ 313 CARP_MAXSKEW : \ 314 (((sc)->sc_advskew + V_carp_demotion < 0) ? \ 315 0 : ((sc)->sc_advskew + V_carp_demotion))) 316 317 /* 318 * VRRPv3 priority is the inverse of CARP advskew: higher is better. 319 * Subtract the global demotion counter and clamp to [0, 254]. 320 * Priority 255 (IP address owner) is never demoted. 321 */ 322 #define DEMOTE_VRRP_PRIO(sc) \ 323 ((sc)->sc_vrrp_prio == 255 ? 255 : \ 324 (((int)(sc)->sc_vrrp_prio - V_carp_demotion < 0) ? 0 : \ 325 (((int)(sc)->sc_vrrp_prio - V_carp_demotion > 254) ? 254 : \ 326 (int)(sc)->sc_vrrp_prio - V_carp_demotion))) 327 328 static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t, int); 329 static void vrrp_input_c(struct mbuf *, int, sa_family_t, int, int, uint16_t); 330 static struct carp_softc 331 *carp_alloc(struct ifnet *, carp_version_t, int); 332 static void carp_destroy(struct carp_softc *); 333 static struct carp_if 334 *carp_alloc_if(struct ifnet *); 335 static void carp_free_if(struct carp_if *); 336 static void carp_set_state(struct carp_softc *, int, const char* reason); 337 static void carp_sc_state(struct carp_softc *); 338 static void carp_setrun(struct carp_softc *, sa_family_t); 339 static void carp_master_down(void *); 340 static void carp_master_down_locked(struct carp_softc *, 341 const char* reason); 342 static void carp_send_ad_locked(struct carp_softc *); 343 static void vrrp_send_ad_locked(struct carp_softc *); 344 static void carp_addroute(struct carp_softc *); 345 static void carp_ifa_addroute(struct ifaddr *); 346 static void carp_delroute(struct carp_softc *); 347 static void carp_ifa_delroute(struct ifaddr *); 348 static void carp_send_ad_all(void *, int); 349 static void carp_demote_adj(int, char *); 350 351 static LIST_HEAD(, carp_softc) carp_list = LIST_HEAD_INITIALIZER(carp_list); 352 static struct mtx carp_mtx; 353 static struct sx carp_sx; 354 static struct task carp_sendall_task = 355 TASK_INITIALIZER(0, carp_send_ad_all, NULL); 356 357 static int 358 carp_is_supported_if(if_t ifp) 359 { 360 if (ifp == NULL) 361 return (ENXIO); 362 363 switch (ifp->if_type) { 364 case IFT_ETHER: 365 case IFT_L2VLAN: 366 case IFT_BRIDGE: 367 break; 368 default: 369 return (EOPNOTSUPP); 370 } 371 372 return (0); 373 } 374 375 static void 376 carp_hmac_prepare(struct carp_softc *sc) 377 { 378 uint8_t version = CARP_VERSION_CARP, type = CARP_ADVERTISEMENT; 379 uint8_t vhid = sc->sc_vhid & 0xff; 380 struct ifaddr *ifa; 381 int i, found; 382 #ifdef INET 383 struct in_addr last, cur, in; 384 #endif 385 #ifdef INET6 386 struct in6_addr last6, cur6, in6; 387 #endif 388 389 CARP_LOCK_ASSERT(sc); 390 MPASS(sc->sc_version == CARP_VERSION_CARP); 391 392 /* Compute ipad from key. */ 393 bzero(sc->sc_pad, sizeof(sc->sc_pad)); 394 bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); 395 for (i = 0; i < sizeof(sc->sc_pad); i++) 396 sc->sc_pad[i] ^= 0x36; 397 398 /* Precompute first part of inner hash. */ 399 SHA1Init(&sc->sc_sha1); 400 SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); 401 SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); 402 SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); 403 SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); 404 #ifdef INET 405 cur.s_addr = 0; 406 do { 407 found = 0; 408 last = cur; 409 cur.s_addr = 0xffffffff; 410 CARP_FOREACH_IFA(sc, ifa) { 411 in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; 412 if (ifa->ifa_addr->sa_family == AF_INET && 413 ntohl(in.s_addr) > ntohl(last.s_addr) && 414 ntohl(in.s_addr) < ntohl(cur.s_addr)) { 415 cur.s_addr = in.s_addr; 416 found++; 417 } 418 } 419 if (found) 420 SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); 421 } while (found); 422 #endif /* INET */ 423 #ifdef INET6 424 memset(&cur6, 0, sizeof(cur6)); 425 do { 426 found = 0; 427 last6 = cur6; 428 memset(&cur6, 0xff, sizeof(cur6)); 429 CARP_FOREACH_IFA(sc, ifa) { 430 in6 = ifatoia6(ifa)->ia_addr.sin6_addr; 431 if (IN6_IS_SCOPE_EMBED(&in6)) 432 in6.s6_addr16[1] = 0; 433 if (ifa->ifa_addr->sa_family == AF_INET6 && 434 memcmp(&in6, &last6, sizeof(in6)) > 0 && 435 memcmp(&in6, &cur6, sizeof(in6)) < 0) { 436 cur6 = in6; 437 found++; 438 } 439 } 440 if (found) 441 SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); 442 } while (found); 443 #endif /* INET6 */ 444 445 /* convert ipad to opad */ 446 for (i = 0; i < sizeof(sc->sc_pad); i++) 447 sc->sc_pad[i] ^= 0x36 ^ 0x5c; 448 } 449 450 static void 451 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], 452 unsigned char md[20]) 453 { 454 SHA1_CTX sha1ctx; 455 456 CARP_LOCK_ASSERT(sc); 457 458 /* fetch first half of inner hash */ 459 bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); 460 461 SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); 462 SHA1Final(md, &sha1ctx); 463 464 /* outer hash */ 465 SHA1Init(&sha1ctx); 466 SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); 467 SHA1Update(&sha1ctx, md, 20); 468 SHA1Final(md, &sha1ctx); 469 } 470 471 static int 472 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], 473 unsigned char md[20]) 474 { 475 unsigned char md2[20]; 476 477 CARP_LOCK_ASSERT(sc); 478 479 carp_hmac_generate(sc, counter, md2); 480 481 return (bcmp(md, md2, sizeof(md2))); 482 } 483 484 static int 485 vrrp_checksum_verify(struct mbuf *m, int off, int len, uint16_t phdrcksum) 486 { 487 uint16_t cksum; 488 489 /* 490 * Note that VRRPv3 checksums are different from CARP checksums. 491 * Carp just calculates the checksum over the packet. 492 * VRRPv3 includes the pseudo-header checksum as well. 493 */ 494 cksum = in_cksum_skip(m, off + len, off); 495 cksum -= phdrcksum; 496 497 return (cksum); 498 } 499 500 /* 501 * process input packet. 502 * we have rearranged checks order compared to the rfc, 503 * but it seems more efficient this way or not possible otherwise. 504 */ 505 #ifdef INET 506 static int 507 carp_input(struct mbuf **mp, int *offp, int proto) 508 { 509 struct mbuf *m = *mp; 510 struct ip *ip; 511 struct vrrpv3_header *vh; 512 int iplen; 513 int minlen; 514 int totlen; 515 516 iplen = *offp; 517 *mp = NULL; 518 519 CARPSTATS_INC(carps_ipackets); 520 521 if (!V_carp_allow) { 522 m_freem(m); 523 return (IPPROTO_DONE); 524 } 525 526 /* Ensure we have enough header to figure out the version. */ 527 if (m->m_pkthdr.len < iplen + sizeof(*vh)) { 528 CARPSTATS_INC(carps_badlen); 529 CARP_DEBUG("%s: received len %zd < sizeof(struct vrrpv3_header) " 530 "on %s\n", __func__, m->m_len - sizeof(struct ip), 531 if_name(m->m_pkthdr.rcvif)); 532 m_freem(m); 533 return (IPPROTO_DONE); 534 } 535 536 if (m->m_len < iplen + sizeof(*vh)) { 537 if ((m = m_pullup(m, iplen + sizeof(*vh))) == NULL) { 538 CARPSTATS_INC(carps_hdrops); 539 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); 540 return (IPPROTO_DONE); 541 } 542 } 543 ip = mtod(m, struct ip *); 544 totlen = ntohs(ip->ip_len); 545 vh = (struct vrrpv3_header *)((char *)ip + iplen); 546 547 switch (vh->vrrp_version) { 548 case CARP_VERSION_CARP: 549 minlen = sizeof(struct carp_header); 550 break; 551 case CARP_VERSION_VRRPv3: 552 minlen = sizeof(struct vrrpv3_header); 553 break; 554 default: 555 CARPSTATS_INC(carps_badver); 556 CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, 557 vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); 558 m_freem(m); 559 return (IPPROTO_DONE); 560 } 561 562 /* And now check the length again but with the real minimal length. */ 563 if (m->m_pkthdr.len < iplen + minlen) { 564 CARPSTATS_INC(carps_badlen); 565 CARP_DEBUG("%s: received len %zd < %d " 566 "on %s\n", __func__, m->m_len - sizeof(struct ip), 567 iplen + minlen, 568 if_name(m->m_pkthdr.rcvif)); 569 m_freem(m); 570 return (IPPROTO_DONE); 571 } 572 573 if (m->m_len < iplen + minlen) { 574 if ((m = m_pullup(m, iplen + minlen)) == NULL) { 575 CARPSTATS_INC(carps_hdrops); 576 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); 577 return (IPPROTO_DONE); 578 } 579 ip = mtod(m, struct ip *); 580 vh = (struct vrrpv3_header *)((char *)ip + iplen); 581 } 582 583 switch (vh->vrrp_version) { 584 case CARP_VERSION_CARP: { 585 struct carp_header *ch; 586 587 /* verify the CARP checksum */ 588 if (in_cksum_skip(m, totlen, iplen)) { 589 CARPSTATS_INC(carps_badsum); 590 CARP_DEBUG("%s: checksum failed on %s\n", __func__, 591 if_name(m->m_pkthdr.rcvif)); 592 m_freem(m); 593 break; 594 } 595 ch = (struct carp_header *)((char *)ip + iplen); 596 carp_input_c(m, ch, AF_INET, ip->ip_ttl); 597 break; 598 } 599 case CARP_VERSION_VRRPv3: { 600 uint16_t phdrcksum; 601 602 phdrcksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 603 htonl((u_short)(totlen - iplen) + ip->ip_p)); 604 vrrp_input_c(m, iplen, AF_INET, ip->ip_ttl, totlen - iplen, 605 phdrcksum); 606 break; 607 } 608 default: 609 KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); 610 } 611 612 return (IPPROTO_DONE); 613 } 614 #endif 615 616 #ifdef INET6 617 static int 618 carp6_input(struct mbuf **mp, int *offp, int proto) 619 { 620 struct mbuf *m = *mp; 621 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 622 struct vrrpv3_header *vh; 623 u_int len, minlen; 624 625 CARPSTATS_INC(carps_ipackets6); 626 627 if (!V_carp_allow) { 628 m_freem(m); 629 return (IPPROTO_DONE); 630 } 631 632 /* check if received on a valid carp interface */ 633 if (m->m_pkthdr.rcvif->if_carp == NULL) { 634 CARPSTATS_INC(carps_badif); 635 CARP_DEBUG("%s: packet received on non-carp interface: %s\n", 636 __func__, if_name(m->m_pkthdr.rcvif)); 637 m_freem(m); 638 return (IPPROTO_DONE); 639 } 640 641 if (m->m_len < *offp + sizeof(*vh)) { 642 len = m->m_len; 643 m = m_pullup(m, *offp + sizeof(*vh)); 644 if (m == NULL) { 645 CARPSTATS_INC(carps_badlen); 646 CARP_DEBUG("%s: packet size %u too small\n", __func__, len); 647 return (IPPROTO_DONE); 648 } 649 ip6 = mtod(m, struct ip6_hdr *); 650 } 651 vh = (struct vrrpv3_header *)(mtod(m, char *) + *offp); 652 653 switch (vh->vrrp_version) { 654 case CARP_VERSION_CARP: 655 minlen = sizeof(struct carp_header); 656 break; 657 case CARP_VERSION_VRRPv3: 658 minlen = sizeof(struct vrrpv3_header); 659 break; 660 default: 661 CARPSTATS_INC(carps_badver); 662 CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, 663 vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); 664 m_freem(m); 665 return (IPPROTO_DONE); 666 } 667 668 /* And now check the length again but with the real minimal length. */ 669 if (m->m_pkthdr.len < sizeof(*ip6) + minlen) { 670 CARPSTATS_INC(carps_badlen); 671 CARP_DEBUG("%s: received len %zd < %zd " 672 "on %s\n", __func__, m->m_len - sizeof(struct ip), 673 sizeof(*ip6) + minlen, 674 if_name(m->m_pkthdr.rcvif)); 675 m_freem(m); 676 return (IPPROTO_DONE); 677 } 678 679 if (m->m_len < sizeof(*ip6) + minlen) { 680 if ((m = m_pullup(m, sizeof(*ip6) + minlen)) == NULL) { 681 CARPSTATS_INC(carps_hdrops); 682 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); 683 return (IPPROTO_DONE); 684 } 685 ip6 = mtod(m, struct ip6_hdr *); 686 vh = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); 687 } 688 689 switch (vh->vrrp_version) { 690 case CARP_VERSION_CARP: { 691 struct carp_header *ch; 692 693 /* verify the CARP checksum */ 694 if (in_cksum_skip(m, *offp + sizeof(struct carp_header), 695 *offp)) { 696 CARPSTATS_INC(carps_badsum); 697 CARP_DEBUG("%s: checksum failed, on %s\n", __func__, 698 if_name(m->m_pkthdr.rcvif)); 699 m_freem(m); 700 break; 701 } 702 ch = (struct carp_header *)((char *)ip6 + sizeof(*ip6)); 703 carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim); 704 break; 705 } 706 case CARP_VERSION_VRRPv3: { 707 uint16_t phdrcksum; 708 709 phdrcksum = in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen), 710 ip6->ip6_nxt, 0); 711 vrrp_input_c(m, sizeof(*ip6), AF_INET6, ip6->ip6_hlim, 712 ntohs(ip6->ip6_plen), phdrcksum); 713 break; 714 } 715 default: 716 KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); 717 } 718 return (IPPROTO_DONE); 719 } 720 #endif /* INET6 */ 721 722 /* 723 * This routine should not be necessary at all, but some switches 724 * (VMWare ESX vswitches) can echo our own packets back at us, 725 * and we must ignore them or they will cause us to drop out of 726 * MASTER mode. 727 * 728 * We cannot catch all cases of network loops. Instead, what we 729 * do here is catch any packet that arrives with a carp header 730 * with a VHID of 0, that comes from an address that is our own. 731 * These packets are by definition "from us" (even if they are from 732 * a misconfigured host that is pretending to be us). 733 * 734 * The VHID test is outside this mini-function. 735 */ 736 static int 737 carp_source_is_self(const struct mbuf *m, struct ifaddr *ifa, sa_family_t af) 738 { 739 #ifdef INET 740 struct ip *ip4; 741 struct in_addr in4; 742 #endif 743 #ifdef INET6 744 struct ip6_hdr *ip6; 745 struct in6_addr in6; 746 #endif 747 748 switch (af) { 749 #ifdef INET 750 case AF_INET: 751 ip4 = mtod(m, struct ip *); 752 in4 = ifatoia(ifa)->ia_addr.sin_addr; 753 return (in4.s_addr == ip4->ip_src.s_addr); 754 #endif 755 #ifdef INET6 756 case AF_INET6: 757 ip6 = mtod(m, struct ip6_hdr *); 758 in6 = ifatoia6(ifa)->ia_addr.sin6_addr; 759 return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); 760 #endif 761 default: 762 break; 763 } 764 return (0); 765 } 766 767 static struct ifaddr * 768 carp_find_ifa(const struct mbuf *m, sa_family_t af, uint8_t vhid) 769 { 770 struct ifnet *ifp = m->m_pkthdr.rcvif; 771 struct ifaddr *ifa, *match; 772 int error; 773 774 NET_EPOCH_ASSERT(); 775 776 /* 777 * Verify that the VHID is valid on the receiving interface. 778 * 779 * There should be just one match. If there are none 780 * the VHID is not valid and we drop the packet. If 781 * there are multiple VHID matches, take just the first 782 * one, for compatibility with previous code. While we're 783 * scanning, check for obvious loops in the network topology 784 * (these should never happen, and as noted above, we may 785 * miss real loops; this is just a double-check). 786 */ 787 error = 0; 788 match = NULL; 789 IFNET_FOREACH_IFA(ifp, ifa) { 790 if (match == NULL && ifa->ifa_carp != NULL && 791 ifa->ifa_addr->sa_family == af && 792 ifa->ifa_carp->sc_vhid == vhid) 793 match = ifa; 794 if (vhid == 0 && carp_source_is_self(m, ifa, af)) 795 error = ELOOP; 796 } 797 ifa = error ? NULL : match; 798 if (ifa != NULL) 799 ifa_ref(ifa); 800 801 if (ifa == NULL) { 802 if (error == ELOOP) { 803 CARP_DEBUG("dropping looped packet on interface %s\n", 804 if_name(ifp)); 805 CARPSTATS_INC(carps_badif); /* ??? */ 806 } else { 807 CARPSTATS_INC(carps_badvhid); 808 } 809 } 810 811 return (ifa); 812 } 813 814 static void 815 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) 816 { 817 struct ifnet *ifp = m->m_pkthdr.rcvif; 818 struct ifaddr *ifa; 819 struct carp_softc *sc; 820 uint64_t tmp_counter; 821 struct timeval sc_tv, ch_tv; 822 bool multicast = false; 823 824 NET_EPOCH_ASSERT(); 825 MPASS(ch->carp_version == CARP_VERSION_CARP); 826 827 ifa = carp_find_ifa(m, af, ch->carp_vhid); 828 if (ifa == NULL) { 829 m_freem(m); 830 return; 831 } 832 833 sc = ifa->ifa_carp; 834 CARP_LOCK(sc); 835 836 /* verify the CARP version. */ 837 if (sc->sc_version != CARP_VERSION_CARP) { 838 CARP_UNLOCK(sc); 839 840 CARPSTATS_INC(carps_badver); 841 CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), 842 ch->carp_version); 843 ifa_free(ifa); 844 m_freem(m); 845 return; 846 } 847 848 if (ifa->ifa_addr->sa_family == AF_INET) { 849 multicast = IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)); 850 } else { 851 multicast = IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6); 852 } 853 ifa_free(ifa); 854 855 /* verify that the IP TTL is 255, but only if we're not in unicast mode. */ 856 if (multicast && ttl != CARP_DFLTTL) { 857 CARPSTATS_INC(carps_badttl); 858 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, 859 ttl, if_name(m->m_pkthdr.rcvif)); 860 goto out; 861 } 862 863 if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { 864 CARPSTATS_INC(carps_badauth); 865 CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, 866 sc->sc_vhid, if_name(ifp)); 867 goto out; 868 } 869 870 tmp_counter = ntohl(ch->carp_counter[0]); 871 tmp_counter = tmp_counter<<32; 872 tmp_counter += ntohl(ch->carp_counter[1]); 873 874 /* XXX Replay protection goes here */ 875 876 sc->sc_init_counter = false; 877 sc->sc_counter = tmp_counter; 878 879 sc_tv.tv_sec = sc->sc_advbase; 880 sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; 881 ch_tv.tv_sec = ch->carp_advbase; 882 ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; 883 884 switch (sc->sc_state) { 885 case INIT: 886 break; 887 case MASTER: 888 /* 889 * If we receive an advertisement from a master who's going to 890 * be more frequent than us, go into BACKUP state. 891 */ 892 if (timevalcmp(&sc_tv, &ch_tv, >) || 893 timevalcmp(&sc_tv, &ch_tv, ==)) { 894 callout_stop(&sc->sc_ad_tmo); 895 carp_set_state(sc, BACKUP, 896 "more frequent advertisement received"); 897 carp_setrun(sc, 0); 898 carp_delroute(sc); 899 } 900 break; 901 case BACKUP: 902 /* 903 * If we're pre-empting masters who advertise slower than us, 904 * and this one claims to be slower, treat him as down. 905 */ 906 if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { 907 carp_master_down_locked(sc, 908 "preempting a slower master"); 909 break; 910 } 911 912 /* 913 * If the master is going to advertise at such a low frequency 914 * that he's guaranteed to time out, we'd might as well just 915 * treat him as timed out now. 916 */ 917 sc_tv.tv_sec = sc->sc_advbase * 3; 918 if (timevalcmp(&sc_tv, &ch_tv, <)) { 919 carp_master_down_locked(sc, "master will time out"); 920 break; 921 } 922 923 /* 924 * Otherwise, we reset the counter and wait for the next 925 * advertisement. 926 */ 927 carp_setrun(sc, af); 928 break; 929 } 930 931 out: 932 CARP_UNLOCK(sc); 933 m_freem(m); 934 } 935 936 static void 937 vrrp_input_c(struct mbuf *m, int off, sa_family_t af, int ttl, 938 int len, uint16_t phdrcksum) 939 { 940 struct vrrpv3_header *vh = mtodo(m, off); 941 struct ifnet *ifp = m->m_pkthdr.rcvif; 942 struct ifaddr *ifa; 943 struct carp_softc *sc; 944 945 NET_EPOCH_ASSERT(); 946 MPASS(vh->vrrp_version == CARP_VERSION_VRRPv3); 947 948 ifa = carp_find_ifa(m, af, vh->vrrp_vrtid); 949 if (ifa == NULL) { 950 m_freem(m); 951 return; 952 } 953 954 sc = ifa->ifa_carp; 955 CARP_LOCK(sc); 956 957 ifa_free(ifa); 958 959 /* verify the CARP version. */ 960 if (sc->sc_version != CARP_VERSION_VRRPv3) { 961 CARP_UNLOCK(sc); 962 963 CARPSTATS_INC(carps_badver); 964 CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), 965 vh->vrrp_version); 966 m_freem(m); 967 return; 968 } 969 970 /* verify that the IP TTL is 255. */ 971 if (ttl != CARP_DFLTTL) { 972 CARPSTATS_INC(carps_badttl); 973 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, 974 ttl, if_name(m->m_pkthdr.rcvif)); 975 goto out; 976 } 977 978 if (vrrp_checksum_verify(m, off, len, phdrcksum)) { 979 CARPSTATS_INC(carps_badsum); 980 CARP_DEBUG("%s: incorrect checksum for VRID %u@%s\n", __func__, 981 sc->sc_vhid, if_name(ifp)); 982 goto out; 983 } 984 985 /* RFC9568, 7.1 Receiving VRRP packets. */ 986 if (sc->sc_vrrp_prio == 255) { 987 CARP_DEBUG("%s: our priority is 255. Ignore peer announcement.\n", 988 __func__); 989 goto out; 990 } 991 992 /* XXX TODO Check IP address payload. */ 993 994 sc->sc_vrrp_master_inter = ntohs(vh->vrrp_max_adver_int); 995 996 switch (sc->sc_state) { 997 case INIT: 998 break; 999 case MASTER: 1000 /* 1001 * If we receive an advertisement from a master who's going to 1002 * be more frequent than us, go into BACKUP state. 1003 * Same if the peer has a higher priority than us. 1004 */ 1005 if (ntohs(vh->vrrp_max_adver_int) < sc->sc_vrrp_adv_inter || 1006 vh->vrrp_priority > DEMOTE_VRRP_PRIO(sc)) { 1007 callout_stop(&sc->sc_ad_tmo); 1008 carp_set_state(sc, BACKUP, 1009 "more frequent advertisement received"); 1010 carp_setrun(sc, 0); 1011 carp_delroute(sc); 1012 } 1013 break; 1014 case BACKUP: 1015 /* 1016 * If we're pre-empting masters who advertise slower than us, 1017 * and this one claims to be slower, treat him as down. 1018 */ 1019 if (V_carp_preempt && (ntohs(vh->vrrp_max_adver_int) > sc->sc_vrrp_adv_inter 1020 || vh->vrrp_priority < DEMOTE_VRRP_PRIO(sc))) { 1021 carp_master_down_locked(sc, 1022 "preempting a slower master"); 1023 break; 1024 } 1025 1026 /* 1027 * Otherwise, we reset the counter and wait for the next 1028 * advertisement. 1029 */ 1030 carp_setrun(sc, af); 1031 break; 1032 } 1033 1034 out: 1035 CARP_UNLOCK(sc); 1036 m_freem(m); 1037 } 1038 1039 static int 1040 carp_tag(struct carp_softc *sc, struct mbuf *m) 1041 { 1042 struct m_tag *mtag; 1043 1044 /* Tag packet for carp_output */ 1045 if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(sc->sc_vhid), 1046 M_NOWAIT)) == NULL) { 1047 m_freem(m); 1048 CARPSTATS_INC(carps_onomem); 1049 return (ENOMEM); 1050 } 1051 bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); 1052 m_tag_prepend(m, mtag); 1053 1054 return (0); 1055 } 1056 1057 static void 1058 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) 1059 { 1060 1061 MPASS(sc->sc_version == CARP_VERSION_CARP); 1062 1063 if (sc->sc_init_counter) { 1064 /* this could also be seconds since unix epoch */ 1065 sc->sc_counter = arc4random(); 1066 sc->sc_counter = sc->sc_counter << 32; 1067 sc->sc_counter += arc4random(); 1068 } else 1069 sc->sc_counter++; 1070 1071 ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); 1072 ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); 1073 1074 carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); 1075 } 1076 1077 static inline void 1078 send_ad_locked(struct carp_softc *sc) 1079 { 1080 switch (sc->sc_version) { 1081 case CARP_VERSION_CARP: 1082 carp_send_ad_locked(sc); 1083 break; 1084 case CARP_VERSION_VRRPv3: 1085 vrrp_send_ad_locked(sc); 1086 break; 1087 } 1088 } 1089 1090 /* 1091 * To avoid LORs and possible recursions this function shouldn't 1092 * be called directly, but scheduled via taskqueue. 1093 */ 1094 static void 1095 carp_send_ad_all(void *ctx __unused, int pending __unused) 1096 { 1097 struct carp_softc *sc; 1098 struct epoch_tracker et; 1099 1100 NET_EPOCH_ENTER(et); 1101 mtx_lock(&carp_mtx); 1102 LIST_FOREACH(sc, &carp_list, sc_next) 1103 if (sc->sc_state == MASTER) { 1104 CARP_LOCK(sc); 1105 CURVNET_SET(sc->sc_carpdev->if_vnet); 1106 send_ad_locked(sc); 1107 CURVNET_RESTORE(); 1108 CARP_UNLOCK(sc); 1109 } 1110 mtx_unlock(&carp_mtx); 1111 NET_EPOCH_EXIT(et); 1112 } 1113 1114 /* Send a periodic advertisement, executed in callout context. */ 1115 static void 1116 carp_callout(void *v) 1117 { 1118 struct carp_softc *sc = v; 1119 struct epoch_tracker et; 1120 1121 NET_EPOCH_ENTER(et); 1122 CARP_LOCK_ASSERT(sc); 1123 CURVNET_SET(sc->sc_carpdev->if_vnet); 1124 send_ad_locked(sc); 1125 CURVNET_RESTORE(); 1126 CARP_UNLOCK(sc); 1127 NET_EPOCH_EXIT(et); 1128 } 1129 1130 static void 1131 carp_send_ad_error(struct carp_softc *sc, int error) 1132 { 1133 1134 /* 1135 * We track errors and successful sends with this logic: 1136 * - Any error resets success counter to 0. 1137 * - MAX_ERRORS triggers demotion. 1138 * - MIN_SUCCESS successes resets error counter to 0. 1139 * - MIN_SUCCESS reverts demotion, if it was triggered before. 1140 */ 1141 if (error) { 1142 if (sc->sc_sendad_errors < INT_MAX) 1143 sc->sc_sendad_errors++; 1144 if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { 1145 static const char fmt[] = "send error %d on %s"; 1146 char msg[sizeof(fmt) + IFNAMSIZ]; 1147 1148 sprintf(msg, fmt, error, if_name(sc->sc_carpdev)); 1149 carp_demote_adj(V_carp_senderr_adj, msg); 1150 } 1151 sc->sc_sendad_success = 0; 1152 } else if (sc->sc_sendad_errors > 0) { 1153 if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { 1154 if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { 1155 static const char fmt[] = "send ok on %s"; 1156 char msg[sizeof(fmt) + IFNAMSIZ]; 1157 1158 sprintf(msg, fmt, if_name(sc->sc_carpdev)); 1159 carp_demote_adj(-V_carp_senderr_adj, msg); 1160 } 1161 sc->sc_sendad_errors = 0; 1162 } 1163 } 1164 } 1165 1166 /* 1167 * Pick the best ifaddr on the given ifp for sending CARP 1168 * advertisements. 1169 * 1170 * "Best" here is defined by ifa_preferred(). This function is much 1171 * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). 1172 * 1173 * (This could be simplified to return the actual address, except that 1174 * it has a different format in AF_INET and AF_INET6.) 1175 */ 1176 static struct ifaddr * 1177 carp_best_ifa(int af, struct ifnet *ifp) 1178 { 1179 struct ifaddr *ifa, *best; 1180 1181 NET_EPOCH_ASSERT(); 1182 1183 if (af >= AF_MAX) 1184 return (NULL); 1185 best = NULL; 1186 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1187 if (ifa->ifa_addr->sa_family == af && 1188 (best == NULL || ifa_preferred(best, ifa))) 1189 best = ifa; 1190 } 1191 if (best != NULL) 1192 ifa_ref(best); 1193 return (best); 1194 } 1195 1196 static void 1197 carp_send_ad_locked(struct carp_softc *sc) 1198 { 1199 struct carp_header ch; 1200 struct timeval tv; 1201 struct ifaddr *ifa; 1202 struct carp_header *ch_ptr; 1203 struct mbuf *m; 1204 int len, advskew; 1205 1206 NET_EPOCH_ASSERT(); 1207 CARP_LOCK_ASSERT(sc); 1208 MPASS(sc->sc_version == CARP_VERSION_CARP); 1209 1210 advskew = DEMOTE_ADVSKEW(sc); 1211 tv.tv_sec = sc->sc_advbase; 1212 tv.tv_usec = advskew * 1000000 / 256; 1213 1214 ch.carp_version = CARP_VERSION_CARP; 1215 ch.carp_type = CARP_ADVERTISEMENT; 1216 ch.carp_vhid = sc->sc_vhid; 1217 ch.carp_advbase = sc->sc_advbase; 1218 ch.carp_advskew = advskew; 1219 ch.carp_authlen = 7; /* XXX DEFINE */ 1220 ch.carp_pad1 = 0; /* must be zero */ 1221 ch.carp_cksum = 0; 1222 1223 /* XXXGL: OpenBSD picks first ifaddr with needed family. */ 1224 1225 #ifdef INET 1226 if (sc->sc_naddrs) { 1227 struct ip *ip; 1228 1229 m = m_gethdr(M_NOWAIT, MT_DATA); 1230 if (m == NULL) { 1231 CARPSTATS_INC(carps_onomem); 1232 goto resched; 1233 } 1234 len = sizeof(*ip) + sizeof(ch); 1235 m->m_pkthdr.len = len; 1236 m->m_pkthdr.rcvif = NULL; 1237 m->m_len = len; 1238 M_ALIGN(m, m->m_len); 1239 if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr))) 1240 m->m_flags |= M_MCAST; 1241 ip = mtod(m, struct ip *); 1242 ip->ip_v = IPVERSION; 1243 ip->ip_hl = sizeof(*ip) >> 2; 1244 ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; 1245 ip->ip_len = htons(len); 1246 ip->ip_off = htons(IP_DF); 1247 ip->ip_ttl = CARP_DFLTTL; 1248 ip->ip_p = IPPROTO_CARP; 1249 ip->ip_sum = 0; 1250 ip_fillid(ip, V_ip_random_id); 1251 1252 ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); 1253 if (ifa != NULL) { 1254 ip->ip_src.s_addr = 1255 ifatoia(ifa)->ia_addr.sin_addr.s_addr; 1256 ifa_free(ifa); 1257 } else 1258 ip->ip_src.s_addr = 0; 1259 ip->ip_dst = sc->sc_carpaddr; 1260 1261 ch_ptr = (struct carp_header *)(&ip[1]); 1262 bcopy(&ch, ch_ptr, sizeof(ch)); 1263 carp_prepare_ad(m, sc, ch_ptr); 1264 if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)) && 1265 carp_tag(sc, m) != 0) 1266 goto resched; 1267 1268 m->m_data += sizeof(*ip); 1269 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); 1270 m->m_data -= sizeof(*ip); 1271 1272 CARPSTATS_INC(carps_opackets); 1273 1274 carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, 1275 &sc->sc_carpdev->if_carp->cif_imo, NULL)); 1276 } 1277 #endif /* INET */ 1278 #ifdef INET6 1279 if (sc->sc_naddrs6) { 1280 struct ip6_hdr *ip6; 1281 1282 m = m_gethdr(M_NOWAIT, MT_DATA); 1283 if (m == NULL) { 1284 CARPSTATS_INC(carps_onomem); 1285 goto resched; 1286 } 1287 len = sizeof(*ip6) + sizeof(ch); 1288 m->m_pkthdr.len = len; 1289 m->m_pkthdr.rcvif = NULL; 1290 m->m_len = len; 1291 M_ALIGN(m, m->m_len); 1292 ip6 = mtod(m, struct ip6_hdr *); 1293 bzero(ip6, sizeof(*ip6)); 1294 ip6->ip6_vfc |= IPV6_VERSION; 1295 /* Traffic class isn't defined in ip6 struct instead 1296 * it gets offset into flowid field */ 1297 ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + 1298 IPTOS_DSCP_OFFSET)); 1299 ip6->ip6_hlim = CARP_DFLTTL; 1300 ip6->ip6_nxt = IPPROTO_CARP; 1301 1302 /* set the source address */ 1303 ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); 1304 if (ifa != NULL) { 1305 bcopy(IFA_IN6(ifa), &ip6->ip6_src, 1306 sizeof(struct in6_addr)); 1307 ifa_free(ifa); 1308 } else 1309 /* This should never happen with IPv6. */ 1310 bzero(&ip6->ip6_src, sizeof(struct in6_addr)); 1311 1312 /* Set the multicast destination. */ 1313 memcpy(&ip6->ip6_dst, &sc->sc_carpaddr6, sizeof(ip6->ip6_dst)); 1314 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 1315 IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { 1316 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { 1317 m_freem(m); 1318 CARP_DEBUG("%s: in6_setscope failed\n", __func__); 1319 goto resched; 1320 } 1321 } 1322 1323 ch_ptr = (struct carp_header *)(&ip6[1]); 1324 bcopy(&ch, ch_ptr, sizeof(ch)); 1325 carp_prepare_ad(m, sc, ch_ptr); 1326 if (IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6) && 1327 carp_tag(sc, m) != 0) 1328 goto resched; 1329 1330 m->m_data += sizeof(*ip6); 1331 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); 1332 m->m_data -= sizeof(*ip6); 1333 1334 CARPSTATS_INC(carps_opackets6); 1335 1336 carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, 1337 &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); 1338 } 1339 #endif /* INET6 */ 1340 1341 resched: 1342 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_callout, sc); 1343 } 1344 1345 static void 1346 vrrp_send_ad_locked(struct carp_softc *sc) 1347 { 1348 struct vrrpv3_header *vh_ptr; 1349 struct ifaddr *ifa; 1350 struct mbuf *m; 1351 int len; 1352 struct vrrpv3_header vh = { 1353 .vrrp_version = CARP_VERSION_VRRPv3, 1354 .vrrp_type = VRRP_TYPE_ADVERTISEMENT, 1355 .vrrp_vrtid = sc->sc_vhid, 1356 .vrrp_priority = DEMOTE_VRRP_PRIO(sc), 1357 .vrrp_count_addr = 0, 1358 .vrrp_max_adver_int = htons(sc->sc_vrrp_adv_inter), 1359 .vrrp_checksum = 0, 1360 }; 1361 1362 NET_EPOCH_ASSERT(); 1363 CARP_LOCK_ASSERT(sc); 1364 MPASS(sc->sc_version == CARP_VERSION_VRRPv3); 1365 1366 #ifdef INET 1367 if (sc->sc_naddrs) { 1368 struct ip *ip; 1369 1370 m = m_gethdr(M_NOWAIT, MT_DATA); 1371 if (m == NULL) { 1372 CARPSTATS_INC(carps_onomem); 1373 goto resched; 1374 } 1375 len = sizeof(*ip) + sizeof(vh); 1376 m->m_pkthdr.len = len; 1377 m->m_pkthdr.rcvif = NULL; 1378 m->m_len = len; 1379 M_ALIGN(m, m->m_len); 1380 m->m_flags |= M_MCAST; 1381 ip = mtod(m, struct ip *); 1382 ip->ip_v = IPVERSION; 1383 ip->ip_hl = sizeof(*ip) >> 2; 1384 ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; 1385 ip->ip_off = htons(IP_DF); 1386 ip->ip_ttl = CARP_DFLTTL; 1387 ip->ip_p = IPPROTO_CARP; 1388 ip->ip_sum = 0; 1389 ip_fillid(ip, V_ip_random_id); 1390 1391 ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); 1392 if (ifa != NULL) { 1393 ip->ip_src.s_addr = 1394 ifatoia(ifa)->ia_addr.sin_addr.s_addr; 1395 ifa_free(ifa); 1396 } else 1397 ip->ip_src.s_addr = 0; 1398 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); 1399 1400 /* Include the IP addresses in the announcement. */ 1401 for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { 1402 struct sockaddr_in *in; 1403 1404 MPASS(sc->sc_ifas[i] != NULL); 1405 if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET) 1406 continue; 1407 1408 in = (struct sockaddr_in *)sc->sc_ifas[i]->ifa_addr; 1409 1410 if (m_append(m, sizeof(in->sin_addr), 1411 (caddr_t)&in->sin_addr) != 1) { 1412 m_freem(m); 1413 goto resched; 1414 } 1415 1416 vh.vrrp_count_addr++; 1417 len += sizeof(in->sin_addr); 1418 } 1419 ip->ip_len = htons(len); 1420 1421 vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip)); 1422 bcopy(&vh, vh_ptr, sizeof(vh)); 1423 1424 vh_ptr->vrrp_checksum = in_pseudo(ip->ip_src.s_addr, 1425 ip->ip_dst.s_addr, 1426 htonl((uint16_t)(len - sizeof(*ip)) + ip->ip_p)); 1427 vh_ptr->vrrp_checksum = in_cksum_skip(m, len, sizeof(*ip)); 1428 1429 if (carp_tag(sc, m)) 1430 goto resched; 1431 1432 CARPSTATS_INC(carps_opackets); 1433 1434 carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, 1435 &sc->sc_carpdev->if_carp->cif_imo, NULL)); 1436 } 1437 #endif 1438 #ifdef INET6 1439 if (sc->sc_naddrs6) { 1440 struct ip6_hdr *ip6; 1441 1442 m = m_gethdr(M_NOWAIT, MT_DATA); 1443 if (m == NULL) { 1444 CARPSTATS_INC(carps_onomem); 1445 goto resched; 1446 } 1447 len = sizeof(*ip6) + sizeof(vh); 1448 m->m_pkthdr.len = len; 1449 m->m_pkthdr.rcvif = NULL; 1450 m->m_len = len; 1451 M_ALIGN(m, m->m_len); 1452 m->m_flags |= M_MCAST; 1453 ip6 = mtod(m, struct ip6_hdr *); 1454 bzero(ip6, sizeof(*ip6)); 1455 ip6->ip6_vfc |= IPV6_VERSION; 1456 /* Traffic class isn't defined in ip6 struct instead 1457 * it gets offset into flowid field */ 1458 ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + 1459 IPTOS_DSCP_OFFSET)); 1460 ip6->ip6_hlim = CARP_DFLTTL; 1461 ip6->ip6_nxt = IPPROTO_CARP; 1462 1463 /* set the source address */ 1464 ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); 1465 if (ifa != NULL) { 1466 bcopy(IFA_IN6(ifa), &ip6->ip6_src, 1467 sizeof(struct in6_addr)); 1468 ifa_free(ifa); 1469 } else 1470 /* This should never happen with IPv6. */ 1471 bzero(&ip6->ip6_src, sizeof(struct in6_addr)); 1472 1473 /* Set the multicast destination. */ 1474 bzero(&ip6->ip6_dst, sizeof(ip6->ip6_dst)); 1475 ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; 1476 ip6->ip6_dst.s6_addr8[15] = 0x12; 1477 1478 /* Include the IP addresses in the announcement. */ 1479 len = sizeof(vh); 1480 for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { 1481 struct sockaddr_in6 *in6; 1482 1483 MPASS(sc->sc_ifas[i] != NULL); 1484 if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET6) 1485 continue; 1486 1487 in6 = (struct sockaddr_in6 *)sc->sc_ifas[i]->ifa_addr; 1488 1489 if (m_append(m, sizeof(in6->sin6_addr), 1490 (char *)&in6->sin6_addr) != 1) { 1491 m_freem(m); 1492 goto resched; 1493 } 1494 1495 vh.vrrp_count_addr++; 1496 len += sizeof(in6->sin6_addr); 1497 } 1498 ip6->ip6_plen = htonl(len); 1499 1500 vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); 1501 bcopy(&vh, vh_ptr, sizeof(vh)); 1502 1503 vh_ptr->vrrp_checksum = in6_cksum_pseudo(ip6, len, ip6->ip6_nxt, 0); 1504 vh_ptr->vrrp_checksum = in_cksum_skip(m, len + sizeof(*ip6), sizeof(*ip6)); 1505 1506 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { 1507 m_freem(m); 1508 CARP_DEBUG("%s: in6_setscope failed\n", __func__); 1509 goto resched; 1510 } 1511 1512 if (carp_tag(sc, m)) 1513 goto resched; 1514 CARPSTATS_INC(carps_opackets6); 1515 1516 carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, 1517 &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); 1518 } 1519 #endif 1520 1521 resched: 1522 callout_reset(&sc->sc_ad_tmo, sc->sc_vrrp_adv_inter * hz / 100, 1523 carp_callout, sc); 1524 } 1525 1526 static void 1527 carp_addroute(struct carp_softc *sc) 1528 { 1529 struct ifaddr *ifa; 1530 1531 CARP_FOREACH_IFA(sc, ifa) 1532 carp_ifa_addroute(ifa); 1533 } 1534 1535 static void 1536 carp_ifa_addroute(struct ifaddr *ifa) 1537 { 1538 1539 switch (ifa->ifa_addr->sa_family) { 1540 #ifdef INET 1541 case AF_INET: 1542 in_addprefix(ifatoia(ifa)); 1543 ifa_add_loopback_route(ifa, 1544 (struct sockaddr *)&ifatoia(ifa)->ia_addr); 1545 break; 1546 #endif 1547 #ifdef INET6 1548 case AF_INET6: 1549 ifa_add_loopback_route(ifa, 1550 (struct sockaddr *)&ifatoia6(ifa)->ia_addr); 1551 nd6_add_ifa_lle(ifatoia6(ifa)); 1552 break; 1553 #endif 1554 } 1555 } 1556 1557 static void 1558 carp_delroute(struct carp_softc *sc) 1559 { 1560 struct ifaddr *ifa; 1561 1562 CARP_FOREACH_IFA(sc, ifa) 1563 carp_ifa_delroute(ifa); 1564 } 1565 1566 static void 1567 carp_ifa_delroute(struct ifaddr *ifa) 1568 { 1569 1570 switch (ifa->ifa_addr->sa_family) { 1571 #ifdef INET 1572 case AF_INET: 1573 ifa_del_loopback_route(ifa, 1574 (struct sockaddr *)&ifatoia(ifa)->ia_addr); 1575 in_scrubprefix(ifatoia(ifa), LLE_STATIC); 1576 break; 1577 #endif 1578 #ifdef INET6 1579 case AF_INET6: 1580 ifa_del_loopback_route(ifa, 1581 (struct sockaddr *)&ifatoia6(ifa)->ia_addr); 1582 nd6_rem_ifa_lle(ifatoia6(ifa), 1); 1583 break; 1584 #endif 1585 } 1586 } 1587 1588 int 1589 carp_master(struct ifaddr *ifa) 1590 { 1591 struct carp_softc *sc = ifa->ifa_carp; 1592 1593 return (sc->sc_state == MASTER); 1594 } 1595 1596 #ifdef INET 1597 /* 1598 * Broadcast a gratuitous ARP request containing 1599 * the virtual router MAC address for each IP address 1600 * associated with the virtual router. 1601 */ 1602 static void 1603 carp_send_arp(struct carp_softc *sc) 1604 { 1605 struct ifaddr *ifa; 1606 struct in_addr addr; 1607 1608 NET_EPOCH_ASSERT(); 1609 1610 CARP_FOREACH_IFA(sc, ifa) { 1611 if (ifa->ifa_addr->sa_family != AF_INET) 1612 continue; 1613 addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; 1614 arp_announce_ifaddr(sc->sc_carpdev, addr, sc->sc_addr); 1615 } 1616 } 1617 1618 int 1619 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) 1620 { 1621 struct carp_softc *sc = ifa->ifa_carp; 1622 1623 if (sc->sc_state == MASTER) { 1624 *enaddr = sc->sc_addr; 1625 return (1); 1626 } 1627 1628 return (0); 1629 } 1630 #endif 1631 1632 #ifdef INET6 1633 static void 1634 carp_send_na(struct carp_softc *sc) 1635 { 1636 struct ifaddr *ifa; 1637 int flags; 1638 1639 /* 1640 * Sending Unsolicited Neighbor Advertisements 1641 * 1642 * If the node is a router, we MUST set the Router flag to one. 1643 * We set Override flag to one and send link-layer address option, 1644 * thus neighboring nodes will install the new link-layer address. 1645 */ 1646 flags = ND_NA_FLAG_OVERRIDE; 1647 if (V_ip6_forwarding) 1648 flags |= ND_NA_FLAG_ROUTER; 1649 CARP_FOREACH_IFA(sc, ifa) { 1650 if (ifa->ifa_addr->sa_family != AF_INET6) 1651 continue; 1652 /* 1653 * We use unspecified address as destination here to avoid 1654 * scope initialization for each call. 1655 * nd6_na_output() will use all nodes multicast address if 1656 * destinaion address is unspecified. 1657 */ 1658 nd6_na_output(sc->sc_carpdev, &in6addr_any, IFA_IN6(ifa), 1659 flags, ND6_NA_OPT_LLA | ND6_NA_CARP_MASTER, NULL); 1660 DELAY(1000); /* RetransTimer */ 1661 } 1662 } 1663 1664 /* 1665 * Returns ifa in case it's a carp address and it is MASTER, or if the address 1666 * matches and is not a carp address. Returns NULL otherwise. 1667 */ 1668 struct ifaddr * 1669 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) 1670 { 1671 struct ifaddr *ifa; 1672 1673 NET_EPOCH_ASSERT(); 1674 1675 ifa = NULL; 1676 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1677 if (ifa->ifa_addr->sa_family != AF_INET6) 1678 continue; 1679 if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) 1680 continue; 1681 if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) 1682 ifa = NULL; 1683 else 1684 ifa_ref(ifa); 1685 break; 1686 } 1687 1688 return (ifa); 1689 } 1690 1691 char * 1692 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) 1693 { 1694 struct ifaddr *ifa; 1695 char *mac = NULL; 1696 1697 NET_EPOCH_ASSERT(); 1698 1699 IFNET_FOREACH_IFA(ifp, ifa) 1700 if (ifa->ifa_addr->sa_family == AF_INET6 && 1701 IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { 1702 struct carp_softc *sc = ifa->ifa_carp; 1703 struct m_tag *mtag; 1704 1705 mtag = m_tag_get(PACKET_TAG_CARP, 1706 sizeof(sc->sc_vhid) + sizeof(sc->sc_addr), 1707 M_NOWAIT); 1708 if (mtag == NULL) { 1709 CARPSTATS_INC(carps_onomem); 1710 break; 1711 } 1712 /* carp_output expects sc_vhid first. */ 1713 bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); 1714 /* 1715 * Save sc_addr into mtag data after sc_vhid to avoid 1716 * possible access to destroyed softc. 1717 */ 1718 mac = (char *)(mtag + 1) + sizeof(sc->sc_vhid); 1719 bcopy(sc->sc_addr, mac, sizeof(sc->sc_addr)); 1720 1721 m_tag_prepend(m, mtag); 1722 break; 1723 } 1724 1725 return (mac); 1726 } 1727 #endif /* INET6 */ 1728 1729 int 1730 carp_forus(struct ifnet *ifp, u_char *dhost) 1731 { 1732 struct carp_softc *sc; 1733 uint8_t *ena = dhost; 1734 1735 if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) 1736 return (0); 1737 1738 CIF_LOCK(ifp->if_carp); 1739 IFNET_FOREACH_CARP(ifp, sc) { 1740 /* 1741 * CARP_LOCK() is not here, since would protect nothing, but 1742 * cause deadlock with if_bridge, calling this under its lock. 1743 */ 1744 if (sc->sc_state == MASTER && !bcmp(dhost, sc->sc_addr, 1745 ETHER_ADDR_LEN)) { 1746 CIF_UNLOCK(ifp->if_carp); 1747 return (1); 1748 } 1749 } 1750 CIF_UNLOCK(ifp->if_carp); 1751 1752 return (0); 1753 } 1754 1755 /* Master down timeout event, executed in callout context. */ 1756 static void 1757 carp_master_down(void *v) 1758 { 1759 struct carp_softc *sc = v; 1760 struct epoch_tracker et; 1761 1762 NET_EPOCH_ENTER(et); 1763 CARP_LOCK_ASSERT(sc); 1764 1765 CURVNET_SET(sc->sc_carpdev->if_vnet); 1766 if (sc->sc_state == BACKUP) { 1767 carp_master_down_locked(sc, "master timed out"); 1768 } 1769 CURVNET_RESTORE(); 1770 1771 CARP_UNLOCK(sc); 1772 NET_EPOCH_EXIT(et); 1773 } 1774 1775 static void 1776 carp_master_down_locked(struct carp_softc *sc, const char *reason) 1777 { 1778 1779 NET_EPOCH_ASSERT(); 1780 CARP_LOCK_ASSERT(sc); 1781 1782 switch (sc->sc_state) { 1783 case BACKUP: 1784 carp_set_state(sc, MASTER, reason); 1785 send_ad_locked(sc); 1786 #ifdef INET 1787 carp_send_arp(sc); 1788 #endif 1789 #ifdef INET6 1790 carp_send_na(sc); 1791 #endif 1792 carp_setrun(sc, 0); 1793 carp_addroute(sc); 1794 break; 1795 case INIT: 1796 case MASTER: 1797 #ifdef INVARIANTS 1798 panic("carp: VHID %u@%s: master_down event in %s state\n", 1799 sc->sc_vhid, 1800 if_name(sc->sc_carpdev), 1801 sc->sc_state ? "MASTER" : "INIT"); 1802 #endif 1803 break; 1804 } 1805 } 1806 1807 /* 1808 * When in backup state, af indicates whether to reset the master down timer 1809 * for v4 or v6. If it's set to zero, reset the ones which are already pending. 1810 */ 1811 static void 1812 carp_setrun(struct carp_softc *sc, sa_family_t af) 1813 { 1814 struct timeval tv; 1815 int timeout; 1816 1817 CARP_LOCK_ASSERT(sc); 1818 1819 if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || 1820 sc->sc_carpdev->if_link_state != LINK_STATE_UP || 1821 (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || 1822 !V_carp_allow) 1823 return; 1824 1825 switch (sc->sc_state) { 1826 case INIT: 1827 carp_set_state(sc, BACKUP, "initialization complete"); 1828 carp_setrun(sc, 0); 1829 break; 1830 case BACKUP: 1831 callout_stop(&sc->sc_ad_tmo); 1832 1833 switch (sc->sc_version) { 1834 case CARP_VERSION_CARP: 1835 tv.tv_sec = 3 * sc->sc_advbase; 1836 tv.tv_usec = sc->sc_advskew * 1000000 / 256; 1837 timeout = tvtohz(&tv); 1838 break; 1839 case CARP_VERSION_VRRPv3: 1840 /* skew time */ 1841 timeout = (256 - sc->sc_vrrp_prio) * 1842 sc->sc_vrrp_master_inter / 256; 1843 timeout += (3 * sc->sc_vrrp_master_inter); 1844 timeout *= hz; 1845 timeout /= 100; /* master interval is in centiseconds */ 1846 break; 1847 } 1848 switch (af) { 1849 #ifdef INET 1850 case AF_INET: 1851 callout_reset(&sc->sc_md_tmo, timeout, 1852 carp_master_down, sc); 1853 break; 1854 #endif 1855 #ifdef INET6 1856 case AF_INET6: 1857 callout_reset(&sc->sc_md6_tmo, timeout, 1858 carp_master_down, sc); 1859 break; 1860 #endif 1861 default: 1862 #ifdef INET 1863 if (sc->sc_naddrs) 1864 callout_reset(&sc->sc_md_tmo, timeout, 1865 carp_master_down, sc); 1866 #endif 1867 #ifdef INET6 1868 if (sc->sc_naddrs6) 1869 callout_reset(&sc->sc_md6_tmo, timeout, 1870 carp_master_down, sc); 1871 #endif 1872 break; 1873 } 1874 break; 1875 case MASTER: 1876 switch (sc->sc_version) { 1877 case CARP_VERSION_CARP: 1878 tv.tv_sec = sc->sc_advbase; 1879 tv.tv_usec = sc->sc_advskew * 1000000 / 256; 1880 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), 1881 carp_callout, sc); 1882 break; 1883 case CARP_VERSION_VRRPv3: 1884 callout_reset(&sc->sc_ad_tmo, 1885 sc->sc_vrrp_adv_inter * hz / 100, 1886 carp_callout, sc); 1887 break; 1888 } 1889 break; 1890 } 1891 } 1892 1893 /* 1894 * Setup multicast structures. 1895 */ 1896 static int 1897 carp_multicast_setup(struct carp_if *cif, sa_family_t sa) 1898 { 1899 struct ifnet *ifp = cif->cif_ifp; 1900 int error = 0; 1901 1902 switch (sa) { 1903 #ifdef INET 1904 case AF_INET: 1905 { 1906 struct ip_moptions *imo = &cif->cif_imo; 1907 struct in_mfilter *imf; 1908 struct in_addr addr; 1909 1910 if (ip_mfilter_first(&imo->imo_head) != NULL) 1911 return (0); 1912 1913 imf = ip_mfilter_alloc(M_WAITOK, 0, 0); 1914 ip_mfilter_init(&imo->imo_head); 1915 imo->imo_multicast_vif = -1; 1916 1917 addr.s_addr = htonl(INADDR_CARP_GROUP); 1918 if ((error = in_joingroup(ifp, &addr, NULL, 1919 &imf->imf_inm)) != 0) { 1920 ip_mfilter_free(imf); 1921 break; 1922 } 1923 1924 ip_mfilter_insert(&imo->imo_head, imf); 1925 imo->imo_multicast_ifp = ifp; 1926 imo->imo_multicast_ttl = CARP_DFLTTL; 1927 imo->imo_multicast_loop = 0; 1928 break; 1929 } 1930 #endif 1931 #ifdef INET6 1932 case AF_INET6: 1933 { 1934 struct ip6_moptions *im6o = &cif->cif_im6o; 1935 struct in6_mfilter *im6f[2]; 1936 struct in6_addr in6; 1937 1938 if (ip6_mfilter_first(&im6o->im6o_head)) 1939 return (0); 1940 1941 im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); 1942 im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); 1943 1944 ip6_mfilter_init(&im6o->im6o_head); 1945 im6o->im6o_multicast_hlim = CARP_DFLTTL; 1946 im6o->im6o_multicast_ifp = ifp; 1947 1948 /* Join IPv6 CARP multicast group. */ 1949 bzero(&in6, sizeof(in6)); 1950 in6.s6_addr16[0] = htons(0xff02); 1951 in6.s6_addr8[15] = 0x12; 1952 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { 1953 ip6_mfilter_free(im6f[0]); 1954 ip6_mfilter_free(im6f[1]); 1955 break; 1956 } 1957 if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { 1958 ip6_mfilter_free(im6f[0]); 1959 ip6_mfilter_free(im6f[1]); 1960 break; 1961 } 1962 1963 /* Join solicited multicast address. */ 1964 bzero(&in6, sizeof(in6)); 1965 in6.s6_addr16[0] = htons(0xff02); 1966 in6.s6_addr32[1] = 0; 1967 in6.s6_addr32[2] = htonl(1); 1968 in6.s6_addr32[3] = 0; 1969 in6.s6_addr8[12] = 0xff; 1970 1971 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { 1972 ip6_mfilter_free(im6f[0]); 1973 ip6_mfilter_free(im6f[1]); 1974 break; 1975 } 1976 1977 if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { 1978 in6_leavegroup(im6f[0]->im6f_in6m, NULL); 1979 ip6_mfilter_free(im6f[0]); 1980 ip6_mfilter_free(im6f[1]); 1981 break; 1982 } 1983 ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); 1984 ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); 1985 break; 1986 } 1987 #endif 1988 } 1989 1990 return (error); 1991 } 1992 1993 /* 1994 * Free multicast structures. 1995 */ 1996 static void 1997 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) 1998 { 1999 #ifdef INET 2000 struct ip_moptions *imo = &cif->cif_imo; 2001 struct in_mfilter *imf; 2002 #endif 2003 #ifdef INET6 2004 struct ip6_moptions *im6o = &cif->cif_im6o; 2005 struct in6_mfilter *im6f; 2006 #endif 2007 sx_assert(&carp_sx, SA_XLOCKED); 2008 2009 switch (sa) { 2010 #ifdef INET 2011 case AF_INET: 2012 if (cif->cif_naddrs != 0) 2013 break; 2014 2015 while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { 2016 ip_mfilter_remove(&imo->imo_head, imf); 2017 in_leavegroup(imf->imf_inm, NULL); 2018 ip_mfilter_free(imf); 2019 } 2020 break; 2021 #endif 2022 #ifdef INET6 2023 case AF_INET6: 2024 if (cif->cif_naddrs6 != 0) 2025 break; 2026 2027 while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { 2028 ip6_mfilter_remove(&im6o->im6o_head, im6f); 2029 in6_leavegroup(im6f->im6f_in6m, NULL); 2030 ip6_mfilter_free(im6f); 2031 } 2032 break; 2033 #endif 2034 } 2035 } 2036 2037 int 2038 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) 2039 { 2040 struct m_tag *mtag; 2041 int vhid; 2042 2043 if (!sa) 2044 return (0); 2045 2046 switch (sa->sa_family) { 2047 #ifdef INET 2048 case AF_INET: 2049 break; 2050 #endif 2051 #ifdef INET6 2052 case AF_INET6: 2053 break; 2054 #endif 2055 default: 2056 return (0); 2057 } 2058 2059 mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); 2060 if (mtag == NULL) 2061 return (0); 2062 2063 bcopy(mtag + 1, &vhid, sizeof(vhid)); 2064 2065 /* Set the source MAC address to the Virtual Router MAC Address. */ 2066 switch (ifp->if_type) { 2067 case IFT_ETHER: 2068 case IFT_BRIDGE: 2069 case IFT_L2VLAN: { 2070 struct ether_header *eh; 2071 2072 eh = mtod(m, struct ether_header *); 2073 eh->ether_shost[0] = 0; 2074 eh->ether_shost[1] = 0; 2075 eh->ether_shost[2] = 0x5e; 2076 eh->ether_shost[3] = 0; 2077 eh->ether_shost[4] = 1; 2078 eh->ether_shost[5] = vhid; 2079 } 2080 break; 2081 default: 2082 printf("%s: carp is not supported for the %d interface type\n", 2083 if_name(ifp), ifp->if_type); 2084 return (EOPNOTSUPP); 2085 } 2086 2087 return (0); 2088 } 2089 2090 static struct carp_softc* 2091 carp_alloc(struct ifnet *ifp, carp_version_t version, int vhid) 2092 { 2093 struct carp_softc *sc; 2094 struct carp_if *cif; 2095 2096 sx_assert(&carp_sx, SA_XLOCKED); 2097 2098 if ((cif = ifp->if_carp) == NULL) 2099 cif = carp_alloc_if(ifp); 2100 2101 sc = malloc(sizeof(*sc), M_CARP, M_WAITOK); 2102 *sc = (struct carp_softc ){ 2103 .sc_vhid = vhid, 2104 .sc_version = version, 2105 .sc_state = INIT, 2106 .sc_carpdev = ifp, 2107 .sc_ifasiz = sizeof(struct ifaddr *), 2108 .sc_addr = { 0, 0, 0x5e, 0, 1, vhid }, 2109 }; 2110 sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); 2111 2112 switch (version) { 2113 case CARP_VERSION_CARP: 2114 sc->sc_advbase = CARP_DFLTINTV; 2115 sc->sc_init_counter = true; 2116 sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP); 2117 sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; 2118 sc->sc_carpaddr6.s6_addr8[15] = 0x12; 2119 break; 2120 case CARP_VERSION_VRRPv3: 2121 sc->sc_vrrp_adv_inter = 100; 2122 sc->sc_vrrp_master_inter = sc->sc_vrrp_adv_inter; 2123 sc->sc_vrrp_prio = 100; 2124 break; 2125 } 2126 2127 CARP_LOCK_INIT(sc); 2128 #ifdef INET 2129 callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); 2130 #endif 2131 #ifdef INET6 2132 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); 2133 #endif 2134 callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); 2135 2136 CIF_LOCK(cif); 2137 TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); 2138 CIF_UNLOCK(cif); 2139 2140 mtx_lock(&carp_mtx); 2141 LIST_INSERT_HEAD(&carp_list, sc, sc_next); 2142 mtx_unlock(&carp_mtx); 2143 2144 return (sc); 2145 } 2146 2147 static void 2148 carp_grow_ifas(struct carp_softc *sc) 2149 { 2150 struct ifaddr **new; 2151 2152 new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); 2153 CARP_LOCK(sc); 2154 bcopy(sc->sc_ifas, new, sc->sc_ifasiz); 2155 free(sc->sc_ifas, M_CARP); 2156 sc->sc_ifas = new; 2157 sc->sc_ifasiz *= 2; 2158 CARP_UNLOCK(sc); 2159 } 2160 2161 static void 2162 carp_destroy(struct carp_softc *sc) 2163 { 2164 struct ifnet *ifp = sc->sc_carpdev; 2165 struct carp_if *cif = ifp->if_carp; 2166 2167 sx_assert(&carp_sx, SA_XLOCKED); 2168 2169 if (sc->sc_suppress) 2170 carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); 2171 CARP_UNLOCK(sc); 2172 2173 CIF_LOCK(cif); 2174 TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); 2175 CIF_UNLOCK(cif); 2176 2177 mtx_lock(&carp_mtx); 2178 LIST_REMOVE(sc, sc_next); 2179 mtx_unlock(&carp_mtx); 2180 2181 callout_drain(&sc->sc_ad_tmo); 2182 #ifdef INET 2183 callout_drain(&sc->sc_md_tmo); 2184 #endif 2185 #ifdef INET6 2186 callout_drain(&sc->sc_md6_tmo); 2187 #endif 2188 CARP_LOCK_DESTROY(sc); 2189 2190 free(sc->sc_ifas, M_CARP); 2191 free(sc, M_CARP); 2192 } 2193 2194 static struct carp_if* 2195 carp_alloc_if(struct ifnet *ifp) 2196 { 2197 struct carp_if *cif; 2198 int error; 2199 2200 cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); 2201 2202 if ((error = ifpromisc(ifp, 1)) != 0) 2203 printf("%s: ifpromisc(%s) failed: %d\n", 2204 __func__, if_name(ifp), error); 2205 else 2206 cif->cif_flags |= CIF_PROMISC; 2207 2208 CIF_LOCK_INIT(cif); 2209 cif->cif_ifp = ifp; 2210 TAILQ_INIT(&cif->cif_vrs); 2211 2212 IF_ADDR_WLOCK(ifp); 2213 ifp->if_carp = cif; 2214 if_ref(ifp); 2215 IF_ADDR_WUNLOCK(ifp); 2216 2217 return (cif); 2218 } 2219 2220 static void 2221 carp_free_if(struct carp_if *cif) 2222 { 2223 struct ifnet *ifp = cif->cif_ifp; 2224 2225 CIF_LOCK_ASSERT(cif); 2226 KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", 2227 __func__)); 2228 2229 IF_ADDR_WLOCK(ifp); 2230 ifp->if_carp = NULL; 2231 IF_ADDR_WUNLOCK(ifp); 2232 2233 CIF_LOCK_DESTROY(cif); 2234 2235 if (cif->cif_flags & CIF_PROMISC) 2236 ifpromisc(ifp, 0); 2237 if_rele(ifp); 2238 2239 free(cif, M_CARP); 2240 } 2241 2242 static int 2243 carp_get_vhid(struct ifaddr *ifa) 2244 { 2245 2246 if (ifa == NULL || ifa->ifa_carp == NULL) 2247 return (0); 2248 2249 return (ifa->ifa_carp->sc_vhid); 2250 } 2251 2252 int 2253 carp_attach(struct ifaddr *ifa, int vhid) 2254 { 2255 struct ifnet *ifp = ifa->ifa_ifp; 2256 struct carp_if *cif = ifp->if_carp; 2257 struct carp_softc *sc; 2258 int index, error; 2259 2260 KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); 2261 2262 switch (ifa->ifa_addr->sa_family) { 2263 #ifdef INET 2264 case AF_INET: 2265 #endif 2266 #ifdef INET6 2267 case AF_INET6: 2268 #endif 2269 break; 2270 default: 2271 return (EPROTOTYPE); 2272 } 2273 2274 sx_xlock(&carp_sx); 2275 if (ifp->if_carp == NULL) { 2276 sx_xunlock(&carp_sx); 2277 return (ENOPROTOOPT); 2278 } 2279 2280 IFNET_FOREACH_CARP(ifp, sc) 2281 if (sc->sc_vhid == vhid) 2282 break; 2283 if (sc == NULL) { 2284 sx_xunlock(&carp_sx); 2285 return (ENOENT); 2286 } 2287 2288 error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); 2289 if (error) { 2290 CIF_FREE(cif); 2291 sx_xunlock(&carp_sx); 2292 return (error); 2293 } 2294 2295 index = sc->sc_naddrs + sc->sc_naddrs6 + 1; 2296 if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) 2297 carp_grow_ifas(sc); 2298 2299 switch (ifa->ifa_addr->sa_family) { 2300 #ifdef INET 2301 case AF_INET: 2302 cif->cif_naddrs++; 2303 sc->sc_naddrs++; 2304 break; 2305 #endif 2306 #ifdef INET6 2307 case AF_INET6: 2308 cif->cif_naddrs6++; 2309 sc->sc_naddrs6++; 2310 break; 2311 #endif 2312 } 2313 2314 ifa_ref(ifa); 2315 2316 CARP_LOCK(sc); 2317 sc->sc_ifas[index - 1] = ifa; 2318 ifa->ifa_carp = sc; 2319 if (sc->sc_version == CARP_VERSION_CARP) 2320 carp_hmac_prepare(sc); 2321 carp_sc_state(sc); 2322 CARP_UNLOCK(sc); 2323 2324 sx_xunlock(&carp_sx); 2325 2326 return (0); 2327 } 2328 2329 void 2330 carp_detach(struct ifaddr *ifa, bool keep_cif) 2331 { 2332 struct ifnet *ifp = ifa->ifa_ifp; 2333 struct carp_if *cif = ifp->if_carp; 2334 struct carp_softc *sc = ifa->ifa_carp; 2335 int i, index; 2336 2337 KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); 2338 2339 sx_xlock(&carp_sx); 2340 2341 CARP_LOCK(sc); 2342 /* Shift array. */ 2343 index = sc->sc_naddrs + sc->sc_naddrs6; 2344 for (i = 0; i < index; i++) 2345 if (sc->sc_ifas[i] == ifa) 2346 break; 2347 KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); 2348 for (; i < index - 1; i++) 2349 sc->sc_ifas[i] = sc->sc_ifas[i+1]; 2350 sc->sc_ifas[index - 1] = NULL; 2351 2352 switch (ifa->ifa_addr->sa_family) { 2353 #ifdef INET 2354 case AF_INET: 2355 cif->cif_naddrs--; 2356 sc->sc_naddrs--; 2357 break; 2358 #endif 2359 #ifdef INET6 2360 case AF_INET6: 2361 cif->cif_naddrs6--; 2362 sc->sc_naddrs6--; 2363 break; 2364 #endif 2365 } 2366 2367 carp_ifa_delroute(ifa); 2368 carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); 2369 2370 ifa->ifa_carp = NULL; 2371 ifa_free(ifa); 2372 2373 if (sc->sc_version == CARP_VERSION_CARP) 2374 carp_hmac_prepare(sc); 2375 carp_sc_state(sc); 2376 2377 if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) 2378 carp_destroy(sc); 2379 else 2380 CARP_UNLOCK(sc); 2381 2382 if (!keep_cif) 2383 CIF_FREE(cif); 2384 2385 sx_xunlock(&carp_sx); 2386 } 2387 2388 static void 2389 carp_set_state(struct carp_softc *sc, int state, const char *reason) 2390 { 2391 2392 CARP_LOCK_ASSERT(sc); 2393 2394 if (sc->sc_state != state) { 2395 const char *carp_states[] = { CARP_STATES }; 2396 char subsys[IFNAMSIZ+5]; 2397 2398 snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, 2399 if_name(sc->sc_carpdev)); 2400 2401 CARP_LOG("%s: %s -> %s (%s)\n", subsys, 2402 carp_states[sc->sc_state], carp_states[state], reason); 2403 2404 sc->sc_state = state; 2405 2406 devctl_notify("CARP", subsys, carp_states[state], NULL); 2407 } 2408 } 2409 2410 static void 2411 carp_linkstate(struct ifnet *ifp) 2412 { 2413 struct carp_softc *sc; 2414 2415 CIF_LOCK(ifp->if_carp); 2416 IFNET_FOREACH_CARP(ifp, sc) { 2417 CARP_LOCK(sc); 2418 carp_sc_state(sc); 2419 CARP_UNLOCK(sc); 2420 } 2421 CIF_UNLOCK(ifp->if_carp); 2422 } 2423 2424 static void 2425 carp_sc_state(struct carp_softc *sc) 2426 { 2427 2428 CARP_LOCK_ASSERT(sc); 2429 2430 if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || 2431 !(sc->sc_carpdev->if_flags & IFF_UP) || 2432 !V_carp_allow) { 2433 callout_stop(&sc->sc_ad_tmo); 2434 #ifdef INET 2435 callout_stop(&sc->sc_md_tmo); 2436 #endif 2437 #ifdef INET6 2438 callout_stop(&sc->sc_md6_tmo); 2439 #endif 2440 carp_set_state(sc, INIT, "hardware interface down"); 2441 carp_setrun(sc, 0); 2442 carp_delroute(sc); 2443 if (!sc->sc_suppress) 2444 carp_demote_adj(V_carp_ifdown_adj, "interface down"); 2445 sc->sc_suppress = 1; 2446 } else { 2447 carp_set_state(sc, INIT, "hardware interface up"); 2448 carp_setrun(sc, 0); 2449 if (sc->sc_suppress) 2450 carp_demote_adj(-V_carp_ifdown_adj, "interface up"); 2451 sc->sc_suppress = 0; 2452 } 2453 } 2454 2455 static void 2456 carp_demote_adj(int adj, char *reason) 2457 { 2458 atomic_add_int(&V_carp_demotion, adj); 2459 CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); 2460 taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); 2461 } 2462 2463 static int 2464 carp_allow_sysctl(SYSCTL_HANDLER_ARGS) 2465 { 2466 int new, error; 2467 struct carp_softc *sc; 2468 2469 new = V_carp_allow; 2470 error = sysctl_handle_int(oidp, &new, 0, req); 2471 if (error || !req->newptr) 2472 return (error); 2473 2474 if (V_carp_allow != new) { 2475 V_carp_allow = new; 2476 2477 mtx_lock(&carp_mtx); 2478 LIST_FOREACH(sc, &carp_list, sc_next) { 2479 CARP_LOCK(sc); 2480 if (curvnet == sc->sc_carpdev->if_vnet) 2481 carp_sc_state(sc); 2482 CARP_UNLOCK(sc); 2483 } 2484 mtx_unlock(&carp_mtx); 2485 } 2486 2487 return (0); 2488 } 2489 2490 static int 2491 carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) 2492 { 2493 int new, error; 2494 2495 new = V_carp_dscp; 2496 error = sysctl_handle_int(oidp, &new, 0, req); 2497 if (error || !req->newptr) 2498 return (error); 2499 2500 if (new < 0 || new > 63) 2501 return (EINVAL); 2502 2503 V_carp_dscp = new; 2504 2505 return (0); 2506 } 2507 2508 static int 2509 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) 2510 { 2511 int new, error; 2512 2513 new = V_carp_demotion; 2514 error = sysctl_handle_int(oidp, &new, 0, req); 2515 if (error || !req->newptr) 2516 return (error); 2517 2518 carp_demote_adj(new, "sysctl"); 2519 2520 return (0); 2521 } 2522 2523 static int 2524 nlattr_get_carp_key(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 2525 { 2526 if (__predict_false(NLA_DATA_LEN(nla) > CARP_KEY_LEN)) 2527 return (EINVAL); 2528 2529 memcpy(target, NLA_DATA_CONST(nla), NLA_DATA_LEN(nla)); 2530 return (0); 2531 } 2532 2533 struct nl_carp_parsed { 2534 unsigned int ifindex; 2535 char *ifname; 2536 uint32_t state; 2537 uint32_t vhid; 2538 int32_t advbase; 2539 int32_t advskew; 2540 char key[CARP_KEY_LEN]; 2541 struct in_addr addr; 2542 struct in6_addr addr6; 2543 carp_version_t version; 2544 uint8_t vrrp_prio; 2545 uint16_t vrrp_adv_inter; 2546 }; 2547 2548 #define _OUT(_field) offsetof(struct nl_carp_parsed, _field) 2549 static const struct nlattr_parser nla_p_set[] = { 2550 { .type = CARP_NL_VHID, .off = _OUT(vhid), .cb = nlattr_get_uint32 }, 2551 { .type = CARP_NL_STATE, .off = _OUT(state), .cb = nlattr_get_uint32 }, 2552 { .type = CARP_NL_ADVBASE, .off = _OUT(advbase), .cb = nlattr_get_uint32 }, 2553 { .type = CARP_NL_ADVSKEW, .off = _OUT(advskew), .cb = nlattr_get_uint32 }, 2554 { .type = CARP_NL_KEY, .off = _OUT(key), .cb = nlattr_get_carp_key }, 2555 { .type = CARP_NL_IFINDEX, .off = _OUT(ifindex), .cb = nlattr_get_uint32 }, 2556 { .type = CARP_NL_ADDR, .off = _OUT(addr), .cb = nlattr_get_in_addr }, 2557 { .type = CARP_NL_ADDR6, .off = _OUT(addr6), .cb = nlattr_get_in6_addr }, 2558 { .type = CARP_NL_IFNAME, .off = _OUT(ifname), .cb = nlattr_get_string }, 2559 { .type = CARP_NL_VERSION, .off = _OUT(version), .cb = nlattr_get_uint8 }, 2560 { .type = CARP_NL_VRRP_PRIORITY, .off = _OUT(vrrp_prio), .cb = nlattr_get_uint8 }, 2561 { .type = CARP_NL_VRRP_ADV_INTER, .off = _OUT(vrrp_adv_inter), .cb = nlattr_get_uint16 }, 2562 }; 2563 NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_empty, nla_p_set); 2564 #undef _OUT 2565 2566 2567 static int 2568 carp_nl_get(struct nlmsghdr *hdr, struct nl_pstate *npt) 2569 { 2570 struct nl_carp_parsed attrs = { }; 2571 struct epoch_tracker et; 2572 struct nl_writer *nw = npt->nw; 2573 struct carp_softc *sc; 2574 if_t ifp = NULL; 2575 int error; 2576 bool privileged; 2577 2578 error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); 2579 if (error != 0) 2580 return (error); 2581 2582 if (attrs.vhid < 0 || attrs.vhid > CARP_MAXVHID) 2583 return (EINVAL); 2584 2585 NET_EPOCH_ENTER(et); 2586 if (attrs.ifname != NULL) 2587 ifp = ifunit_ref(attrs.ifname); 2588 else if (attrs.ifindex != 0) 2589 ifp = ifnet_byindex_ref(attrs.ifindex); 2590 NET_EPOCH_EXIT(et); 2591 2592 if ((error = carp_is_supported_if(ifp)) != 0) 2593 goto out; 2594 2595 if (ifp->if_carp == NULL) { 2596 error = ENOENT; 2597 goto out; 2598 } 2599 2600 hdr->nlmsg_flags |= NLM_F_MULTI; 2601 privileged = (priv_check_cred(nlp_get_cred(npt->nlp), 2602 PRIV_NETINET_CARP) == 0); 2603 2604 sx_xlock(&carp_sx); 2605 IFNET_FOREACH_CARP(ifp, sc) { 2606 struct genlmsghdr *ghdr_new; 2607 2608 if (attrs.vhid != 0 && attrs.vhid != sc->sc_vhid) 2609 continue; 2610 2611 if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { 2612 nlmsg_abort(nw); 2613 error = ENOMEM; 2614 break; 2615 } 2616 2617 ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); 2618 if (ghdr_new == NULL) { 2619 nlmsg_abort(nw); 2620 error = ENOMEM; 2621 break; 2622 } 2623 2624 ghdr_new->cmd = CARP_NL_CMD_GET; 2625 ghdr_new->version = 0; 2626 ghdr_new->reserved = 0; 2627 2628 CARP_LOCK(sc); 2629 nlattr_add_u32(nw, CARP_NL_VHID, sc->sc_vhid); 2630 nlattr_add_u32(nw, CARP_NL_STATE, sc->sc_state); 2631 nlattr_add_u8(nw, CARP_NL_VERSION, sc->sc_version); 2632 switch (sc->sc_version) { 2633 case CARP_VERSION_CARP: 2634 nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase); 2635 nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew); 2636 nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr); 2637 nlattr_add_in6_addr(nw, CARP_NL_ADDR6, 2638 &sc->sc_carpaddr6); 2639 if (privileged) 2640 nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key), 2641 sc->sc_key); 2642 break; 2643 case CARP_VERSION_VRRPv3: 2644 nlattr_add_u8(nw, CARP_NL_VRRP_PRIORITY, 2645 sc->sc_vrrp_prio); 2646 nlattr_add_u16(nw, CARP_NL_VRRP_ADV_INTER, 2647 sc->sc_vrrp_adv_inter); 2648 break; 2649 } 2650 CARP_UNLOCK(sc); 2651 2652 if (! nlmsg_end(nw)) { 2653 nlmsg_abort(nw); 2654 error = ENOMEM; 2655 break; 2656 } 2657 } 2658 sx_xunlock(&carp_sx); 2659 2660 if (! nlmsg_end_dump(nw, error, hdr)) 2661 error = ENOMEM; 2662 2663 out: 2664 if (ifp != NULL) 2665 if_rele(ifp); 2666 2667 return (error); 2668 } 2669 2670 static int 2671 carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) 2672 { 2673 struct nl_carp_parsed attrs = { }; 2674 struct epoch_tracker et; 2675 struct carp_softc *sc; 2676 if_t ifp = NULL; 2677 int error; 2678 2679 error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); 2680 if (error != 0) 2681 return (error); 2682 2683 if (attrs.vhid <= 0 || attrs.vhid > CARP_MAXVHID) 2684 return (EINVAL); 2685 if (attrs.state > CARP_MAXSTATE) 2686 return (EINVAL); 2687 if (attrs.version == 0) /* compat with pre-VRRPv3 */ 2688 attrs.version = CARP_VERSION_CARP; 2689 switch (attrs.version) { 2690 case CARP_VERSION_CARP: 2691 if (attrs.advbase < 0 || attrs.advskew < 0) 2692 return (EINVAL); 2693 if (attrs.advbase > 255) 2694 return (EINVAL); 2695 if (attrs.advskew >= 255) 2696 return (EINVAL); 2697 break; 2698 case CARP_VERSION_VRRPv3: 2699 if (attrs.vrrp_adv_inter > VRRP_MAX_INTERVAL) 2700 return (EINVAL); 2701 break; 2702 default: 2703 return (EINVAL); 2704 } 2705 2706 NET_EPOCH_ENTER(et); 2707 if (attrs.ifname != NULL) 2708 ifp = ifunit_ref(attrs.ifname); 2709 else if (attrs.ifindex != 0) 2710 ifp = ifnet_byindex_ref(attrs.ifindex); 2711 NET_EPOCH_EXIT(et); 2712 2713 if ((error = carp_is_supported_if(ifp)) != 0) 2714 goto out; 2715 2716 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 2717 error = EADDRNOTAVAIL; 2718 goto out; 2719 } 2720 2721 sx_xlock(&carp_sx); 2722 if (ifp->if_carp) { 2723 IFNET_FOREACH_CARP(ifp, sc) 2724 if (sc->sc_vhid == attrs.vhid) 2725 break; 2726 } else 2727 sc = NULL; 2728 if (sc == NULL) 2729 sc = carp_alloc(ifp, attrs.version, attrs.vhid); 2730 else if (sc->sc_version != attrs.version) { 2731 sx_xunlock(&carp_sx); 2732 error = EINVAL; 2733 goto out; 2734 } 2735 2736 CARP_LOCK(sc); 2737 switch (sc->sc_version) { 2738 case CARP_VERSION_CARP: 2739 if (attrs.advbase != 0) 2740 sc->sc_advbase = attrs.advbase; 2741 sc->sc_advskew = attrs.advskew; 2742 if (attrs.addr.s_addr != INADDR_ANY) 2743 sc->sc_carpaddr = attrs.addr; 2744 if (!IN6_IS_ADDR_UNSPECIFIED(&attrs.addr6)) { 2745 memcpy(&sc->sc_carpaddr6, &attrs.addr6, 2746 sizeof(sc->sc_carpaddr6)); 2747 } 2748 if (attrs.key[0] != '\0') { 2749 bcopy(attrs.key, sc->sc_key, sizeof(sc->sc_key)); 2750 carp_hmac_prepare(sc); 2751 } 2752 break; 2753 case CARP_VERSION_VRRPv3: 2754 if (attrs.vrrp_prio != 0) 2755 sc->sc_vrrp_prio = attrs.vrrp_prio; 2756 if (attrs.vrrp_adv_inter) 2757 sc->sc_vrrp_adv_inter = attrs.vrrp_adv_inter; 2758 break; 2759 } 2760 2761 if (sc->sc_state != INIT && sc->sc_state != attrs.state) { 2762 switch (attrs.state) { 2763 case BACKUP: 2764 callout_stop(&sc->sc_ad_tmo); 2765 carp_set_state(sc, BACKUP, 2766 "user requested via ifconfig"); 2767 carp_setrun(sc, 0); 2768 carp_delroute(sc); 2769 break; 2770 case MASTER: 2771 NET_EPOCH_ENTER(et); 2772 carp_master_down_locked(sc, 2773 "user requested via ifconfig"); 2774 NET_EPOCH_EXIT(et); 2775 break; 2776 default: 2777 break; 2778 } 2779 } 2780 CARP_UNLOCK(sc); 2781 sx_xunlock(&carp_sx); 2782 2783 out: 2784 if (ifp != NULL) 2785 if_rele(ifp); 2786 2787 return (error); 2788 } 2789 2790 static const struct nlhdr_parser *all_parsers[] = { 2791 &carp_parser 2792 }; 2793 2794 static const struct genl_cmd carp_cmds[] = { 2795 { 2796 .cmd_num = CARP_NL_CMD_GET, 2797 .cmd_name = "SIOCGVH", 2798 .cmd_cb = carp_nl_get, 2799 .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | 2800 GENL_CMD_CAP_HASPOL, 2801 }, 2802 { 2803 .cmd_num = CARP_NL_CMD_SET, 2804 .cmd_name = "SIOCSVH", 2805 .cmd_cb = carp_nl_set, 2806 .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, 2807 .cmd_priv = PRIV_NETINET_CARP, 2808 }, 2809 }; 2810 2811 static uint16_t carp_family_id; 2812 static void 2813 carp_nl_register(void) 2814 { 2815 bool ret __diagused; 2816 2817 NL_VERIFY_PARSERS(all_parsers); 2818 carp_family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, 2819 CARP_NL_CMD_MAX); 2820 MPASS(carp_family_id != 0); 2821 2822 ret = genl_register_cmds(carp_family_id, carp_cmds, nitems(carp_cmds)); 2823 MPASS(ret); 2824 } 2825 2826 static void 2827 carp_nl_unregister(void) 2828 { 2829 genl_unregister_family(carp_family_id); 2830 } 2831 2832 static void 2833 carp_mod_cleanup(void) 2834 { 2835 2836 carp_nl_unregister(); 2837 2838 #ifdef INET 2839 (void)ipproto_unregister(IPPROTO_CARP); 2840 carp_iamatch_p = NULL; 2841 #endif 2842 #ifdef INET6 2843 (void)ip6proto_unregister(IPPROTO_CARP); 2844 carp_iamatch6_p = NULL; 2845 carp_macmatch6_p = NULL; 2846 #endif 2847 carp_attach_p = NULL; 2848 carp_detach_p = NULL; 2849 carp_get_vhid_p = NULL; 2850 carp_linkstate_p = NULL; 2851 carp_forus_p = NULL; 2852 carp_output_p = NULL; 2853 carp_demote_adj_p = NULL; 2854 carp_master_p = NULL; 2855 mtx_unlock(&carp_mtx); 2856 taskqueue_drain(taskqueue_swi, &carp_sendall_task); 2857 mtx_destroy(&carp_mtx); 2858 sx_destroy(&carp_sx); 2859 } 2860 2861 static void 2862 ipcarp_sysinit(void) 2863 { 2864 2865 /* Load allow as tunable so to postpone carp start after module load */ 2866 TUNABLE_INT_FETCH("net.inet.carp.allow", &V_carp_allow); 2867 } 2868 VNET_SYSINIT(ip_carp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipcarp_sysinit, NULL); 2869 2870 static int 2871 carp_mod_load(void) 2872 { 2873 int err; 2874 2875 mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); 2876 sx_init(&carp_sx, "carp_sx"); 2877 carp_get_vhid_p = carp_get_vhid; 2878 carp_forus_p = carp_forus; 2879 carp_output_p = carp_output; 2880 carp_linkstate_p = carp_linkstate; 2881 carp_attach_p = carp_attach; 2882 carp_detach_p = carp_detach; 2883 carp_demote_adj_p = carp_demote_adj; 2884 carp_master_p = carp_master; 2885 #ifdef INET6 2886 carp_iamatch6_p = carp_iamatch6; 2887 carp_macmatch6_p = carp_macmatch6; 2888 err = ip6proto_register(IPPROTO_CARP, carp6_input, NULL); 2889 if (err) { 2890 printf("carp: error %d registering with INET6\n", err); 2891 carp_mod_cleanup(); 2892 return (err); 2893 } 2894 #endif 2895 #ifdef INET 2896 carp_iamatch_p = carp_iamatch; 2897 err = ipproto_register(IPPROTO_CARP, carp_input, NULL); 2898 if (err) { 2899 printf("carp: error %d registering with INET\n", err); 2900 carp_mod_cleanup(); 2901 return (err); 2902 } 2903 #endif 2904 2905 carp_nl_register(); 2906 2907 return (0); 2908 } 2909 2910 static int 2911 carp_modevent(module_t mod, int type, void *data) 2912 { 2913 switch (type) { 2914 case MOD_LOAD: 2915 return carp_mod_load(); 2916 /* NOTREACHED */ 2917 case MOD_UNLOAD: 2918 mtx_lock(&carp_mtx); 2919 if (LIST_EMPTY(&carp_list)) 2920 carp_mod_cleanup(); 2921 else { 2922 mtx_unlock(&carp_mtx); 2923 return (EBUSY); 2924 } 2925 break; 2926 2927 default: 2928 return (EINVAL); 2929 } 2930 2931 return (0); 2932 } 2933 2934 static moduledata_t carp_mod = { 2935 "carp", 2936 carp_modevent, 2937 0 2938 }; 2939 2940 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 2941