1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/malloc.h> 37 #include <sys/mbuf.h> 38 #include <sys/socket.h> 39 #include <sys/sysctl.h> 40 #include <sys/syslog.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/rmlock.h> 44 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_dl.h> 48 #include <net/vnet.h> 49 #include <net/route.h> 50 #include <net/route/route_ctl.h> 51 #include <net/route/route_var.h> 52 #include <net/route/nhop_utils.h> 53 #include <net/route/nhop.h> 54 #include <net/route/nhop_var.h> 55 #include <netinet/in.h> 56 57 #ifdef RADIX_MPATH 58 #include <net/radix_mpath.h> 59 #endif 60 61 #include <vm/uma.h> 62 63 /* 64 * This file contains control plane routing tables functions. 65 * 66 * All functions assumes they are called in net epoch. 67 */ 68 69 struct rib_subscription { 70 CK_STAILQ_ENTRY(rib_subscription) next; 71 rib_subscription_cb_t *func; 72 void *arg; 73 enum rib_subscription_type type; 74 struct epoch_context epoch_ctx; 75 }; 76 77 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, 78 struct rib_cmd_info *rc); 79 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 80 struct rt_addrinfo *info, struct route_nhop_data *rnd, 81 struct rib_cmd_info *rc); 82 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, 83 struct rib_cmd_info *rc); 84 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, 85 struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); 86 87 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, 88 struct rib_cmd_info *rc); 89 90 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 91 struct rib_cmd_info *rc); 92 93 static void destroy_subscription_epoch(epoch_context_t ctx); 94 #ifdef ROUTE_MPATH 95 static bool rib_can_multipath(struct rib_head *rh); 96 #endif 97 98 /* Per-vnet multipath routing configuration */ 99 SYSCTL_DECL(_net_route); 100 #define V_rib_route_multipath VNET(rib_route_multipath) 101 #ifdef ROUTE_MPATH 102 #define _MP_FLAGS CTLFLAG_RW 103 #else 104 #define _MP_FLAGS CTLFLAG_RD 105 #endif 106 VNET_DEFINE(u_int, rib_route_multipath) = 0; 107 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, 108 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); 109 #undef _MP_FLAGS 110 111 /* Routing table UMA zone */ 112 VNET_DEFINE_STATIC(uma_zone_t, rtzone); 113 #define V_rtzone VNET(rtzone) 114 115 void 116 vnet_rtzone_init() 117 { 118 119 V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), 120 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 121 } 122 123 #ifdef VIMAGE 124 void 125 vnet_rtzone_destroy() 126 { 127 128 uma_zdestroy(V_rtzone); 129 } 130 #endif 131 132 static void 133 destroy_rtentry(struct rtentry *rt) 134 { 135 136 /* 137 * At this moment rnh, nh_control may be already freed. 138 * nhop interface may have been migrated to a different vnet. 139 * Use vnet stored in the nexthop to delete the entry. 140 */ 141 CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); 142 143 /* Unreference nexthop */ 144 nhop_free_any(rt->rt_nhop); 145 146 uma_zfree(V_rtzone, rt); 147 148 CURVNET_RESTORE(); 149 } 150 151 /* 152 * Epoch callback indicating rtentry is safe to destroy 153 */ 154 static void 155 destroy_rtentry_epoch(epoch_context_t ctx) 156 { 157 struct rtentry *rt; 158 159 rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); 160 161 destroy_rtentry(rt); 162 } 163 164 /* 165 * Schedule rtentry deletion 166 */ 167 static void 168 rtfree(struct rtentry *rt) 169 { 170 171 KASSERT(rt != NULL, ("%s: NULL rt", __func__)); 172 173 epoch_call(net_epoch_preempt, destroy_rtentry_epoch, 174 &rt->rt_epoch_ctx); 175 } 176 177 static struct rib_head * 178 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) 179 { 180 struct rib_head *rnh; 181 struct sockaddr *dst; 182 183 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); 184 185 dst = info->rti_info[RTAX_DST]; 186 rnh = rt_tables_get_rnh(fibnum, dst->sa_family); 187 188 return (rnh); 189 } 190 191 #ifdef ROUTE_MPATH 192 static bool 193 rib_can_multipath(struct rib_head *rh) 194 { 195 int result; 196 197 CURVNET_SET(rh->rib_vnet); 198 result = !!V_rib_route_multipath; 199 CURVNET_RESTORE(); 200 201 return (result); 202 } 203 204 /* 205 * Check is nhop is multipath-eligible. 206 * Avoid nhops without gateways and redirects. 207 * 208 * Returns 1 for multipath-eligible nexthop, 209 * 0 otherwise. 210 */ 211 bool 212 nhop_can_multipath(const struct nhop_object *nh) 213 { 214 215 if ((nh->nh_flags & NHF_MULTIPATH) != 0) 216 return (1); 217 if ((nh->nh_flags & NHF_GATEWAY) == 0) 218 return (0); 219 if ((nh->nh_flags & NHF_REDIRECT) != 0) 220 return (0); 221 222 return (1); 223 } 224 #endif 225 226 static int 227 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) 228 { 229 uint32_t weight; 230 231 if (info->rti_mflags & RTV_WEIGHT) 232 weight = info->rti_rmx->rmx_weight; 233 else 234 weight = default_weight; 235 /* Keep upper 1 byte for adm distance purposes */ 236 if (weight > RT_MAX_WEIGHT) 237 weight = RT_MAX_WEIGHT; 238 239 return (weight); 240 } 241 242 static void 243 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) 244 { 245 246 /* Kernel -> userland timebase conversion. */ 247 if (info->rti_mflags & RTV_EXPIRE) 248 rt->rt_expire = info->rti_rmx->rmx_expire ? 249 info->rti_rmx->rmx_expire - time_second + time_uptime : 0; 250 } 251 252 /* 253 * Check if specified @gw matches gw data in the nexthop @nh. 254 * 255 * Returns true if matches, false otherwise. 256 */ 257 bool 258 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) 259 { 260 261 if (nh->gw_sa.sa_family != gw->sa_family) 262 return (false); 263 264 switch (gw->sa_family) { 265 case AF_INET: 266 return (nh->gw4_sa.sin_addr.s_addr == 267 ((const struct sockaddr_in *)gw)->sin_addr.s_addr); 268 case AF_INET6: 269 { 270 const struct sockaddr_in6 *gw6; 271 gw6 = (const struct sockaddr_in6 *)gw; 272 273 /* 274 * Currently (2020-09) IPv6 gws in kernel have their 275 * scope embedded. Once this becomes false, this code 276 * has to be revisited. 277 */ 278 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, 279 &gw6->sin6_addr)) 280 return (true); 281 return (false); 282 } 283 case AF_LINK: 284 { 285 const struct sockaddr_dl *sdl; 286 sdl = (const struct sockaddr_dl *)gw; 287 return (nh->gwl_sa.sdl_index == sdl->sdl_index); 288 } 289 default: 290 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); 291 } 292 293 /* NOTREACHED */ 294 return (false); 295 } 296 297 /* 298 * Checks if data in @info matches nexhop @nh. 299 * 300 * Returns 0 on success, 301 * ESRCH if not matched, 302 * ENOENT if filter function returned false 303 */ 304 int 305 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, 306 const struct nhop_object *nh) 307 { 308 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; 309 310 if (info->rti_filter != NULL) { 311 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) 312 return (ENOENT); 313 else 314 return (0); 315 } 316 if ((gw != NULL) && !match_nhop_gw(nh, gw)) 317 return (ESRCH); 318 319 return (0); 320 } 321 322 /* 323 * Checks if nexhop @nh can be rewritten by data in @info because 324 * of higher "priority". Currently the only case for such scenario 325 * is kernel installing interface routes, marked by RTF_PINNED flag. 326 * 327 * Returns: 328 * 1 if @info data has higher priority 329 * 0 if priority is the same 330 * -1 if priority is lower 331 */ 332 int 333 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh) 334 { 335 336 if (info->rti_flags & RTF_PINNED) { 337 return (NH_IS_PINNED(nh)) ? 0 : 1; 338 } else { 339 return (NH_IS_PINNED(nh)) ? -1 : 0; 340 } 341 } 342 343 /* 344 * Runs exact prefix match based on @dst and @netmask. 345 * Returns matched @rtentry if found or NULL. 346 * If rtentry was found, saves nexthop / weight value into @rnd. 347 */ 348 static struct rtentry * 349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, 350 const struct sockaddr *netmask, struct route_nhop_data *rnd) 351 { 352 struct rtentry *rt; 353 354 RIB_LOCK_ASSERT(rnh); 355 356 rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst), 357 __DECONST(void *, netmask), &rnh->head); 358 if (rt != NULL) { 359 rnd->rnd_nhop = rt->rt_nhop; 360 rnd->rnd_weight = rt->rt_weight; 361 } else { 362 rnd->rnd_nhop = NULL; 363 rnd->rnd_weight = 0; 364 } 365 366 return (rt); 367 } 368 369 /* 370 * Runs exact prefix match based on dst/netmask from @info. 371 * Assumes RIB lock is held. 372 * Returns matched @rtentry if found or NULL. 373 * If rtentry was found, saves nexthop / weight value into @rnd. 374 */ 375 struct rtentry * 376 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, 377 struct route_nhop_data *rnd) 378 { 379 struct rtentry *rt; 380 381 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], 382 info->rti_info[RTAX_NETMASK], rnd); 383 384 return (rt); 385 } 386 387 /* 388 * Adds route defined by @info into the kernel table specified by @fibnum and 389 * sa_family in @info->rti_info[RTAX_DST]. 390 * 391 * Returns 0 on success and fills in operation metadata into @rc. 392 */ 393 int 394 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, 395 struct rib_cmd_info *rc) 396 { 397 struct rib_head *rnh; 398 int error; 399 400 NET_EPOCH_ASSERT(); 401 402 rnh = get_rnh(fibnum, info); 403 if (rnh == NULL) 404 return (EAFNOSUPPORT); 405 406 /* 407 * Check consistency between RTF_HOST flag and netmask 408 * existence. 409 */ 410 if (info->rti_flags & RTF_HOST) 411 info->rti_info[RTAX_NETMASK] = NULL; 412 else if (info->rti_info[RTAX_NETMASK] == NULL) 413 return (EINVAL); 414 415 bzero(rc, sizeof(struct rib_cmd_info)); 416 rc->rc_cmd = RTM_ADD; 417 418 error = add_route(rnh, info, rc); 419 if (error == 0) 420 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 421 422 return (error); 423 } 424 425 /* 426 * Creates rtentry and nexthop based on @info data. 427 * Return 0 and fills in rtentry into @prt on success, 428 * return errno otherwise. 429 */ 430 static int 431 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, 432 struct rtentry **prt) 433 { 434 struct sockaddr *dst, *ndst, *gateway, *netmask; 435 struct rtentry *rt; 436 struct nhop_object *nh; 437 struct ifaddr *ifa; 438 int error, flags; 439 440 dst = info->rti_info[RTAX_DST]; 441 gateway = info->rti_info[RTAX_GATEWAY]; 442 netmask = info->rti_info[RTAX_NETMASK]; 443 flags = info->rti_flags; 444 445 if ((flags & RTF_GATEWAY) && !gateway) 446 return (EINVAL); 447 if (dst && gateway && (dst->sa_family != gateway->sa_family) && 448 (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) 449 return (EINVAL); 450 451 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) 452 return (EINVAL); 453 454 if (info->rti_ifa == NULL) { 455 error = rt_getifa_fib(info, rnh->rib_fibnum); 456 if (error) 457 return (error); 458 } else { 459 ifa_ref(info->rti_ifa); 460 } 461 462 error = nhop_create_from_info(rnh, info, &nh); 463 if (error != 0) { 464 ifa_free(info->rti_ifa); 465 return (error); 466 } 467 468 rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); 469 if (rt == NULL) { 470 ifa_free(info->rti_ifa); 471 nhop_free(nh); 472 return (ENOBUFS); 473 } 474 rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; 475 rt->rt_nhop = nh; 476 477 /* Fill in dst */ 478 memcpy(&rt->rt_dst, dst, dst->sa_len); 479 rt_key(rt) = &rt->rt_dst; 480 481 /* 482 * point to the (possibly newly malloc'd) dest address. 483 */ 484 ndst = (struct sockaddr *)rt_key(rt); 485 486 /* 487 * make sure it contains the value we want (masked if needed). 488 */ 489 if (netmask) { 490 rt_maskedcopy(dst, ndst, netmask); 491 } else 492 bcopy(dst, ndst, dst->sa_len); 493 494 /* 495 * We use the ifa reference returned by rt_getifa_fib(). 496 * This moved from below so that rnh->rnh_addaddr() can 497 * examine the ifa and ifa->ifa_ifp if it so desires. 498 */ 499 ifa = info->rti_ifa; 500 rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); 501 rt_set_expire_info(rt, info); 502 503 *prt = rt; 504 return (0); 505 } 506 507 static int 508 add_route(struct rib_head *rnh, struct rt_addrinfo *info, 509 struct rib_cmd_info *rc) 510 { 511 struct nhop_object *nh_orig; 512 struct route_nhop_data rnd_orig, rnd_add; 513 struct nhop_object *nh; 514 struct rtentry *rt, *rt_orig; 515 int error; 516 517 error = create_rtentry(rnh, info, &rt); 518 if (error != 0) 519 return (error); 520 521 rnd_add.rnd_nhop = rt->rt_nhop; 522 rnd_add.rnd_weight = rt->rt_weight; 523 nh = rt->rt_nhop; 524 525 RIB_WLOCK(rnh); 526 error = add_route_nhop(rnh, rt, info, &rnd_add, rc); 527 if (error == 0) { 528 RIB_WUNLOCK(rnh); 529 return (0); 530 } 531 532 /* addition failed. Lookup prefix in the rib to determine the cause */ 533 rt_orig = lookup_prefix(rnh, info, &rnd_orig); 534 if (rt_orig == NULL) { 535 /* No prefix -> rnh_addaddr() failed to allocate memory */ 536 RIB_WUNLOCK(rnh); 537 nhop_free(nh); 538 uma_zfree(V_rtzone, rt); 539 return (ENOMEM); 540 } 541 542 /* We have existing route in the RIB. */ 543 nh_orig = rnd_orig.rnd_nhop; 544 /* Check if new route has higher preference */ 545 if (can_override_nhop(info, nh_orig) > 0) { 546 /* Update nexthop to the new route */ 547 change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); 548 RIB_WUNLOCK(rnh); 549 uma_zfree(V_rtzone, rt); 550 nhop_free(nh_orig); 551 return (0); 552 } 553 554 RIB_WUNLOCK(rnh); 555 556 #ifdef ROUTE_MPATH 557 if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && 558 nhop_can_multipath(rnd_orig.rnd_nhop)) 559 error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); 560 else 561 #endif 562 /* Unable to add - another route with the same preference exists */ 563 error = EEXIST; 564 565 /* 566 * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. 567 * ROUTE_MPATH enabled: original nhop reference is unused in any case, 568 * free rt only if not _adding_ new route to rib (e.g. the case 569 * when initial lookup returned existing route, but then it got 570 * deleted prior to multipath group insertion, leading to a simple 571 * non-multipath add as a result). 572 */ 573 nhop_free(nh); 574 if ((error != 0) || rc->rc_cmd != RTM_ADD) 575 uma_zfree(V_rtzone, rt); 576 577 return (error); 578 } 579 580 /* 581 * Removes route defined by @info from the kernel table specified by @fibnum and 582 * sa_family in @info->rti_info[RTAX_DST]. 583 * 584 * Returns 0 on success and fills in operation metadata into @rc. 585 */ 586 int 587 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) 588 { 589 struct rib_head *rnh; 590 struct sockaddr *dst_orig, *netmask; 591 struct sockaddr_storage mdst; 592 int error; 593 594 NET_EPOCH_ASSERT(); 595 596 rnh = get_rnh(fibnum, info); 597 if (rnh == NULL) 598 return (EAFNOSUPPORT); 599 600 bzero(rc, sizeof(struct rib_cmd_info)); 601 rc->rc_cmd = RTM_DELETE; 602 603 dst_orig = info->rti_info[RTAX_DST]; 604 netmask = info->rti_info[RTAX_NETMASK]; 605 606 if (netmask != NULL) { 607 /* Ensure @dst is always properly masked */ 608 if (dst_orig->sa_len > sizeof(mdst)) 609 return (EINVAL); 610 rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask); 611 info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst; 612 } 613 error = del_route(rnh, info, rc); 614 info->rti_info[RTAX_DST] = dst_orig; 615 616 return (error); 617 } 618 619 /* 620 * Conditionally unlinks rtentry matching data inside @info from @rnh. 621 * Returns 0 on success with operation result stored in @rc. 622 * On error, returns: 623 * ESRCH - if prefix was not found, 624 * EADDRINUSE - if trying to delete higher priority route. 625 * ENOENT - if supplied filter function returned 0 (not matched). 626 */ 627 static int 628 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) 629 { 630 struct rtentry *rt; 631 struct nhop_object *nh; 632 struct radix_node *rn; 633 struct route_nhop_data rnd; 634 int error; 635 636 rt = lookup_prefix(rnh, info, &rnd); 637 if (rt == NULL) 638 return (ESRCH); 639 640 nh = rt->rt_nhop; 641 #ifdef ROUTE_MPATH 642 if (NH_IS_NHGRP(nh)) { 643 error = del_route_mpath(rnh, info, rt, 644 (struct nhgrp_object *)nh, rc); 645 return (error); 646 } 647 #endif 648 error = check_info_match_nhop(info, rt, nh); 649 if (error != 0) 650 return (error); 651 652 if (can_override_nhop(info, nh) < 0) 653 return (EADDRINUSE); 654 655 /* 656 * Remove the item from the tree and return it. 657 * Complain if it is not there and do no more processing. 658 */ 659 rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], 660 info->rti_info[RTAX_NETMASK], &rnh->head); 661 if (rn == NULL) 662 return (ESRCH); 663 664 if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) 665 panic ("rtrequest delete"); 666 667 rt = RNTORT(rn); 668 rt->rte_flags &= ~RTF_UP; 669 670 /* Finalize notification */ 671 rnh->rnh_gen++; 672 rc->rc_cmd = RTM_DELETE; 673 rc->rc_rt = rt; 674 rc->rc_nh_old = rt->rt_nhop; 675 rc->rc_nh_weight = rt->rt_weight; 676 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 677 678 return (0); 679 } 680 681 static int 682 del_route(struct rib_head *rnh, struct rt_addrinfo *info, 683 struct rib_cmd_info *rc) 684 { 685 int error; 686 687 RIB_WLOCK(rnh); 688 error = rt_unlinkrte(rnh, info, rc); 689 RIB_WUNLOCK(rnh); 690 if (error != 0) 691 return (error); 692 693 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 694 695 /* 696 * If the caller wants it, then it can have it, 697 * the entry will be deleted after the end of the current epoch. 698 */ 699 if (rc->rc_cmd == RTM_DELETE) 700 rtfree(rc->rc_rt); 701 #ifdef ROUTE_MPATH 702 else { 703 /* 704 * Deleting 1 path may result in RTM_CHANGE to 705 * a different mpath group/nhop. 706 * Free old mpath group. 707 */ 708 nhop_free_any(rc->rc_nh_old); 709 } 710 #endif 711 712 return (0); 713 } 714 715 int 716 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, 717 struct rib_cmd_info *rc) 718 { 719 RIB_RLOCK_TRACKER; 720 struct route_nhop_data rnd_orig; 721 struct rib_head *rnh; 722 struct rtentry *rt; 723 int error; 724 725 NET_EPOCH_ASSERT(); 726 727 rnh = get_rnh(fibnum, info); 728 if (rnh == NULL) 729 return (EAFNOSUPPORT); 730 731 bzero(rc, sizeof(struct rib_cmd_info)); 732 rc->rc_cmd = RTM_CHANGE; 733 734 /* Check if updated gateway exists */ 735 if ((info->rti_flags & RTF_GATEWAY) && 736 (info->rti_info[RTAX_GATEWAY] == NULL)) 737 return (EINVAL); 738 739 /* 740 * route change is done in multiple steps, with dropping and 741 * reacquiring lock. In the situations with multiple processes 742 * changes the same route in can lead to the case when route 743 * is changed between the steps. Address it by retrying the operation 744 * multiple times before failing. 745 */ 746 747 RIB_RLOCK(rnh); 748 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 749 info->rti_info[RTAX_NETMASK], &rnh->head); 750 751 if (rt == NULL) { 752 RIB_RUNLOCK(rnh); 753 return (ESRCH); 754 } 755 756 rnd_orig.rnd_nhop = rt->rt_nhop; 757 rnd_orig.rnd_weight = rt->rt_weight; 758 759 RIB_RUNLOCK(rnh); 760 761 for (int i = 0; i < RIB_MAX_RETRIES; i++) { 762 error = change_route(rnh, info, &rnd_orig, rc); 763 if (error != EAGAIN) 764 break; 765 } 766 767 return (error); 768 } 769 770 static int 771 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, 772 struct nhop_object *nh_orig, struct nhop_object **nh_new) 773 { 774 int free_ifa = 0; 775 int error; 776 777 /* 778 * New gateway could require new ifaddr, ifp; 779 * flags may also be different; ifp may be specified 780 * by ll sockaddr when protocol address is ambiguous 781 */ 782 if (((nh_orig->nh_flags & NHF_GATEWAY) && 783 info->rti_info[RTAX_GATEWAY] != NULL) || 784 info->rti_info[RTAX_IFP] != NULL || 785 (info->rti_info[RTAX_IFA] != NULL && 786 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { 787 error = rt_getifa_fib(info, rnh->rib_fibnum); 788 if (info->rti_ifa != NULL) 789 free_ifa = 1; 790 791 if (error != 0) { 792 if (free_ifa) { 793 ifa_free(info->rti_ifa); 794 info->rti_ifa = NULL; 795 } 796 797 return (error); 798 } 799 } 800 801 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); 802 if (free_ifa) { 803 ifa_free(info->rti_ifa); 804 info->rti_ifa = NULL; 805 } 806 807 return (error); 808 } 809 810 #ifdef ROUTE_MPATH 811 static int 812 change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, 813 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 814 { 815 int error = 0; 816 struct nhop_object *nh, *nh_orig, *nh_new; 817 struct route_nhop_data rnd_new; 818 819 nh = NULL; 820 nh_orig = rnd_orig->rnd_nhop; 821 822 struct weightened_nhop *wn = NULL, *wn_new; 823 uint32_t num_nhops; 824 825 wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); 826 nh_orig = NULL; 827 for (int i = 0; i < num_nhops; i++) { 828 if (check_info_match_nhop(info, NULL, wn[i].nh)) { 829 nh_orig = wn[i].nh; 830 break; 831 } 832 } 833 834 if (nh_orig == NULL) 835 return (ESRCH); 836 837 error = change_nhop(rnh, info, nh_orig, &nh_new); 838 if (error != 0) 839 return (error); 840 841 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), 842 M_TEMP, M_NOWAIT | M_ZERO); 843 if (wn_new == NULL) { 844 nhop_free(nh_new); 845 return (EAGAIN); 846 } 847 848 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); 849 for (int i = 0; i < num_nhops; i++) { 850 if (wn[i].nh == nh_orig) { 851 wn[i].nh = nh_new; 852 wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); 853 break; 854 } 855 } 856 857 error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); 858 nhop_free(nh_new); 859 free(wn_new, M_TEMP); 860 861 if (error != 0) 862 return (error); 863 864 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 865 866 return (error); 867 } 868 #endif 869 870 static int 871 change_route(struct rib_head *rnh, struct rt_addrinfo *info, 872 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 873 { 874 int error = 0; 875 struct nhop_object *nh, *nh_orig; 876 struct route_nhop_data rnd_new; 877 878 nh = NULL; 879 nh_orig = rnd_orig->rnd_nhop; 880 if (nh_orig == NULL) 881 return (ESRCH); 882 883 #ifdef ROUTE_MPATH 884 if (NH_IS_NHGRP(nh_orig)) 885 return (change_mpath_route(rnh, info, rnd_orig, rc)); 886 #endif 887 888 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); 889 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); 890 if (error != 0) 891 return (error); 892 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 893 894 return (error); 895 } 896 897 /* 898 * Insert @rt with nhop data from @rnd_new to @rnh. 899 * Returns 0 on success and stores operation results in @rc. 900 */ 901 static int 902 add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 903 struct rt_addrinfo *info, struct route_nhop_data *rnd, 904 struct rib_cmd_info *rc) 905 { 906 struct sockaddr *ndst, *netmask; 907 struct radix_node *rn; 908 int error = 0; 909 910 RIB_WLOCK_ASSERT(rnh); 911 912 ndst = (struct sockaddr *)rt_key(rt); 913 netmask = info->rti_info[RTAX_NETMASK]; 914 915 rt->rt_nhop = rnd->rnd_nhop; 916 rt->rt_weight = rnd->rnd_weight; 917 rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); 918 919 if (rn != NULL) { 920 if (rt->rt_expire > 0) 921 tmproutes_update(rnh, rt); 922 923 /* Finalize notification */ 924 rnh->rnh_gen++; 925 926 rc->rc_cmd = RTM_ADD; 927 rc->rc_rt = rt; 928 rc->rc_nh_old = NULL; 929 rc->rc_nh_new = rnd->rnd_nhop; 930 rc->rc_nh_weight = rnd->rnd_weight; 931 932 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 933 } else { 934 /* Existing route or memory allocation failure */ 935 error = EEXIST; 936 } 937 938 return (error); 939 } 940 941 /* 942 * Switch @rt nhop/weigh to the ones specified in @rnd. 943 * Conditionally set rt_expire if set in @info. 944 * Returns 0 on success. 945 */ 946 int 947 change_route_nhop(struct rib_head *rnh, struct rtentry *rt, 948 struct rt_addrinfo *info, struct route_nhop_data *rnd, 949 struct rib_cmd_info *rc) 950 { 951 struct nhop_object *nh_orig; 952 953 RIB_WLOCK_ASSERT(rnh); 954 955 nh_orig = rt->rt_nhop; 956 957 if (rnd->rnd_nhop != NULL) { 958 /* Changing expiration & nexthop & weight to a new one */ 959 rt_set_expire_info(rt, info); 960 rt->rt_nhop = rnd->rnd_nhop; 961 rt->rt_weight = rnd->rnd_weight; 962 if (rt->rt_expire > 0) 963 tmproutes_update(rnh, rt); 964 } else { 965 /* Route deletion requested. */ 966 struct sockaddr *ndst, *netmask; 967 struct radix_node *rn; 968 969 ndst = (struct sockaddr *)rt_key(rt); 970 netmask = info->rti_info[RTAX_NETMASK]; 971 rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); 972 if (rn == NULL) 973 return (ESRCH); 974 rt = RNTORT(rn); 975 rt->rte_flags &= ~RTF_UP; 976 } 977 978 /* Finalize notification */ 979 rnh->rnh_gen++; 980 981 rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; 982 rc->rc_rt = rt; 983 rc->rc_nh_old = nh_orig; 984 rc->rc_nh_new = rnd->rnd_nhop; 985 rc->rc_nh_weight = rnd->rnd_weight; 986 987 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 988 989 return (0); 990 } 991 992 /* 993 * Conditionally update route nhop/weight IFF data in @nhd_orig is 994 * consistent with the current route data. 995 * Nexthop in @nhd_new is consumed. 996 */ 997 int 998 change_route_conditional(struct rib_head *rnh, struct rtentry *rt, 999 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, 1000 struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) 1001 { 1002 struct rtentry *rt_new; 1003 int error = 0; 1004 1005 RIB_WLOCK(rnh); 1006 1007 rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 1008 info->rti_info[RTAX_NETMASK], &rnh->head); 1009 1010 if (rt_new == NULL) { 1011 if (rnd_orig->rnd_nhop == NULL) 1012 error = add_route_nhop(rnh, rt, info, rnd_new, rc); 1013 else { 1014 /* 1015 * Prefix does not exist, which was not our assumption. 1016 * Update @rnd_orig with the new data and return 1017 */ 1018 rnd_orig->rnd_nhop = NULL; 1019 rnd_orig->rnd_weight = 0; 1020 error = EAGAIN; 1021 } 1022 } else { 1023 /* Prefix exists, try to update */ 1024 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { 1025 /* 1026 * Nhop/mpath group hasn't changed. Flip 1027 * to the new precalculated one and return 1028 */ 1029 error = change_route_nhop(rnh, rt_new, info, rnd_new, rc); 1030 } else { 1031 /* Update and retry */ 1032 rnd_orig->rnd_nhop = rt_new->rt_nhop; 1033 rnd_orig->rnd_weight = rt_new->rt_weight; 1034 error = EAGAIN; 1035 } 1036 } 1037 1038 RIB_WUNLOCK(rnh); 1039 1040 if (error == 0) { 1041 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 1042 1043 if (rnd_orig->rnd_nhop != NULL) 1044 nhop_free_any(rnd_orig->rnd_nhop); 1045 1046 } else { 1047 if (rnd_new->rnd_nhop != NULL) 1048 nhop_free_any(rnd_new->rnd_nhop); 1049 } 1050 1051 return (error); 1052 } 1053 1054 /* 1055 * Performs modification of routing table specificed by @action. 1056 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. 1057 * Needs to be run in network epoch. 1058 * 1059 * Returns 0 on success and fills in @rc with action result. 1060 */ 1061 int 1062 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, 1063 struct rib_cmd_info *rc) 1064 { 1065 int error; 1066 1067 switch (action) { 1068 case RTM_ADD: 1069 error = rib_add_route(fibnum, info, rc); 1070 break; 1071 case RTM_DELETE: 1072 error = rib_del_route(fibnum, info, rc); 1073 break; 1074 case RTM_CHANGE: 1075 error = rib_change_route(fibnum, info, rc); 1076 break; 1077 default: 1078 error = ENOTSUP; 1079 } 1080 1081 return (error); 1082 } 1083 1084 struct rt_delinfo 1085 { 1086 struct rt_addrinfo info; 1087 struct rib_head *rnh; 1088 struct rtentry *head; 1089 struct rib_cmd_info rc; 1090 }; 1091 1092 /* 1093 * Conditionally unlinks @rn from radix tree based 1094 * on info data passed in @arg. 1095 */ 1096 static int 1097 rt_checkdelroute(struct radix_node *rn, void *arg) 1098 { 1099 struct rt_delinfo *di; 1100 struct rt_addrinfo *info; 1101 struct rtentry *rt; 1102 int error; 1103 1104 di = (struct rt_delinfo *)arg; 1105 rt = (struct rtentry *)rn; 1106 info = &di->info; 1107 1108 info->rti_info[RTAX_DST] = rt_key(rt); 1109 info->rti_info[RTAX_NETMASK] = rt_mask(rt); 1110 1111 error = rt_unlinkrte(di->rnh, info, &di->rc); 1112 1113 /* 1114 * Add deleted rtentries to the list to GC them 1115 * after dropping the lock. 1116 * 1117 * XXX: Delayed notifications not implemented 1118 * for nexthop updates. 1119 */ 1120 if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { 1121 /* Add to the list and return */ 1122 rt->rt_chain = di->head; 1123 di->head = rt; 1124 } 1125 1126 return (0); 1127 } 1128 1129 /* 1130 * Iterates over a routing table specified by @fibnum and @family and 1131 * deletes elements marked by @filter_f. 1132 * @fibnum: rtable id 1133 * @family: AF_ address family 1134 * @filter_f: function returning non-zero value for items to delete 1135 * @arg: data to pass to the @filter_f function 1136 * @report: true if rtsock notification is needed. 1137 */ 1138 void 1139 rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report) 1140 { 1141 struct rib_head *rnh; 1142 struct rt_delinfo di; 1143 struct rtentry *rt; 1144 struct nhop_object *nh; 1145 struct epoch_tracker et; 1146 1147 rnh = rt_tables_get_rnh(fibnum, family); 1148 if (rnh == NULL) 1149 return; 1150 1151 bzero(&di, sizeof(di)); 1152 di.info.rti_filter = filter_f; 1153 di.info.rti_filterdata = arg; 1154 di.rnh = rnh; 1155 di.rc.rc_cmd = RTM_DELETE; 1156 1157 NET_EPOCH_ENTER(et); 1158 1159 RIB_WLOCK(rnh); 1160 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); 1161 RIB_WUNLOCK(rnh); 1162 1163 /* We might have something to reclaim. */ 1164 bzero(&di.rc, sizeof(di.rc)); 1165 di.rc.rc_cmd = RTM_DELETE; 1166 while (di.head != NULL) { 1167 rt = di.head; 1168 di.head = rt->rt_chain; 1169 rt->rt_chain = NULL; 1170 nh = rt->rt_nhop; 1171 1172 di.rc.rc_rt = rt; 1173 di.rc.rc_nh_old = nh; 1174 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); 1175 1176 /* TODO std rt -> rt_addrinfo export */ 1177 di.info.rti_info[RTAX_DST] = rt_key(rt); 1178 di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); 1179 1180 if (report) { 1181 #ifdef ROUTE_MPATH 1182 struct nhgrp_object *nhg; 1183 struct weightened_nhop *wn; 1184 uint32_t num_nhops; 1185 if (NH_IS_NHGRP(nh)) { 1186 nhg = (struct nhgrp_object *)nh; 1187 wn = nhgrp_get_nhops(nhg, &num_nhops); 1188 for (int i = 0; i < num_nhops; i++) 1189 rt_routemsg(RTM_DELETE, rt, 1190 wn[i].nh->nh_ifp, 0, fibnum); 1191 } else 1192 #endif 1193 rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); 1194 } 1195 rtfree(rt); 1196 } 1197 1198 NET_EPOCH_EXIT(et); 1199 } 1200 1201 static void 1202 rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 1203 struct rib_cmd_info *rc) 1204 { 1205 struct rib_subscription *rs; 1206 1207 CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { 1208 if (rs->type == type) 1209 rs->func(rnh, rc, rs->arg); 1210 } 1211 } 1212 1213 static struct rib_subscription * 1214 allocate_subscription(rib_subscription_cb_t *f, void *arg, 1215 enum rib_subscription_type type, bool waitok) 1216 { 1217 struct rib_subscription *rs; 1218 int flags = M_ZERO | (waitok ? M_WAITOK : 0); 1219 1220 rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); 1221 if (rs == NULL) 1222 return (NULL); 1223 1224 rs->func = f; 1225 rs->arg = arg; 1226 rs->type = type; 1227 1228 return (rs); 1229 } 1230 1231 /* 1232 * Subscribe for the changes in the routing table specified by @fibnum and 1233 * @family. 1234 * 1235 * Returns pointer to the subscription structure on success. 1236 */ 1237 struct rib_subscription * 1238 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, 1239 enum rib_subscription_type type, bool waitok) 1240 { 1241 struct rib_head *rnh; 1242 struct rib_subscription *rs; 1243 struct epoch_tracker et; 1244 1245 if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) 1246 return (NULL); 1247 1248 NET_EPOCH_ENTER(et); 1249 KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); 1250 rnh = rt_tables_get_rnh(fibnum, family); 1251 1252 RIB_WLOCK(rnh); 1253 CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); 1254 RIB_WUNLOCK(rnh); 1255 NET_EPOCH_EXIT(et); 1256 1257 return (rs); 1258 } 1259 1260 struct rib_subscription * 1261 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, 1262 enum rib_subscription_type type, bool waitok) 1263 { 1264 struct rib_subscription *rs; 1265 struct epoch_tracker et; 1266 1267 if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) 1268 return (NULL); 1269 1270 NET_EPOCH_ENTER(et); 1271 RIB_WLOCK(rnh); 1272 CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); 1273 RIB_WUNLOCK(rnh); 1274 NET_EPOCH_EXIT(et); 1275 1276 return (rs); 1277 } 1278 1279 /* 1280 * Remove rtable subscription @rs from the table specified by @fibnum 1281 * and @family. 1282 * Needs to be run in network epoch. 1283 * 1284 * Returns 0 on success. 1285 */ 1286 int 1287 rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs) 1288 { 1289 struct rib_head *rnh; 1290 1291 NET_EPOCH_ASSERT(); 1292 KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); 1293 rnh = rt_tables_get_rnh(fibnum, family); 1294 1295 if (rnh == NULL) 1296 return (ENOENT); 1297 1298 RIB_WLOCK(rnh); 1299 CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); 1300 RIB_WUNLOCK(rnh); 1301 1302 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1303 &rs->epoch_ctx); 1304 1305 return (0); 1306 } 1307 1308 /* 1309 * Epoch callback indicating subscription is safe to destroy 1310 */ 1311 static void 1312 destroy_subscription_epoch(epoch_context_t ctx) 1313 { 1314 struct rib_subscription *rs; 1315 1316 rs = __containerof(ctx, struct rib_subscription, epoch_ctx); 1317 1318 free(rs, M_RTABLE); 1319 } 1320 1321 void 1322 rib_init_subscriptions(struct rib_head *rnh) 1323 { 1324 1325 CK_STAILQ_INIT(&rnh->rnh_subscribers); 1326 } 1327 1328 void 1329 rib_destroy_subscriptions(struct rib_head *rnh) 1330 { 1331 struct rib_subscription *rs; 1332 struct epoch_tracker et; 1333 1334 NET_EPOCH_ENTER(et); 1335 RIB_WLOCK(rnh); 1336 while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { 1337 CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); 1338 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1339 &rs->epoch_ctx); 1340 } 1341 RIB_WUNLOCK(rnh); 1342 NET_EPOCH_EXIT(et); 1343 } 1344