1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/malloc.h> 37 #include <sys/mbuf.h> 38 #include <sys/socket.h> 39 #include <sys/sysctl.h> 40 #include <sys/syslog.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/rmlock.h> 44 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_dl.h> 48 #include <net/vnet.h> 49 #include <net/route.h> 50 #include <net/route/route_ctl.h> 51 #include <net/route/route_var.h> 52 #include <net/route/nhop_utils.h> 53 #include <net/route/nhop.h> 54 #include <net/route/nhop_var.h> 55 #include <netinet/in.h> 56 57 #ifdef RADIX_MPATH 58 #include <net/radix_mpath.h> 59 #endif 60 61 #include <vm/uma.h> 62 63 /* 64 * This file contains control plane routing tables functions. 65 * 66 * All functions assumes they are called in net epoch. 67 */ 68 69 struct rib_subscription { 70 CK_STAILQ_ENTRY(rib_subscription) next; 71 rib_subscription_cb_t *func; 72 void *arg; 73 enum rib_subscription_type type; 74 struct epoch_context epoch_ctx; 75 }; 76 77 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, 78 struct rib_cmd_info *rc); 79 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 80 struct rt_addrinfo *info, struct route_nhop_data *rnd, 81 struct rib_cmd_info *rc); 82 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, 83 struct rib_cmd_info *rc); 84 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, 85 struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); 86 87 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, 88 struct rib_cmd_info *rc); 89 90 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 91 struct rib_cmd_info *rc); 92 93 static void destroy_subscription_epoch(epoch_context_t ctx); 94 #ifdef ROUTE_MPATH 95 static bool rib_can_multipath(struct rib_head *rh); 96 #endif 97 98 /* Per-vnet multipath routing configuration */ 99 SYSCTL_DECL(_net_route); 100 #define V_rib_route_multipath VNET(rib_route_multipath) 101 #ifdef ROUTE_MPATH 102 #define _MP_FLAGS CTLFLAG_RW 103 #else 104 #define _MP_FLAGS CTLFLAG_RD 105 #endif 106 VNET_DEFINE(u_int, rib_route_multipath) = 0; 107 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, 108 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); 109 #undef _MP_FLAGS 110 111 /* Routing table UMA zone */ 112 VNET_DEFINE_STATIC(uma_zone_t, rtzone); 113 #define V_rtzone VNET(rtzone) 114 115 void 116 vnet_rtzone_init() 117 { 118 119 V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), 120 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 121 } 122 123 #ifdef VIMAGE 124 void 125 vnet_rtzone_destroy() 126 { 127 128 uma_zdestroy(V_rtzone); 129 } 130 #endif 131 132 static void 133 destroy_rtentry(struct rtentry *rt) 134 { 135 136 /* 137 * At this moment rnh, nh_control may be already freed. 138 * nhop interface may have been migrated to a different vnet. 139 * Use vnet stored in the nexthop to delete the entry. 140 */ 141 CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); 142 143 /* Unreference nexthop */ 144 nhop_free_any(rt->rt_nhop); 145 146 uma_zfree(V_rtzone, rt); 147 148 CURVNET_RESTORE(); 149 } 150 151 /* 152 * Epoch callback indicating rtentry is safe to destroy 153 */ 154 static void 155 destroy_rtentry_epoch(epoch_context_t ctx) 156 { 157 struct rtentry *rt; 158 159 rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); 160 161 destroy_rtentry(rt); 162 } 163 164 /* 165 * Schedule rtentry deletion 166 */ 167 static void 168 rtfree(struct rtentry *rt) 169 { 170 171 KASSERT(rt != NULL, ("%s: NULL rt", __func__)); 172 173 epoch_call(net_epoch_preempt, destroy_rtentry_epoch, 174 &rt->rt_epoch_ctx); 175 } 176 177 static struct rib_head * 178 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) 179 { 180 struct rib_head *rnh; 181 struct sockaddr *dst; 182 183 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); 184 185 dst = info->rti_info[RTAX_DST]; 186 rnh = rt_tables_get_rnh(fibnum, dst->sa_family); 187 188 return (rnh); 189 } 190 191 #ifdef ROUTE_MPATH 192 static bool 193 rib_can_multipath(struct rib_head *rh) 194 { 195 int result; 196 197 CURVNET_SET(rh->rib_vnet); 198 result = !!V_rib_route_multipath; 199 CURVNET_RESTORE(); 200 201 return (result); 202 } 203 204 /* 205 * Check is nhop is multipath-eligible. 206 * Avoid nhops without gateways and redirects. 207 * 208 * Returns 1 for multipath-eligible nexthop, 209 * 0 otherwise. 210 */ 211 bool 212 nhop_can_multipath(const struct nhop_object *nh) 213 { 214 215 if ((nh->nh_flags & NHF_MULTIPATH) != 0) 216 return (1); 217 if ((nh->nh_flags & NHF_GATEWAY) == 0) 218 return (0); 219 if ((nh->nh_flags & NHF_REDIRECT) != 0) 220 return (0); 221 222 return (1); 223 } 224 #endif 225 226 static int 227 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) 228 { 229 uint32_t weight; 230 231 if (info->rti_mflags & RTV_WEIGHT) 232 weight = info->rti_rmx->rmx_weight; 233 else 234 weight = default_weight; 235 /* Keep upper 1 byte for adm distance purposes */ 236 if (weight > RT_MAX_WEIGHT) 237 weight = RT_MAX_WEIGHT; 238 239 return (weight); 240 } 241 242 static void 243 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) 244 { 245 246 /* Kernel -> userland timebase conversion. */ 247 if (info->rti_mflags & RTV_EXPIRE) 248 rt->rt_expire = info->rti_rmx->rmx_expire ? 249 info->rti_rmx->rmx_expire - time_second + time_uptime : 0; 250 } 251 252 /* 253 * Check if specified @gw matches gw data in the nexthop @nh. 254 * 255 * Returns true if matches, false otherwise. 256 */ 257 bool 258 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) 259 { 260 261 if (nh->gw_sa.sa_family != gw->sa_family) 262 return (false); 263 264 switch (gw->sa_family) { 265 case AF_INET: 266 return (nh->gw4_sa.sin_addr.s_addr == 267 ((const struct sockaddr_in *)gw)->sin_addr.s_addr); 268 case AF_INET6: 269 { 270 const struct sockaddr_in6 *gw6; 271 gw6 = (const struct sockaddr_in6 *)gw; 272 273 /* 274 * Currently (2020-09) IPv6 gws in kernel have their 275 * scope embedded. Once this becomes false, this code 276 * has to be revisited. 277 */ 278 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, 279 &gw6->sin6_addr)) 280 return (true); 281 return (false); 282 } 283 case AF_LINK: 284 { 285 const struct sockaddr_dl *sdl; 286 sdl = (const struct sockaddr_dl *)gw; 287 return (nh->gwl_sa.sdl_index == sdl->sdl_index); 288 } 289 default: 290 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); 291 } 292 293 /* NOTREACHED */ 294 return (false); 295 } 296 297 /* 298 * Checks if data in @info matches nexhop @nh. 299 * 300 * Returns 0 on success, 301 * ESRCH if not matched, 302 * ENOENT if filter function returned false 303 */ 304 int 305 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, 306 const struct nhop_object *nh) 307 { 308 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; 309 310 if (info->rti_filter != NULL) { 311 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) 312 return (ENOENT); 313 else 314 return (0); 315 } 316 if ((gw != NULL) && !match_nhop_gw(nh, gw)) 317 return (ESRCH); 318 319 return (0); 320 } 321 322 /* 323 * Checks if nexhop @nh can be rewritten by data in @info because 324 * of higher "priority". Currently the only case for such scenario 325 * is kernel installing interface routes, marked by RTF_PINNED flag. 326 * 327 * Returns: 328 * 1 if @info data has higher priority 329 * 0 if priority is the same 330 * -1 if priority is lower 331 */ 332 int 333 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh) 334 { 335 336 if (info->rti_flags & RTF_PINNED) { 337 return (NH_IS_PINNED(nh)) ? 0 : 1; 338 } else { 339 return (NH_IS_PINNED(nh)) ? -1 : 0; 340 } 341 } 342 343 /* 344 * Runs exact prefix match based on @dst and @netmask. 345 * Returns matched @rtentry if found or NULL. 346 * If rtentry was found, saves nexthop / weight value into @rnd. 347 */ 348 static struct rtentry * 349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, 350 const struct sockaddr *netmask, struct route_nhop_data *rnd) 351 { 352 struct rtentry *rt; 353 354 RIB_LOCK_ASSERT(rnh); 355 356 rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst), 357 __DECONST(void *, netmask), &rnh->head); 358 if (rt != NULL) { 359 rnd->rnd_nhop = rt->rt_nhop; 360 rnd->rnd_weight = rt->rt_weight; 361 } else { 362 rnd->rnd_nhop = NULL; 363 rnd->rnd_weight = 0; 364 } 365 366 return (rt); 367 } 368 369 /* 370 * Runs exact prefix match based on dst/netmask from @info. 371 * Assumes RIB lock is held. 372 * Returns matched @rtentry if found or NULL. 373 * If rtentry was found, saves nexthop / weight value into @rnd. 374 */ 375 struct rtentry * 376 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, 377 struct route_nhop_data *rnd) 378 { 379 struct rtentry *rt; 380 381 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], 382 info->rti_info[RTAX_NETMASK], rnd); 383 384 return (rt); 385 } 386 387 /* 388 * Adds route defined by @info into the kernel table specified by @fibnum and 389 * sa_family in @info->rti_info[RTAX_DST]. 390 * 391 * Returns 0 on success and fills in operation metadata into @rc. 392 */ 393 int 394 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, 395 struct rib_cmd_info *rc) 396 { 397 struct rib_head *rnh; 398 int error; 399 400 NET_EPOCH_ASSERT(); 401 402 rnh = get_rnh(fibnum, info); 403 if (rnh == NULL) 404 return (EAFNOSUPPORT); 405 406 /* 407 * Check consistency between RTF_HOST flag and netmask 408 * existence. 409 */ 410 if (info->rti_flags & RTF_HOST) 411 info->rti_info[RTAX_NETMASK] = NULL; 412 else if (info->rti_info[RTAX_NETMASK] == NULL) 413 return (EINVAL); 414 415 bzero(rc, sizeof(struct rib_cmd_info)); 416 rc->rc_cmd = RTM_ADD; 417 418 error = add_route(rnh, info, rc); 419 if (error == 0) 420 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 421 422 return (error); 423 } 424 425 /* 426 * Creates rtentry and nexthop based on @info data. 427 * Return 0 and fills in rtentry into @prt on success, 428 * return errno otherwise. 429 */ 430 static int 431 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, 432 struct rtentry **prt) 433 { 434 struct sockaddr *dst, *ndst, *gateway, *netmask; 435 struct rtentry *rt; 436 struct nhop_object *nh; 437 struct ifaddr *ifa; 438 int error, flags; 439 440 dst = info->rti_info[RTAX_DST]; 441 gateway = info->rti_info[RTAX_GATEWAY]; 442 netmask = info->rti_info[RTAX_NETMASK]; 443 flags = info->rti_flags; 444 445 if ((flags & RTF_GATEWAY) && !gateway) 446 return (EINVAL); 447 if (dst && gateway && (dst->sa_family != gateway->sa_family) && 448 (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) 449 return (EINVAL); 450 451 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) 452 return (EINVAL); 453 454 if (info->rti_ifa == NULL) { 455 error = rt_getifa_fib(info, rnh->rib_fibnum); 456 if (error) 457 return (error); 458 } else { 459 ifa_ref(info->rti_ifa); 460 } 461 462 error = nhop_create_from_info(rnh, info, &nh); 463 if (error != 0) { 464 ifa_free(info->rti_ifa); 465 return (error); 466 } 467 468 rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); 469 if (rt == NULL) { 470 ifa_free(info->rti_ifa); 471 nhop_free(nh); 472 return (ENOBUFS); 473 } 474 rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; 475 rt->rt_nhop = nh; 476 477 /* Fill in dst */ 478 memcpy(&rt->rt_dst, dst, dst->sa_len); 479 rt_key(rt) = &rt->rt_dst; 480 481 /* 482 * point to the (possibly newly malloc'd) dest address. 483 */ 484 ndst = (struct sockaddr *)rt_key(rt); 485 486 /* 487 * make sure it contains the value we want (masked if needed). 488 */ 489 if (netmask) { 490 rt_maskedcopy(dst, ndst, netmask); 491 } else 492 bcopy(dst, ndst, dst->sa_len); 493 494 /* 495 * We use the ifa reference returned by rt_getifa_fib(). 496 * This moved from below so that rnh->rnh_addaddr() can 497 * examine the ifa and ifa->ifa_ifp if it so desires. 498 */ 499 ifa = info->rti_ifa; 500 rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); 501 rt_set_expire_info(rt, info); 502 503 *prt = rt; 504 return (0); 505 } 506 507 static int 508 add_route(struct rib_head *rnh, struct rt_addrinfo *info, 509 struct rib_cmd_info *rc) 510 { 511 struct nhop_object *nh_orig; 512 struct route_nhop_data rnd_orig, rnd_add; 513 struct nhop_object *nh; 514 struct rtentry *rt, *rt_orig; 515 int error; 516 517 error = create_rtentry(rnh, info, &rt); 518 if (error != 0) 519 return (error); 520 521 rnd_add.rnd_nhop = rt->rt_nhop; 522 rnd_add.rnd_weight = rt->rt_weight; 523 nh = rt->rt_nhop; 524 525 RIB_WLOCK(rnh); 526 error = add_route_nhop(rnh, rt, info, &rnd_add, rc); 527 if (error == 0) { 528 RIB_WUNLOCK(rnh); 529 return (0); 530 } 531 532 /* addition failed. Lookup prefix in the rib to determine the cause */ 533 rt_orig = lookup_prefix(rnh, info, &rnd_orig); 534 if (rt_orig == NULL) { 535 /* No prefix -> rnh_addaddr() failed to allocate memory */ 536 RIB_WUNLOCK(rnh); 537 nhop_free(nh); 538 uma_zfree(V_rtzone, rt); 539 return (ENOMEM); 540 } 541 542 /* We have existing route in the RIB. */ 543 nh_orig = rnd_orig.rnd_nhop; 544 /* Check if new route has higher preference */ 545 if (can_override_nhop(info, nh_orig) > 0) { 546 /* Update nexthop to the new route */ 547 change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); 548 RIB_WUNLOCK(rnh); 549 uma_zfree(V_rtzone, rt); 550 nhop_free(nh_orig); 551 return (0); 552 } 553 554 RIB_WUNLOCK(rnh); 555 556 #ifdef ROUTE_MPATH 557 if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && 558 nhop_can_multipath(rnd_orig.rnd_nhop)) 559 error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); 560 else 561 #endif 562 /* Unable to add - another route with the same preference exists */ 563 error = EEXIST; 564 565 /* 566 * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. 567 * ROUTE_MPATH enabled: original nhop reference is unused in any case, 568 * free rt only if not _adding_ new route to rib (e.g. the case 569 * when initial lookup returned existing route, but then it got 570 * deleted prior to multipath group insertion, leading to a simple 571 * non-multipath add as a result). 572 */ 573 nhop_free(nh); 574 if ((error != 0) || rc->rc_cmd != RTM_ADD) 575 uma_zfree(V_rtzone, rt); 576 577 return (error); 578 } 579 580 /* 581 * Removes route defined by @info from the kernel table specified by @fibnum and 582 * sa_family in @info->rti_info[RTAX_DST]. 583 * 584 * Returns 0 on success and fills in operation metadata into @rc. 585 */ 586 int 587 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) 588 { 589 struct rib_head *rnh; 590 struct sockaddr *dst_orig, *netmask; 591 struct sockaddr_storage mdst; 592 int error; 593 594 NET_EPOCH_ASSERT(); 595 596 rnh = get_rnh(fibnum, info); 597 if (rnh == NULL) 598 return (EAFNOSUPPORT); 599 600 bzero(rc, sizeof(struct rib_cmd_info)); 601 rc->rc_cmd = RTM_DELETE; 602 603 dst_orig = info->rti_info[RTAX_DST]; 604 netmask = info->rti_info[RTAX_NETMASK]; 605 606 if (netmask != NULL) { 607 /* Ensure @dst is always properly masked */ 608 if (dst_orig->sa_len > sizeof(mdst)) 609 return (EINVAL); 610 rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask); 611 info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst; 612 } 613 error = del_route(rnh, info, rc); 614 info->rti_info[RTAX_DST] = dst_orig; 615 616 return (error); 617 } 618 619 /* 620 * Conditionally unlinks rtentry matching data inside @info from @rnh. 621 * Returns 0 on success with operation result stored in @rc. 622 * On error, returns: 623 * ESRCH - if prefix was not found, 624 * EADDRINUSE - if trying to delete higher priority route. 625 * ENOENT - if supplied filter function returned 0 (not matched). 626 */ 627 static int 628 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) 629 { 630 struct rtentry *rt; 631 struct nhop_object *nh; 632 struct radix_node *rn; 633 struct route_nhop_data rnd; 634 int error; 635 636 rt = lookup_prefix(rnh, info, &rnd); 637 if (rt == NULL) 638 return (ESRCH); 639 640 nh = rt->rt_nhop; 641 #ifdef ROUTE_MPATH 642 if (NH_IS_NHGRP(nh)) { 643 error = del_route_mpath(rnh, info, rt, 644 (struct nhgrp_object *)nh, rc); 645 return (error); 646 } 647 #endif 648 error = check_info_match_nhop(info, rt, nh); 649 if (error != 0) 650 return (error); 651 652 if (can_override_nhop(info, nh) < 0) 653 return (EADDRINUSE); 654 655 /* 656 * Remove the item from the tree and return it. 657 * Complain if it is not there and do no more processing. 658 */ 659 rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], 660 info->rti_info[RTAX_NETMASK], &rnh->head); 661 if (rn == NULL) 662 return (ESRCH); 663 664 if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) 665 panic ("rtrequest delete"); 666 667 rt = RNTORT(rn); 668 rt->rte_flags &= ~RTF_UP; 669 670 /* Finalize notification */ 671 rnh->rnh_gen++; 672 rc->rc_cmd = RTM_DELETE; 673 rc->rc_rt = rt; 674 rc->rc_nh_old = rt->rt_nhop; 675 rc->rc_nh_weight = rt->rt_weight; 676 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 677 678 return (0); 679 } 680 681 static int 682 del_route(struct rib_head *rnh, struct rt_addrinfo *info, 683 struct rib_cmd_info *rc) 684 { 685 int error; 686 687 RIB_WLOCK(rnh); 688 error = rt_unlinkrte(rnh, info, rc); 689 RIB_WUNLOCK(rnh); 690 if (error != 0) 691 return (error); 692 693 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 694 695 /* 696 * If the caller wants it, then it can have it, 697 * the entry will be deleted after the end of the current epoch. 698 */ 699 if (rc->rc_cmd == RTM_DELETE) 700 rtfree(rc->rc_rt); 701 #ifdef ROUTE_MPATH 702 else { 703 /* 704 * Deleting 1 path may result in RTM_CHANGE to 705 * a different mpath group/nhop. 706 * Free old mpath group. 707 */ 708 nhop_free_any(rc->rc_nh_old); 709 } 710 #endif 711 712 return (0); 713 } 714 715 int 716 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, 717 struct rib_cmd_info *rc) 718 { 719 RIB_RLOCK_TRACKER; 720 struct route_nhop_data rnd_orig; 721 struct rib_head *rnh; 722 struct rtentry *rt; 723 int error; 724 725 NET_EPOCH_ASSERT(); 726 727 rnh = get_rnh(fibnum, info); 728 if (rnh == NULL) 729 return (EAFNOSUPPORT); 730 731 bzero(rc, sizeof(struct rib_cmd_info)); 732 rc->rc_cmd = RTM_CHANGE; 733 734 /* Check if updated gateway exists */ 735 if ((info->rti_flags & RTF_GATEWAY) && 736 (info->rti_info[RTAX_GATEWAY] == NULL)) { 737 738 /* 739 * route(8) adds RTF_GATEWAY flag if -interface is not set. 740 * Remove RTF_GATEWAY to enforce consistency and maintain 741 * compatibility.. 742 */ 743 info->rti_flags &= ~RTF_GATEWAY; 744 } 745 746 /* 747 * route change is done in multiple steps, with dropping and 748 * reacquiring lock. In the situations with multiple processes 749 * changes the same route in can lead to the case when route 750 * is changed between the steps. Address it by retrying the operation 751 * multiple times before failing. 752 */ 753 754 RIB_RLOCK(rnh); 755 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 756 info->rti_info[RTAX_NETMASK], &rnh->head); 757 758 if (rt == NULL) { 759 RIB_RUNLOCK(rnh); 760 return (ESRCH); 761 } 762 763 rnd_orig.rnd_nhop = rt->rt_nhop; 764 rnd_orig.rnd_weight = rt->rt_weight; 765 766 RIB_RUNLOCK(rnh); 767 768 for (int i = 0; i < RIB_MAX_RETRIES; i++) { 769 error = change_route(rnh, info, &rnd_orig, rc); 770 if (error != EAGAIN) 771 break; 772 } 773 774 return (error); 775 } 776 777 static int 778 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, 779 struct nhop_object *nh_orig, struct nhop_object **nh_new) 780 { 781 int free_ifa = 0; 782 int error; 783 784 /* 785 * New gateway could require new ifaddr, ifp; 786 * flags may also be different; ifp may be specified 787 * by ll sockaddr when protocol address is ambiguous 788 */ 789 if (((nh_orig->nh_flags & NHF_GATEWAY) && 790 info->rti_info[RTAX_GATEWAY] != NULL) || 791 info->rti_info[RTAX_IFP] != NULL || 792 (info->rti_info[RTAX_IFA] != NULL && 793 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { 794 error = rt_getifa_fib(info, rnh->rib_fibnum); 795 if (info->rti_ifa != NULL) 796 free_ifa = 1; 797 798 if (error != 0) { 799 if (free_ifa) { 800 ifa_free(info->rti_ifa); 801 info->rti_ifa = NULL; 802 } 803 804 return (error); 805 } 806 } 807 808 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); 809 if (free_ifa) { 810 ifa_free(info->rti_ifa); 811 info->rti_ifa = NULL; 812 } 813 814 return (error); 815 } 816 817 #ifdef ROUTE_MPATH 818 static int 819 change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, 820 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 821 { 822 int error = 0; 823 struct nhop_object *nh, *nh_orig, *nh_new; 824 struct route_nhop_data rnd_new; 825 826 nh = NULL; 827 nh_orig = rnd_orig->rnd_nhop; 828 829 struct weightened_nhop *wn = NULL, *wn_new; 830 uint32_t num_nhops; 831 832 wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); 833 nh_orig = NULL; 834 for (int i = 0; i < num_nhops; i++) { 835 if (check_info_match_nhop(info, NULL, wn[i].nh)) { 836 nh_orig = wn[i].nh; 837 break; 838 } 839 } 840 841 if (nh_orig == NULL) 842 return (ESRCH); 843 844 error = change_nhop(rnh, info, nh_orig, &nh_new); 845 if (error != 0) 846 return (error); 847 848 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), 849 M_TEMP, M_NOWAIT | M_ZERO); 850 if (wn_new == NULL) { 851 nhop_free(nh_new); 852 return (EAGAIN); 853 } 854 855 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); 856 for (int i = 0; i < num_nhops; i++) { 857 if (wn[i].nh == nh_orig) { 858 wn[i].nh = nh_new; 859 wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); 860 break; 861 } 862 } 863 864 error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); 865 nhop_free(nh_new); 866 free(wn_new, M_TEMP); 867 868 if (error != 0) 869 return (error); 870 871 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 872 873 return (error); 874 } 875 #endif 876 877 static int 878 change_route(struct rib_head *rnh, struct rt_addrinfo *info, 879 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 880 { 881 int error = 0; 882 struct nhop_object *nh, *nh_orig; 883 struct route_nhop_data rnd_new; 884 885 nh = NULL; 886 nh_orig = rnd_orig->rnd_nhop; 887 if (nh_orig == NULL) 888 return (ESRCH); 889 890 #ifdef ROUTE_MPATH 891 if (NH_IS_NHGRP(nh_orig)) 892 return (change_mpath_route(rnh, info, rnd_orig, rc)); 893 #endif 894 895 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); 896 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); 897 if (error != 0) 898 return (error); 899 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 900 901 return (error); 902 } 903 904 /* 905 * Insert @rt with nhop data from @rnd_new to @rnh. 906 * Returns 0 on success and stores operation results in @rc. 907 */ 908 static int 909 add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 910 struct rt_addrinfo *info, struct route_nhop_data *rnd, 911 struct rib_cmd_info *rc) 912 { 913 struct sockaddr *ndst, *netmask; 914 struct radix_node *rn; 915 int error = 0; 916 917 RIB_WLOCK_ASSERT(rnh); 918 919 ndst = (struct sockaddr *)rt_key(rt); 920 netmask = info->rti_info[RTAX_NETMASK]; 921 922 rt->rt_nhop = rnd->rnd_nhop; 923 rt->rt_weight = rnd->rnd_weight; 924 rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); 925 926 if (rn != NULL) { 927 if (rt->rt_expire > 0) 928 tmproutes_update(rnh, rt); 929 930 /* Finalize notification */ 931 rnh->rnh_gen++; 932 933 rc->rc_cmd = RTM_ADD; 934 rc->rc_rt = rt; 935 rc->rc_nh_old = NULL; 936 rc->rc_nh_new = rnd->rnd_nhop; 937 rc->rc_nh_weight = rnd->rnd_weight; 938 939 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 940 } else { 941 /* Existing route or memory allocation failure */ 942 error = EEXIST; 943 } 944 945 return (error); 946 } 947 948 /* 949 * Switch @rt nhop/weigh to the ones specified in @rnd. 950 * Conditionally set rt_expire if set in @info. 951 * Returns 0 on success. 952 */ 953 int 954 change_route_nhop(struct rib_head *rnh, struct rtentry *rt, 955 struct rt_addrinfo *info, struct route_nhop_data *rnd, 956 struct rib_cmd_info *rc) 957 { 958 struct nhop_object *nh_orig; 959 960 RIB_WLOCK_ASSERT(rnh); 961 962 nh_orig = rt->rt_nhop; 963 964 if (rnd->rnd_nhop != NULL) { 965 /* Changing expiration & nexthop & weight to a new one */ 966 rt_set_expire_info(rt, info); 967 rt->rt_nhop = rnd->rnd_nhop; 968 rt->rt_weight = rnd->rnd_weight; 969 if (rt->rt_expire > 0) 970 tmproutes_update(rnh, rt); 971 } else { 972 /* Route deletion requested. */ 973 struct sockaddr *ndst, *netmask; 974 struct radix_node *rn; 975 976 ndst = (struct sockaddr *)rt_key(rt); 977 netmask = info->rti_info[RTAX_NETMASK]; 978 rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); 979 if (rn == NULL) 980 return (ESRCH); 981 rt = RNTORT(rn); 982 rt->rte_flags &= ~RTF_UP; 983 } 984 985 /* Finalize notification */ 986 rnh->rnh_gen++; 987 988 rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; 989 rc->rc_rt = rt; 990 rc->rc_nh_old = nh_orig; 991 rc->rc_nh_new = rnd->rnd_nhop; 992 rc->rc_nh_weight = rnd->rnd_weight; 993 994 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 995 996 return (0); 997 } 998 999 /* 1000 * Conditionally update route nhop/weight IFF data in @nhd_orig is 1001 * consistent with the current route data. 1002 * Nexthop in @nhd_new is consumed. 1003 */ 1004 int 1005 change_route_conditional(struct rib_head *rnh, struct rtentry *rt, 1006 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, 1007 struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) 1008 { 1009 struct rtentry *rt_new; 1010 int error = 0; 1011 1012 RIB_WLOCK(rnh); 1013 1014 rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 1015 info->rti_info[RTAX_NETMASK], &rnh->head); 1016 1017 if (rt_new == NULL) { 1018 if (rnd_orig->rnd_nhop == NULL) 1019 error = add_route_nhop(rnh, rt, info, rnd_new, rc); 1020 else { 1021 /* 1022 * Prefix does not exist, which was not our assumption. 1023 * Update @rnd_orig with the new data and return 1024 */ 1025 rnd_orig->rnd_nhop = NULL; 1026 rnd_orig->rnd_weight = 0; 1027 error = EAGAIN; 1028 } 1029 } else { 1030 /* Prefix exists, try to update */ 1031 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { 1032 /* 1033 * Nhop/mpath group hasn't changed. Flip 1034 * to the new precalculated one and return 1035 */ 1036 error = change_route_nhop(rnh, rt_new, info, rnd_new, rc); 1037 } else { 1038 /* Update and retry */ 1039 rnd_orig->rnd_nhop = rt_new->rt_nhop; 1040 rnd_orig->rnd_weight = rt_new->rt_weight; 1041 error = EAGAIN; 1042 } 1043 } 1044 1045 RIB_WUNLOCK(rnh); 1046 1047 if (error == 0) { 1048 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 1049 1050 if (rnd_orig->rnd_nhop != NULL) 1051 nhop_free_any(rnd_orig->rnd_nhop); 1052 1053 } else { 1054 if (rnd_new->rnd_nhop != NULL) 1055 nhop_free_any(rnd_new->rnd_nhop); 1056 } 1057 1058 return (error); 1059 } 1060 1061 /* 1062 * Performs modification of routing table specificed by @action. 1063 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. 1064 * Needs to be run in network epoch. 1065 * 1066 * Returns 0 on success and fills in @rc with action result. 1067 */ 1068 int 1069 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, 1070 struct rib_cmd_info *rc) 1071 { 1072 int error; 1073 1074 switch (action) { 1075 case RTM_ADD: 1076 error = rib_add_route(fibnum, info, rc); 1077 break; 1078 case RTM_DELETE: 1079 error = rib_del_route(fibnum, info, rc); 1080 break; 1081 case RTM_CHANGE: 1082 error = rib_change_route(fibnum, info, rc); 1083 break; 1084 default: 1085 error = ENOTSUP; 1086 } 1087 1088 return (error); 1089 } 1090 1091 struct rt_delinfo 1092 { 1093 struct rt_addrinfo info; 1094 struct rib_head *rnh; 1095 struct rtentry *head; 1096 struct rib_cmd_info rc; 1097 }; 1098 1099 /* 1100 * Conditionally unlinks @rn from radix tree based 1101 * on info data passed in @arg. 1102 */ 1103 static int 1104 rt_checkdelroute(struct radix_node *rn, void *arg) 1105 { 1106 struct rt_delinfo *di; 1107 struct rt_addrinfo *info; 1108 struct rtentry *rt; 1109 int error; 1110 1111 di = (struct rt_delinfo *)arg; 1112 rt = (struct rtentry *)rn; 1113 info = &di->info; 1114 1115 info->rti_info[RTAX_DST] = rt_key(rt); 1116 info->rti_info[RTAX_NETMASK] = rt_mask(rt); 1117 1118 error = rt_unlinkrte(di->rnh, info, &di->rc); 1119 1120 /* 1121 * Add deleted rtentries to the list to GC them 1122 * after dropping the lock. 1123 * 1124 * XXX: Delayed notifications not implemented 1125 * for nexthop updates. 1126 */ 1127 if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { 1128 /* Add to the list and return */ 1129 rt->rt_chain = di->head; 1130 di->head = rt; 1131 } 1132 1133 return (0); 1134 } 1135 1136 /* 1137 * Iterates over a routing table specified by @fibnum and @family and 1138 * deletes elements marked by @filter_f. 1139 * @fibnum: rtable id 1140 * @family: AF_ address family 1141 * @filter_f: function returning non-zero value for items to delete 1142 * @arg: data to pass to the @filter_f function 1143 * @report: true if rtsock notification is needed. 1144 */ 1145 void 1146 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report) 1147 { 1148 struct rib_head *rnh; 1149 struct rt_delinfo di; 1150 struct rtentry *rt; 1151 struct nhop_object *nh; 1152 struct epoch_tracker et; 1153 1154 rnh = rt_tables_get_rnh(fibnum, family); 1155 if (rnh == NULL) 1156 return; 1157 1158 bzero(&di, sizeof(di)); 1159 di.info.rti_filter = filter_f; 1160 di.info.rti_filterdata = arg; 1161 di.rnh = rnh; 1162 di.rc.rc_cmd = RTM_DELETE; 1163 1164 NET_EPOCH_ENTER(et); 1165 1166 RIB_WLOCK(rnh); 1167 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); 1168 RIB_WUNLOCK(rnh); 1169 1170 /* We might have something to reclaim. */ 1171 bzero(&di.rc, sizeof(di.rc)); 1172 di.rc.rc_cmd = RTM_DELETE; 1173 while (di.head != NULL) { 1174 rt = di.head; 1175 di.head = rt->rt_chain; 1176 rt->rt_chain = NULL; 1177 nh = rt->rt_nhop; 1178 1179 di.rc.rc_rt = rt; 1180 di.rc.rc_nh_old = nh; 1181 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); 1182 1183 /* TODO std rt -> rt_addrinfo export */ 1184 di.info.rti_info[RTAX_DST] = rt_key(rt); 1185 di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); 1186 1187 if (report) { 1188 #ifdef ROUTE_MPATH 1189 struct nhgrp_object *nhg; 1190 struct weightened_nhop *wn; 1191 uint32_t num_nhops; 1192 if (NH_IS_NHGRP(nh)) { 1193 nhg = (struct nhgrp_object *)nh; 1194 wn = nhgrp_get_nhops(nhg, &num_nhops); 1195 for (int i = 0; i < num_nhops; i++) 1196 rt_routemsg(RTM_DELETE, rt, 1197 wn[i].nh->nh_ifp, 0, fibnum); 1198 } else 1199 #endif 1200 rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); 1201 } 1202 rtfree(rt); 1203 } 1204 1205 NET_EPOCH_EXIT(et); 1206 } 1207 1208 static void 1209 rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 1210 struct rib_cmd_info *rc) 1211 { 1212 struct rib_subscription *rs; 1213 1214 CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { 1215 if (rs->type == type) 1216 rs->func(rnh, rc, rs->arg); 1217 } 1218 } 1219 1220 static struct rib_subscription * 1221 allocate_subscription(rib_subscription_cb_t *f, void *arg, 1222 enum rib_subscription_type type, bool waitok) 1223 { 1224 struct rib_subscription *rs; 1225 int flags = M_ZERO | (waitok ? M_WAITOK : 0); 1226 1227 rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); 1228 if (rs == NULL) 1229 return (NULL); 1230 1231 rs->func = f; 1232 rs->arg = arg; 1233 rs->type = type; 1234 1235 return (rs); 1236 } 1237 1238 /* 1239 * Subscribe for the changes in the routing table specified by @fibnum and 1240 * @family. 1241 * 1242 * Returns pointer to the subscription structure on success. 1243 */ 1244 struct rib_subscription * 1245 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, 1246 enum rib_subscription_type type, bool waitok) 1247 { 1248 struct rib_head *rnh; 1249 struct rib_subscription *rs; 1250 struct epoch_tracker et; 1251 1252 if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) 1253 return (NULL); 1254 1255 NET_EPOCH_ENTER(et); 1256 KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); 1257 rnh = rt_tables_get_rnh(fibnum, family); 1258 1259 RIB_WLOCK(rnh); 1260 CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); 1261 RIB_WUNLOCK(rnh); 1262 NET_EPOCH_EXIT(et); 1263 1264 return (rs); 1265 } 1266 1267 struct rib_subscription * 1268 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, 1269 enum rib_subscription_type type, bool waitok) 1270 { 1271 struct rib_subscription *rs; 1272 struct epoch_tracker et; 1273 1274 if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) 1275 return (NULL); 1276 1277 NET_EPOCH_ENTER(et); 1278 RIB_WLOCK(rnh); 1279 CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); 1280 RIB_WUNLOCK(rnh); 1281 NET_EPOCH_EXIT(et); 1282 1283 return (rs); 1284 } 1285 1286 /* 1287 * Remove rtable subscription @rs from the table specified by @fibnum 1288 * and @family. 1289 * Needs to be run in network epoch. 1290 * 1291 * Returns 0 on success. 1292 */ 1293 int 1294 rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs) 1295 { 1296 struct rib_head *rnh; 1297 1298 NET_EPOCH_ASSERT(); 1299 KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); 1300 rnh = rt_tables_get_rnh(fibnum, family); 1301 1302 if (rnh == NULL) 1303 return (ENOENT); 1304 1305 RIB_WLOCK(rnh); 1306 CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); 1307 RIB_WUNLOCK(rnh); 1308 1309 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1310 &rs->epoch_ctx); 1311 1312 return (0); 1313 } 1314 1315 /* 1316 * Epoch callback indicating subscription is safe to destroy 1317 */ 1318 static void 1319 destroy_subscription_epoch(epoch_context_t ctx) 1320 { 1321 struct rib_subscription *rs; 1322 1323 rs = __containerof(ctx, struct rib_subscription, epoch_ctx); 1324 1325 free(rs, M_RTABLE); 1326 } 1327 1328 void 1329 rib_init_subscriptions(struct rib_head *rnh) 1330 { 1331 1332 CK_STAILQ_INIT(&rnh->rnh_subscribers); 1333 } 1334 1335 void 1336 rib_destroy_subscriptions(struct rib_head *rnh) 1337 { 1338 struct rib_subscription *rs; 1339 struct epoch_tracker et; 1340 1341 NET_EPOCH_ENTER(et); 1342 RIB_WLOCK(rnh); 1343 while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { 1344 CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); 1345 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1346 &rs->epoch_ctx); 1347 } 1348 RIB_WUNLOCK(rnh); 1349 NET_EPOCH_EXIT(et); 1350 } 1351