1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/malloc.h> 37 #include <sys/mbuf.h> 38 #include <sys/socket.h> 39 #include <sys/sysctl.h> 40 #include <sys/syslog.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/rmlock.h> 44 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_dl.h> 48 #include <net/vnet.h> 49 #include <net/route.h> 50 #include <net/route/route_ctl.h> 51 #include <net/route/route_var.h> 52 #include <net/route/nhop_utils.h> 53 #include <net/route/nhop.h> 54 #include <net/route/nhop_var.h> 55 #include <netinet/in.h> 56 57 #include <vm/uma.h> 58 59 /* 60 * This file contains control plane routing tables functions. 61 * 62 * All functions assumes they are called in net epoch. 63 */ 64 65 struct rib_subscription { 66 CK_STAILQ_ENTRY(rib_subscription) next; 67 rib_subscription_cb_t *func; 68 void *arg; 69 struct rib_head *rnh; 70 enum rib_subscription_type type; 71 struct epoch_context epoch_ctx; 72 }; 73 74 static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, 75 struct rib_cmd_info *rc); 76 static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 77 struct rt_addrinfo *info, struct route_nhop_data *rnd, 78 struct rib_cmd_info *rc); 79 static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, 80 struct rib_cmd_info *rc); 81 static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, 82 struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); 83 84 static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, 85 struct rib_cmd_info *rc); 86 87 static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 88 struct rib_cmd_info *rc); 89 90 static void destroy_subscription_epoch(epoch_context_t ctx); 91 #ifdef ROUTE_MPATH 92 static bool rib_can_multipath(struct rib_head *rh); 93 #endif 94 95 /* Per-vnet multipath routing configuration */ 96 SYSCTL_DECL(_net_route); 97 #define V_rib_route_multipath VNET(rib_route_multipath) 98 #ifdef ROUTE_MPATH 99 #define _MP_FLAGS CTLFLAG_RW 100 #else 101 #define _MP_FLAGS CTLFLAG_RD 102 #endif 103 VNET_DEFINE(u_int, rib_route_multipath) = 0; 104 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, 105 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); 106 #undef _MP_FLAGS 107 108 /* Routing table UMA zone */ 109 VNET_DEFINE_STATIC(uma_zone_t, rtzone); 110 #define V_rtzone VNET(rtzone) 111 112 void 113 vnet_rtzone_init() 114 { 115 116 V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), 117 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 118 } 119 120 #ifdef VIMAGE 121 void 122 vnet_rtzone_destroy() 123 { 124 125 uma_zdestroy(V_rtzone); 126 } 127 #endif 128 129 static void 130 destroy_rtentry(struct rtentry *rt) 131 { 132 133 /* 134 * At this moment rnh, nh_control may be already freed. 135 * nhop interface may have been migrated to a different vnet. 136 * Use vnet stored in the nexthop to delete the entry. 137 */ 138 CURVNET_SET(nhop_get_vnet(rt->rt_nhop)); 139 140 /* Unreference nexthop */ 141 nhop_free_any(rt->rt_nhop); 142 143 uma_zfree(V_rtzone, rt); 144 145 CURVNET_RESTORE(); 146 } 147 148 /* 149 * Epoch callback indicating rtentry is safe to destroy 150 */ 151 static void 152 destroy_rtentry_epoch(epoch_context_t ctx) 153 { 154 struct rtentry *rt; 155 156 rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); 157 158 destroy_rtentry(rt); 159 } 160 161 /* 162 * Schedule rtentry deletion 163 */ 164 static void 165 rtfree(struct rtentry *rt) 166 { 167 168 KASSERT(rt != NULL, ("%s: NULL rt", __func__)); 169 170 epoch_call(net_epoch_preempt, destroy_rtentry_epoch, 171 &rt->rt_epoch_ctx); 172 } 173 174 static struct rib_head * 175 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) 176 { 177 struct rib_head *rnh; 178 struct sockaddr *dst; 179 180 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); 181 182 dst = info->rti_info[RTAX_DST]; 183 rnh = rt_tables_get_rnh(fibnum, dst->sa_family); 184 185 return (rnh); 186 } 187 188 #ifdef ROUTE_MPATH 189 static bool 190 rib_can_multipath(struct rib_head *rh) 191 { 192 int result; 193 194 CURVNET_SET(rh->rib_vnet); 195 result = !!V_rib_route_multipath; 196 CURVNET_RESTORE(); 197 198 return (result); 199 } 200 201 /* 202 * Check is nhop is multipath-eligible. 203 * Avoid nhops without gateways and redirects. 204 * 205 * Returns 1 for multipath-eligible nexthop, 206 * 0 otherwise. 207 */ 208 bool 209 nhop_can_multipath(const struct nhop_object *nh) 210 { 211 212 if ((nh->nh_flags & NHF_MULTIPATH) != 0) 213 return (1); 214 if ((nh->nh_flags & NHF_GATEWAY) == 0) 215 return (0); 216 if ((nh->nh_flags & NHF_REDIRECT) != 0) 217 return (0); 218 219 return (1); 220 } 221 #endif 222 223 static int 224 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) 225 { 226 uint32_t weight; 227 228 if (info->rti_mflags & RTV_WEIGHT) 229 weight = info->rti_rmx->rmx_weight; 230 else 231 weight = default_weight; 232 /* Keep upper 1 byte for adm distance purposes */ 233 if (weight > RT_MAX_WEIGHT) 234 weight = RT_MAX_WEIGHT; 235 236 return (weight); 237 } 238 239 static void 240 rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) 241 { 242 243 /* Kernel -> userland timebase conversion. */ 244 if (info->rti_mflags & RTV_EXPIRE) 245 rt->rt_expire = info->rti_rmx->rmx_expire ? 246 info->rti_rmx->rmx_expire - time_second + time_uptime : 0; 247 } 248 249 /* 250 * Check if specified @gw matches gw data in the nexthop @nh. 251 * 252 * Returns true if matches, false otherwise. 253 */ 254 bool 255 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) 256 { 257 258 if (nh->gw_sa.sa_family != gw->sa_family) 259 return (false); 260 261 switch (gw->sa_family) { 262 case AF_INET: 263 return (nh->gw4_sa.sin_addr.s_addr == 264 ((const struct sockaddr_in *)gw)->sin_addr.s_addr); 265 case AF_INET6: 266 { 267 const struct sockaddr_in6 *gw6; 268 gw6 = (const struct sockaddr_in6 *)gw; 269 270 /* 271 * Currently (2020-09) IPv6 gws in kernel have their 272 * scope embedded. Once this becomes false, this code 273 * has to be revisited. 274 */ 275 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, 276 &gw6->sin6_addr)) 277 return (true); 278 return (false); 279 } 280 case AF_LINK: 281 { 282 const struct sockaddr_dl *sdl; 283 sdl = (const struct sockaddr_dl *)gw; 284 return (nh->gwl_sa.sdl_index == sdl->sdl_index); 285 } 286 default: 287 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); 288 } 289 290 /* NOTREACHED */ 291 return (false); 292 } 293 294 /* 295 * Checks if data in @info matches nexhop @nh. 296 * 297 * Returns 0 on success, 298 * ESRCH if not matched, 299 * ENOENT if filter function returned false 300 */ 301 int 302 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, 303 const struct nhop_object *nh) 304 { 305 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; 306 307 if (info->rti_filter != NULL) { 308 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) 309 return (ENOENT); 310 else 311 return (0); 312 } 313 if ((gw != NULL) && !match_nhop_gw(nh, gw)) 314 return (ESRCH); 315 316 return (0); 317 } 318 319 /* 320 * Checks if nexhop @nh can be rewritten by data in @info because 321 * of higher "priority". Currently the only case for such scenario 322 * is kernel installing interface routes, marked by RTF_PINNED flag. 323 * 324 * Returns: 325 * 1 if @info data has higher priority 326 * 0 if priority is the same 327 * -1 if priority is lower 328 */ 329 int 330 can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh) 331 { 332 333 if (info->rti_flags & RTF_PINNED) { 334 return (NH_IS_PINNED(nh)) ? 0 : 1; 335 } else { 336 return (NH_IS_PINNED(nh)) ? -1 : 0; 337 } 338 } 339 340 /* 341 * Runs exact prefix match based on @dst and @netmask. 342 * Returns matched @rtentry if found or NULL. 343 * If rtentry was found, saves nexthop / weight value into @rnd. 344 */ 345 static struct rtentry * 346 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, 347 const struct sockaddr *netmask, struct route_nhop_data *rnd) 348 { 349 struct rtentry *rt; 350 351 RIB_LOCK_ASSERT(rnh); 352 353 rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst), 354 __DECONST(void *, netmask), &rnh->head); 355 if (rt != NULL) { 356 rnd->rnd_nhop = rt->rt_nhop; 357 rnd->rnd_weight = rt->rt_weight; 358 } else { 359 rnd->rnd_nhop = NULL; 360 rnd->rnd_weight = 0; 361 } 362 363 return (rt); 364 } 365 366 /* 367 * Runs exact prefix match based on dst/netmask from @info. 368 * Assumes RIB lock is held. 369 * Returns matched @rtentry if found or NULL. 370 * If rtentry was found, saves nexthop / weight value into @rnd. 371 */ 372 struct rtentry * 373 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, 374 struct route_nhop_data *rnd) 375 { 376 struct rtentry *rt; 377 378 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], 379 info->rti_info[RTAX_NETMASK], rnd); 380 381 return (rt); 382 } 383 384 /* 385 * Adds route defined by @info into the kernel table specified by @fibnum and 386 * sa_family in @info->rti_info[RTAX_DST]. 387 * 388 * Returns 0 on success and fills in operation metadata into @rc. 389 */ 390 int 391 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, 392 struct rib_cmd_info *rc) 393 { 394 struct rib_head *rnh; 395 int error; 396 397 NET_EPOCH_ASSERT(); 398 399 rnh = get_rnh(fibnum, info); 400 if (rnh == NULL) 401 return (EAFNOSUPPORT); 402 403 /* 404 * Check consistency between RTF_HOST flag and netmask 405 * existence. 406 */ 407 if (info->rti_flags & RTF_HOST) 408 info->rti_info[RTAX_NETMASK] = NULL; 409 else if (info->rti_info[RTAX_NETMASK] == NULL) 410 return (EINVAL); 411 412 bzero(rc, sizeof(struct rib_cmd_info)); 413 rc->rc_cmd = RTM_ADD; 414 415 error = add_route(rnh, info, rc); 416 if (error == 0) 417 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 418 419 return (error); 420 } 421 422 /* 423 * Creates rtentry and nexthop based on @info data. 424 * Return 0 and fills in rtentry into @prt on success, 425 * return errno otherwise. 426 */ 427 static int 428 create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, 429 struct rtentry **prt) 430 { 431 struct sockaddr *dst, *ndst, *gateway, *netmask; 432 struct rtentry *rt; 433 struct nhop_object *nh; 434 struct ifaddr *ifa; 435 int error, flags; 436 437 dst = info->rti_info[RTAX_DST]; 438 gateway = info->rti_info[RTAX_GATEWAY]; 439 netmask = info->rti_info[RTAX_NETMASK]; 440 flags = info->rti_flags; 441 442 if ((flags & RTF_GATEWAY) && !gateway) 443 return (EINVAL); 444 if (dst && gateway && (dst->sa_family != gateway->sa_family) && 445 (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) 446 return (EINVAL); 447 448 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) 449 return (EINVAL); 450 451 if (info->rti_ifa == NULL) { 452 error = rt_getifa_fib(info, rnh->rib_fibnum); 453 if (error) 454 return (error); 455 } else { 456 ifa_ref(info->rti_ifa); 457 } 458 459 error = nhop_create_from_info(rnh, info, &nh); 460 if (error != 0) { 461 ifa_free(info->rti_ifa); 462 return (error); 463 } 464 465 rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); 466 if (rt == NULL) { 467 ifa_free(info->rti_ifa); 468 nhop_free(nh); 469 return (ENOBUFS); 470 } 471 rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; 472 rt->rt_nhop = nh; 473 474 /* Fill in dst */ 475 memcpy(&rt->rt_dst, dst, dst->sa_len); 476 rt_key(rt) = &rt->rt_dst; 477 478 /* 479 * point to the (possibly newly malloc'd) dest address. 480 */ 481 ndst = (struct sockaddr *)rt_key(rt); 482 483 /* 484 * make sure it contains the value we want (masked if needed). 485 */ 486 if (netmask) { 487 rt_maskedcopy(dst, ndst, netmask); 488 } else 489 bcopy(dst, ndst, dst->sa_len); 490 491 /* 492 * We use the ifa reference returned by rt_getifa_fib(). 493 * This moved from below so that rnh->rnh_addaddr() can 494 * examine the ifa and ifa->ifa_ifp if it so desires. 495 */ 496 ifa = info->rti_ifa; 497 rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); 498 rt_set_expire_info(rt, info); 499 500 *prt = rt; 501 return (0); 502 } 503 504 static int 505 add_route(struct rib_head *rnh, struct rt_addrinfo *info, 506 struct rib_cmd_info *rc) 507 { 508 struct nhop_object *nh_orig; 509 struct route_nhop_data rnd_orig, rnd_add; 510 struct nhop_object *nh; 511 struct rtentry *rt, *rt_orig; 512 int error; 513 514 error = create_rtentry(rnh, info, &rt); 515 if (error != 0) 516 return (error); 517 518 rnd_add.rnd_nhop = rt->rt_nhop; 519 rnd_add.rnd_weight = rt->rt_weight; 520 nh = rt->rt_nhop; 521 522 RIB_WLOCK(rnh); 523 error = add_route_nhop(rnh, rt, info, &rnd_add, rc); 524 if (error == 0) { 525 RIB_WUNLOCK(rnh); 526 return (0); 527 } 528 529 /* addition failed. Lookup prefix in the rib to determine the cause */ 530 rt_orig = lookup_prefix(rnh, info, &rnd_orig); 531 if (rt_orig == NULL) { 532 /* No prefix -> rnh_addaddr() failed to allocate memory */ 533 RIB_WUNLOCK(rnh); 534 nhop_free(nh); 535 uma_zfree(V_rtzone, rt); 536 return (ENOMEM); 537 } 538 539 /* We have existing route in the RIB. */ 540 nh_orig = rnd_orig.rnd_nhop; 541 /* Check if new route has higher preference */ 542 if (can_override_nhop(info, nh_orig) > 0) { 543 /* Update nexthop to the new route */ 544 change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); 545 RIB_WUNLOCK(rnh); 546 uma_zfree(V_rtzone, rt); 547 nhop_free(nh_orig); 548 return (0); 549 } 550 551 RIB_WUNLOCK(rnh); 552 553 #ifdef ROUTE_MPATH 554 if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && 555 nhop_can_multipath(rnd_orig.rnd_nhop)) 556 error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); 557 else 558 #endif 559 /* Unable to add - another route with the same preference exists */ 560 error = EEXIST; 561 562 /* 563 * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. 564 * ROUTE_MPATH enabled: original nhop reference is unused in any case, 565 * free rt only if not _adding_ new route to rib (e.g. the case 566 * when initial lookup returned existing route, but then it got 567 * deleted prior to multipath group insertion, leading to a simple 568 * non-multipath add as a result). 569 */ 570 nhop_free(nh); 571 if ((error != 0) || rc->rc_cmd != RTM_ADD) 572 uma_zfree(V_rtzone, rt); 573 574 return (error); 575 } 576 577 /* 578 * Removes route defined by @info from the kernel table specified by @fibnum and 579 * sa_family in @info->rti_info[RTAX_DST]. 580 * 581 * Returns 0 on success and fills in operation metadata into @rc. 582 */ 583 int 584 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) 585 { 586 struct rib_head *rnh; 587 struct sockaddr *dst_orig, *netmask; 588 struct sockaddr_storage mdst; 589 int error; 590 591 NET_EPOCH_ASSERT(); 592 593 rnh = get_rnh(fibnum, info); 594 if (rnh == NULL) 595 return (EAFNOSUPPORT); 596 597 bzero(rc, sizeof(struct rib_cmd_info)); 598 rc->rc_cmd = RTM_DELETE; 599 600 dst_orig = info->rti_info[RTAX_DST]; 601 netmask = info->rti_info[RTAX_NETMASK]; 602 603 if (netmask != NULL) { 604 /* Ensure @dst is always properly masked */ 605 if (dst_orig->sa_len > sizeof(mdst)) 606 return (EINVAL); 607 rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask); 608 info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst; 609 } 610 error = del_route(rnh, info, rc); 611 info->rti_info[RTAX_DST] = dst_orig; 612 613 return (error); 614 } 615 616 /* 617 * Conditionally unlinks rtentry matching data inside @info from @rnh. 618 * Returns 0 on success with operation result stored in @rc. 619 * On error, returns: 620 * ESRCH - if prefix was not found, 621 * EADDRINUSE - if trying to delete higher priority route. 622 * ENOENT - if supplied filter function returned 0 (not matched). 623 */ 624 static int 625 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) 626 { 627 struct rtentry *rt; 628 struct nhop_object *nh; 629 struct radix_node *rn; 630 struct route_nhop_data rnd; 631 int error; 632 633 rt = lookup_prefix(rnh, info, &rnd); 634 if (rt == NULL) 635 return (ESRCH); 636 637 nh = rt->rt_nhop; 638 #ifdef ROUTE_MPATH 639 if (NH_IS_NHGRP(nh)) { 640 error = del_route_mpath(rnh, info, rt, 641 (struct nhgrp_object *)nh, rc); 642 return (error); 643 } 644 #endif 645 error = check_info_match_nhop(info, rt, nh); 646 if (error != 0) 647 return (error); 648 649 if (can_override_nhop(info, nh) < 0) 650 return (EADDRINUSE); 651 652 /* 653 * Remove the item from the tree and return it. 654 * Complain if it is not there and do no more processing. 655 */ 656 rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], 657 info->rti_info[RTAX_NETMASK], &rnh->head); 658 if (rn == NULL) 659 return (ESRCH); 660 661 if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) 662 panic ("rtrequest delete"); 663 664 rt = RNTORT(rn); 665 rt->rte_flags &= ~RTF_UP; 666 667 /* Finalize notification */ 668 rnh->rnh_gen++; 669 rnh->rnh_prefixes--; 670 671 rc->rc_cmd = RTM_DELETE; 672 rc->rc_rt = rt; 673 rc->rc_nh_old = rt->rt_nhop; 674 rc->rc_nh_weight = rt->rt_weight; 675 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 676 677 return (0); 678 } 679 680 static int 681 del_route(struct rib_head *rnh, struct rt_addrinfo *info, 682 struct rib_cmd_info *rc) 683 { 684 int error; 685 686 RIB_WLOCK(rnh); 687 error = rt_unlinkrte(rnh, info, rc); 688 RIB_WUNLOCK(rnh); 689 if (error != 0) 690 return (error); 691 692 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 693 694 /* 695 * If the caller wants it, then it can have it, 696 * the entry will be deleted after the end of the current epoch. 697 */ 698 if (rc->rc_cmd == RTM_DELETE) 699 rtfree(rc->rc_rt); 700 #ifdef ROUTE_MPATH 701 else { 702 /* 703 * Deleting 1 path may result in RTM_CHANGE to 704 * a different mpath group/nhop. 705 * Free old mpath group. 706 */ 707 nhop_free_any(rc->rc_nh_old); 708 } 709 #endif 710 711 return (0); 712 } 713 714 int 715 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, 716 struct rib_cmd_info *rc) 717 { 718 RIB_RLOCK_TRACKER; 719 struct route_nhop_data rnd_orig; 720 struct rib_head *rnh; 721 struct rtentry *rt; 722 int error; 723 724 NET_EPOCH_ASSERT(); 725 726 rnh = get_rnh(fibnum, info); 727 if (rnh == NULL) 728 return (EAFNOSUPPORT); 729 730 bzero(rc, sizeof(struct rib_cmd_info)); 731 rc->rc_cmd = RTM_CHANGE; 732 733 /* Check if updated gateway exists */ 734 if ((info->rti_flags & RTF_GATEWAY) && 735 (info->rti_info[RTAX_GATEWAY] == NULL)) { 736 737 /* 738 * route(8) adds RTF_GATEWAY flag if -interface is not set. 739 * Remove RTF_GATEWAY to enforce consistency and maintain 740 * compatibility.. 741 */ 742 info->rti_flags &= ~RTF_GATEWAY; 743 } 744 745 /* 746 * route change is done in multiple steps, with dropping and 747 * reacquiring lock. In the situations with multiple processes 748 * changes the same route in can lead to the case when route 749 * is changed between the steps. Address it by retrying the operation 750 * multiple times before failing. 751 */ 752 753 RIB_RLOCK(rnh); 754 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 755 info->rti_info[RTAX_NETMASK], &rnh->head); 756 757 if (rt == NULL) { 758 RIB_RUNLOCK(rnh); 759 return (ESRCH); 760 } 761 762 rnd_orig.rnd_nhop = rt->rt_nhop; 763 rnd_orig.rnd_weight = rt->rt_weight; 764 765 RIB_RUNLOCK(rnh); 766 767 for (int i = 0; i < RIB_MAX_RETRIES; i++) { 768 error = change_route(rnh, info, &rnd_orig, rc); 769 if (error != EAGAIN) 770 break; 771 } 772 773 return (error); 774 } 775 776 static int 777 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, 778 struct nhop_object *nh_orig, struct nhop_object **nh_new) 779 { 780 int free_ifa = 0; 781 int error; 782 783 /* 784 * New gateway could require new ifaddr, ifp; 785 * flags may also be different; ifp may be specified 786 * by ll sockaddr when protocol address is ambiguous 787 */ 788 if (((nh_orig->nh_flags & NHF_GATEWAY) && 789 info->rti_info[RTAX_GATEWAY] != NULL) || 790 info->rti_info[RTAX_IFP] != NULL || 791 (info->rti_info[RTAX_IFA] != NULL && 792 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { 793 error = rt_getifa_fib(info, rnh->rib_fibnum); 794 if (info->rti_ifa != NULL) 795 free_ifa = 1; 796 797 if (error != 0) { 798 if (free_ifa) { 799 ifa_free(info->rti_ifa); 800 info->rti_ifa = NULL; 801 } 802 803 return (error); 804 } 805 } 806 807 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); 808 if (free_ifa) { 809 ifa_free(info->rti_ifa); 810 info->rti_ifa = NULL; 811 } 812 813 return (error); 814 } 815 816 #ifdef ROUTE_MPATH 817 static int 818 change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, 819 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 820 { 821 int error = 0; 822 struct nhop_object *nh, *nh_orig, *nh_new; 823 struct route_nhop_data rnd_new; 824 825 nh = NULL; 826 nh_orig = rnd_orig->rnd_nhop; 827 828 struct weightened_nhop *wn = NULL, *wn_new; 829 uint32_t num_nhops; 830 831 wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); 832 nh_orig = NULL; 833 for (int i = 0; i < num_nhops; i++) { 834 if (check_info_match_nhop(info, NULL, wn[i].nh)) { 835 nh_orig = wn[i].nh; 836 break; 837 } 838 } 839 840 if (nh_orig == NULL) 841 return (ESRCH); 842 843 error = change_nhop(rnh, info, nh_orig, &nh_new); 844 if (error != 0) 845 return (error); 846 847 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), 848 M_TEMP, M_NOWAIT | M_ZERO); 849 if (wn_new == NULL) { 850 nhop_free(nh_new); 851 return (EAGAIN); 852 } 853 854 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); 855 for (int i = 0; i < num_nhops; i++) { 856 if (wn[i].nh == nh_orig) { 857 wn[i].nh = nh_new; 858 wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); 859 break; 860 } 861 } 862 863 error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); 864 nhop_free(nh_new); 865 free(wn_new, M_TEMP); 866 867 if (error != 0) 868 return (error); 869 870 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 871 872 return (error); 873 } 874 #endif 875 876 static int 877 change_route(struct rib_head *rnh, struct rt_addrinfo *info, 878 struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) 879 { 880 int error = 0; 881 struct nhop_object *nh, *nh_orig; 882 struct route_nhop_data rnd_new; 883 884 nh = NULL; 885 nh_orig = rnd_orig->rnd_nhop; 886 if (nh_orig == NULL) 887 return (ESRCH); 888 889 #ifdef ROUTE_MPATH 890 if (NH_IS_NHGRP(nh_orig)) 891 return (change_mpath_route(rnh, info, rnd_orig, rc)); 892 #endif 893 894 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); 895 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); 896 if (error != 0) 897 return (error); 898 error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); 899 900 return (error); 901 } 902 903 /* 904 * Insert @rt with nhop data from @rnd_new to @rnh. 905 * Returns 0 on success and stores operation results in @rc. 906 */ 907 static int 908 add_route_nhop(struct rib_head *rnh, struct rtentry *rt, 909 struct rt_addrinfo *info, struct route_nhop_data *rnd, 910 struct rib_cmd_info *rc) 911 { 912 struct sockaddr *ndst, *netmask; 913 struct radix_node *rn; 914 int error = 0; 915 916 RIB_WLOCK_ASSERT(rnh); 917 918 ndst = (struct sockaddr *)rt_key(rt); 919 netmask = info->rti_info[RTAX_NETMASK]; 920 921 rt->rt_nhop = rnd->rnd_nhop; 922 rt->rt_weight = rnd->rnd_weight; 923 rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); 924 925 if (rn != NULL) { 926 if (rt->rt_expire > 0) 927 tmproutes_update(rnh, rt); 928 929 /* Finalize notification */ 930 rnh->rnh_gen++; 931 rnh->rnh_prefixes++; 932 933 rc->rc_cmd = RTM_ADD; 934 rc->rc_rt = rt; 935 rc->rc_nh_old = NULL; 936 rc->rc_nh_new = rnd->rnd_nhop; 937 rc->rc_nh_weight = rnd->rnd_weight; 938 939 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 940 } else { 941 /* Existing route or memory allocation failure */ 942 error = EEXIST; 943 } 944 945 return (error); 946 } 947 948 /* 949 * Switch @rt nhop/weigh to the ones specified in @rnd. 950 * Conditionally set rt_expire if set in @info. 951 * Returns 0 on success. 952 */ 953 int 954 change_route_nhop(struct rib_head *rnh, struct rtentry *rt, 955 struct rt_addrinfo *info, struct route_nhop_data *rnd, 956 struct rib_cmd_info *rc) 957 { 958 struct nhop_object *nh_orig; 959 960 RIB_WLOCK_ASSERT(rnh); 961 962 nh_orig = rt->rt_nhop; 963 964 if (rnd->rnd_nhop != NULL) { 965 /* Changing expiration & nexthop & weight to a new one */ 966 rt_set_expire_info(rt, info); 967 rt->rt_nhop = rnd->rnd_nhop; 968 rt->rt_weight = rnd->rnd_weight; 969 if (rt->rt_expire > 0) 970 tmproutes_update(rnh, rt); 971 } else { 972 /* Route deletion requested. */ 973 struct sockaddr *ndst, *netmask; 974 struct radix_node *rn; 975 976 ndst = (struct sockaddr *)rt_key(rt); 977 netmask = info->rti_info[RTAX_NETMASK]; 978 rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); 979 if (rn == NULL) 980 return (ESRCH); 981 rt = RNTORT(rn); 982 rt->rte_flags &= ~RTF_UP; 983 } 984 985 /* Finalize notification */ 986 rnh->rnh_gen++; 987 if (rnd->rnd_nhop == NULL) 988 rnh->rnh_prefixes--; 989 990 rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; 991 rc->rc_rt = rt; 992 rc->rc_nh_old = nh_orig; 993 rc->rc_nh_new = rnd->rnd_nhop; 994 rc->rc_nh_weight = rnd->rnd_weight; 995 996 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); 997 998 return (0); 999 } 1000 1001 /* 1002 * Conditionally update route nhop/weight IFF data in @nhd_orig is 1003 * consistent with the current route data. 1004 * Nexthop in @nhd_new is consumed. 1005 */ 1006 int 1007 change_route_conditional(struct rib_head *rnh, struct rtentry *rt, 1008 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, 1009 struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) 1010 { 1011 struct rtentry *rt_new; 1012 int error = 0; 1013 1014 RIB_WLOCK(rnh); 1015 1016 rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], 1017 info->rti_info[RTAX_NETMASK], &rnh->head); 1018 1019 if (rt_new == NULL) { 1020 if (rnd_orig->rnd_nhop == NULL) 1021 error = add_route_nhop(rnh, rt, info, rnd_new, rc); 1022 else { 1023 /* 1024 * Prefix does not exist, which was not our assumption. 1025 * Update @rnd_orig with the new data and return 1026 */ 1027 rnd_orig->rnd_nhop = NULL; 1028 rnd_orig->rnd_weight = 0; 1029 error = EAGAIN; 1030 } 1031 } else { 1032 /* Prefix exists, try to update */ 1033 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { 1034 /* 1035 * Nhop/mpath group hasn't changed. Flip 1036 * to the new precalculated one and return 1037 */ 1038 error = change_route_nhop(rnh, rt_new, info, rnd_new, rc); 1039 } else { 1040 /* Update and retry */ 1041 rnd_orig->rnd_nhop = rt_new->rt_nhop; 1042 rnd_orig->rnd_weight = rt_new->rt_weight; 1043 error = EAGAIN; 1044 } 1045 } 1046 1047 RIB_WUNLOCK(rnh); 1048 1049 if (error == 0) { 1050 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); 1051 1052 if (rnd_orig->rnd_nhop != NULL) 1053 nhop_free_any(rnd_orig->rnd_nhop); 1054 1055 } else { 1056 if (rnd_new->rnd_nhop != NULL) 1057 nhop_free_any(rnd_new->rnd_nhop); 1058 } 1059 1060 return (error); 1061 } 1062 1063 /* 1064 * Performs modification of routing table specificed by @action. 1065 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. 1066 * Needs to be run in network epoch. 1067 * 1068 * Returns 0 on success and fills in @rc with action result. 1069 */ 1070 int 1071 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, 1072 struct rib_cmd_info *rc) 1073 { 1074 int error; 1075 1076 switch (action) { 1077 case RTM_ADD: 1078 error = rib_add_route(fibnum, info, rc); 1079 break; 1080 case RTM_DELETE: 1081 error = rib_del_route(fibnum, info, rc); 1082 break; 1083 case RTM_CHANGE: 1084 error = rib_change_route(fibnum, info, rc); 1085 break; 1086 default: 1087 error = ENOTSUP; 1088 } 1089 1090 return (error); 1091 } 1092 1093 struct rt_delinfo 1094 { 1095 struct rt_addrinfo info; 1096 struct rib_head *rnh; 1097 struct rtentry *head; 1098 struct rib_cmd_info rc; 1099 }; 1100 1101 /* 1102 * Conditionally unlinks @rn from radix tree based 1103 * on info data passed in @arg. 1104 */ 1105 static int 1106 rt_checkdelroute(struct radix_node *rn, void *arg) 1107 { 1108 struct rt_delinfo *di; 1109 struct rt_addrinfo *info; 1110 struct rtentry *rt; 1111 int error; 1112 1113 di = (struct rt_delinfo *)arg; 1114 rt = (struct rtentry *)rn; 1115 info = &di->info; 1116 1117 info->rti_info[RTAX_DST] = rt_key(rt); 1118 info->rti_info[RTAX_NETMASK] = rt_mask(rt); 1119 1120 error = rt_unlinkrte(di->rnh, info, &di->rc); 1121 1122 /* 1123 * Add deleted rtentries to the list to GC them 1124 * after dropping the lock. 1125 * 1126 * XXX: Delayed notifications not implemented 1127 * for nexthop updates. 1128 */ 1129 if ((error == 0) && (di->rc.rc_cmd == RTM_DELETE)) { 1130 /* Add to the list and return */ 1131 rt->rt_chain = di->head; 1132 di->head = rt; 1133 } 1134 1135 return (0); 1136 } 1137 1138 /* 1139 * Iterates over a routing table specified by @fibnum and @family and 1140 * deletes elements marked by @filter_f. 1141 * @fibnum: rtable id 1142 * @family: AF_ address family 1143 * @filter_f: function returning non-zero value for items to delete 1144 * @arg: data to pass to the @filter_f function 1145 * @report: true if rtsock notification is needed. 1146 */ 1147 void 1148 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report) 1149 { 1150 struct rib_head *rnh; 1151 struct rt_delinfo di; 1152 struct rtentry *rt; 1153 struct nhop_object *nh; 1154 struct epoch_tracker et; 1155 1156 rnh = rt_tables_get_rnh(fibnum, family); 1157 if (rnh == NULL) 1158 return; 1159 1160 bzero(&di, sizeof(di)); 1161 di.info.rti_filter = filter_f; 1162 di.info.rti_filterdata = arg; 1163 di.rnh = rnh; 1164 di.rc.rc_cmd = RTM_DELETE; 1165 1166 NET_EPOCH_ENTER(et); 1167 1168 RIB_WLOCK(rnh); 1169 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); 1170 RIB_WUNLOCK(rnh); 1171 1172 /* We might have something to reclaim. */ 1173 bzero(&di.rc, sizeof(di.rc)); 1174 di.rc.rc_cmd = RTM_DELETE; 1175 while (di.head != NULL) { 1176 rt = di.head; 1177 di.head = rt->rt_chain; 1178 rt->rt_chain = NULL; 1179 nh = rt->rt_nhop; 1180 1181 di.rc.rc_rt = rt; 1182 di.rc.rc_nh_old = nh; 1183 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); 1184 1185 /* TODO std rt -> rt_addrinfo export */ 1186 di.info.rti_info[RTAX_DST] = rt_key(rt); 1187 di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); 1188 1189 if (report) { 1190 #ifdef ROUTE_MPATH 1191 struct nhgrp_object *nhg; 1192 struct weightened_nhop *wn; 1193 uint32_t num_nhops; 1194 if (NH_IS_NHGRP(nh)) { 1195 nhg = (struct nhgrp_object *)nh; 1196 wn = nhgrp_get_nhops(nhg, &num_nhops); 1197 for (int i = 0; i < num_nhops; i++) 1198 rt_routemsg(RTM_DELETE, rt, 1199 wn[i].nh->nh_ifp, 0, fibnum); 1200 } else 1201 #endif 1202 rt_routemsg(RTM_DELETE, rt, nh->nh_ifp, 0, fibnum); 1203 } 1204 rtfree(rt); 1205 } 1206 1207 NET_EPOCH_EXIT(et); 1208 } 1209 1210 static void 1211 rib_notify(struct rib_head *rnh, enum rib_subscription_type type, 1212 struct rib_cmd_info *rc) 1213 { 1214 struct rib_subscription *rs; 1215 1216 CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { 1217 if (rs->type == type) 1218 rs->func(rnh, rc, rs->arg); 1219 } 1220 } 1221 1222 static struct rib_subscription * 1223 allocate_subscription(rib_subscription_cb_t *f, void *arg, 1224 enum rib_subscription_type type, bool waitok) 1225 { 1226 struct rib_subscription *rs; 1227 int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); 1228 1229 rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); 1230 if (rs == NULL) 1231 return (NULL); 1232 1233 rs->func = f; 1234 rs->arg = arg; 1235 rs->type = type; 1236 1237 return (rs); 1238 } 1239 1240 /* 1241 * Subscribe for the changes in the routing table specified by @fibnum and 1242 * @family. 1243 * 1244 * Returns pointer to the subscription structure on success. 1245 */ 1246 struct rib_subscription * 1247 rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, 1248 enum rib_subscription_type type, bool waitok) 1249 { 1250 struct rib_head *rnh; 1251 struct epoch_tracker et; 1252 1253 NET_EPOCH_ENTER(et); 1254 KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); 1255 rnh = rt_tables_get_rnh(fibnum, family); 1256 NET_EPOCH_EXIT(et); 1257 1258 return (rib_subscribe_internal(rnh, f, arg, type, waitok)); 1259 } 1260 1261 struct rib_subscription * 1262 rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, 1263 enum rib_subscription_type type, bool waitok) 1264 { 1265 struct rib_subscription *rs; 1266 struct epoch_tracker et; 1267 1268 if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) 1269 return (NULL); 1270 rs->rnh = rnh; 1271 1272 NET_EPOCH_ENTER(et); 1273 RIB_WLOCK(rnh); 1274 CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next); 1275 RIB_WUNLOCK(rnh); 1276 NET_EPOCH_EXIT(et); 1277 1278 return (rs); 1279 } 1280 1281 /* 1282 * Remove rtable subscription @rs from the routing table. 1283 * Needs to be run in network epoch. 1284 */ 1285 void 1286 rib_unsibscribe(struct rib_subscription *rs) 1287 { 1288 struct rib_head *rnh = rs->rnh; 1289 1290 NET_EPOCH_ASSERT(); 1291 1292 RIB_WLOCK(rnh); 1293 CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); 1294 RIB_WUNLOCK(rnh); 1295 1296 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1297 &rs->epoch_ctx); 1298 } 1299 1300 /* 1301 * Epoch callback indicating subscription is safe to destroy 1302 */ 1303 static void 1304 destroy_subscription_epoch(epoch_context_t ctx) 1305 { 1306 struct rib_subscription *rs; 1307 1308 rs = __containerof(ctx, struct rib_subscription, epoch_ctx); 1309 1310 free(rs, M_RTABLE); 1311 } 1312 1313 void 1314 rib_init_subscriptions(struct rib_head *rnh) 1315 { 1316 1317 CK_STAILQ_INIT(&rnh->rnh_subscribers); 1318 } 1319 1320 void 1321 rib_destroy_subscriptions(struct rib_head *rnh) 1322 { 1323 struct rib_subscription *rs; 1324 struct epoch_tracker et; 1325 1326 NET_EPOCH_ENTER(et); 1327 RIB_WLOCK(rnh); 1328 while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { 1329 CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); 1330 epoch_call(net_epoch_preempt, destroy_subscription_epoch, 1331 &rs->epoch_ctx); 1332 } 1333 RIB_WUNLOCK(rnh); 1334 NET_EPOCH_EXIT(et); 1335 } 1336