1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/jail.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 #include <sys/syslog.h> 42 #include <sys/sysproto.h> 43 #include <sys/proc.h> 44 #include <sys/domain.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/rmlock.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/if_dl.h> 52 #include <net/route.h> 53 #include <net/route/route_ctl.h> 54 #include <net/route/route_var.h> 55 #include <net/route/nhop_utils.h> 56 #include <net/route/nhop.h> 57 #include <net/route/nhop_var.h> 58 #ifdef INET 59 #include <netinet/in_fib.h> 60 #endif 61 #ifdef INET6 62 #include <netinet6/in6_fib.h> 63 #include <netinet6/in6_var.h> 64 #endif 65 #include <net/vnet.h> 66 67 #define DEBUG_MOD_NAME rt_helpers 68 #define DEBUG_MAX_LEVEL LOG_DEBUG2 69 #include <net/route/route_debug.h> 70 _DECLARE_DEBUG(LOG_INFO); 71 72 /* 73 * RIB helper functions. 74 */ 75 76 void 77 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, 78 rib_walk_hook_f_t *hook_f, void *arg) 79 { 80 if (hook_f != NULL) 81 hook_f(rnh, RIB_WALK_HOOK_PRE, arg); 82 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); 83 if (hook_f != NULL) 84 hook_f(rnh, RIB_WALK_HOOK_POST, arg); 85 } 86 87 /* 88 * Calls @wa_f with @arg for each entry in the table specified by 89 * @af and @fibnum. 90 * 91 * @ss_t callback is called before and after the tree traversal 92 * while holding table lock. 93 * 94 * Table is traversed under read lock unless @wlock is set. 95 */ 96 void 97 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, 98 rib_walk_hook_f_t *hook_f, void *arg) 99 { 100 RIB_RLOCK_TRACKER; 101 102 if (wlock) 103 RIB_WLOCK(rnh); 104 else 105 RIB_RLOCK(rnh); 106 rib_walk_ext_locked(rnh, wa_f, hook_f, arg); 107 if (wlock) 108 RIB_WUNLOCK(rnh); 109 else 110 RIB_RUNLOCK(rnh); 111 } 112 113 void 114 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 115 rib_walk_hook_f_t *hook_f, void *arg) 116 { 117 struct rib_head *rnh; 118 119 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) 120 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); 121 } 122 123 /* 124 * Calls @wa_f with @arg for each entry in the table specified by 125 * @af and @fibnum. 126 * 127 * Table is traversed under read lock unless @wlock is set. 128 */ 129 void 130 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 131 void *arg) 132 { 133 134 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); 135 } 136 137 /* 138 * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. 139 * 140 * The following flags are supported: 141 * RIB_FLAG_WLOCK: acquire exclusive lock 142 * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking 143 * 144 * By default, table is traversed under read lock. 145 */ 146 void 147 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, 148 struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) 149 { 150 RIB_RLOCK_TRACKER; 151 struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); 152 153 if (rnh == NULL) 154 return; 155 156 if (flags & RIB_FLAG_WLOCK) 157 RIB_WLOCK(rnh); 158 else if (!(flags & RIB_FLAG_LOCKED)) 159 RIB_RLOCK(rnh); 160 161 rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); 162 163 if (flags & RIB_FLAG_WLOCK) 164 RIB_WUNLOCK(rnh); 165 else if (!(flags & RIB_FLAG_LOCKED)) 166 RIB_RUNLOCK(rnh); 167 } 168 169 /* 170 * Iterates over all existing fibs in system calling 171 * @hook_f function before/after traversing each fib. 172 * Calls @wa_f function for each element in current fib. 173 * If af is not AF_UNSPEC, iterates over fibs in particular 174 * address family. 175 */ 176 void 177 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, 178 rib_walk_hook_f_t *hook_f, void *arg) 179 { 180 181 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 182 /* Do we want some specific family? */ 183 if (family != AF_UNSPEC) { 184 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); 185 continue; 186 } 187 188 for (int i = 1; i <= AF_MAX; i++) 189 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); 190 } 191 } 192 193 /* 194 * Iterates over all existing fibs in system and deletes each element 195 * for which @filter_f function returns non-zero value. 196 * If @family is not AF_UNSPEC, iterates over fibs in particular 197 * address family. 198 */ 199 void 200 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) 201 { 202 203 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 204 /* Do we want some specific family? */ 205 if (family != AF_UNSPEC) { 206 rib_walk_del(fibnum, family, filter_f, arg, 0); 207 continue; 208 } 209 210 for (int i = 1; i <= AF_MAX; i++) 211 rib_walk_del(fibnum, i, filter_f, arg, 0); 212 } 213 } 214 215 216 /* 217 * Wrapper for the control plane functions for performing af-agnostic 218 * lookups. 219 * @fibnum: fib to perform the lookup. 220 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in 221 * deembedded from. 222 * @flags: fib(9) flags. 223 * @flowid: flow id for path selection in multipath use case. 224 * 225 * Returns nhop_object or NULL. 226 * 227 * Requires NET_EPOCH. 228 * 229 */ 230 struct nhop_object * 231 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, 232 uint32_t flowid) 233 { 234 struct nhop_object *nh; 235 236 nh = NULL; 237 238 switch (dst->sa_family) { 239 #ifdef INET 240 case AF_INET: 241 { 242 const struct sockaddr_in *a = (const struct sockaddr_in *)dst; 243 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); 244 break; 245 } 246 #endif 247 #ifdef INET6 248 case AF_INET6: 249 { 250 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; 251 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, 252 flags, flowid); 253 break; 254 } 255 #endif 256 } 257 258 return (nh); 259 } 260 261 #ifdef ROUTE_MPATH 262 static void 263 notify_add(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, 264 route_notification_t *cb, void *cbdata) { 265 rc->rc_nh_new = wn_src->nh; 266 rc->rc_nh_weight = wn_src->weight; 267 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 268 char nhbuf[NHOP_PRINT_BUFSIZE]; 269 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_ADD for %s @ w=%u", 270 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); 271 #endif 272 cb(rc, cbdata); 273 } 274 275 static void 276 notify_del(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, 277 route_notification_t *cb, void *cbdata) { 278 rc->rc_nh_old = wn_src->nh; 279 rc->rc_nh_weight = wn_src->weight; 280 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 281 char nhbuf[NHOP_PRINT_BUFSIZE]; 282 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_DEL for %s @ w=%u", 283 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); 284 #endif 285 cb(rc, cbdata); 286 } 287 288 static void 289 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, 290 void *cbdata) 291 { 292 uint32_t num_old, num_new; 293 const struct weightened_nhop *wn_old, *wn_new; 294 struct weightened_nhop tmp = { NULL, 0 }; 295 uint32_t idx_old = 0, idx_new = 0; 296 297 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; 298 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; 299 300 if (NH_IS_NHGRP(rc->rc_nh_old)) { 301 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); 302 } else { 303 tmp.nh = rc->rc_nh_old; 304 tmp.weight = rc->rc_nh_weight; 305 wn_old = &tmp; 306 num_old = 1; 307 } 308 if (NH_IS_NHGRP(rc->rc_nh_new)) { 309 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); 310 } else { 311 tmp.nh = rc->rc_nh_new; 312 tmp.weight = rc->rc_nh_weight; 313 wn_new = &tmp; 314 num_new = 1; 315 } 316 #if DEBUG_MAX_LEVEL >= LOG_DEBUG 317 { 318 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE]; 319 nhop_print_buf_any(rc->rc_nh_old, buf_old, NHOP_PRINT_BUFSIZE); 320 nhop_print_buf_any(rc->rc_nh_new, buf_new, NHOP_PRINT_BUFSIZE); 321 FIB_NH_LOG(LOG_DEBUG, wn_old[0].nh, "change %s -> %s", buf_old, buf_new); 322 } 323 #endif 324 325 /* Use the fact that each @wn array is sorted */ 326 /* 327 * Here we have one (or two) multipath groups and transition 328 * between them needs to be reported to the caller, using series 329 * of primitive (RTM_DEL, RTM_ADD) operations. 330 * 331 * Leverage the fact that each nexthop group has its nexthops sorted 332 * by their indices. 333 * [1] -> [1, 2] = A{2} 334 * [1, 2] -> [1] = D{2} 335 * [1, 2, 4] -> [1, 3, 4] = D{2}, A{3} 336 * [1, 2] -> [3, 4] = D{1}, D{2}, A{3}, A{4] 337 */ 338 while ((idx_old < num_old) && (idx_new < num_new)) { 339 uint32_t nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; 340 uint32_t nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; 341 342 if (nh_idx_old == nh_idx_new) { 343 if (wn_old[idx_old].weight != wn_new[idx_new].weight) { 344 /* Update weight by providing del/add notifications */ 345 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 346 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 347 } 348 idx_old++; 349 idx_new++; 350 } else if (nh_idx_old < nh_idx_new) { 351 /* [1, ~2~, 4], [1, ~3~, 4] */ 352 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 353 idx_old++; 354 } else { 355 /* nh_idx_old > nh_idx_new. */ 356 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 357 idx_new++; 358 } 359 } 360 361 while (idx_old < num_old) { 362 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 363 idx_old++; 364 } 365 366 while (idx_new < num_new) { 367 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 368 idx_new++; 369 } 370 } 371 372 /* 373 * Decompose multipath cmd info @rc into a list of add/del/change 374 * single-path operations, calling @cb callback for each operation. 375 * Assumes at least one of the nexthops in @rc is multipath. 376 */ 377 void 378 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, 379 void *cbdata) 380 { 381 const struct weightened_nhop *wn; 382 uint32_t num_nhops; 383 struct rib_cmd_info rc_new; 384 385 rc_new = *rc; 386 switch (rc->rc_cmd) { 387 case RTM_ADD: 388 if (!NH_IS_NHGRP(rc->rc_nh_new)) 389 return; 390 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); 391 for (uint32_t i = 0; i < num_nhops; i++) { 392 notify_add(&rc_new, &wn[i], cb, cbdata); 393 } 394 break; 395 case RTM_DELETE: 396 if (!NH_IS_NHGRP(rc->rc_nh_old)) 397 return; 398 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); 399 for (uint32_t i = 0; i < num_nhops; i++) { 400 notify_del(&rc_new, &wn[i], cb, cbdata); 401 } 402 break; 403 case RTM_CHANGE: 404 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) 405 return; 406 decompose_change_notification(rc, cb, cbdata); 407 break; 408 } 409 } 410 #endif 411 412 #ifdef INET 413 /* 414 * Checks if the found key in the trie contains (<=) a prefix covering 415 * @paddr/@plen. 416 * Returns the most specific rtentry matching the condition or NULL. 417 */ 418 static struct rtentry * 419 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen) 420 { 421 struct route_nhop_data rnd; 422 struct rtentry *rt; 423 struct in_addr addr4; 424 uint32_t scopeid; 425 int parent_plen; 426 struct radix_node *rn; 427 428 rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd); 429 if (rt == NULL) 430 return (NULL); 431 432 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 433 if (parent_plen <= plen) 434 return (rt); 435 436 /* 437 * There can be multiple prefixes associated with the found key: 438 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc. 439 * All such prefixes are linked via rn_dupedkey, from most specific 440 * to least specific. Iterate over them to check if any of these 441 * prefixes are wider than desired plen. 442 */ 443 rn = (struct radix_node *)rt; 444 while ((rn = rn_nextprefix(rn)) != NULL) { 445 rt = RNTORT(rn); 446 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 447 if (parent_plen <= plen) 448 return (rt); 449 } 450 451 return (NULL); 452 } 453 454 /* 455 * Returns the most specific prefix containing (>) @paddr/plen. 456 */ 457 struct rtentry * 458 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen) 459 { 460 struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST }; 461 struct in_addr addr4 = addr; 462 struct in_addr mask4; 463 struct rtentry *rt; 464 465 while (plen-- > 0) { 466 /* Calculate wider mask & new key to lookup */ 467 mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); 468 addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr)); 469 if (addr4.s_addr == lookup_addr.s_addr) { 470 /* Skip lookup if the key is the same */ 471 continue; 472 } 473 lookup_addr = addr4; 474 475 rt = get_inet_parent_prefix(fibnum, lookup_addr, plen); 476 if (rt != NULL) 477 return (rt); 478 } 479 480 return (NULL); 481 } 482 #endif 483 484 #ifdef INET6 485 /* 486 * Checks if the found key in the trie contains (<=) a prefix covering 487 * @paddr/@plen. 488 * Returns the most specific rtentry matching the condition or NULL. 489 */ 490 static struct rtentry * 491 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen) 492 { 493 struct route_nhop_data rnd; 494 struct rtentry *rt; 495 struct in6_addr addr6; 496 uint32_t scopeid; 497 int parent_plen; 498 struct radix_node *rn; 499 500 rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd); 501 if (rt == NULL) 502 return (NULL); 503 504 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 505 if (parent_plen <= plen) 506 return (rt); 507 508 /* 509 * There can be multiple prefixes associated with the found key: 510 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc. 511 * All such prefixes are linked via rn_dupedkey, from most specific 512 * to least specific. Iterate over them to check if any of these 513 * prefixes are wider than desired plen. 514 */ 515 rn = (struct radix_node *)rt; 516 while ((rn = rn_nextprefix(rn)) != NULL) { 517 rt = RNTORT(rn); 518 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 519 if (parent_plen <= plen) 520 return (rt); 521 } 522 523 return (NULL); 524 } 525 526 static void 527 ipv6_writemask(struct in6_addr *addr6, uint8_t mask) 528 { 529 uint32_t *cp; 530 531 for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) 532 *cp++ = 0xFFFFFFFF; 533 if (mask > 0) 534 *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); 535 } 536 537 /* 538 * Returns the most specific prefix containing (>) @paddr/plen. 539 */ 540 struct rtentry * 541 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen) 542 { 543 struct in6_addr lookup_addr = in6mask128; 544 struct in6_addr addr6 = *paddr; 545 struct in6_addr mask6; 546 struct rtentry *rt; 547 548 while (plen-- > 0) { 549 /* Calculate wider mask & new key to lookup */ 550 ipv6_writemask(&mask6, plen); 551 IN6_MASK_ADDR(&addr6, &mask6); 552 if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) { 553 /* Skip lookup if the key is the same */ 554 continue; 555 } 556 lookup_addr = addr6; 557 558 rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen); 559 if (rt != NULL) 560 return (rt); 561 } 562 563 return (NULL); 564 } 565 #endif 566 567 /* 568 * Prints rtentry @rt data in the provided @buf. 569 * Example: rt/192.168.0.0/24 570 */ 571 char * 572 rt_print_buf(const struct rtentry *rt, char *buf, size_t bufsize) 573 { 574 #if defined(INET) || defined(INET6) 575 char abuf[INET6_ADDRSTRLEN]; 576 uint32_t scopeid; 577 int plen; 578 #endif 579 580 switch (rt_get_family(rt)) { 581 #ifdef INET 582 case AF_INET: 583 { 584 struct in_addr addr4; 585 rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid); 586 inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf)); 587 snprintf(buf, bufsize, "rt/%s/%d", abuf, plen); 588 } 589 break; 590 #endif 591 #ifdef INET6 592 case AF_INET6: 593 { 594 struct in6_addr addr6; 595 rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid); 596 inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf)); 597 snprintf(buf, bufsize, "rt/%s/%d", abuf, plen); 598 } 599 break; 600 #endif 601 default: 602 snprintf(buf, bufsize, "rt/unknown_af#%d", rt_get_family(rt)); 603 break; 604 } 605 606 return (buf); 607 } 608 609 const char * 610 rib_print_cmd(int rib_cmd) 611 { 612 switch (rib_cmd) { 613 case RTM_ADD: 614 return ("RTM_ADD"); 615 case RTM_CHANGE: 616 return ("RTM_CHANGE"); 617 case RTM_DELETE: 618 return ("RTM_DELETE"); 619 case RTM_GET: 620 return ("RTM_GET"); 621 } 622 623 return ("UNKNOWN"); 624 } 625