1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/jail.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 #include <sys/syslog.h> 42 #include <sys/sysproto.h> 43 #include <sys/proc.h> 44 #include <sys/domain.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/rmlock.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/if_dl.h> 52 #include <net/route.h> 53 #include <net/route/route_ctl.h> 54 #include <net/route/route_var.h> 55 #include <net/route/nhop_utils.h> 56 #include <net/route/nhop.h> 57 #include <net/route/nhop_var.h> 58 #ifdef INET 59 #include <netinet/in_fib.h> 60 #endif 61 #ifdef INET6 62 #include <netinet6/in6_fib.h> 63 #include <netinet6/in6_var.h> 64 #endif 65 #include <net/vnet.h> 66 67 #define DEBUG_MOD_NAME rt_helpers 68 #define DEBUG_MAX_LEVEL LOG_DEBUG2 69 #include <net/route/route_debug.h> 70 _DECLARE_DEBUG(LOG_INFO); 71 72 /* 73 * RIB helper functions. 74 */ 75 76 void 77 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, 78 rib_walk_hook_f_t *hook_f, void *arg) 79 { 80 if (hook_f != NULL) 81 hook_f(rnh, RIB_WALK_HOOK_PRE, arg); 82 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); 83 if (hook_f != NULL) 84 hook_f(rnh, RIB_WALK_HOOK_POST, arg); 85 } 86 87 /* 88 * Calls @wa_f with @arg for each entry in the table specified by 89 * @af and @fibnum. 90 * 91 * @ss_t callback is called before and after the tree traversal 92 * while holding table lock. 93 * 94 * Table is traversed under read lock unless @wlock is set. 95 */ 96 void 97 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, 98 rib_walk_hook_f_t *hook_f, void *arg) 99 { 100 RIB_RLOCK_TRACKER; 101 102 if (wlock) 103 RIB_WLOCK(rnh); 104 else 105 RIB_RLOCK(rnh); 106 rib_walk_ext_locked(rnh, wa_f, hook_f, arg); 107 if (wlock) 108 RIB_WUNLOCK(rnh); 109 else 110 RIB_RUNLOCK(rnh); 111 } 112 113 void 114 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 115 rib_walk_hook_f_t *hook_f, void *arg) 116 { 117 struct rib_head *rnh; 118 119 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) 120 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); 121 } 122 123 /* 124 * Calls @wa_f with @arg for each entry in the table specified by 125 * @af and @fibnum. 126 * 127 * Table is traversed under read lock unless @wlock is set. 128 */ 129 void 130 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 131 void *arg) 132 { 133 134 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); 135 } 136 137 /* 138 * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. 139 * 140 * The following flags are supported: 141 * RIB_FLAG_WLOCK: acquire exclusive lock 142 * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking 143 * 144 * By default, table is traversed under read lock. 145 */ 146 void 147 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, 148 struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) 149 { 150 RIB_RLOCK_TRACKER; 151 struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); 152 153 if (rnh == NULL) 154 return; 155 156 if (flags & RIB_FLAG_WLOCK) 157 RIB_WLOCK(rnh); 158 else if (!(flags & RIB_FLAG_LOCKED)) 159 RIB_RLOCK(rnh); 160 161 rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); 162 163 if (flags & RIB_FLAG_WLOCK) 164 RIB_WUNLOCK(rnh); 165 else if (!(flags & RIB_FLAG_LOCKED)) 166 RIB_RUNLOCK(rnh); 167 } 168 169 /* 170 * Iterates over all existing fibs in system calling 171 * @hook_f function before/after traversing each fib. 172 * Calls @wa_f function for each element in current fib. 173 * If af is not AF_UNSPEC, iterates over fibs in particular 174 * address family. 175 */ 176 void 177 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, 178 rib_walk_hook_f_t *hook_f, void *arg) 179 { 180 181 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 182 /* Do we want some specific family? */ 183 if (family != AF_UNSPEC) { 184 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); 185 continue; 186 } 187 188 for (int i = 1; i <= AF_MAX; i++) 189 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); 190 } 191 } 192 193 /* 194 * Iterates over all existing fibs in system and deletes each element 195 * for which @filter_f function returns non-zero value. 196 * If @family is not AF_UNSPEC, iterates over fibs in particular 197 * address family. 198 */ 199 void 200 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) 201 { 202 203 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 204 /* Do we want some specific family? */ 205 if (family != AF_UNSPEC) { 206 rib_walk_del(fibnum, family, filter_f, arg, 0); 207 continue; 208 } 209 210 for (int i = 1; i <= AF_MAX; i++) 211 rib_walk_del(fibnum, i, filter_f, arg, 0); 212 } 213 } 214 215 216 /* 217 * Wrapper for the control plane functions for performing af-agnostic 218 * lookups. 219 * @fibnum: fib to perform the lookup. 220 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in 221 * deembedded from. 222 * @flags: fib(9) flags. 223 * @flowid: flow id for path selection in multipath use case. 224 * 225 * Returns nhop_object or NULL. 226 * 227 * Requires NET_EPOCH. 228 * 229 */ 230 struct nhop_object * 231 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, 232 uint32_t flowid) 233 { 234 struct nhop_object *nh; 235 236 nh = NULL; 237 238 switch (dst->sa_family) { 239 #ifdef INET 240 case AF_INET: 241 { 242 const struct sockaddr_in *a = (const struct sockaddr_in *)dst; 243 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); 244 break; 245 } 246 #endif 247 #ifdef INET6 248 case AF_INET6: 249 { 250 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; 251 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, 252 flags, flowid); 253 break; 254 } 255 #endif 256 } 257 258 return (nh); 259 } 260 261 #ifdef ROUTE_MPATH 262 static void 263 notify_add(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, 264 route_notification_t *cb, void *cbdata) { 265 rc->rc_nh_new = wn_src->nh; 266 rc->rc_nh_weight = wn_src->weight; 267 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 268 char nhbuf[NHOP_PRINT_BUFSIZE]; 269 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_ADD for %s @ w=%u", 270 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); 271 #endif 272 cb(rc, cbdata); 273 } 274 275 static void 276 notify_del(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src, 277 route_notification_t *cb, void *cbdata) { 278 rc->rc_nh_old = wn_src->nh; 279 rc->rc_nh_weight = wn_src->weight; 280 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 281 char nhbuf[NHOP_PRINT_BUFSIZE]; 282 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_DEL for %s @ w=%u", 283 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight); 284 #endif 285 cb(rc, cbdata); 286 } 287 288 static void 289 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, 290 void *cbdata) 291 { 292 uint32_t num_old, num_new; 293 struct weightened_nhop *wn_old, *wn_new; 294 struct weightened_nhop tmp = { NULL, 0 }; 295 uint32_t idx_old = 0, idx_new = 0; 296 297 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; 298 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; 299 300 if (NH_IS_NHGRP(rc->rc_nh_old)) { 301 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); 302 } else { 303 tmp.nh = rc->rc_nh_old; 304 tmp.weight = rc->rc_nh_weight; 305 wn_old = &tmp; 306 num_old = 1; 307 } 308 if (NH_IS_NHGRP(rc->rc_nh_new)) { 309 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); 310 } else { 311 tmp.nh = rc->rc_nh_new; 312 tmp.weight = rc->rc_nh_weight; 313 wn_new = &tmp; 314 num_new = 1; 315 } 316 #if DEBUG_MAX_LEVEL >= LOG_DEBUG 317 { 318 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE]; 319 nhop_print_buf_any(rc->rc_nh_old, buf_old, NHOP_PRINT_BUFSIZE); 320 nhop_print_buf_any(rc->rc_nh_new, buf_new, NHOP_PRINT_BUFSIZE); 321 FIB_NH_LOG(LOG_DEBUG, wn_old[0].nh, "change %s -> %s", buf_old, buf_new); 322 } 323 #endif 324 325 /* Use the fact that each @wn array is sorted */ 326 /* 327 * Here we have one (or two) multipath groups and transition 328 * between them needs to be reported to the caller, using series 329 * of primitive (RTM_DEL, RTM_ADD) operations. 330 * 331 * Leverage the fact that each nexthop group has its nexthops sorted 332 * by their indices. 333 * [1] -> [1, 2] = A{2} 334 * [1, 2] -> [1] = D{2} 335 * [1, 2, 4] -> [1, 3, 4] = D{2}, A{3} 336 * [1, 2] -> [3, 4] = D{1}, D{2}, A{3}, A{4] 337 */ 338 while ((idx_old < num_old) && (idx_new < num_new)) { 339 uint32_t nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; 340 uint32_t nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; 341 342 if (nh_idx_old == nh_idx_new) { 343 if (wn_old[idx_old].weight != wn_new[idx_new].weight) { 344 /* Update weight by providing del/add notifications */ 345 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 346 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 347 } 348 idx_old++; 349 idx_new++; 350 } else if (nh_idx_old < nh_idx_new) { 351 /* [1, ~2~, 4], [1, ~3~, 4] */ 352 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 353 idx_old++; 354 } else { 355 /* nh_idx_old > nh_idx_new. */ 356 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 357 idx_new++; 358 } 359 } 360 361 while (idx_old < num_old) { 362 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata); 363 idx_old++; 364 } 365 366 while (idx_new < num_new) { 367 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata); 368 idx_new++; 369 } 370 } 371 372 /* 373 * Decompose multipath cmd info @rc into a list of add/del/change 374 * single-path operations, calling @cb callback for each operation. 375 * Assumes at least one of the nexthops in @rc is multipath. 376 */ 377 void 378 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, 379 void *cbdata) 380 { 381 struct weightened_nhop *wn; 382 uint32_t num_nhops; 383 struct rib_cmd_info rc_new; 384 385 rc_new = *rc; 386 DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", 387 cb, rc->cmd, rc->nh_old, rc->nh_new); 388 switch (rc->rc_cmd) { 389 case RTM_ADD: 390 if (!NH_IS_NHGRP(rc->rc_nh_new)) 391 return; 392 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); 393 for (uint32_t i = 0; i < num_nhops; i++) { 394 notify_add(&rc_new, &wn[i], cb, cbdata); 395 } 396 break; 397 case RTM_DELETE: 398 if (!NH_IS_NHGRP(rc->rc_nh_old)) 399 return; 400 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); 401 for (uint32_t i = 0; i < num_nhops; i++) { 402 notify_del(&rc_new, &wn[i], cb, cbdata); 403 } 404 break; 405 case RTM_CHANGE: 406 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) 407 return; 408 decompose_change_notification(rc, cb, cbdata); 409 break; 410 } 411 } 412 #endif 413 414 #ifdef INET 415 /* 416 * Checks if the found key in the trie contains (<=) a prefix covering 417 * @paddr/@plen. 418 * Returns the most specific rtentry matching the condition or NULL. 419 */ 420 static struct rtentry * 421 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen) 422 { 423 struct route_nhop_data rnd; 424 struct rtentry *rt; 425 struct in_addr addr4; 426 uint32_t scopeid; 427 int parent_plen; 428 struct radix_node *rn; 429 430 rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd); 431 if (rt == NULL) 432 return (NULL); 433 434 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 435 if (parent_plen <= plen) 436 return (rt); 437 438 /* 439 * There can be multiple prefixes associated with the found key: 440 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc. 441 * All such prefixes are linked via rn_dupedkey, from most specific 442 * to least specific. Iterate over them to check if any of these 443 * prefixes are wider than desired plen. 444 */ 445 rn = (struct radix_node *)rt; 446 while ((rn = rn_nextprefix(rn)) != NULL) { 447 rt = RNTORT(rn); 448 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 449 if (parent_plen <= plen) 450 return (rt); 451 } 452 453 return (NULL); 454 } 455 456 /* 457 * Returns the most specific prefix containing (>) @paddr/plen. 458 */ 459 struct rtentry * 460 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen) 461 { 462 struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST }; 463 struct in_addr addr4 = addr; 464 struct in_addr mask4; 465 struct rtentry *rt; 466 467 while (plen-- > 0) { 468 /* Calculate wider mask & new key to lookup */ 469 mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); 470 addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr)); 471 if (addr4.s_addr == lookup_addr.s_addr) { 472 /* Skip lookup if the key is the same */ 473 continue; 474 } 475 lookup_addr = addr4; 476 477 rt = get_inet_parent_prefix(fibnum, lookup_addr, plen); 478 if (rt != NULL) 479 return (rt); 480 } 481 482 return (NULL); 483 } 484 #endif 485 486 #ifdef INET6 487 /* 488 * Checks if the found key in the trie contains (<=) a prefix covering 489 * @paddr/@plen. 490 * Returns the most specific rtentry matching the condition or NULL. 491 */ 492 static struct rtentry * 493 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen) 494 { 495 struct route_nhop_data rnd; 496 struct rtentry *rt; 497 struct in6_addr addr6; 498 uint32_t scopeid; 499 int parent_plen; 500 struct radix_node *rn; 501 502 rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd); 503 if (rt == NULL) 504 return (NULL); 505 506 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 507 if (parent_plen <= plen) 508 return (rt); 509 510 /* 511 * There can be multiple prefixes associated with the found key: 512 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc. 513 * All such prefixes are linked via rn_dupedkey, from most specific 514 * to least specific. Iterate over them to check if any of these 515 * prefixes are wider than desired plen. 516 */ 517 rn = (struct radix_node *)rt; 518 while ((rn = rn_nextprefix(rn)) != NULL) { 519 rt = RNTORT(rn); 520 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 521 if (parent_plen <= plen) 522 return (rt); 523 } 524 525 return (NULL); 526 } 527 528 static void 529 ipv6_writemask(struct in6_addr *addr6, uint8_t mask) 530 { 531 uint32_t *cp; 532 533 for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) 534 *cp++ = 0xFFFFFFFF; 535 if (mask > 0) 536 *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); 537 } 538 539 /* 540 * Returns the most specific prefix containing (>) @paddr/plen. 541 */ 542 struct rtentry * 543 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen) 544 { 545 struct in6_addr lookup_addr = in6mask128; 546 struct in6_addr addr6 = *paddr; 547 struct in6_addr mask6; 548 struct rtentry *rt; 549 550 while (plen-- > 0) { 551 /* Calculate wider mask & new key to lookup */ 552 ipv6_writemask(&mask6, plen); 553 IN6_MASK_ADDR(&addr6, &mask6); 554 if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) { 555 /* Skip lookup if the key is the same */ 556 continue; 557 } 558 lookup_addr = addr6; 559 560 rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen); 561 if (rt != NULL) 562 return (rt); 563 } 564 565 return (NULL); 566 } 567 #endif 568