1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/jail.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 #include <sys/syslog.h> 42 #include <sys/sysproto.h> 43 #include <sys/proc.h> 44 #include <sys/domain.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/rmlock.h> 48 49 #include <net/if.h> 50 #include <net/if_var.h> 51 #include <net/if_dl.h> 52 #include <net/route.h> 53 #include <net/route/route_ctl.h> 54 #include <net/route/route_var.h> 55 #include <net/route/nhop_utils.h> 56 #include <net/route/nhop.h> 57 #include <net/route/nhop_var.h> 58 #ifdef INET 59 #include <netinet/in_fib.h> 60 #endif 61 #ifdef INET6 62 #include <netinet6/in6_fib.h> 63 #include <netinet6/in6_var.h> 64 #endif 65 #include <net/vnet.h> 66 67 /* 68 * RIB helper functions. 69 */ 70 71 void 72 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, 73 rib_walk_hook_f_t *hook_f, void *arg) 74 { 75 if (hook_f != NULL) 76 hook_f(rnh, RIB_WALK_HOOK_PRE, arg); 77 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg); 78 if (hook_f != NULL) 79 hook_f(rnh, RIB_WALK_HOOK_POST, arg); 80 } 81 82 /* 83 * Calls @wa_f with @arg for each entry in the table specified by 84 * @af and @fibnum. 85 * 86 * @ss_t callback is called before and after the tree traversal 87 * while holding table lock. 88 * 89 * Table is traversed under read lock unless @wlock is set. 90 */ 91 void 92 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, 93 rib_walk_hook_f_t *hook_f, void *arg) 94 { 95 RIB_RLOCK_TRACKER; 96 97 if (wlock) 98 RIB_WLOCK(rnh); 99 else 100 RIB_RLOCK(rnh); 101 rib_walk_ext_locked(rnh, wa_f, hook_f, arg); 102 if (wlock) 103 RIB_WUNLOCK(rnh); 104 else 105 RIB_RUNLOCK(rnh); 106 } 107 108 void 109 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 110 rib_walk_hook_f_t *hook_f, void *arg) 111 { 112 struct rib_head *rnh; 113 114 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) 115 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg); 116 } 117 118 /* 119 * Calls @wa_f with @arg for each entry in the table specified by 120 * @af and @fibnum. 121 * 122 * Table is traversed under read lock unless @wlock is set. 123 */ 124 void 125 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f, 126 void *arg) 127 { 128 129 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg); 130 } 131 132 /* 133 * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask. 134 * 135 * The following flags are supported: 136 * RIB_FLAG_WLOCK: acquire exclusive lock 137 * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking 138 * 139 * By default, table is traversed under read lock. 140 */ 141 void 142 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, 143 struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg) 144 { 145 RIB_RLOCK_TRACKER; 146 struct rib_head *rnh = rt_tables_get_rnh(fibnum, family); 147 148 if (rnh == NULL) 149 return; 150 151 if (flags & RIB_FLAG_WLOCK) 152 RIB_WLOCK(rnh); 153 else if (!(flags & RIB_FLAG_LOCKED)) 154 RIB_RLOCK(rnh); 155 156 rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg); 157 158 if (flags & RIB_FLAG_WLOCK) 159 RIB_WUNLOCK(rnh); 160 else if (!(flags & RIB_FLAG_LOCKED)) 161 RIB_RUNLOCK(rnh); 162 } 163 164 /* 165 * Iterates over all existing fibs in system calling 166 * @hook_f function before/after traversing each fib. 167 * Calls @wa_f function for each element in current fib. 168 * If af is not AF_UNSPEC, iterates over fibs in particular 169 * address family. 170 */ 171 void 172 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, 173 rib_walk_hook_f_t *hook_f, void *arg) 174 { 175 176 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 177 /* Do we want some specific family? */ 178 if (family != AF_UNSPEC) { 179 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); 180 continue; 181 } 182 183 for (int i = 1; i <= AF_MAX; i++) 184 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg); 185 } 186 } 187 188 /* 189 * Iterates over all existing fibs in system and deletes each element 190 * for which @filter_f function returns non-zero value. 191 * If @family is not AF_UNSPEC, iterates over fibs in particular 192 * address family. 193 */ 194 void 195 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg) 196 { 197 198 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { 199 /* Do we want some specific family? */ 200 if (family != AF_UNSPEC) { 201 rib_walk_del(fibnum, family, filter_f, arg, 0); 202 continue; 203 } 204 205 for (int i = 1; i <= AF_MAX; i++) 206 rib_walk_del(fibnum, i, filter_f, arg, 0); 207 } 208 } 209 210 211 /* 212 * Wrapper for the control plane functions for performing af-agnostic 213 * lookups. 214 * @fibnum: fib to perform the lookup. 215 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in 216 * deembedded from. 217 * @flags: fib(9) flags. 218 * @flowid: flow id for path selection in multipath use case. 219 * 220 * Returns nhop_object or NULL. 221 * 222 * Requires NET_EPOCH. 223 * 224 */ 225 struct nhop_object * 226 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, 227 uint32_t flowid) 228 { 229 struct nhop_object *nh; 230 231 nh = NULL; 232 233 switch (dst->sa_family) { 234 #ifdef INET 235 case AF_INET: 236 { 237 const struct sockaddr_in *a = (const struct sockaddr_in *)dst; 238 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid); 239 break; 240 } 241 #endif 242 #ifdef INET6 243 case AF_INET6: 244 { 245 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst; 246 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id, 247 flags, flowid); 248 break; 249 } 250 #endif 251 } 252 253 return (nh); 254 } 255 256 #ifdef ROUTE_MPATH 257 static void 258 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb, 259 void *cbdata) 260 { 261 uint32_t num_old, num_new; 262 uint32_t nh_idx_old, nh_idx_new; 263 struct weightened_nhop *wn_old, *wn_new; 264 struct weightened_nhop tmp = { NULL, 0 }; 265 uint32_t idx_old = 0, idx_new = 0; 266 267 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt }; 268 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt }; 269 270 if (NH_IS_NHGRP(rc->rc_nh_old)) { 271 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old); 272 } else { 273 tmp.nh = rc->rc_nh_old; 274 tmp.weight = rc->rc_nh_weight; 275 wn_old = &tmp; 276 num_old = 1; 277 } 278 if (NH_IS_NHGRP(rc->rc_nh_new)) { 279 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new); 280 } else { 281 tmp.nh = rc->rc_nh_new; 282 tmp.weight = rc->rc_nh_weight; 283 wn_new = &tmp; 284 num_new = 1; 285 } 286 287 /* Use the fact that each @wn array is sorted */ 288 /* 289 * Want to convert into set of add and delete operations 290 * [1] -> [1, 2] = A{2} 291 * [2] -> [1, 2] = A{1} 292 * [1, 2, 4]->[1, 3, 4] = A{2}, D{3} 293 * [1, 2, 4]->[1, 4] = D{2} 294 * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3} 295 * [1, 2] -> [3, 4] = 296 * 297 */ 298 idx_old = 0; 299 while ((idx_old < num_old) && (idx_new < num_new)) { 300 nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx; 301 nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx; 302 303 if (nh_idx_old == nh_idx_new) { 304 if (wn_old[idx_old].weight != wn_new[idx_new].weight) { 305 /* Update weight by providing del/add notifications */ 306 rc_del.rc_nh_old = wn_old[idx_old].nh; 307 rc_del.rc_nh_weight = wn_old[idx_old].weight; 308 cb(&rc_del, cbdata); 309 310 rc_add.rc_nh_new = wn_new[idx_new].nh; 311 rc_add.rc_nh_weight = wn_new[idx_new].weight; 312 cb(&rc_add, cbdata); 313 } 314 idx_old++; 315 idx_new++; 316 } else if (nh_idx_old < nh_idx_new) { 317 /* 318 * [1, ~2~, 4], [1, ~3~, 4] 319 * [1, ~2~, 5], [1, ~3~, 4] 320 * [1, ~2~], [1, ~3~, 4] 321 */ 322 if ((idx_old + 1 >= num_old) || 323 (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) { 324 /* Add new unless the next old item is still <= new */ 325 rc_add.rc_nh_new = wn_new[idx_new].nh; 326 rc_add.rc_nh_weight = wn_new[idx_new].weight; 327 cb(&rc_add, cbdata); 328 idx_new++; 329 } 330 /* In any case, delete current old */ 331 rc_del.rc_nh_old = wn_old[idx_old].nh; 332 rc_del.rc_nh_weight = wn_old[idx_old].weight; 333 cb(&rc_del, cbdata); 334 idx_old++; 335 } else { 336 /* 337 * nh_idx_old > nh_idx_new 338 * 339 * [1, ~3~, 4], [1, ~2~, 4] 340 * [1, ~3~, 5], [1, ~2~, 4] 341 * [1, ~3~, 4], [1, ~2~] 342 */ 343 if ((idx_new + 1 >= num_new) || 344 (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) { 345 /* No next item or next item is > current one */ 346 rc_add.rc_nh_new = wn_new[idx_new].nh; 347 rc_add.rc_nh_weight = wn_new[idx_new].weight; 348 cb(&rc_add, cbdata); 349 idx_new++; 350 } 351 /* In any case, delete current old */ 352 rc_del.rc_nh_old = wn_old[idx_old].nh; 353 rc_del.rc_nh_weight = wn_old[idx_old].weight; 354 cb(&rc_del, cbdata); 355 idx_old++; 356 } 357 } 358 359 while (idx_old < num_old) { 360 rc_del.rc_nh_old = wn_old[idx_old].nh; 361 rc_del.rc_nh_weight = wn_old[idx_old].weight; 362 cb(&rc_del, cbdata); 363 idx_old++; 364 } 365 366 while (idx_new < num_new) { 367 rc_add.rc_nh_new = wn_new[idx_new].nh; 368 rc_add.rc_nh_weight = wn_new[idx_new].weight; 369 cb(&rc_add, cbdata); 370 idx_new++; 371 } 372 } 373 374 /* 375 * Decompose multipath cmd info @rc into a list of add/del/change 376 * single-path operations, calling @cb callback for each operation. 377 * Assumes at least one of the nexthops in @rc is multipath. 378 */ 379 void 380 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb, 381 void *cbdata) 382 { 383 struct weightened_nhop *wn; 384 uint32_t num_nhops; 385 struct rib_cmd_info rc_new; 386 387 rc_new = *rc; 388 DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p", 389 cb, rc->cmd, rc->nh_old, rc->nh_new); 390 switch (rc->rc_cmd) { 391 case RTM_ADD: 392 if (!NH_IS_NHGRP(rc->rc_nh_new)) 393 return; 394 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops); 395 for (uint32_t i = 0; i < num_nhops; i++) { 396 rc_new.rc_nh_new = wn[i].nh; 397 rc_new.rc_nh_weight = wn[i].weight; 398 cb(&rc_new, cbdata); 399 } 400 break; 401 case RTM_DELETE: 402 if (!NH_IS_NHGRP(rc->rc_nh_old)) 403 return; 404 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops); 405 for (uint32_t i = 0; i < num_nhops; i++) { 406 rc_new.rc_nh_old = wn[i].nh; 407 rc_new.rc_nh_weight = wn[i].weight; 408 cb(&rc_new, cbdata); 409 } 410 break; 411 case RTM_CHANGE: 412 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new)) 413 return; 414 decompose_change_notification(rc, cb, cbdata); 415 break; 416 } 417 } 418 #endif 419 420 #ifdef INET 421 /* 422 * Checks if the found key in the trie contains (<=) a prefix covering 423 * @paddr/@plen. 424 * Returns the most specific rtentry matching the condition or NULL. 425 */ 426 static struct rtentry * 427 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen) 428 { 429 struct route_nhop_data rnd; 430 struct rtentry *rt; 431 struct in_addr addr4; 432 uint32_t scopeid; 433 int parent_plen; 434 struct radix_node *rn; 435 436 rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd); 437 if (rt == NULL) 438 return (NULL); 439 440 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 441 if (parent_plen <= plen) 442 return (rt); 443 444 /* 445 * There can be multiple prefixes associated with the found key: 446 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc. 447 * All such prefixes are linked via rn_dupedkey, from most specific 448 * to least specific. Iterate over them to check if any of these 449 * prefixes are wider than desired plen. 450 */ 451 rn = (struct radix_node *)rt; 452 while ((rn = rn_nextprefix(rn)) != NULL) { 453 rt = RNTORT(rn); 454 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid); 455 if (parent_plen <= plen) 456 return (rt); 457 } 458 459 return (NULL); 460 } 461 462 /* 463 * Returns the most specific prefix containing (>) @paddr/plen. 464 */ 465 struct rtentry * 466 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen) 467 { 468 struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST }; 469 struct in_addr addr4 = addr; 470 struct in_addr mask4; 471 struct rtentry *rt; 472 473 while (plen-- > 0) { 474 /* Calculate wider mask & new key to lookup */ 475 mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); 476 addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr)); 477 if (addr4.s_addr == lookup_addr.s_addr) { 478 /* Skip lookup if the key is the same */ 479 continue; 480 } 481 lookup_addr = addr4; 482 483 rt = get_inet_parent_prefix(fibnum, lookup_addr, plen); 484 if (rt != NULL) 485 return (rt); 486 } 487 488 return (NULL); 489 } 490 #endif 491 492 #ifdef INET6 493 /* 494 * Checks if the found key in the trie contains (<=) a prefix covering 495 * @paddr/@plen. 496 * Returns the most specific rtentry matching the condition or NULL. 497 */ 498 static struct rtentry * 499 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen) 500 { 501 struct route_nhop_data rnd; 502 struct rtentry *rt; 503 struct in6_addr addr6; 504 uint32_t scopeid; 505 int parent_plen; 506 struct radix_node *rn; 507 508 rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd); 509 if (rt == NULL) 510 return (NULL); 511 512 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 513 if (parent_plen <= plen) 514 return (rt); 515 516 /* 517 * There can be multiple prefixes associated with the found key: 518 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc. 519 * All such prefixes are linked via rn_dupedkey, from most specific 520 * to least specific. Iterate over them to check if any of these 521 * prefixes are wider than desired plen. 522 */ 523 rn = (struct radix_node *)rt; 524 while ((rn = rn_nextprefix(rn)) != NULL) { 525 rt = RNTORT(rn); 526 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid); 527 if (parent_plen <= plen) 528 return (rt); 529 } 530 531 return (NULL); 532 } 533 534 static void 535 ipv6_writemask(struct in6_addr *addr6, uint8_t mask) 536 { 537 uint32_t *cp; 538 539 for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) 540 *cp++ = 0xFFFFFFFF; 541 if (mask > 0) 542 *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); 543 } 544 545 /* 546 * Returns the most specific prefix containing (>) @paddr/plen. 547 */ 548 struct rtentry * 549 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen) 550 { 551 struct in6_addr lookup_addr = in6mask128; 552 struct in6_addr addr6 = *paddr; 553 struct in6_addr mask6; 554 struct rtentry *rt; 555 556 while (plen-- > 0) { 557 /* Calculate wider mask & new key to lookup */ 558 ipv6_writemask(&mask6, plen); 559 IN6_MASK_ADDR(&addr6, &mask6); 560 if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) { 561 /* Skip lookup if the key is the same */ 562 continue; 563 } 564 lookup_addr = addr6; 565 566 rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen); 567 if (rt != NULL) 568 return (rt); 569 } 570 571 return (NULL); 572 } 573 #endif 574