1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include "opt_netlink.h" 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 #include "opt_inet.h" 33 #include "opt_inet6.h" 34 #include "opt_route.h" 35 #include <sys/types.h> 36 #include <sys/ck.h> 37 #include <sys/epoch.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/rmlock.h> 41 #include <sys/socket.h> 42 43 #include <net/if.h> 44 #include <net/route.h> 45 #include <net/route/nhop.h> 46 #include <net/route/nhop_utils.h> 47 48 #include <net/route/route_ctl.h> 49 #include <net/route/route_var.h> 50 #include <netinet6/scope6_var.h> 51 #include <netlink/netlink.h> 52 #include <netlink/netlink_ctl.h> 53 #include <netlink/netlink_route.h> 54 #include <netlink/route/route_var.h> 55 56 #define DEBUG_MOD_NAME nl_nhop 57 #define DEBUG_MAX_LEVEL LOG_DEBUG3 58 #include <netlink/netlink_debug.h> 59 _DECLARE_DEBUG(LOG_DEBUG); 60 61 /* 62 * This file contains the logic to maintain kernel nexthops and 63 * nexhop groups based om the data provided by the user. 64 * 65 * Kernel stores (nearly) all of the routing data in the nexthops, 66 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). 67 * 68 * Netlink API provides higher-level abstraction for the user. Each 69 * user-created nexthop may map to multiple kernel nexthops. 70 * 71 * The following variations require separate kernel nexthop to be 72 * created: 73 * * prefix flags (NHF_HOST, NHF_DEFAULT) 74 * * using IPv6 gateway for IPv4 routes 75 * * different fibnum 76 * 77 * These kernel nexthops have the lifetime bound to the lifetime of 78 * the user_nhop object. They are not collected until user requests 79 * to delete the created user_nhop. 80 * 81 */ 82 struct user_nhop { 83 uint32_t un_idx; /* Userland-provided index */ 84 uint32_t un_fibfam; /* fibnum+af(as highest byte) */ 85 uint8_t un_protocol; /* protocol that install the record */ 86 struct nhop_object *un_nhop; /* "production" nexthop */ 87 struct nhop_object *un_nhop_src; /* nexthop to copy from */ 88 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ 89 uint32_t un_nhgrp_count; /* number of nexthops */ 90 struct user_nhop *un_next; /* next item in hash chain */ 91 struct user_nhop *un_nextchild; /* master -> children */ 92 struct epoch_context un_epoch_ctx; /* epoch ctl helper */ 93 }; 94 95 /* produce hash value for an object */ 96 #define unhop_hash_obj(_obj) (hash_unhop(_obj)) 97 /* compare two objects */ 98 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) 99 /* next object accessor */ 100 #define unhop_next(_obj) (_obj)->un_next 101 102 CHT_SLIST_DEFINE(unhop, struct user_nhop); 103 104 struct unhop_ctl { 105 struct unhop_head un_head; 106 struct rmlock un_lock; 107 }; 108 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") 109 #define UN_TRACKER struct rm_priotracker un_tracker 110 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) 111 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) 112 113 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); 114 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); 115 116 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; 117 #define V_un_ctl VNET(un_ctl) 118 119 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); 120 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); 121 static unsigned int hash_unhop(const struct user_nhop *obj); 122 123 static void destroy_unhop(struct user_nhop *unhop); 124 static struct nhop_object *clone_unhop(const struct user_nhop *unhop, 125 uint32_t fibnum, int family, int nh_flags); 126 127 static int 128 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) 129 { 130 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); 131 } 132 133 /* 134 * Hash callback: calculate hash of an object 135 */ 136 static unsigned int 137 hash_unhop(const struct user_nhop *obj) 138 { 139 return (obj->un_idx ^ obj->un_fibfam); 140 } 141 142 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) 143 144 /* 145 * Factory interface for creating matching kernel nexthops/nexthop groups 146 * 147 * @uidx: userland nexhop index used to create the nexthop 148 * @fibnum: fibnum nexthop will be used in 149 * @family: upper family nexthop will be used in 150 * @nh_flags: desired nexthop prefix flags 151 * @perror: pointer to store error to 152 * 153 * Returns referenced nexthop linked to @fibnum/@family rib on success. 154 */ 155 struct nhop_object * 156 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, 157 int nh_flags, int *perror) 158 { 159 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 160 UN_TRACKER; 161 162 if (__predict_false(ctl == NULL)) 163 return (NULL); 164 165 struct user_nhop key= { 166 .un_idx = uidx, 167 .un_fibfam = fibnum | ((uint32_t)family) << 24, 168 }; 169 struct user_nhop *unhop; 170 171 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); 172 173 if (__predict_false(family == 0)) 174 return (NULL); 175 176 UN_RLOCK(ctl); 177 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 178 if (unhop != NULL) { 179 struct nhop_object *nh = unhop->un_nhop; 180 UN_RLOCK(ctl); 181 *perror = 0; 182 nhop_ref_any(nh); 183 return (nh); 184 } 185 186 /* 187 * Exact nexthop not found. Search for template nexthop to clone from. 188 */ 189 key.un_fibfam = 0; 190 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 191 if (unhop == NULL) { 192 UN_RUNLOCK(ctl); 193 *perror = ESRCH; 194 return (NULL); 195 } 196 197 UN_RUNLOCK(ctl); 198 199 /* Create entry to insert first */ 200 struct user_nhop *un_new, *un_tmp; 201 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 202 if (un_new == NULL) { 203 *perror = ENOMEM; 204 return (NULL); 205 } 206 un_new->un_idx = uidx; 207 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; 208 209 /* Relying on epoch to protect unhop here */ 210 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); 211 if (un_new->un_nhop == NULL) { 212 free(un_new, M_NETLINK); 213 *perror = ENOMEM; 214 return (NULL); 215 } 216 217 /* Insert back and report */ 218 UN_WLOCK(ctl); 219 220 /* First, find template record once again */ 221 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 222 if (unhop == NULL) { 223 /* Someone deleted the nexthop during the call */ 224 UN_WUNLOCK(ctl); 225 *perror = ESRCH; 226 destroy_unhop(un_new); 227 return (NULL); 228 } 229 230 /* Second, check the direct match */ 231 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); 232 struct nhop_object *nh; 233 if (un_tmp != NULL) { 234 /* Another thread already created the desired nextop, use it */ 235 nh = un_tmp->un_nhop; 236 } else { 237 /* Finally, insert the new nexthop and link it to the primary */ 238 nh = un_new->un_nhop; 239 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); 240 un_new->un_nextchild = unhop->un_nextchild; 241 unhop->un_nextchild = un_new; 242 un_new = NULL; 243 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); 244 } 245 246 UN_WUNLOCK(ctl); 247 248 if (un_new != NULL) 249 destroy_unhop(un_new); 250 251 *perror = 0; 252 nhop_ref_any(nh); 253 return (nh); 254 } 255 256 static struct user_nhop * 257 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) 258 { 259 struct user_nhop key= { .un_idx = uidx }; 260 struct user_nhop *unhop = NULL; 261 UN_TRACKER; 262 263 UN_RLOCK(ctl); 264 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 265 UN_RUNLOCK(ctl); 266 267 return (unhop); 268 } 269 270 #define MAX_STACK_NHOPS 4 271 static struct nhop_object * 272 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) 273 { 274 #ifdef ROUTE_MPATH 275 const struct weightened_nhop *wn; 276 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; 277 uint32_t num_nhops; 278 #endif 279 struct nhop_object *nh = NULL; 280 int error; 281 282 if (unhop->un_nhop_src != NULL) { 283 IF_DEBUG_LEVEL(LOG_DEBUG2) { 284 char nhbuf[NHOP_PRINT_BUFSIZE]; 285 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); 286 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, 287 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, 288 family, nh_flags); 289 } 290 struct nhop_object *nh; 291 nh = nhop_alloc(fibnum, AF_UNSPEC); 292 if (nh == NULL) 293 return (NULL); 294 nhop_copy(nh, unhop->un_nhop_src); 295 /* Check that nexthop gateway is compatible with the new family */ 296 if (!nhop_set_upper_family(nh, family)) { 297 nhop_free(nh); 298 return (NULL); 299 } 300 nhop_set_uidx(nh, unhop->un_idx); 301 nhop_set_pxtype_flag(nh, nh_flags); 302 return (nhop_get_nhop(nh, &error)); 303 } 304 #ifdef ROUTE_MPATH 305 wn = unhop->un_nhgrp_src; 306 num_nhops = unhop->un_nhgrp_count; 307 308 if (num_nhops > MAX_STACK_NHOPS) { 309 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); 310 if (wn_new == NULL) 311 return (NULL); 312 } else 313 wn_new = wn_base; 314 315 for (int i = 0; i < num_nhops; i++) { 316 uint32_t uidx = nhop_get_uidx(wn[i].nh); 317 MPASS(uidx != 0); 318 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); 319 if (error != 0) 320 break; 321 wn_new[i].weight = wn[i].weight; 322 } 323 324 if (error == 0) { 325 struct rib_head *rh = nhop_get_rh(wn_new[0].nh); 326 struct nhgrp_object *nhg; 327 328 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); 329 nh = (struct nhop_object *)nhg; 330 } 331 332 if (wn_new != wn_base) 333 free(wn_new, M_TEMP); 334 #endif 335 return (nh); 336 } 337 338 static void 339 destroy_unhop(struct user_nhop *unhop) 340 { 341 if (unhop->un_nhop != NULL) 342 nhop_free_any(unhop->un_nhop); 343 if (unhop->un_nhop_src != NULL) 344 nhop_free_any(unhop->un_nhop_src); 345 free(unhop, M_NETLINK); 346 } 347 348 static void 349 destroy_unhop_epoch(epoch_context_t ctx) 350 { 351 struct user_nhop *unhop; 352 353 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); 354 355 destroy_unhop(unhop); 356 } 357 358 static uint32_t 359 find_spare_uidx(struct unhop_ctl *ctl) 360 { 361 struct user_nhop *unhop, key = {}; 362 uint32_t uidx = 0; 363 UN_TRACKER; 364 365 UN_RLOCK(ctl); 366 /* This should return spare uid with 75% of 65k used in ~99/100 cases */ 367 for (int i = 0; i < 16; i++) { 368 key.un_idx = (arc4random() % 65536) + 65536 * 4; 369 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 370 if (unhop == NULL) { 371 uidx = key.un_idx; 372 break; 373 } 374 } 375 UN_RUNLOCK(ctl); 376 377 return (uidx); 378 } 379 380 381 /* 382 * Actual netlink code 383 */ 384 struct netlink_walkargs { 385 struct nl_writer *nw; 386 struct nlmsghdr hdr; 387 struct nlpcb *so; 388 int family; 389 int error; 390 int count; 391 int dumped; 392 }; 393 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem 394 395 static bool 396 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, 397 struct nl_writer *nw) 398 { 399 400 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 401 goto enomem; 402 403 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 404 nhm->nh_family = AF_UNSPEC; 405 nhm->nh_scope = 0; 406 nhm->nh_protocol = unhop->un_protocol; 407 nhm->nh_flags = 0; 408 409 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 410 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); 411 412 struct weightened_nhop *wn = unhop->un_nhgrp_src; 413 uint32_t num_nhops = unhop->un_nhgrp_count; 414 /* TODO: a better API? */ 415 int nla_len = sizeof(struct nlattr); 416 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); 417 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); 418 if (nla == NULL) 419 goto enomem; 420 nla->nla_type = NHA_GROUP; 421 nla->nla_len = nla_len; 422 for (int i = 0; i < num_nhops; i++) { 423 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; 424 grp->id = nhop_get_uidx(wn[i].nh); 425 grp->weight = wn[i].weight; 426 grp->resvd1 = 0; 427 grp->resvd2 = 0; 428 } 429 430 if (nlmsg_end(nw)) 431 return (true); 432 enomem: 433 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); 434 nlmsg_abort(nw); 435 return (false); 436 } 437 438 static bool 439 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, 440 struct nl_writer *nw) 441 { 442 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 443 goto enomem; 444 445 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 446 ENOMEM_IF_NULL(nhm); 447 nhm->nh_family = nhop_get_neigh_family(nh); 448 nhm->nh_scope = 0; // XXX: what's that? 449 nhm->nh_protocol = nhop_get_origin(nh); 450 nhm->nh_flags = 0; 451 452 if (uidx != 0) 453 nlattr_add_u32(nw, NHA_ID, uidx); 454 if (nh->nh_flags & NHF_BLACKHOLE) { 455 nlattr_add_flag(nw, NHA_BLACKHOLE); 456 goto done; 457 } 458 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); 459 460 switch (nh->gw_sa.sa_family) { 461 #ifdef INET 462 case AF_INET: 463 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); 464 break; 465 #endif 466 #ifdef INET6 467 case AF_INET6: 468 { 469 struct in6_addr addr = nh->gw6_sa.sin6_addr; 470 in6_clearscope(&addr); 471 nlattr_add(nw, NHA_GATEWAY, 16, &addr); 472 break; 473 } 474 #endif 475 } 476 477 int off = nlattr_add_nested(nw, NHA_FREEBSD); 478 if (off != 0) { 479 nlattr_add_u32(nw, NHAF_AIF, nh->nh_aifp->if_index); 480 481 if (uidx == 0) { 482 nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); 483 nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); 484 nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); 485 } 486 487 nlattr_set_len(nw, off); 488 } 489 490 done: 491 if (nlmsg_end(nw)) 492 return (true); 493 enomem: 494 nlmsg_abort(nw); 495 return (false); 496 } 497 498 static void 499 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 500 struct nl_writer *nw) 501 { 502 if (unhop->un_nhop_src != NULL) 503 dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); 504 else 505 dump_nhgrp(unhop, hdr, nw); 506 } 507 508 static int 509 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) 510 { 511 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; 512 513 struct user_nhop key = { .un_idx = uidx }; 514 515 UN_WLOCK(ctl); 516 517 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); 518 519 if (unhop_base != NULL) { 520 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); 521 IF_DEBUG_LEVEL(LOG_DEBUG2) { 522 char nhbuf[NHOP_PRINT_BUFSIZE]; 523 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); 524 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, 525 "removed base nhop %u: %s", uidx, nhbuf); 526 } 527 /* Unlink all child nexhops as well, keeping the chain intact */ 528 unhop_chain = unhop_base->un_nextchild; 529 while (unhop_chain != NULL) { 530 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, 531 unhop_ret); 532 MPASS(unhop_chain == unhop_ret); 533 IF_DEBUG_LEVEL(LOG_DEBUG3) { 534 char nhbuf[NHOP_PRINT_BUFSIZE]; 535 nhop_print_buf_any(unhop_chain->un_nhop, 536 nhbuf, sizeof(nhbuf)); 537 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, 538 "removed child nhop %u: %s", uidx, nhbuf); 539 } 540 unhop_chain = unhop_chain->un_nextchild; 541 } 542 } 543 544 UN_WUNLOCK(ctl); 545 546 if (unhop_base == NULL) { 547 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); 548 return (ENOENT); 549 } 550 551 /* Report nexthop deletion */ 552 struct netlink_walkargs wa = { 553 .hdr.nlmsg_pid = hdr->nlmsg_pid, 554 .hdr.nlmsg_seq = hdr->nlmsg_seq, 555 .hdr.nlmsg_flags = hdr->nlmsg_flags, 556 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, 557 }; 558 559 struct nl_writer nw = {}; 560 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 561 NL_LOG(LOG_DEBUG, "error allocating message writer"); 562 return (ENOMEM); 563 } 564 565 dump_unhop(unhop_base, &wa.hdr, &nw); 566 nlmsg_flush(&nw); 567 568 while (unhop_base != NULL) { 569 unhop_chain = unhop_base->un_nextchild; 570 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); 571 unhop_base = unhop_chain; 572 } 573 574 return (0); 575 } 576 577 static void 578 consider_resize(struct unhop_ctl *ctl, uint32_t new_size) 579 { 580 void *new_ptr = NULL; 581 size_t alloc_size; 582 583 if (new_size == 0) 584 return; 585 586 if (new_size != 0) { 587 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); 588 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 589 if (new_ptr == NULL) 590 return; 591 } 592 593 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); 594 UN_WLOCK(ctl); 595 if (new_ptr != NULL) { 596 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); 597 } 598 UN_WUNLOCK(ctl); 599 600 601 if (new_ptr != NULL) 602 free(new_ptr, M_NETLINK); 603 } 604 605 static bool __noinline 606 vnet_init_unhops(void) 607 { 608 uint32_t num_buckets = 16; 609 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); 610 611 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, 612 M_NOWAIT | M_ZERO); 613 if (ctl == NULL) 614 return (false); 615 616 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 617 if (ptr == NULL) { 618 free(ctl, M_NETLINK); 619 return (false); 620 } 621 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); 622 UN_LOCK_INIT(ctl); 623 624 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { 625 free(ptr, M_NETLINK); 626 free(ctl, M_NETLINK); 627 } 628 629 if (atomic_load_ptr(&V_un_ctl) == NULL) 630 return (false); 631 632 NL_LOG(LOG_NOTICE, "UNHOPS init done"); 633 634 return (true); 635 } 636 637 static void 638 vnet_destroy_unhops(const void *unused __unused) 639 { 640 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 641 struct user_nhop *unhop, *tmp; 642 643 if (ctl == NULL) 644 return; 645 V_un_ctl = NULL; 646 647 /* Wait till all unhop users finish their reads */ 648 NET_EPOCH_WAIT(); 649 650 UN_WLOCK(ctl); 651 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { 652 destroy_unhop(unhop); 653 } CHT_SLIST_FOREACH_SAFE_END; 654 UN_WUNLOCK(ctl); 655 656 free(ctl->un_head.ptr, M_NETLINK); 657 free(ctl, M_NETLINK); 658 } 659 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, 660 vnet_destroy_unhops, NULL); 661 662 static int 663 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 664 { 665 int error = 0; 666 667 /* Verify attribute correctness */ 668 struct nexthop_grp *grp = NLA_DATA(nla); 669 int data_len = NLA_DATA_LEN(nla); 670 671 int count = data_len / sizeof(*grp); 672 if (count == 0 || (count * sizeof(*grp) != data_len)) { 673 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); 674 return (EINVAL); 675 } 676 677 *((struct nlattr **)target) = nla; 678 return (error); 679 } 680 681 struct nl_parsed_nhop { 682 uint32_t nha_id; 683 uint8_t nha_blackhole; 684 uint8_t nha_groups; 685 uint8_t nhaf_knhops; 686 uint8_t nhaf_family; 687 struct ifnet *nha_oif; 688 struct sockaddr *nha_gw; 689 struct nlattr *nha_group; 690 uint8_t nh_family; 691 uint8_t nh_protocol; 692 uint32_t nhaf_table; 693 uint32_t nhaf_kid; 694 uint32_t nhaf_aif; 695 }; 696 697 #define _IN(_field) offsetof(struct nhmsg, _field) 698 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) 699 static struct nlattr_parser nla_p_nh_fbsd[] = { 700 { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, 701 { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, 702 { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, 703 { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, 704 { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, 705 }; 706 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); 707 708 static const struct nlfield_parser nlf_p_nh[] = { 709 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, 710 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, 711 }; 712 713 static const struct nlattr_parser nla_p_nh[] = { 714 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, 715 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, 716 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, 717 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, 718 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, 719 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, 720 { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, 721 }; 722 #undef _IN 723 #undef _OUT 724 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); 725 726 static bool 727 eligible_nhg(const struct nhop_object *nh) 728 { 729 return (nh->nh_flags & NHF_GATEWAY); 730 } 731 732 static int 733 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 734 { 735 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); 736 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); 737 struct weightened_nhop *wn; 738 739 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); 740 if (wn == NULL) 741 return (ENOMEM); 742 743 for (int i = 0; i < count; i++) { 744 struct user_nhop *unhop; 745 unhop = nl_find_base_unhop(ctl, grp[i].id); 746 if (unhop == NULL) { 747 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); 748 free(wn, M_NETLINK); 749 return (ESRCH); 750 } else if (unhop->un_nhop_src == NULL) { 751 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", 752 grp[i].id); 753 free(wn, M_NETLINK); 754 return (ENOTSUP); 755 } else if (!eligible_nhg(unhop->un_nhop_src)) { 756 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", 757 grp[i].id); 758 free(wn, M_NETLINK); 759 return (ENOTSUP); 760 } 761 /* 762 * TODO: consider more rigid eligibility checks: 763 * restrict nexthops with the same gateway 764 */ 765 wn[i].nh = unhop->un_nhop_src; 766 wn[i].weight = grp[i].weight; 767 } 768 unhop->un_nhgrp_src = wn; 769 unhop->un_nhgrp_count = count; 770 return (0); 771 } 772 773 /* 774 * Sets nexthop @nh gateway specified by @gw. 775 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to 776 * @ifp ifindex. 777 * Returns 0 on success or errno. 778 */ 779 int 780 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp, 781 struct nl_pstate *npt) 782 { 783 #ifdef INET6 784 if (gw->sa_family == AF_INET6) { 785 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; 786 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { 787 if (ifp == NULL) { 788 NLMSG_REPORT_ERR_MSG(npt, "interface not set"); 789 return (EINVAL); 790 } 791 in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index); 792 } 793 } 794 #endif 795 nhop_set_gw(nh, gw, true); 796 return (0); 797 } 798 799 static int 800 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) 801 { 802 struct ifaddr *ifa = NULL; 803 struct nhop_object *nh; 804 int error; 805 806 if (!attrs->nha_blackhole) { 807 if (attrs->nha_gw == NULL) { 808 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); 809 return (EINVAL); 810 } 811 if (attrs->nha_oif == NULL) { 812 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); 813 return (EINVAL); 814 } 815 if (ifa == NULL) 816 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); 817 if (ifa == NULL) { 818 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); 819 return (EINVAL); 820 } 821 } 822 823 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; 824 825 nh = nhop_alloc(RT_DEFAULT_FIB, family); 826 if (nh == NULL) { 827 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); 828 return (ENOMEM); 829 } 830 nhop_set_uidx(nh, attrs->nha_id); 831 nhop_set_origin(nh, attrs->nh_protocol); 832 833 if (attrs->nha_blackhole) 834 nhop_set_blackhole(nh, NHF_BLACKHOLE); 835 else { 836 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); 837 if (error != 0) { 838 nhop_free(nh); 839 return (error); 840 } 841 nhop_set_transmit_ifp(nh, attrs->nha_oif); 842 nhop_set_src(nh, ifa); 843 } 844 845 error = nhop_get_unlinked(nh); 846 if (error != 0) { 847 NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); 848 return (error); 849 } 850 851 IF_DEBUG_LEVEL(LOG_DEBUG2) { 852 char nhbuf[NHOP_PRINT_BUFSIZE]; 853 nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); 854 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); 855 } 856 857 unhop->un_nhop_src = nh; 858 return (0); 859 } 860 861 static int 862 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 863 struct nl_pstate *npt) 864 { 865 struct user_nhop *unhop; 866 int error; 867 868 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) 869 return (ENOMEM); 870 struct unhop_ctl *ctl = V_un_ctl; 871 872 struct nl_parsed_nhop attrs = {}; 873 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 874 if (error != 0) 875 return (error); 876 877 /* 878 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class 879 * citizen. 880 */ 881 if (attrs.nha_id == 0) { 882 attrs.nha_id = find_spare_uidx(ctl); 883 if (attrs.nha_id == 0) { 884 NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); 885 return (ENOSPC); 886 } 887 } 888 889 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); 890 891 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 892 if (unhop == NULL) { 893 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); 894 return (ENOMEM); 895 } 896 unhop->un_idx = attrs.nha_id; 897 unhop->un_protocol = attrs.nh_protocol; 898 899 if (attrs.nha_group) 900 error = newnhg(ctl, &attrs, unhop); 901 else 902 error = newnhop(&attrs, unhop, npt); 903 904 if (error != 0) { 905 free(unhop, M_NETLINK); 906 return (error); 907 } 908 909 UN_WLOCK(ctl); 910 /* Check if uidx already exists */ 911 struct user_nhop *tmp = NULL; 912 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); 913 if (tmp != NULL) { 914 UN_WUNLOCK(ctl); 915 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); 916 destroy_unhop(unhop); 917 return (EEXIST); 918 } 919 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); 920 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); 921 UN_WUNLOCK(ctl); 922 923 /* Report addition of the next nexhop */ 924 struct netlink_walkargs wa = { 925 .hdr.nlmsg_pid = hdr->nlmsg_pid, 926 .hdr.nlmsg_seq = hdr->nlmsg_seq, 927 .hdr.nlmsg_flags = hdr->nlmsg_flags, 928 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 929 }; 930 931 struct nl_writer nw = {}; 932 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 933 NL_LOG(LOG_DEBUG, "error allocating message writer"); 934 return (ENOMEM); 935 } 936 937 dump_unhop(unhop, &wa.hdr, &nw); 938 nlmsg_flush(&nw); 939 940 consider_resize(ctl, num_buckets_new); 941 942 return (0); 943 } 944 945 static int 946 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 947 struct nl_pstate *npt) 948 { 949 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 950 int error; 951 952 if (__predict_false(ctl == NULL)) 953 return (ESRCH); 954 955 struct nl_parsed_nhop attrs = {}; 956 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 957 if (error != 0) 958 return (error); 959 960 if (attrs.nha_id == 0) { 961 NL_LOG(LOG_DEBUG, "NHA_ID not set"); 962 return (EINVAL); 963 } 964 965 error = delete_unhop(ctl, hdr, attrs.nha_id); 966 967 return (error); 968 } 969 970 static bool 971 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 972 { 973 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) 974 return (false); 975 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) 976 return (false); 977 if (attrs->nha_oif != NULL && 978 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) 979 return (false); 980 981 return (true); 982 } 983 984 static int 985 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 986 struct nl_pstate *npt) 987 { 988 struct user_nhop *unhop; 989 UN_TRACKER; 990 int error; 991 992 struct nl_parsed_nhop attrs = {}; 993 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 994 if (error != 0) 995 return (error); 996 997 struct netlink_walkargs wa = { 998 .nw = npt->nw, 999 .hdr.nlmsg_pid = hdr->nlmsg_pid, 1000 .hdr.nlmsg_seq = hdr->nlmsg_seq, 1001 .hdr.nlmsg_flags = hdr->nlmsg_flags, 1002 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 1003 }; 1004 1005 if (attrs.nha_id != 0) { 1006 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 1007 struct user_nhop key = { .un_idx = attrs.nha_id }; 1008 1009 if (__predict_false(ctl == NULL)) 1010 return (ESRCH); 1011 1012 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); 1013 UN_RLOCK(ctl); 1014 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 1015 UN_RUNLOCK(ctl); 1016 1017 if (unhop == NULL) 1018 return (ESRCH); 1019 dump_unhop(unhop, &wa.hdr, wa.nw); 1020 return (0); 1021 } else if (attrs.nhaf_kid != 0) { 1022 struct nhop_iter iter = { 1023 .fibnum = attrs.nhaf_table, 1024 .family = attrs.nhaf_family, 1025 }; 1026 int error = ESRCH; 1027 1028 NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); 1029 for (struct nhop_object *nh = nhops_iter_start(&iter); nh; 1030 nh = nhops_iter_next(&iter)) { 1031 NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); 1032 if (nhop_get_idx(nh) == attrs.nhaf_kid) { 1033 dump_nhop(nh, 0, &wa.hdr, wa.nw); 1034 error = 0; 1035 break; 1036 } 1037 } 1038 nhops_iter_stop(&iter); 1039 return (error); 1040 } else if (attrs.nhaf_knhops) { 1041 struct nhop_iter iter = { 1042 .fibnum = attrs.nhaf_table, 1043 .family = attrs.nhaf_family, 1044 }; 1045 1046 NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); 1047 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 1048 for (struct nhop_object *nh = nhops_iter_start(&iter); nh; 1049 nh = nhops_iter_next(&iter)) { 1050 dump_nhop(nh, 0, &wa.hdr, wa.nw); 1051 } 1052 nhops_iter_stop(&iter); 1053 } else { 1054 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 1055 1056 if (__predict_false(ctl == NULL)) 1057 return (ESRCH); 1058 1059 NL_LOG(LOG_DEBUG2, "DUMP unhops"); 1060 UN_RLOCK(ctl); 1061 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 1062 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { 1063 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) 1064 dump_unhop(unhop, &wa.hdr, wa.nw); 1065 } CHT_SLIST_FOREACH_END; 1066 UN_RUNLOCK(ctl); 1067 } 1068 1069 if (wa.error == 0) { 1070 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) 1071 return (ENOMEM); 1072 } 1073 return (0); 1074 } 1075 1076 static const struct rtnl_cmd_handler cmd_handlers[] = { 1077 { 1078 .cmd = NL_RTM_NEWNEXTHOP, 1079 .name = "RTM_NEWNEXTHOP", 1080 .cb = &rtnl_handle_newnhop, 1081 .priv = PRIV_NET_ROUTE, 1082 }, 1083 { 1084 .cmd = NL_RTM_DELNEXTHOP, 1085 .name = "RTM_DELNEXTHOP", 1086 .cb = &rtnl_handle_delnhop, 1087 .priv = PRIV_NET_ROUTE, 1088 }, 1089 { 1090 .cmd = NL_RTM_GETNEXTHOP, 1091 .name = "RTM_GETNEXTHOP", 1092 .cb = &rtnl_handle_getnhop, 1093 } 1094 }; 1095 1096 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; 1097 1098 void 1099 rtnl_nexthops_init(void) 1100 { 1101 NL_VERIFY_PARSERS(all_parsers); 1102 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); 1103 } 1104