1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include "opt_netlink.h" 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 #include "opt_inet.h" 33 #include "opt_inet6.h" 34 #include "opt_route.h" 35 #include <sys/types.h> 36 #include <sys/ck.h> 37 #include <sys/epoch.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/rmlock.h> 41 #include <sys/socket.h> 42 43 #include <net/if.h> 44 #include <net/route.h> 45 #include <net/route/nhop.h> 46 #include <net/route/nhop_utils.h> 47 48 #include <net/route/route_ctl.h> 49 #include <net/route/route_var.h> 50 #include <netinet6/scope6_var.h> 51 #include <netlink/netlink.h> 52 #include <netlink/netlink_ctl.h> 53 #include <netlink/netlink_route.h> 54 #include <netlink/route/route_var.h> 55 56 #define DEBUG_MOD_NAME nl_nhop 57 #define DEBUG_MAX_LEVEL LOG_DEBUG3 58 #include <netlink/netlink_debug.h> 59 _DECLARE_DEBUG(LOG_DEBUG); 60 61 /* 62 * This file contains the logic to maintain kernel nexthops and 63 * nexhop groups based om the data provided by the user. 64 * 65 * Kernel stores (nearly) all of the routing data in the nexthops, 66 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). 67 * 68 * Netlink API provides higher-level abstraction for the user. Each 69 * user-created nexthop may map to multiple kernel nexthops. 70 * 71 * The following variations require separate kernel nexthop to be 72 * created: 73 * * prefix flags (NHF_HOST, NHF_DEFAULT) 74 * * using IPv6 gateway for IPv4 routes 75 * * different fibnum 76 * 77 * These kernel nexthops have the lifetime bound to the lifetime of 78 * the user_nhop object. They are not collected until user requests 79 * to delete the created user_nhop. 80 * 81 */ 82 struct user_nhop { 83 uint32_t un_idx; /* Userland-provided index */ 84 uint32_t un_fibfam; /* fibnum+af(as highest byte) */ 85 uint8_t un_protocol; /* protocol that install the record */ 86 struct nhop_object *un_nhop; /* "production" nexthop */ 87 struct nhop_object *un_nhop_src; /* nexthop to copy from */ 88 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ 89 uint32_t un_nhgrp_count; /* number of nexthops */ 90 struct user_nhop *un_next; /* next item in hash chain */ 91 struct user_nhop *un_nextchild; /* master -> children */ 92 struct epoch_context un_epoch_ctx; /* epoch ctl helper */ 93 }; 94 95 /* produce hash value for an object */ 96 #define unhop_hash_obj(_obj) (hash_unhop(_obj)) 97 /* compare two objects */ 98 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) 99 /* next object accessor */ 100 #define unhop_next(_obj) (_obj)->un_next 101 102 CHT_SLIST_DEFINE(unhop, struct user_nhop); 103 104 struct unhop_ctl { 105 struct unhop_head un_head; 106 struct rmlock un_lock; 107 }; 108 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") 109 #define UN_TRACKER struct rm_priotracker un_tracker 110 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) 111 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) 112 113 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); 114 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); 115 116 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; 117 #define V_un_ctl VNET(un_ctl) 118 119 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); 120 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); 121 static unsigned int hash_unhop(const struct user_nhop *obj); 122 123 static void destroy_unhop(struct user_nhop *unhop); 124 static struct nhop_object *clone_unhop(const struct user_nhop *unhop, 125 uint32_t fibnum, int family, int nh_flags); 126 127 static int 128 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) 129 { 130 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); 131 } 132 133 /* 134 * Hash callback: calculate hash of an object 135 */ 136 static unsigned int 137 hash_unhop(const struct user_nhop *obj) 138 { 139 return (obj->un_idx ^ obj->un_fibfam); 140 } 141 142 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) 143 144 /* 145 * Factory interface for creating matching kernel nexthops/nexthop groups 146 * 147 * @uidx: userland nexhop index used to create the nexthop 148 * @fibnum: fibnum nexthop will be used in 149 * @family: upper family nexthop will be used in 150 * @nh_flags: desired nexthop prefix flags 151 * @perror: pointer to store error to 152 * 153 * Returns referenced nexthop linked to @fibnum/@family rib on success. 154 */ 155 struct nhop_object * 156 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, 157 int nh_flags, int *perror) 158 { 159 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 160 UN_TRACKER; 161 162 if (__predict_false(ctl == NULL)) 163 return (NULL); 164 165 struct user_nhop key= { 166 .un_idx = uidx, 167 .un_fibfam = fibnum | ((uint32_t)family) << 24, 168 }; 169 struct user_nhop *unhop; 170 171 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); 172 173 if (__predict_false(family == 0)) 174 return (NULL); 175 176 UN_RLOCK(ctl); 177 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 178 if (unhop != NULL) { 179 struct nhop_object *nh = unhop->un_nhop; 180 UN_RLOCK(ctl); 181 *perror = 0; 182 nhop_ref_any(nh); 183 return (nh); 184 } 185 186 /* 187 * Exact nexthop not found. Search for template nexthop to clone from. 188 */ 189 key.un_fibfam = 0; 190 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 191 if (unhop == NULL) { 192 UN_RUNLOCK(ctl); 193 *perror = ESRCH; 194 return (NULL); 195 } 196 197 UN_RUNLOCK(ctl); 198 199 /* Create entry to insert first */ 200 struct user_nhop *un_new, *un_tmp; 201 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 202 if (un_new == NULL) { 203 *perror = ENOMEM; 204 return (NULL); 205 } 206 un_new->un_idx = uidx; 207 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; 208 209 /* Relying on epoch to protect unhop here */ 210 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); 211 if (un_new->un_nhop == NULL) { 212 free(un_new, M_NETLINK); 213 *perror = ENOMEM; 214 return (NULL); 215 } 216 217 /* Insert back and report */ 218 UN_WLOCK(ctl); 219 220 /* First, find template record once again */ 221 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 222 if (unhop == NULL) { 223 /* Someone deleted the nexthop during the call */ 224 UN_WUNLOCK(ctl); 225 *perror = ESRCH; 226 destroy_unhop(un_new); 227 return (NULL); 228 } 229 230 /* Second, check the direct match */ 231 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); 232 struct nhop_object *nh; 233 if (un_tmp != NULL) { 234 /* Another thread already created the desired nextop, use it */ 235 nh = un_tmp->un_nhop; 236 } else { 237 /* Finally, insert the new nexthop and link it to the primary */ 238 nh = un_new->un_nhop; 239 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); 240 un_new->un_nextchild = unhop->un_nextchild; 241 unhop->un_nextchild = un_new; 242 un_new = NULL; 243 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); 244 } 245 246 UN_WUNLOCK(ctl); 247 248 if (un_new != NULL) 249 destroy_unhop(un_new); 250 251 *perror = 0; 252 nhop_ref_any(nh); 253 return (nh); 254 } 255 256 static struct user_nhop * 257 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) 258 { 259 struct user_nhop key= { .un_idx = uidx }; 260 struct user_nhop *unhop = NULL; 261 UN_TRACKER; 262 263 UN_RLOCK(ctl); 264 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 265 UN_RUNLOCK(ctl); 266 267 return (unhop); 268 } 269 270 #define MAX_STACK_NHOPS 4 271 static struct nhop_object * 272 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) 273 { 274 #ifdef ROUTE_MPATH 275 const struct weightened_nhop *wn; 276 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; 277 uint32_t num_nhops; 278 #endif 279 struct nhop_object *nh = NULL; 280 int error; 281 282 if (unhop->un_nhop_src != NULL) { 283 IF_DEBUG_LEVEL(LOG_DEBUG2) { 284 char nhbuf[NHOP_PRINT_BUFSIZE]; 285 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); 286 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, 287 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, 288 family, nh_flags); 289 } 290 struct nhop_object *nh; 291 nh = nhop_alloc(fibnum, AF_UNSPEC); 292 if (nh == NULL) 293 return (NULL); 294 nhop_copy(nh, unhop->un_nhop_src); 295 /* Check that nexthop gateway is compatible with the new family */ 296 if (!nhop_set_upper_family(nh, family)) { 297 nhop_free(nh); 298 return (NULL); 299 } 300 nhop_set_uidx(nh, unhop->un_idx); 301 nhop_set_pxtype_flag(nh, nh_flags); 302 return (nhop_get_nhop(nh, &error)); 303 } 304 #ifdef ROUTE_MPATH 305 wn = unhop->un_nhgrp_src; 306 num_nhops = unhop->un_nhgrp_count; 307 308 if (num_nhops > MAX_STACK_NHOPS) { 309 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); 310 if (wn_new == NULL) 311 return (NULL); 312 } else 313 wn_new = wn_base; 314 315 for (int i = 0; i < num_nhops; i++) { 316 uint32_t uidx = nhop_get_uidx(wn[i].nh); 317 MPASS(uidx != 0); 318 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); 319 if (error != 0) 320 break; 321 wn_new[i].weight = wn[i].weight; 322 } 323 324 if (error == 0) { 325 struct rib_head *rh = nhop_get_rh(wn_new[0].nh); 326 struct nhgrp_object *nhg; 327 328 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); 329 nh = (struct nhop_object *)nhg; 330 } 331 332 if (wn_new != wn_base) 333 free(wn_new, M_TEMP); 334 #endif 335 return (nh); 336 } 337 338 static void 339 destroy_unhop(struct user_nhop *unhop) 340 { 341 if (unhop->un_nhop != NULL) 342 nhop_free_any(unhop->un_nhop); 343 if (unhop->un_nhop_src != NULL) 344 nhop_free_any(unhop->un_nhop_src); 345 free(unhop, M_NETLINK); 346 } 347 348 static void 349 destroy_unhop_epoch(epoch_context_t ctx) 350 { 351 struct user_nhop *unhop; 352 353 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); 354 355 destroy_unhop(unhop); 356 } 357 358 static uint32_t 359 find_spare_uidx(struct unhop_ctl *ctl) 360 { 361 struct user_nhop *unhop, key = {}; 362 uint32_t uidx = 0; 363 UN_TRACKER; 364 365 UN_RLOCK(ctl); 366 /* This should return spare uid with 75% of 65k used in ~99/100 cases */ 367 for (int i = 0; i < 16; i++) { 368 key.un_idx = (arc4random() % 65536) + 65536 * 4; 369 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 370 if (unhop == NULL) { 371 uidx = key.un_idx; 372 break; 373 } 374 } 375 UN_RUNLOCK(ctl); 376 377 return (uidx); 378 } 379 380 381 /* 382 * Actual netlink code 383 */ 384 struct netlink_walkargs { 385 struct nl_writer *nw; 386 struct nlmsghdr hdr; 387 struct nlpcb *so; 388 int family; 389 int error; 390 int count; 391 int dumped; 392 }; 393 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem 394 395 static bool 396 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, 397 struct nl_writer *nw) 398 { 399 400 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 401 goto enomem; 402 403 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 404 nhm->nh_family = AF_UNSPEC; 405 nhm->nh_scope = 0; 406 nhm->nh_protocol = unhop->un_protocol; 407 nhm->nh_flags = 0; 408 409 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 410 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); 411 412 struct weightened_nhop *wn = unhop->un_nhgrp_src; 413 uint32_t num_nhops = unhop->un_nhgrp_count; 414 /* TODO: a better API? */ 415 int nla_len = sizeof(struct nlattr); 416 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); 417 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); 418 if (nla == NULL) 419 goto enomem; 420 nla->nla_type = NHA_GROUP; 421 nla->nla_len = nla_len; 422 for (int i = 0; i < num_nhops; i++) { 423 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; 424 grp->id = nhop_get_uidx(wn[i].nh); 425 grp->weight = wn[i].weight; 426 grp->resvd1 = 0; 427 grp->resvd2 = 0; 428 } 429 430 if (nlmsg_end(nw)) 431 return (true); 432 enomem: 433 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); 434 nlmsg_abort(nw); 435 return (false); 436 } 437 438 static bool 439 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 440 struct nl_writer *nw) 441 { 442 struct nhop_object *nh = unhop->un_nhop_src; 443 444 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 445 goto enomem; 446 447 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 448 ENOMEM_IF_NULL(nhm); 449 nhm->nh_family = nhop_get_neigh_family(nh); 450 nhm->nh_scope = 0; // XXX: what's that? 451 nhm->nh_protocol = unhop->un_protocol; 452 nhm->nh_flags = 0; 453 454 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 455 if (nh->nh_flags & NHF_BLACKHOLE) { 456 nlattr_add_flag(nw, NHA_BLACKHOLE); 457 goto done; 458 } 459 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); 460 461 switch (nh->gw_sa.sa_family) { 462 #ifdef INET 463 case AF_INET: 464 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); 465 break; 466 #endif 467 #ifdef INET6 468 case AF_INET6: 469 { 470 struct in6_addr addr = nh->gw6_sa.sin6_addr; 471 in6_clearscope(&addr); 472 nlattr_add(nw, NHA_GATEWAY, 16, &addr); 473 break; 474 } 475 #endif 476 } 477 478 done: 479 if (nlmsg_end(nw)) 480 return (true); 481 enomem: 482 nlmsg_abort(nw); 483 return (false); 484 } 485 486 static void 487 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 488 struct nl_writer *nw) 489 { 490 if (unhop->un_nhop_src != NULL) 491 dump_nhop(unhop, hdr, nw); 492 else 493 dump_nhgrp(unhop, hdr, nw); 494 } 495 496 static int 497 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) 498 { 499 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; 500 501 struct user_nhop key = { .un_idx = uidx }; 502 503 UN_WLOCK(ctl); 504 505 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); 506 507 if (unhop_base != NULL) { 508 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); 509 IF_DEBUG_LEVEL(LOG_DEBUG2) { 510 char nhbuf[NHOP_PRINT_BUFSIZE]; 511 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); 512 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, 513 "removed base nhop %u: %s", uidx, nhbuf); 514 } 515 /* Unlink all child nexhops as well, keeping the chain intact */ 516 unhop_chain = unhop_base->un_nextchild; 517 while (unhop_chain != NULL) { 518 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, 519 unhop_ret); 520 MPASS(unhop_chain == unhop_ret); 521 IF_DEBUG_LEVEL(LOG_DEBUG3) { 522 char nhbuf[NHOP_PRINT_BUFSIZE]; 523 nhop_print_buf_any(unhop_chain->un_nhop, 524 nhbuf, sizeof(nhbuf)); 525 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, 526 "removed child nhop %u: %s", uidx, nhbuf); 527 } 528 unhop_chain = unhop_chain->un_nextchild; 529 } 530 } 531 532 UN_WUNLOCK(ctl); 533 534 if (unhop_base == NULL) { 535 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); 536 return (ENOENT); 537 } 538 539 /* Report nexthop deletion */ 540 struct netlink_walkargs wa = { 541 .hdr.nlmsg_pid = hdr->nlmsg_pid, 542 .hdr.nlmsg_seq = hdr->nlmsg_seq, 543 .hdr.nlmsg_flags = hdr->nlmsg_flags, 544 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, 545 }; 546 547 struct nl_writer nw = {}; 548 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 549 NL_LOG(LOG_DEBUG, "error allocating message writer"); 550 return (ENOMEM); 551 } 552 553 dump_unhop(unhop_base, &wa.hdr, &nw); 554 nlmsg_flush(&nw); 555 556 while (unhop_base != NULL) { 557 unhop_chain = unhop_base->un_nextchild; 558 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); 559 unhop_base = unhop_chain; 560 } 561 562 return (0); 563 } 564 565 static void 566 consider_resize(struct unhop_ctl *ctl, uint32_t new_size) 567 { 568 void *new_ptr = NULL; 569 size_t alloc_size; 570 571 if (new_size == 0) 572 return; 573 574 if (new_size != 0) { 575 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); 576 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 577 if (new_ptr == NULL) 578 return; 579 } 580 581 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); 582 UN_WLOCK(ctl); 583 if (new_ptr != NULL) { 584 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); 585 } 586 UN_WUNLOCK(ctl); 587 588 589 if (new_ptr != NULL) 590 free(new_ptr, M_NETLINK); 591 } 592 593 static bool __noinline 594 vnet_init_unhops(void) 595 { 596 uint32_t num_buckets = 16; 597 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); 598 599 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, 600 M_NOWAIT | M_ZERO); 601 if (ctl == NULL) 602 return (false); 603 604 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 605 if (ptr == NULL) { 606 free(ctl, M_NETLINK); 607 return (false); 608 } 609 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); 610 UN_LOCK_INIT(ctl); 611 612 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { 613 free(ptr, M_NETLINK); 614 free(ctl, M_NETLINK); 615 } 616 617 if (atomic_load_ptr(&V_un_ctl) == NULL) 618 return (false); 619 620 NL_LOG(LOG_NOTICE, "UNHOPS init done"); 621 622 return (true); 623 } 624 625 static void 626 vnet_destroy_unhops(const void *unused __unused) 627 { 628 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 629 struct user_nhop *unhop, *tmp; 630 631 if (ctl == NULL) 632 return; 633 V_un_ctl = NULL; 634 635 /* Wait till all unhop users finish their reads */ 636 NET_EPOCH_WAIT(); 637 638 UN_WLOCK(ctl); 639 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { 640 destroy_unhop(unhop); 641 } CHT_SLIST_FOREACH_SAFE_END; 642 UN_WUNLOCK(ctl); 643 644 free(ctl->un_head.ptr, M_NETLINK); 645 free(ctl, M_NETLINK); 646 } 647 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, 648 vnet_destroy_unhops, NULL); 649 650 static int 651 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 652 { 653 int error = 0; 654 655 /* Verify attribute correctness */ 656 struct nexthop_grp *grp = NLA_DATA(nla); 657 int data_len = NLA_DATA_LEN(nla); 658 659 int count = data_len / sizeof(*grp); 660 if (count == 0 || (count * sizeof(*grp) != data_len)) { 661 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); 662 return (EINVAL); 663 } 664 665 *((struct nlattr **)target) = nla; 666 return (error); 667 } 668 669 struct nl_parsed_nhop { 670 uint32_t nha_id; 671 uint8_t nha_blackhole; 672 uint8_t nha_groups; 673 struct ifnet *nha_oif; 674 struct sockaddr *nha_gw; 675 struct nlattr *nha_group; 676 uint8_t nh_family; 677 uint8_t nh_protocol; 678 }; 679 680 #define _IN(_field) offsetof(struct nhmsg, _field) 681 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) 682 static const struct nlfield_parser nlf_p_nh[] = { 683 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, 684 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, 685 }; 686 687 static const struct nlattr_parser nla_p_nh[] = { 688 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, 689 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, 690 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, 691 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, 692 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, 693 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, 694 }; 695 #undef _IN 696 #undef _OUT 697 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); 698 699 static bool 700 eligible_nhg(const struct nhop_object *nh) 701 { 702 return (nh->nh_flags & NHF_GATEWAY); 703 } 704 705 static int 706 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 707 { 708 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); 709 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); 710 struct weightened_nhop *wn; 711 712 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); 713 if (wn == NULL) 714 return (ENOMEM); 715 716 for (int i = 0; i < count; i++) { 717 struct user_nhop *unhop; 718 unhop = nl_find_base_unhop(ctl, grp[i].id); 719 if (unhop == NULL) { 720 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); 721 free(wn, M_NETLINK); 722 return (ESRCH); 723 } else if (unhop->un_nhop_src == NULL) { 724 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", 725 grp[i].id); 726 free(wn, M_NETLINK); 727 return (ENOTSUP); 728 } else if (!eligible_nhg(unhop->un_nhop_src)) { 729 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", 730 grp[i].id); 731 free(wn, M_NETLINK); 732 return (ENOTSUP); 733 } 734 /* 735 * TODO: consider more rigid eligibility checks: 736 * restrict nexthops with the same gateway 737 */ 738 wn[i].nh = unhop->un_nhop_src; 739 wn[i].weight = grp[i].weight; 740 } 741 unhop->un_nhgrp_src = wn; 742 unhop->un_nhgrp_count = count; 743 return (0); 744 } 745 746 /* 747 * Sets nexthop @nh gateway specified by @gw. 748 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to 749 * @ifp ifindex. 750 * Returns 0 on success or errno. 751 */ 752 int 753 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp, 754 struct nl_pstate *npt) 755 { 756 #ifdef INET6 757 if (gw->sa_family == AF_INET6) { 758 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; 759 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { 760 if (ifp == NULL) { 761 NLMSG_REPORT_ERR_MSG(npt, "interface not set"); 762 return (EINVAL); 763 } 764 in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index); 765 } 766 } 767 #endif 768 nhop_set_gw(nh, gw, true); 769 return (0); 770 } 771 772 static int 773 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) 774 { 775 struct ifaddr *ifa = NULL; 776 struct nhop_object *nh; 777 int error; 778 779 if (!attrs->nha_blackhole) { 780 if (attrs->nha_gw == NULL) { 781 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); 782 return (EINVAL); 783 } 784 if (attrs->nha_oif == NULL) { 785 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); 786 return (EINVAL); 787 } 788 if (ifa == NULL) 789 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); 790 if (ifa == NULL) { 791 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); 792 return (EINVAL); 793 } 794 } 795 796 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; 797 798 nh = nhop_alloc(RT_DEFAULT_FIB, family); 799 if (nh == NULL) { 800 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); 801 return (ENOMEM); 802 } 803 nhop_set_uidx(nh, attrs->nha_id); 804 805 if (attrs->nha_blackhole) 806 nhop_set_blackhole(nh, NHF_BLACKHOLE); 807 else { 808 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); 809 if (error != 0) { 810 nhop_free(nh); 811 return (error); 812 } 813 nhop_set_transmit_ifp(nh, attrs->nha_oif); 814 nhop_set_src(nh, ifa); 815 } 816 817 error = nhop_get_unlinked(nh); 818 if (error != 0) { 819 NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); 820 return (error); 821 } 822 823 IF_DEBUG_LEVEL(LOG_DEBUG2) { 824 char nhbuf[NHOP_PRINT_BUFSIZE]; 825 nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); 826 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); 827 } 828 829 unhop->un_nhop_src = nh; 830 return (0); 831 } 832 833 static int 834 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 835 struct nl_pstate *npt) 836 { 837 struct user_nhop *unhop; 838 int error; 839 840 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) 841 return (ENOMEM); 842 struct unhop_ctl *ctl = V_un_ctl; 843 844 struct nl_parsed_nhop attrs = {}; 845 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 846 if (error != 0) 847 return (error); 848 849 /* 850 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class 851 * citizen. 852 */ 853 if (attrs.nha_id == 0) { 854 attrs.nha_id = find_spare_uidx(ctl); 855 if (attrs.nha_id == 0) { 856 NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); 857 return (ENOSPC); 858 } 859 } 860 861 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); 862 863 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 864 if (unhop == NULL) { 865 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); 866 return (ENOMEM); 867 } 868 unhop->un_idx = attrs.nha_id; 869 unhop->un_protocol = attrs.nh_protocol; 870 871 if (attrs.nha_group) 872 error = newnhg(ctl, &attrs, unhop); 873 else 874 error = newnhop(&attrs, unhop, npt); 875 876 if (error != 0) { 877 free(unhop, M_NETLINK); 878 return (error); 879 } 880 881 UN_WLOCK(ctl); 882 /* Check if uidx already exists */ 883 struct user_nhop *tmp = NULL; 884 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); 885 if (tmp != NULL) { 886 UN_WUNLOCK(ctl); 887 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); 888 destroy_unhop(unhop); 889 return (EEXIST); 890 } 891 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); 892 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); 893 UN_WUNLOCK(ctl); 894 895 /* Report addition of the next nexhop */ 896 struct netlink_walkargs wa = { 897 .hdr.nlmsg_pid = hdr->nlmsg_pid, 898 .hdr.nlmsg_seq = hdr->nlmsg_seq, 899 .hdr.nlmsg_flags = hdr->nlmsg_flags, 900 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 901 }; 902 903 struct nl_writer nw = {}; 904 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 905 NL_LOG(LOG_DEBUG, "error allocating message writer"); 906 return (ENOMEM); 907 } 908 909 dump_unhop(unhop, &wa.hdr, &nw); 910 nlmsg_flush(&nw); 911 912 consider_resize(ctl, num_buckets_new); 913 914 return (0); 915 } 916 917 static int 918 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 919 struct nl_pstate *npt) 920 { 921 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 922 int error; 923 924 if (__predict_false(ctl == NULL)) 925 return (ESRCH); 926 927 struct nl_parsed_nhop attrs = {}; 928 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 929 if (error != 0) 930 return (error); 931 932 if (attrs.nha_id == 0) { 933 NL_LOG(LOG_DEBUG, "NHA_ID not set"); 934 return (EINVAL); 935 } 936 937 error = delete_unhop(ctl, hdr, attrs.nha_id); 938 939 return (error); 940 } 941 942 static bool 943 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 944 { 945 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) 946 return (false); 947 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) 948 return (false); 949 if (attrs->nha_oif != NULL && 950 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) 951 return (false); 952 953 return (true); 954 } 955 956 static int 957 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 958 struct nl_pstate *npt) 959 { 960 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 961 struct user_nhop *unhop; 962 UN_TRACKER; 963 int error; 964 965 if (__predict_false(ctl == NULL)) 966 return (ESRCH); 967 968 struct nl_parsed_nhop attrs = {}; 969 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 970 if (error != 0) 971 return (error); 972 973 struct netlink_walkargs wa = { 974 .nw = npt->nw, 975 .hdr.nlmsg_pid = hdr->nlmsg_pid, 976 .hdr.nlmsg_seq = hdr->nlmsg_seq, 977 .hdr.nlmsg_flags = hdr->nlmsg_flags, 978 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 979 }; 980 981 if (attrs.nha_id != 0) { 982 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); 983 struct user_nhop key= { .un_idx = attrs.nha_id }; 984 UN_RLOCK(ctl); 985 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 986 UN_RUNLOCK(ctl); 987 988 if (unhop == NULL) 989 return (ESRCH); 990 dump_unhop(unhop, &wa.hdr, wa.nw); 991 return (0); 992 } 993 994 UN_RLOCK(ctl); 995 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 996 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { 997 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) 998 dump_unhop(unhop, &wa.hdr, wa.nw); 999 } CHT_SLIST_FOREACH_END; 1000 UN_RUNLOCK(ctl); 1001 1002 if (wa.error == 0) { 1003 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) 1004 return (ENOMEM); 1005 } 1006 return (0); 1007 } 1008 1009 static const struct rtnl_cmd_handler cmd_handlers[] = { 1010 { 1011 .cmd = NL_RTM_NEWNEXTHOP, 1012 .name = "RTM_NEWNEXTHOP", 1013 .cb = &rtnl_handle_newnhop, 1014 .priv = PRIV_NET_ROUTE, 1015 }, 1016 { 1017 .cmd = NL_RTM_DELNEXTHOP, 1018 .name = "RTM_DELNEXTHOP", 1019 .cb = &rtnl_handle_delnhop, 1020 .priv = PRIV_NET_ROUTE, 1021 }, 1022 { 1023 .cmd = NL_RTM_GETNEXTHOP, 1024 .name = "RTM_GETNEXTHOP", 1025 .cb = &rtnl_handle_getnhop, 1026 } 1027 }; 1028 1029 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser }; 1030 1031 void 1032 rtnl_nexthops_init(void) 1033 { 1034 NL_VERIFY_PARSERS(all_parsers); 1035 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); 1036 } 1037