1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include <sys/types.h> 33 #include <sys/malloc.h> 34 #include <sys/rmlock.h> 35 #include <sys/socket.h> 36 #include <sys/ck.h> 37 38 #include <net/if.h> 39 #include <net/route.h> 40 #include <net/route/nhop.h> 41 #include <net/route/nhop_utils.h> 42 43 #include <net/route/route_ctl.h> 44 #include <net/route/route_var.h> 45 #include <netinet6/scope6_var.h> 46 #include <netlink/netlink.h> 47 #include <netlink/netlink_ctl.h> 48 #include <netlink/netlink_var.h> 49 #include <netlink/netlink_route.h> 50 #include <netlink/route/route_var.h> 51 52 #define DEBUG_MOD_NAME nl_nhop 53 #define DEBUG_MAX_LEVEL LOG_DEBUG3 54 #include <netlink/netlink_debug.h> 55 _DECLARE_DEBUG(LOG_DEBUG3); 56 57 /* 58 * This file contains the logic to maintain kernel nexthops and 59 * nexhop groups based om the data provided by the user. 60 * 61 * Kernel stores (nearly) all of the routing data in the nexthops, 62 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). 63 * 64 * Netlink API provides higher-level abstraction for the user. Each 65 * user-created nexthop may map to multiple kernel nexthops. 66 * 67 * The following variations require separate kernel nexthop to be 68 * created: 69 * * prefix flags (NHF_HOST, NHF_DEFAULT) 70 * * using IPv6 gateway for IPv4 routes 71 * * different fibnum 72 * 73 * These kernel nexthops have the lifetime bound to the lifetime of 74 * the user_nhop object. They are not collected until user requests 75 * to delete the created user_nhop. 76 * 77 */ 78 struct user_nhop { 79 uint32_t un_idx; /* Userland-provided index */ 80 uint32_t un_fibfam; /* fibnum+af(as highest byte) */ 81 uint8_t un_protocol; /* protocol that install the record */ 82 struct nhop_object *un_nhop; /* "production" nexthop */ 83 struct nhop_object *un_nhop_src; /* nexthop to copy from */ 84 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ 85 uint32_t un_nhgrp_count; /* number of nexthops */ 86 struct user_nhop *un_next; /* next item in hash chain */ 87 struct user_nhop *un_nextchild; /* master -> children */ 88 struct epoch_context un_epoch_ctx; /* epoch ctl helper */ 89 }; 90 91 /* produce hash value for an object */ 92 #define unhop_hash_obj(_obj) (hash_unhop(_obj)) 93 /* compare two objects */ 94 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) 95 /* next object accessor */ 96 #define unhop_next(_obj) (_obj)->un_next 97 98 CHT_SLIST_DEFINE(unhop, struct user_nhop); 99 100 struct unhop_ctl { 101 struct unhop_head un_head; 102 struct rmlock un_lock; 103 }; 104 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") 105 #define UN_TRACKER struct rm_priotracker un_tracker 106 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) 107 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) 108 109 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); 110 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); 111 112 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; 113 #define V_un_ctl VNET(un_ctl) 114 115 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); 116 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); 117 static unsigned int hash_unhop(const struct user_nhop *obj); 118 119 static void destroy_unhop(struct user_nhop *unhop); 120 static struct nhop_object *clone_unhop(const struct user_nhop *unhop, 121 uint32_t fibnum, int family, int nh_flags); 122 123 static int 124 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) 125 { 126 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); 127 } 128 129 /* 130 * Hash callback: calculate hash of an object 131 */ 132 static unsigned int 133 hash_unhop(const struct user_nhop *obj) 134 { 135 return (obj->un_idx ^ obj->un_fibfam); 136 } 137 138 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) 139 140 /* 141 * Factory interface for creating matching kernel nexthops/nexthop groups 142 * 143 * @uidx: userland nexhop index used to create the nexthop 144 * @fibnum: fibnum nexthop will be used in 145 * @family: upper family nexthop will be used in 146 * @nh_flags: desired nexthop prefix flags 147 * @perror: pointer to store error to 148 * 149 * Returns referenced nexthop linked to @fibnum/@family rib on success. 150 */ 151 struct nhop_object * 152 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, 153 int nh_flags, int *perror) 154 { 155 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 156 UN_TRACKER; 157 158 if (__predict_false(ctl == NULL)) 159 return (NULL); 160 161 struct user_nhop key= { 162 .un_idx = uidx, 163 .un_fibfam = fibnum | ((uint32_t)family) << 24, 164 }; 165 struct user_nhop *unhop; 166 167 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); 168 169 if (__predict_false(family == 0)) 170 return (NULL); 171 172 UN_RLOCK(ctl); 173 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 174 if (unhop != NULL) { 175 struct nhop_object *nh = unhop->un_nhop; 176 UN_RLOCK(ctl); 177 *perror = 0; 178 nhop_ref_any(nh); 179 return (nh); 180 } 181 182 /* 183 * Exact nexthop not found. Search for template nexthop to clone from. 184 */ 185 key.un_fibfam = 0; 186 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 187 if (unhop == NULL) { 188 UN_RUNLOCK(ctl); 189 *perror = ESRCH; 190 return (NULL); 191 } 192 193 UN_RUNLOCK(ctl); 194 195 /* Create entry to insert first */ 196 struct user_nhop *un_new, *un_tmp; 197 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 198 if (un_new == NULL) { 199 *perror = ENOMEM; 200 return (NULL); 201 } 202 un_new->un_idx = uidx; 203 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; 204 205 /* Relying on epoch to protect unhop here */ 206 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); 207 if (un_new->un_nhop == NULL) { 208 free(un_new, M_NETLINK); 209 *perror = ENOMEM; 210 return (NULL); 211 } 212 213 /* Insert back and report */ 214 UN_WLOCK(ctl); 215 216 /* First, find template record once again */ 217 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 218 if (unhop == NULL) { 219 /* Someone deleted the nexthop during the call */ 220 UN_WUNLOCK(ctl); 221 *perror = ESRCH; 222 destroy_unhop(un_new); 223 return (NULL); 224 } 225 226 /* Second, check the direct match */ 227 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); 228 struct nhop_object *nh; 229 if (un_tmp != NULL) { 230 /* Another thread already created the desired nextop, use it */ 231 nh = un_tmp->un_nhop; 232 } else { 233 /* Finally, insert the new nexthop and link it to the primary */ 234 nh = un_new->un_nhop; 235 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); 236 un_new->un_nextchild = unhop->un_nextchild; 237 unhop->un_nextchild = un_new; 238 un_new = NULL; 239 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); 240 } 241 242 UN_WUNLOCK(ctl); 243 244 if (un_new != NULL) 245 destroy_unhop(un_new); 246 247 *perror = 0; 248 nhop_ref_any(nh); 249 return (nh); 250 } 251 252 static struct user_nhop * 253 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) 254 { 255 struct user_nhop key= { .un_idx = uidx }; 256 struct user_nhop *unhop = NULL; 257 UN_TRACKER; 258 259 UN_RLOCK(ctl); 260 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 261 UN_RUNLOCK(ctl); 262 263 return (unhop); 264 } 265 266 #define MAX_STACK_NHOPS 4 267 static struct nhop_object * 268 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) 269 { 270 const struct weightened_nhop *wn; 271 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; 272 struct nhop_object *nh = NULL; 273 uint32_t num_nhops; 274 int error; 275 276 if (unhop->un_nhop_src != NULL) { 277 IF_DEBUG_LEVEL(LOG_DEBUG2) { 278 char nhbuf[NHOP_PRINT_BUFSIZE]; 279 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); 280 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, 281 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, 282 family, nh_flags); 283 } 284 struct nhop_object *nh; 285 nh = nhop_alloc(fibnum, AF_UNSPEC); 286 if (nh == NULL) 287 return (NULL); 288 nhop_copy(nh, unhop->un_nhop_src); 289 /* Check that nexthop gateway is compatible with the new family */ 290 if (!nhop_set_upper_family(nh, family)) { 291 nhop_free(nh); 292 return (NULL); 293 } 294 nhop_set_uidx(nh, unhop->un_idx); 295 nhop_set_pxtype_flag(nh, nh_flags); 296 return (nhop_get_nhop(nh, &error)); 297 } 298 299 wn = unhop->un_nhgrp_src; 300 num_nhops = unhop->un_nhgrp_count; 301 302 if (num_nhops > MAX_STACK_NHOPS) { 303 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); 304 if (wn_new == NULL) 305 return (NULL); 306 } else 307 wn_new = wn_base; 308 309 for (int i = 0; i < num_nhops; i++) { 310 uint32_t uidx = nhop_get_uidx(wn[i].nh); 311 MPASS(uidx != 0); 312 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); 313 if (error != 0) 314 break; 315 wn_new[i].weight = wn[i].weight; 316 } 317 318 if (error == 0) { 319 struct rib_head *rh = nhop_get_rh(wn_new[0].nh); 320 struct nhgrp_object *nhg; 321 322 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); 323 nh = (struct nhop_object *)nhg; 324 } 325 326 if (wn_new != wn_base) 327 free(wn_new, M_TEMP); 328 return (nh); 329 } 330 331 static void 332 destroy_unhop(struct user_nhop *unhop) 333 { 334 if (unhop->un_nhop != NULL) 335 nhop_free_any(unhop->un_nhop); 336 if (unhop->un_nhop_src != NULL) 337 nhop_free_any(unhop->un_nhop_src); 338 free(unhop, M_NETLINK); 339 } 340 341 static void 342 destroy_unhop_epoch(epoch_context_t ctx) 343 { 344 struct user_nhop *unhop; 345 346 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); 347 348 destroy_unhop(unhop); 349 } 350 351 static uint32_t 352 find_spare_uidx(struct unhop_ctl *ctl) 353 { 354 struct user_nhop *unhop, key = {}; 355 uint32_t uidx = 0; 356 UN_TRACKER; 357 358 UN_RLOCK(ctl); 359 /* This should return spare uid with 75% of 65k used in ~99/100 cases */ 360 for (int i = 0; i < 16; i++) { 361 key.un_idx = (arc4random() % 65536) + 65536 * 4; 362 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 363 if (unhop == NULL) { 364 uidx = key.un_idx; 365 break; 366 } 367 } 368 UN_RUNLOCK(ctl); 369 370 return (uidx); 371 } 372 373 374 /* 375 * Actual netlink code 376 */ 377 struct netlink_walkargs { 378 struct nl_writer *nw; 379 struct nlmsghdr hdr; 380 struct nlpcb *so; 381 int family; 382 int error; 383 int count; 384 int dumped; 385 }; 386 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem 387 388 static bool 389 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, 390 struct nl_writer *nw) 391 { 392 393 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 394 goto enomem; 395 396 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 397 nhm->nh_family = AF_UNSPEC; 398 nhm->nh_scope = 0; 399 nhm->nh_protocol = unhop->un_protocol; 400 nhm->nh_flags = 0; 401 402 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 403 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); 404 405 struct weightened_nhop *wn = unhop->un_nhgrp_src; 406 uint32_t num_nhops = unhop->un_nhgrp_count; 407 /* TODO: a better API? */ 408 int nla_len = sizeof(struct nlattr); 409 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); 410 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); 411 if (nla == NULL) 412 goto enomem; 413 nla->nla_type = NHA_GROUP; 414 nla->nla_len = nla_len; 415 for (int i = 0; i < num_nhops; i++) { 416 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; 417 grp->id = nhop_get_uidx(wn[i].nh); 418 grp->weight = wn[i].weight; 419 grp->resvd1 = 0; 420 grp->resvd2 = 0; 421 } 422 423 if (nlmsg_end(nw)) 424 return (true); 425 enomem: 426 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); 427 nlmsg_abort(nw); 428 return (false); 429 } 430 431 static bool 432 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 433 struct nl_writer *nw) 434 { 435 struct nhop_object *nh = unhop->un_nhop_src; 436 437 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 438 goto enomem; 439 440 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 441 ENOMEM_IF_NULL(nhm); 442 nhm->nh_family = nhop_get_neigh_family(nh); 443 nhm->nh_scope = 0; // XXX: what's that? 444 nhm->nh_protocol = unhop->un_protocol; 445 nhm->nh_flags = 0; 446 447 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 448 if (nh->nh_flags & NHF_BLACKHOLE) { 449 nlattr_add_flag(nw, NHA_BLACKHOLE); 450 goto done; 451 } 452 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); 453 454 switch (nh->gw_sa.sa_family) { 455 #ifdef INET 456 case AF_INET: 457 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); 458 break; 459 #endif 460 #ifdef INET6 461 case AF_INET6: 462 { 463 struct in6_addr addr = nh->gw6_sa.sin6_addr; 464 in6_clearscope(&addr); 465 nlattr_add(nw, NHA_GATEWAY, 16, &addr); 466 break; 467 } 468 #endif 469 } 470 471 done: 472 if (nlmsg_end(nw)) 473 return (true); 474 enomem: 475 nlmsg_abort(nw); 476 return (false); 477 } 478 479 static void 480 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 481 struct nl_writer *nw) 482 { 483 if (unhop->un_nhop_src != NULL) 484 dump_nhop(unhop, hdr, nw); 485 else 486 dump_nhgrp(unhop, hdr, nw); 487 } 488 489 static int 490 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) 491 { 492 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; 493 494 struct user_nhop key = { .un_idx = uidx }; 495 496 UN_WLOCK(ctl); 497 498 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); 499 500 if (unhop_base != NULL) { 501 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); 502 IF_DEBUG_LEVEL(LOG_DEBUG2) { 503 char nhbuf[NHOP_PRINT_BUFSIZE]; 504 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); 505 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, 506 "removed base nhop %u: %s", uidx, nhbuf); 507 } 508 /* Unlink all child nexhops as well, keeping the chain intact */ 509 unhop_chain = unhop_base->un_nextchild; 510 while (unhop_chain != NULL) { 511 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, 512 unhop_ret); 513 MPASS(unhop_chain == unhop_ret); 514 IF_DEBUG_LEVEL(LOG_DEBUG3) { 515 char nhbuf[NHOP_PRINT_BUFSIZE]; 516 nhop_print_buf_any(unhop_chain->un_nhop, 517 nhbuf, sizeof(nhbuf)); 518 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, 519 "removed child nhop %u: %s", uidx, nhbuf); 520 } 521 unhop_chain = unhop_chain->un_nextchild; 522 } 523 } 524 525 UN_WUNLOCK(ctl); 526 527 if (unhop_base == NULL) { 528 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); 529 return (ENOENT); 530 } 531 532 /* Report nexthop deletion */ 533 struct netlink_walkargs wa = { 534 .hdr.nlmsg_pid = hdr->nlmsg_pid, 535 .hdr.nlmsg_seq = hdr->nlmsg_seq, 536 .hdr.nlmsg_flags = hdr->nlmsg_flags, 537 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, 538 }; 539 540 struct nl_writer nw = {}; 541 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 542 NL_LOG(LOG_DEBUG, "error allocating message writer"); 543 return (ENOMEM); 544 } 545 546 dump_unhop(unhop_base, &wa.hdr, &nw); 547 nlmsg_flush(&nw); 548 549 while (unhop_base != NULL) { 550 unhop_chain = unhop_base->un_nextchild; 551 epoch_call(net_epoch_preempt, destroy_unhop_epoch, 552 &unhop_base->un_epoch_ctx); 553 unhop_base = unhop_chain; 554 } 555 556 return (0); 557 } 558 559 static void 560 consider_resize(struct unhop_ctl *ctl, uint32_t new_size) 561 { 562 void *new_ptr = NULL; 563 size_t alloc_size; 564 565 if (new_size == 0) 566 return; 567 568 if (new_size != 0) { 569 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); 570 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 571 if (new_ptr == NULL) 572 return; 573 } 574 575 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); 576 UN_WLOCK(ctl); 577 if (new_ptr != NULL) { 578 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); 579 } 580 UN_WUNLOCK(ctl); 581 582 583 if (new_ptr != NULL) 584 free(new_ptr, M_NETLINK); 585 } 586 587 static bool __noinline 588 vnet_init_unhops() 589 { 590 uint32_t num_buckets = 16; 591 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); 592 593 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, 594 M_NOWAIT | M_ZERO); 595 if (ctl == NULL) 596 return (false); 597 598 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 599 if (ptr == NULL) { 600 free(ctl, M_NETLINK); 601 return (false); 602 } 603 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); 604 UN_LOCK_INIT(ctl); 605 606 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { 607 free(ptr, M_NETLINK); 608 free(ctl, M_NETLINK); 609 } 610 611 if (atomic_load_ptr(&V_un_ctl) == NULL) 612 return (false); 613 614 NL_LOG(LOG_NOTICE, "UNHOPS init done"); 615 616 return (true); 617 } 618 619 static void 620 vnet_destroy_unhops(const void *unused __unused) 621 { 622 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 623 struct user_nhop *unhop, *tmp; 624 625 if (ctl == NULL) 626 return; 627 V_un_ctl = NULL; 628 629 /* Wait till all unhop users finish their reads */ 630 epoch_wait_preempt(net_epoch_preempt); 631 632 UN_WLOCK(ctl); 633 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { 634 destroy_unhop(unhop); 635 } CHT_SLIST_FOREACH_SAFE_END; 636 UN_WUNLOCK(ctl); 637 638 free(ctl->un_head.ptr, M_NETLINK); 639 free(ctl, M_NETLINK); 640 } 641 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, 642 vnet_destroy_unhops, NULL); 643 644 static int 645 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 646 { 647 int error = 0; 648 649 /* Verify attribute correctness */ 650 struct nexthop_grp *grp = NLA_DATA(nla); 651 int data_len = NLA_DATA_LEN(nla); 652 653 int count = data_len / sizeof(*grp); 654 if (count == 0 || (count * sizeof(*grp) != data_len)) { 655 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); 656 return (EINVAL); 657 } 658 659 *((struct nlattr **)target) = nla; 660 return (error); 661 } 662 663 struct nl_parsed_nhop { 664 uint32_t nha_id; 665 uint8_t nha_blackhole; 666 uint8_t nha_groups; 667 struct ifnet *nha_oif; 668 struct sockaddr *nha_gw; 669 struct nlattr *nha_group; 670 uint8_t nh_family; 671 uint8_t nh_protocol; 672 }; 673 674 #define _IN(_field) offsetof(struct nhmsg, _field) 675 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) 676 static const struct nlfield_parser nlf_p_nh[] = { 677 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, 678 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, 679 }; 680 681 static const struct nlattr_parser nla_p_nh[] = { 682 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, 683 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, 684 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, 685 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, 686 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, 687 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, 688 }; 689 #undef _IN 690 #undef _OUT 691 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); 692 693 static bool 694 eligible_nhg(const struct nhop_object *nh) 695 { 696 return (nh->nh_flags & NHF_GATEWAY); 697 } 698 699 static int 700 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 701 { 702 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); 703 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); 704 struct weightened_nhop *wn; 705 706 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); 707 if (wn == NULL) 708 return (ENOMEM); 709 710 for (int i = 0; i < count; i++) { 711 struct user_nhop *unhop; 712 unhop = nl_find_base_unhop(ctl, grp[i].id); 713 if (unhop == NULL) { 714 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); 715 free(wn, M_NETLINK); 716 return (ESRCH); 717 } else if (unhop->un_nhop_src == NULL) { 718 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", 719 grp[i].id); 720 free(wn, M_NETLINK); 721 return (ENOTSUP); 722 } else if (!eligible_nhg(unhop->un_nhop_src)) { 723 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", 724 grp[i].id); 725 free(wn, M_NETLINK); 726 return (ENOTSUP); 727 } 728 /* 729 * TODO: consider more rigid eligibility checks: 730 * restrict nexthops with the same gateway 731 */ 732 wn[i].nh = unhop->un_nhop_src; 733 wn[i].weight = grp[i].weight; 734 } 735 unhop->un_nhgrp_src = wn; 736 unhop->un_nhgrp_count = count; 737 return (0); 738 } 739 740 static int 741 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 742 { 743 struct ifaddr *ifa = NULL; 744 struct nhop_object *nh; 745 int error; 746 747 if (!attrs->nha_blackhole) { 748 if (attrs->nha_gw == NULL) { 749 NL_LOG(LOG_DEBUG, "missing NHA_GATEWAY"); 750 return (EINVAL); 751 } 752 if (attrs->nha_oif == NULL) { 753 NL_LOG(LOG_DEBUG, "missing NHA_OIF"); 754 return (EINVAL); 755 } 756 if (ifa == NULL) 757 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); 758 if (ifa == NULL) { 759 NL_LOG(LOG_DEBUG, "Unable to determine default source IP"); 760 return (EINVAL); 761 } 762 } 763 764 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; 765 766 nh = nhop_alloc(RT_DEFAULT_FIB, family); 767 if (nh == NULL) { 768 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); 769 return (ENOMEM); 770 } 771 nhop_set_uidx(nh, attrs->nha_id); 772 773 if (attrs->nha_blackhole) 774 nhop_set_blackhole(nh, NHF_BLACKHOLE); 775 else { 776 nhop_set_gw(nh, attrs->nha_gw, true); 777 nhop_set_transmit_ifp(nh, attrs->nha_oif); 778 nhop_set_src(nh, ifa); 779 } 780 781 error = nhop_get_unlinked(nh); 782 if (error != 0) { 783 NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); 784 return (error); 785 } 786 787 IF_DEBUG_LEVEL(LOG_DEBUG2) { 788 char nhbuf[NHOP_PRINT_BUFSIZE]; 789 nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); 790 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); 791 } 792 793 unhop->un_nhop_src = nh; 794 return (0); 795 } 796 797 static int 798 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 799 struct nl_pstate *npt) 800 { 801 struct user_nhop *unhop; 802 int error; 803 804 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) 805 return (ENOMEM); 806 struct unhop_ctl *ctl = V_un_ctl; 807 808 struct nl_parsed_nhop attrs = {}; 809 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 810 if (error != 0) 811 return (error); 812 813 /* 814 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class 815 * citizen. 816 */ 817 if (attrs.nha_id == 0) { 818 attrs.nha_id = find_spare_uidx(ctl); 819 if (attrs.nha_id == 0) { 820 NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); 821 return (ENOSPC); 822 } 823 } 824 825 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); 826 827 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 828 if (unhop == NULL) { 829 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); 830 return (ENOMEM); 831 } 832 unhop->un_idx = attrs.nha_id; 833 unhop->un_protocol = attrs.nh_protocol; 834 835 if (attrs.nha_group) 836 error = newnhg(ctl, &attrs, unhop); 837 else 838 error = newnhop(&attrs, unhop); 839 840 if (error != 0) { 841 free(unhop, M_NETLINK); 842 return (error); 843 } 844 845 UN_WLOCK(ctl); 846 /* Check if uidx already exists */ 847 struct user_nhop *tmp = NULL; 848 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); 849 if (tmp != NULL) { 850 UN_WUNLOCK(ctl); 851 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); 852 destroy_unhop(unhop); 853 return (EEXIST); 854 } 855 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); 856 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); 857 UN_WUNLOCK(ctl); 858 859 /* Report addition of the next nexhop */ 860 struct netlink_walkargs wa = { 861 .hdr.nlmsg_pid = hdr->nlmsg_pid, 862 .hdr.nlmsg_seq = hdr->nlmsg_seq, 863 .hdr.nlmsg_flags = hdr->nlmsg_flags, 864 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 865 }; 866 867 struct nl_writer nw = {}; 868 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 869 NL_LOG(LOG_DEBUG, "error allocating message writer"); 870 return (ENOMEM); 871 } 872 873 dump_unhop(unhop, &wa.hdr, &nw); 874 nlmsg_flush(&nw); 875 876 consider_resize(ctl, num_buckets_new); 877 878 return (0); 879 } 880 881 static int 882 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 883 struct nl_pstate *npt) 884 { 885 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 886 int error; 887 888 if (__predict_false(ctl == NULL)) 889 return (ESRCH); 890 891 struct nl_parsed_nhop attrs = {}; 892 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 893 if (error != 0) 894 return (error); 895 896 if (attrs.nha_id == 0) { 897 NL_LOG(LOG_DEBUG, "NHA_ID not set"); 898 return (EINVAL); 899 } 900 901 error = delete_unhop(ctl, hdr, attrs.nha_id); 902 903 return (error); 904 } 905 906 static bool 907 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 908 { 909 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) 910 return (false); 911 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) 912 return (false); 913 if (attrs->nha_oif != NULL && 914 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) 915 return (false); 916 917 return (true); 918 } 919 920 static int 921 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 922 struct nl_pstate *npt) 923 { 924 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 925 struct user_nhop *unhop; 926 UN_TRACKER; 927 int error; 928 929 if (__predict_false(ctl == NULL)) 930 return (ESRCH); 931 932 struct nl_parsed_nhop attrs = {}; 933 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 934 if (error != 0) 935 return (error); 936 937 struct netlink_walkargs wa = { 938 .nw = npt->nw, 939 .hdr.nlmsg_pid = hdr->nlmsg_pid, 940 .hdr.nlmsg_seq = hdr->nlmsg_seq, 941 .hdr.nlmsg_flags = hdr->nlmsg_flags, 942 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 943 }; 944 945 if (attrs.nha_id != 0) { 946 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); 947 struct user_nhop key= { .un_idx = attrs.nha_id }; 948 UN_RLOCK(ctl); 949 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 950 UN_RUNLOCK(ctl); 951 952 if (unhop == NULL) 953 return (ESRCH); 954 dump_unhop(unhop, &wa.hdr, wa.nw); 955 return (0); 956 } 957 958 UN_RLOCK(ctl); 959 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 960 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { 961 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) 962 dump_unhop(unhop, &wa.hdr, wa.nw); 963 } CHT_SLIST_FOREACH_END; 964 UN_RUNLOCK(ctl); 965 966 if (wa.error == 0) { 967 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) 968 return (ENOMEM); 969 } 970 return (0); 971 } 972 973 static const struct rtnl_cmd_handler cmd_handlers[] = { 974 { 975 .cmd = NL_RTM_NEWNEXTHOP, 976 .name = "RTM_NEWNEXTHOP", 977 .cb = &rtnl_handle_newnhop, 978 .priv = PRIV_NET_ROUTE, 979 }, 980 { 981 .cmd = NL_RTM_DELNEXTHOP, 982 .name = "RTM_DELNEXTHOP", 983 .cb = &rtnl_handle_delnhop, 984 .priv = PRIV_NET_ROUTE, 985 }, 986 { 987 .cmd = NL_RTM_GETNEXTHOP, 988 .name = "RTM_GETNEXTHOP", 989 .cb = &rtnl_handle_getnhop, 990 } 991 }; 992 993 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser }; 994 995 void 996 rtnl_nexthops_init() 997 { 998 NL_VERIFY_PARSERS(all_parsers); 999 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); 1000 } 1001