1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 #include <sys/types.h> 34 #include <sys/ck.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/rmlock.h> 38 #include <sys/socket.h> 39 40 #include <net/if.h> 41 #include <net/route.h> 42 #include <net/route/nhop.h> 43 #include <net/route/nhop_utils.h> 44 45 #include <net/route/route_ctl.h> 46 #include <net/route/route_var.h> 47 #include <netinet6/scope6_var.h> 48 #include <netlink/netlink.h> 49 #include <netlink/netlink_ctl.h> 50 #include <netlink/netlink_var.h> 51 #include <netlink/netlink_route.h> 52 #include <netlink/route/route_var.h> 53 54 #define DEBUG_MOD_NAME nl_nhop 55 #define DEBUG_MAX_LEVEL LOG_DEBUG3 56 #include <netlink/netlink_debug.h> 57 _DECLARE_DEBUG(LOG_DEBUG3); 58 59 /* 60 * This file contains the logic to maintain kernel nexthops and 61 * nexhop groups based om the data provided by the user. 62 * 63 * Kernel stores (nearly) all of the routing data in the nexthops, 64 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). 65 * 66 * Netlink API provides higher-level abstraction for the user. Each 67 * user-created nexthop may map to multiple kernel nexthops. 68 * 69 * The following variations require separate kernel nexthop to be 70 * created: 71 * * prefix flags (NHF_HOST, NHF_DEFAULT) 72 * * using IPv6 gateway for IPv4 routes 73 * * different fibnum 74 * 75 * These kernel nexthops have the lifetime bound to the lifetime of 76 * the user_nhop object. They are not collected until user requests 77 * to delete the created user_nhop. 78 * 79 */ 80 struct user_nhop { 81 uint32_t un_idx; /* Userland-provided index */ 82 uint32_t un_fibfam; /* fibnum+af(as highest byte) */ 83 uint8_t un_protocol; /* protocol that install the record */ 84 struct nhop_object *un_nhop; /* "production" nexthop */ 85 struct nhop_object *un_nhop_src; /* nexthop to copy from */ 86 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ 87 uint32_t un_nhgrp_count; /* number of nexthops */ 88 struct user_nhop *un_next; /* next item in hash chain */ 89 struct user_nhop *un_nextchild; /* master -> children */ 90 struct epoch_context un_epoch_ctx; /* epoch ctl helper */ 91 }; 92 93 /* produce hash value for an object */ 94 #define unhop_hash_obj(_obj) (hash_unhop(_obj)) 95 /* compare two objects */ 96 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) 97 /* next object accessor */ 98 #define unhop_next(_obj) (_obj)->un_next 99 100 CHT_SLIST_DEFINE(unhop, struct user_nhop); 101 102 struct unhop_ctl { 103 struct unhop_head un_head; 104 struct rmlock un_lock; 105 }; 106 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") 107 #define UN_TRACKER struct rm_priotracker un_tracker 108 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) 109 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) 110 111 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); 112 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); 113 114 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; 115 #define V_un_ctl VNET(un_ctl) 116 117 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); 118 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); 119 static unsigned int hash_unhop(const struct user_nhop *obj); 120 121 static void destroy_unhop(struct user_nhop *unhop); 122 static struct nhop_object *clone_unhop(const struct user_nhop *unhop, 123 uint32_t fibnum, int family, int nh_flags); 124 125 static int 126 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) 127 { 128 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); 129 } 130 131 /* 132 * Hash callback: calculate hash of an object 133 */ 134 static unsigned int 135 hash_unhop(const struct user_nhop *obj) 136 { 137 return (obj->un_idx ^ obj->un_fibfam); 138 } 139 140 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) 141 142 /* 143 * Factory interface for creating matching kernel nexthops/nexthop groups 144 * 145 * @uidx: userland nexhop index used to create the nexthop 146 * @fibnum: fibnum nexthop will be used in 147 * @family: upper family nexthop will be used in 148 * @nh_flags: desired nexthop prefix flags 149 * @perror: pointer to store error to 150 * 151 * Returns referenced nexthop linked to @fibnum/@family rib on success. 152 */ 153 struct nhop_object * 154 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, 155 int nh_flags, int *perror) 156 { 157 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 158 UN_TRACKER; 159 160 if (__predict_false(ctl == NULL)) 161 return (NULL); 162 163 struct user_nhop key= { 164 .un_idx = uidx, 165 .un_fibfam = fibnum | ((uint32_t)family) << 24, 166 }; 167 struct user_nhop *unhop; 168 169 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); 170 171 if (__predict_false(family == 0)) 172 return (NULL); 173 174 UN_RLOCK(ctl); 175 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 176 if (unhop != NULL) { 177 struct nhop_object *nh = unhop->un_nhop; 178 UN_RLOCK(ctl); 179 *perror = 0; 180 nhop_ref_any(nh); 181 return (nh); 182 } 183 184 /* 185 * Exact nexthop not found. Search for template nexthop to clone from. 186 */ 187 key.un_fibfam = 0; 188 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 189 if (unhop == NULL) { 190 UN_RUNLOCK(ctl); 191 *perror = ESRCH; 192 return (NULL); 193 } 194 195 UN_RUNLOCK(ctl); 196 197 /* Create entry to insert first */ 198 struct user_nhop *un_new, *un_tmp; 199 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 200 if (un_new == NULL) { 201 *perror = ENOMEM; 202 return (NULL); 203 } 204 un_new->un_idx = uidx; 205 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; 206 207 /* Relying on epoch to protect unhop here */ 208 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); 209 if (un_new->un_nhop == NULL) { 210 free(un_new, M_NETLINK); 211 *perror = ENOMEM; 212 return (NULL); 213 } 214 215 /* Insert back and report */ 216 UN_WLOCK(ctl); 217 218 /* First, find template record once again */ 219 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 220 if (unhop == NULL) { 221 /* Someone deleted the nexthop during the call */ 222 UN_WUNLOCK(ctl); 223 *perror = ESRCH; 224 destroy_unhop(un_new); 225 return (NULL); 226 } 227 228 /* Second, check the direct match */ 229 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); 230 struct nhop_object *nh; 231 if (un_tmp != NULL) { 232 /* Another thread already created the desired nextop, use it */ 233 nh = un_tmp->un_nhop; 234 } else { 235 /* Finally, insert the new nexthop and link it to the primary */ 236 nh = un_new->un_nhop; 237 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); 238 un_new->un_nextchild = unhop->un_nextchild; 239 unhop->un_nextchild = un_new; 240 un_new = NULL; 241 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); 242 } 243 244 UN_WUNLOCK(ctl); 245 246 if (un_new != NULL) 247 destroy_unhop(un_new); 248 249 *perror = 0; 250 nhop_ref_any(nh); 251 return (nh); 252 } 253 254 static struct user_nhop * 255 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) 256 { 257 struct user_nhop key= { .un_idx = uidx }; 258 struct user_nhop *unhop = NULL; 259 UN_TRACKER; 260 261 UN_RLOCK(ctl); 262 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 263 UN_RUNLOCK(ctl); 264 265 return (unhop); 266 } 267 268 #define MAX_STACK_NHOPS 4 269 static struct nhop_object * 270 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) 271 { 272 #ifdef ROUTE_MPATH 273 const struct weightened_nhop *wn; 274 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; 275 uint32_t num_nhops; 276 #endif 277 struct nhop_object *nh = NULL; 278 int error; 279 280 if (unhop->un_nhop_src != NULL) { 281 IF_DEBUG_LEVEL(LOG_DEBUG2) { 282 char nhbuf[NHOP_PRINT_BUFSIZE]; 283 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); 284 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, 285 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, 286 family, nh_flags); 287 } 288 struct nhop_object *nh; 289 nh = nhop_alloc(fibnum, AF_UNSPEC); 290 if (nh == NULL) 291 return (NULL); 292 nhop_copy(nh, unhop->un_nhop_src); 293 /* Check that nexthop gateway is compatible with the new family */ 294 if (!nhop_set_upper_family(nh, family)) { 295 nhop_free(nh); 296 return (NULL); 297 } 298 nhop_set_uidx(nh, unhop->un_idx); 299 nhop_set_pxtype_flag(nh, nh_flags); 300 return (nhop_get_nhop(nh, &error)); 301 } 302 #ifdef ROUTE_MPATH 303 wn = unhop->un_nhgrp_src; 304 num_nhops = unhop->un_nhgrp_count; 305 306 if (num_nhops > MAX_STACK_NHOPS) { 307 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); 308 if (wn_new == NULL) 309 return (NULL); 310 } else 311 wn_new = wn_base; 312 313 for (int i = 0; i < num_nhops; i++) { 314 uint32_t uidx = nhop_get_uidx(wn[i].nh); 315 MPASS(uidx != 0); 316 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); 317 if (error != 0) 318 break; 319 wn_new[i].weight = wn[i].weight; 320 } 321 322 if (error == 0) { 323 struct rib_head *rh = nhop_get_rh(wn_new[0].nh); 324 struct nhgrp_object *nhg; 325 326 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); 327 nh = (struct nhop_object *)nhg; 328 } 329 330 if (wn_new != wn_base) 331 free(wn_new, M_TEMP); 332 #endif 333 return (nh); 334 } 335 336 static void 337 destroy_unhop(struct user_nhop *unhop) 338 { 339 if (unhop->un_nhop != NULL) 340 nhop_free_any(unhop->un_nhop); 341 if (unhop->un_nhop_src != NULL) 342 nhop_free_any(unhop->un_nhop_src); 343 free(unhop, M_NETLINK); 344 } 345 346 static void 347 destroy_unhop_epoch(epoch_context_t ctx) 348 { 349 struct user_nhop *unhop; 350 351 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); 352 353 destroy_unhop(unhop); 354 } 355 356 static uint32_t 357 find_spare_uidx(struct unhop_ctl *ctl) 358 { 359 struct user_nhop *unhop, key = {}; 360 uint32_t uidx = 0; 361 UN_TRACKER; 362 363 UN_RLOCK(ctl); 364 /* This should return spare uid with 75% of 65k used in ~99/100 cases */ 365 for (int i = 0; i < 16; i++) { 366 key.un_idx = (arc4random() % 65536) + 65536 * 4; 367 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 368 if (unhop == NULL) { 369 uidx = key.un_idx; 370 break; 371 } 372 } 373 UN_RUNLOCK(ctl); 374 375 return (uidx); 376 } 377 378 379 /* 380 * Actual netlink code 381 */ 382 struct netlink_walkargs { 383 struct nl_writer *nw; 384 struct nlmsghdr hdr; 385 struct nlpcb *so; 386 int family; 387 int error; 388 int count; 389 int dumped; 390 }; 391 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem 392 393 static bool 394 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, 395 struct nl_writer *nw) 396 { 397 398 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 399 goto enomem; 400 401 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 402 nhm->nh_family = AF_UNSPEC; 403 nhm->nh_scope = 0; 404 nhm->nh_protocol = unhop->un_protocol; 405 nhm->nh_flags = 0; 406 407 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 408 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); 409 410 struct weightened_nhop *wn = unhop->un_nhgrp_src; 411 uint32_t num_nhops = unhop->un_nhgrp_count; 412 /* TODO: a better API? */ 413 int nla_len = sizeof(struct nlattr); 414 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); 415 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); 416 if (nla == NULL) 417 goto enomem; 418 nla->nla_type = NHA_GROUP; 419 nla->nla_len = nla_len; 420 for (int i = 0; i < num_nhops; i++) { 421 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; 422 grp->id = nhop_get_uidx(wn[i].nh); 423 grp->weight = wn[i].weight; 424 grp->resvd1 = 0; 425 grp->resvd2 = 0; 426 } 427 428 if (nlmsg_end(nw)) 429 return (true); 430 enomem: 431 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); 432 nlmsg_abort(nw); 433 return (false); 434 } 435 436 static bool 437 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 438 struct nl_writer *nw) 439 { 440 struct nhop_object *nh = unhop->un_nhop_src; 441 442 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 443 goto enomem; 444 445 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 446 ENOMEM_IF_NULL(nhm); 447 nhm->nh_family = nhop_get_neigh_family(nh); 448 nhm->nh_scope = 0; // XXX: what's that? 449 nhm->nh_protocol = unhop->un_protocol; 450 nhm->nh_flags = 0; 451 452 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 453 if (nh->nh_flags & NHF_BLACKHOLE) { 454 nlattr_add_flag(nw, NHA_BLACKHOLE); 455 goto done; 456 } 457 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); 458 459 switch (nh->gw_sa.sa_family) { 460 #ifdef INET 461 case AF_INET: 462 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); 463 break; 464 #endif 465 #ifdef INET6 466 case AF_INET6: 467 { 468 struct in6_addr addr = nh->gw6_sa.sin6_addr; 469 in6_clearscope(&addr); 470 nlattr_add(nw, NHA_GATEWAY, 16, &addr); 471 break; 472 } 473 #endif 474 } 475 476 done: 477 if (nlmsg_end(nw)) 478 return (true); 479 enomem: 480 nlmsg_abort(nw); 481 return (false); 482 } 483 484 static void 485 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 486 struct nl_writer *nw) 487 { 488 if (unhop->un_nhop_src != NULL) 489 dump_nhop(unhop, hdr, nw); 490 else 491 dump_nhgrp(unhop, hdr, nw); 492 } 493 494 static int 495 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) 496 { 497 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; 498 499 struct user_nhop key = { .un_idx = uidx }; 500 501 UN_WLOCK(ctl); 502 503 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); 504 505 if (unhop_base != NULL) { 506 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); 507 IF_DEBUG_LEVEL(LOG_DEBUG2) { 508 char nhbuf[NHOP_PRINT_BUFSIZE]; 509 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); 510 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, 511 "removed base nhop %u: %s", uidx, nhbuf); 512 } 513 /* Unlink all child nexhops as well, keeping the chain intact */ 514 unhop_chain = unhop_base->un_nextchild; 515 while (unhop_chain != NULL) { 516 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, 517 unhop_ret); 518 MPASS(unhop_chain == unhop_ret); 519 IF_DEBUG_LEVEL(LOG_DEBUG3) { 520 char nhbuf[NHOP_PRINT_BUFSIZE]; 521 nhop_print_buf_any(unhop_chain->un_nhop, 522 nhbuf, sizeof(nhbuf)); 523 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, 524 "removed child nhop %u: %s", uidx, nhbuf); 525 } 526 unhop_chain = unhop_chain->un_nextchild; 527 } 528 } 529 530 UN_WUNLOCK(ctl); 531 532 if (unhop_base == NULL) { 533 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); 534 return (ENOENT); 535 } 536 537 /* Report nexthop deletion */ 538 struct netlink_walkargs wa = { 539 .hdr.nlmsg_pid = hdr->nlmsg_pid, 540 .hdr.nlmsg_seq = hdr->nlmsg_seq, 541 .hdr.nlmsg_flags = hdr->nlmsg_flags, 542 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, 543 }; 544 545 struct nl_writer nw = {}; 546 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 547 NL_LOG(LOG_DEBUG, "error allocating message writer"); 548 return (ENOMEM); 549 } 550 551 dump_unhop(unhop_base, &wa.hdr, &nw); 552 nlmsg_flush(&nw); 553 554 while (unhop_base != NULL) { 555 unhop_chain = unhop_base->un_nextchild; 556 epoch_call(net_epoch_preempt, destroy_unhop_epoch, 557 &unhop_base->un_epoch_ctx); 558 unhop_base = unhop_chain; 559 } 560 561 return (0); 562 } 563 564 static void 565 consider_resize(struct unhop_ctl *ctl, uint32_t new_size) 566 { 567 void *new_ptr = NULL; 568 size_t alloc_size; 569 570 if (new_size == 0) 571 return; 572 573 if (new_size != 0) { 574 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); 575 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 576 if (new_ptr == NULL) 577 return; 578 } 579 580 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); 581 UN_WLOCK(ctl); 582 if (new_ptr != NULL) { 583 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); 584 } 585 UN_WUNLOCK(ctl); 586 587 588 if (new_ptr != NULL) 589 free(new_ptr, M_NETLINK); 590 } 591 592 static bool __noinline 593 vnet_init_unhops() 594 { 595 uint32_t num_buckets = 16; 596 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); 597 598 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, 599 M_NOWAIT | M_ZERO); 600 if (ctl == NULL) 601 return (false); 602 603 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 604 if (ptr == NULL) { 605 free(ctl, M_NETLINK); 606 return (false); 607 } 608 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); 609 UN_LOCK_INIT(ctl); 610 611 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { 612 free(ptr, M_NETLINK); 613 free(ctl, M_NETLINK); 614 } 615 616 if (atomic_load_ptr(&V_un_ctl) == NULL) 617 return (false); 618 619 NL_LOG(LOG_NOTICE, "UNHOPS init done"); 620 621 return (true); 622 } 623 624 static void 625 vnet_destroy_unhops(const void *unused __unused) 626 { 627 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 628 struct user_nhop *unhop, *tmp; 629 630 if (ctl == NULL) 631 return; 632 V_un_ctl = NULL; 633 634 /* Wait till all unhop users finish their reads */ 635 epoch_wait_preempt(net_epoch_preempt); 636 637 UN_WLOCK(ctl); 638 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { 639 destroy_unhop(unhop); 640 } CHT_SLIST_FOREACH_SAFE_END; 641 UN_WUNLOCK(ctl); 642 643 free(ctl->un_head.ptr, M_NETLINK); 644 free(ctl, M_NETLINK); 645 } 646 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, 647 vnet_destroy_unhops, NULL); 648 649 static int 650 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 651 { 652 int error = 0; 653 654 /* Verify attribute correctness */ 655 struct nexthop_grp *grp = NLA_DATA(nla); 656 int data_len = NLA_DATA_LEN(nla); 657 658 int count = data_len / sizeof(*grp); 659 if (count == 0 || (count * sizeof(*grp) != data_len)) { 660 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); 661 return (EINVAL); 662 } 663 664 *((struct nlattr **)target) = nla; 665 return (error); 666 } 667 668 struct nl_parsed_nhop { 669 uint32_t nha_id; 670 uint8_t nha_blackhole; 671 uint8_t nha_groups; 672 struct ifnet *nha_oif; 673 struct sockaddr *nha_gw; 674 struct nlattr *nha_group; 675 uint8_t nh_family; 676 uint8_t nh_protocol; 677 }; 678 679 #define _IN(_field) offsetof(struct nhmsg, _field) 680 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) 681 static const struct nlfield_parser nlf_p_nh[] = { 682 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, 683 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, 684 }; 685 686 static const struct nlattr_parser nla_p_nh[] = { 687 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, 688 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, 689 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, 690 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, 691 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, 692 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, 693 }; 694 #undef _IN 695 #undef _OUT 696 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); 697 698 static bool 699 eligible_nhg(const struct nhop_object *nh) 700 { 701 return (nh->nh_flags & NHF_GATEWAY); 702 } 703 704 static int 705 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 706 { 707 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); 708 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); 709 struct weightened_nhop *wn; 710 711 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); 712 if (wn == NULL) 713 return (ENOMEM); 714 715 for (int i = 0; i < count; i++) { 716 struct user_nhop *unhop; 717 unhop = nl_find_base_unhop(ctl, grp[i].id); 718 if (unhop == NULL) { 719 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); 720 free(wn, M_NETLINK); 721 return (ESRCH); 722 } else if (unhop->un_nhop_src == NULL) { 723 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", 724 grp[i].id); 725 free(wn, M_NETLINK); 726 return (ENOTSUP); 727 } else if (!eligible_nhg(unhop->un_nhop_src)) { 728 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", 729 grp[i].id); 730 free(wn, M_NETLINK); 731 return (ENOTSUP); 732 } 733 /* 734 * TODO: consider more rigid eligibility checks: 735 * restrict nexthops with the same gateway 736 */ 737 wn[i].nh = unhop->un_nhop_src; 738 wn[i].weight = grp[i].weight; 739 } 740 unhop->un_nhgrp_src = wn; 741 unhop->un_nhgrp_count = count; 742 return (0); 743 } 744 745 static int 746 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 747 { 748 struct ifaddr *ifa = NULL; 749 struct nhop_object *nh; 750 int error; 751 752 if (!attrs->nha_blackhole) { 753 if (attrs->nha_gw == NULL) { 754 NL_LOG(LOG_DEBUG, "missing NHA_GATEWAY"); 755 return (EINVAL); 756 } 757 if (attrs->nha_oif == NULL) { 758 NL_LOG(LOG_DEBUG, "missing NHA_OIF"); 759 return (EINVAL); 760 } 761 if (ifa == NULL) 762 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); 763 if (ifa == NULL) { 764 NL_LOG(LOG_DEBUG, "Unable to determine default source IP"); 765 return (EINVAL); 766 } 767 } 768 769 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; 770 771 nh = nhop_alloc(RT_DEFAULT_FIB, family); 772 if (nh == NULL) { 773 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); 774 return (ENOMEM); 775 } 776 nhop_set_uidx(nh, attrs->nha_id); 777 778 if (attrs->nha_blackhole) 779 nhop_set_blackhole(nh, NHF_BLACKHOLE); 780 else { 781 nhop_set_gw(nh, attrs->nha_gw, true); 782 nhop_set_transmit_ifp(nh, attrs->nha_oif); 783 nhop_set_src(nh, ifa); 784 } 785 786 error = nhop_get_unlinked(nh); 787 if (error != 0) { 788 NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); 789 return (error); 790 } 791 792 IF_DEBUG_LEVEL(LOG_DEBUG2) { 793 char nhbuf[NHOP_PRINT_BUFSIZE]; 794 nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); 795 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); 796 } 797 798 unhop->un_nhop_src = nh; 799 return (0); 800 } 801 802 static int 803 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 804 struct nl_pstate *npt) 805 { 806 struct user_nhop *unhop; 807 int error; 808 809 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) 810 return (ENOMEM); 811 struct unhop_ctl *ctl = V_un_ctl; 812 813 struct nl_parsed_nhop attrs = {}; 814 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 815 if (error != 0) 816 return (error); 817 818 /* 819 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class 820 * citizen. 821 */ 822 if (attrs.nha_id == 0) { 823 attrs.nha_id = find_spare_uidx(ctl); 824 if (attrs.nha_id == 0) { 825 NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); 826 return (ENOSPC); 827 } 828 } 829 830 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); 831 832 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 833 if (unhop == NULL) { 834 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); 835 return (ENOMEM); 836 } 837 unhop->un_idx = attrs.nha_id; 838 unhop->un_protocol = attrs.nh_protocol; 839 840 if (attrs.nha_group) 841 error = newnhg(ctl, &attrs, unhop); 842 else 843 error = newnhop(&attrs, unhop); 844 845 if (error != 0) { 846 free(unhop, M_NETLINK); 847 return (error); 848 } 849 850 UN_WLOCK(ctl); 851 /* Check if uidx already exists */ 852 struct user_nhop *tmp = NULL; 853 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); 854 if (tmp != NULL) { 855 UN_WUNLOCK(ctl); 856 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); 857 destroy_unhop(unhop); 858 return (EEXIST); 859 } 860 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); 861 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); 862 UN_WUNLOCK(ctl); 863 864 /* Report addition of the next nexhop */ 865 struct netlink_walkargs wa = { 866 .hdr.nlmsg_pid = hdr->nlmsg_pid, 867 .hdr.nlmsg_seq = hdr->nlmsg_seq, 868 .hdr.nlmsg_flags = hdr->nlmsg_flags, 869 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 870 }; 871 872 struct nl_writer nw = {}; 873 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 874 NL_LOG(LOG_DEBUG, "error allocating message writer"); 875 return (ENOMEM); 876 } 877 878 dump_unhop(unhop, &wa.hdr, &nw); 879 nlmsg_flush(&nw); 880 881 consider_resize(ctl, num_buckets_new); 882 883 return (0); 884 } 885 886 static int 887 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 888 struct nl_pstate *npt) 889 { 890 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 891 int error; 892 893 if (__predict_false(ctl == NULL)) 894 return (ESRCH); 895 896 struct nl_parsed_nhop attrs = {}; 897 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 898 if (error != 0) 899 return (error); 900 901 if (attrs.nha_id == 0) { 902 NL_LOG(LOG_DEBUG, "NHA_ID not set"); 903 return (EINVAL); 904 } 905 906 error = delete_unhop(ctl, hdr, attrs.nha_id); 907 908 return (error); 909 } 910 911 static bool 912 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 913 { 914 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) 915 return (false); 916 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) 917 return (false); 918 if (attrs->nha_oif != NULL && 919 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) 920 return (false); 921 922 return (true); 923 } 924 925 static int 926 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 927 struct nl_pstate *npt) 928 { 929 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 930 struct user_nhop *unhop; 931 UN_TRACKER; 932 int error; 933 934 if (__predict_false(ctl == NULL)) 935 return (ESRCH); 936 937 struct nl_parsed_nhop attrs = {}; 938 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 939 if (error != 0) 940 return (error); 941 942 struct netlink_walkargs wa = { 943 .nw = npt->nw, 944 .hdr.nlmsg_pid = hdr->nlmsg_pid, 945 .hdr.nlmsg_seq = hdr->nlmsg_seq, 946 .hdr.nlmsg_flags = hdr->nlmsg_flags, 947 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 948 }; 949 950 if (attrs.nha_id != 0) { 951 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); 952 struct user_nhop key= { .un_idx = attrs.nha_id }; 953 UN_RLOCK(ctl); 954 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 955 UN_RUNLOCK(ctl); 956 957 if (unhop == NULL) 958 return (ESRCH); 959 dump_unhop(unhop, &wa.hdr, wa.nw); 960 return (0); 961 } 962 963 UN_RLOCK(ctl); 964 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 965 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { 966 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) 967 dump_unhop(unhop, &wa.hdr, wa.nw); 968 } CHT_SLIST_FOREACH_END; 969 UN_RUNLOCK(ctl); 970 971 if (wa.error == 0) { 972 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) 973 return (ENOMEM); 974 } 975 return (0); 976 } 977 978 static const struct rtnl_cmd_handler cmd_handlers[] = { 979 { 980 .cmd = NL_RTM_NEWNEXTHOP, 981 .name = "RTM_NEWNEXTHOP", 982 .cb = &rtnl_handle_newnhop, 983 .priv = PRIV_NET_ROUTE, 984 }, 985 { 986 .cmd = NL_RTM_DELNEXTHOP, 987 .name = "RTM_DELNEXTHOP", 988 .cb = &rtnl_handle_delnhop, 989 .priv = PRIV_NET_ROUTE, 990 }, 991 { 992 .cmd = NL_RTM_GETNEXTHOP, 993 .name = "RTM_GETNEXTHOP", 994 .cb = &rtnl_handle_getnhop, 995 } 996 }; 997 998 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser }; 999 1000 void 1001 rtnl_nexthops_init() 1002 { 1003 NL_VERIFY_PARSERS(all_parsers); 1004 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); 1005 } 1006