1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 #include <sys/types.h> 34 #include <sys/ck.h> 35 #include <sys/epoch.h> 36 #include <sys/kernel.h> 37 #include <sys/malloc.h> 38 #include <sys/rmlock.h> 39 #include <sys/socket.h> 40 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <net/route/nhop.h> 44 #include <net/route/nhop_utils.h> 45 46 #include <net/route/route_ctl.h> 47 #include <net/route/route_var.h> 48 #include <netinet6/scope6_var.h> 49 #include <netlink/netlink.h> 50 #include <netlink/netlink_ctl.h> 51 #include <netlink/netlink_route.h> 52 #include <netlink/route/route_var.h> 53 54 #define DEBUG_MOD_NAME nl_nhop 55 #define DEBUG_MAX_LEVEL LOG_DEBUG3 56 #include <netlink/netlink_debug.h> 57 _DECLARE_DEBUG(LOG_DEBUG3); 58 59 /* 60 * This file contains the logic to maintain kernel nexthops and 61 * nexhop groups based om the data provided by the user. 62 * 63 * Kernel stores (nearly) all of the routing data in the nexthops, 64 * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). 65 * 66 * Netlink API provides higher-level abstraction for the user. Each 67 * user-created nexthop may map to multiple kernel nexthops. 68 * 69 * The following variations require separate kernel nexthop to be 70 * created: 71 * * prefix flags (NHF_HOST, NHF_DEFAULT) 72 * * using IPv6 gateway for IPv4 routes 73 * * different fibnum 74 * 75 * These kernel nexthops have the lifetime bound to the lifetime of 76 * the user_nhop object. They are not collected until user requests 77 * to delete the created user_nhop. 78 * 79 */ 80 struct user_nhop { 81 uint32_t un_idx; /* Userland-provided index */ 82 uint32_t un_fibfam; /* fibnum+af(as highest byte) */ 83 uint8_t un_protocol; /* protocol that install the record */ 84 struct nhop_object *un_nhop; /* "production" nexthop */ 85 struct nhop_object *un_nhop_src; /* nexthop to copy from */ 86 struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ 87 uint32_t un_nhgrp_count; /* number of nexthops */ 88 struct user_nhop *un_next; /* next item in hash chain */ 89 struct user_nhop *un_nextchild; /* master -> children */ 90 struct epoch_context un_epoch_ctx; /* epoch ctl helper */ 91 }; 92 93 /* produce hash value for an object */ 94 #define unhop_hash_obj(_obj) (hash_unhop(_obj)) 95 /* compare two objects */ 96 #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) 97 /* next object accessor */ 98 #define unhop_next(_obj) (_obj)->un_next 99 100 CHT_SLIST_DEFINE(unhop, struct user_nhop); 101 102 struct unhop_ctl { 103 struct unhop_head un_head; 104 struct rmlock un_lock; 105 }; 106 #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") 107 #define UN_TRACKER struct rm_priotracker un_tracker 108 #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) 109 #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) 110 111 #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); 112 #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); 113 114 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; 115 #define V_un_ctl VNET(un_ctl) 116 117 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); 118 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); 119 static unsigned int hash_unhop(const struct user_nhop *obj); 120 121 static void destroy_unhop(struct user_nhop *unhop); 122 static struct nhop_object *clone_unhop(const struct user_nhop *unhop, 123 uint32_t fibnum, int family, int nh_flags); 124 125 static int 126 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) 127 { 128 return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); 129 } 130 131 /* 132 * Hash callback: calculate hash of an object 133 */ 134 static unsigned int 135 hash_unhop(const struct user_nhop *obj) 136 { 137 return (obj->un_idx ^ obj->un_fibfam); 138 } 139 140 #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) 141 142 /* 143 * Factory interface for creating matching kernel nexthops/nexthop groups 144 * 145 * @uidx: userland nexhop index used to create the nexthop 146 * @fibnum: fibnum nexthop will be used in 147 * @family: upper family nexthop will be used in 148 * @nh_flags: desired nexthop prefix flags 149 * @perror: pointer to store error to 150 * 151 * Returns referenced nexthop linked to @fibnum/@family rib on success. 152 */ 153 struct nhop_object * 154 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, 155 int nh_flags, int *perror) 156 { 157 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 158 UN_TRACKER; 159 160 if (__predict_false(ctl == NULL)) 161 return (NULL); 162 163 struct user_nhop key= { 164 .un_idx = uidx, 165 .un_fibfam = fibnum | ((uint32_t)family) << 24, 166 }; 167 struct user_nhop *unhop; 168 169 nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); 170 171 if (__predict_false(family == 0)) 172 return (NULL); 173 174 UN_RLOCK(ctl); 175 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 176 if (unhop != NULL) { 177 struct nhop_object *nh = unhop->un_nhop; 178 UN_RLOCK(ctl); 179 *perror = 0; 180 nhop_ref_any(nh); 181 return (nh); 182 } 183 184 /* 185 * Exact nexthop not found. Search for template nexthop to clone from. 186 */ 187 key.un_fibfam = 0; 188 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 189 if (unhop == NULL) { 190 UN_RUNLOCK(ctl); 191 *perror = ESRCH; 192 return (NULL); 193 } 194 195 UN_RUNLOCK(ctl); 196 197 /* Create entry to insert first */ 198 struct user_nhop *un_new, *un_tmp; 199 un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 200 if (un_new == NULL) { 201 *perror = ENOMEM; 202 return (NULL); 203 } 204 un_new->un_idx = uidx; 205 un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; 206 207 /* Relying on epoch to protect unhop here */ 208 un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); 209 if (un_new->un_nhop == NULL) { 210 free(un_new, M_NETLINK); 211 *perror = ENOMEM; 212 return (NULL); 213 } 214 215 /* Insert back and report */ 216 UN_WLOCK(ctl); 217 218 /* First, find template record once again */ 219 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 220 if (unhop == NULL) { 221 /* Someone deleted the nexthop during the call */ 222 UN_WUNLOCK(ctl); 223 *perror = ESRCH; 224 destroy_unhop(un_new); 225 return (NULL); 226 } 227 228 /* Second, check the direct match */ 229 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); 230 struct nhop_object *nh; 231 if (un_tmp != NULL) { 232 /* Another thread already created the desired nextop, use it */ 233 nh = un_tmp->un_nhop; 234 } else { 235 /* Finally, insert the new nexthop and link it to the primary */ 236 nh = un_new->un_nhop; 237 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); 238 un_new->un_nextchild = unhop->un_nextchild; 239 unhop->un_nextchild = un_new; 240 un_new = NULL; 241 NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); 242 } 243 244 UN_WUNLOCK(ctl); 245 246 if (un_new != NULL) 247 destroy_unhop(un_new); 248 249 *perror = 0; 250 nhop_ref_any(nh); 251 return (nh); 252 } 253 254 static struct user_nhop * 255 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) 256 { 257 struct user_nhop key= { .un_idx = uidx }; 258 struct user_nhop *unhop = NULL; 259 UN_TRACKER; 260 261 UN_RLOCK(ctl); 262 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 263 UN_RUNLOCK(ctl); 264 265 return (unhop); 266 } 267 268 #define MAX_STACK_NHOPS 4 269 static struct nhop_object * 270 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) 271 { 272 #ifdef ROUTE_MPATH 273 const struct weightened_nhop *wn; 274 struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; 275 uint32_t num_nhops; 276 #endif 277 struct nhop_object *nh = NULL; 278 int error; 279 280 if (unhop->un_nhop_src != NULL) { 281 IF_DEBUG_LEVEL(LOG_DEBUG2) { 282 char nhbuf[NHOP_PRINT_BUFSIZE]; 283 nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); 284 FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, 285 "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, 286 family, nh_flags); 287 } 288 struct nhop_object *nh; 289 nh = nhop_alloc(fibnum, AF_UNSPEC); 290 if (nh == NULL) 291 return (NULL); 292 nhop_copy(nh, unhop->un_nhop_src); 293 /* Check that nexthop gateway is compatible with the new family */ 294 if (!nhop_set_upper_family(nh, family)) { 295 nhop_free(nh); 296 return (NULL); 297 } 298 nhop_set_uidx(nh, unhop->un_idx); 299 nhop_set_pxtype_flag(nh, nh_flags); 300 return (nhop_get_nhop(nh, &error)); 301 } 302 #ifdef ROUTE_MPATH 303 wn = unhop->un_nhgrp_src; 304 num_nhops = unhop->un_nhgrp_count; 305 306 if (num_nhops > MAX_STACK_NHOPS) { 307 wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); 308 if (wn_new == NULL) 309 return (NULL); 310 } else 311 wn_new = wn_base; 312 313 for (int i = 0; i < num_nhops; i++) { 314 uint32_t uidx = nhop_get_uidx(wn[i].nh); 315 MPASS(uidx != 0); 316 wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); 317 if (error != 0) 318 break; 319 wn_new[i].weight = wn[i].weight; 320 } 321 322 if (error == 0) { 323 struct rib_head *rh = nhop_get_rh(wn_new[0].nh); 324 struct nhgrp_object *nhg; 325 326 error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); 327 nh = (struct nhop_object *)nhg; 328 } 329 330 if (wn_new != wn_base) 331 free(wn_new, M_TEMP); 332 #endif 333 return (nh); 334 } 335 336 static void 337 destroy_unhop(struct user_nhop *unhop) 338 { 339 if (unhop->un_nhop != NULL) 340 nhop_free_any(unhop->un_nhop); 341 if (unhop->un_nhop_src != NULL) 342 nhop_free_any(unhop->un_nhop_src); 343 free(unhop, M_NETLINK); 344 } 345 346 static void 347 destroy_unhop_epoch(epoch_context_t ctx) 348 { 349 struct user_nhop *unhop; 350 351 unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); 352 353 destroy_unhop(unhop); 354 } 355 356 static uint32_t 357 find_spare_uidx(struct unhop_ctl *ctl) 358 { 359 struct user_nhop *unhop, key = {}; 360 uint32_t uidx = 0; 361 UN_TRACKER; 362 363 UN_RLOCK(ctl); 364 /* This should return spare uid with 75% of 65k used in ~99/100 cases */ 365 for (int i = 0; i < 16; i++) { 366 key.un_idx = (arc4random() % 65536) + 65536 * 4; 367 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 368 if (unhop == NULL) { 369 uidx = key.un_idx; 370 break; 371 } 372 } 373 UN_RUNLOCK(ctl); 374 375 return (uidx); 376 } 377 378 379 /* 380 * Actual netlink code 381 */ 382 struct netlink_walkargs { 383 struct nl_writer *nw; 384 struct nlmsghdr hdr; 385 struct nlpcb *so; 386 int family; 387 int error; 388 int count; 389 int dumped; 390 }; 391 #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem 392 393 static bool 394 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, 395 struct nl_writer *nw) 396 { 397 398 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 399 goto enomem; 400 401 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 402 nhm->nh_family = AF_UNSPEC; 403 nhm->nh_scope = 0; 404 nhm->nh_protocol = unhop->un_protocol; 405 nhm->nh_flags = 0; 406 407 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 408 nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); 409 410 struct weightened_nhop *wn = unhop->un_nhgrp_src; 411 uint32_t num_nhops = unhop->un_nhgrp_count; 412 /* TODO: a better API? */ 413 int nla_len = sizeof(struct nlattr); 414 nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); 415 struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); 416 if (nla == NULL) 417 goto enomem; 418 nla->nla_type = NHA_GROUP; 419 nla->nla_len = nla_len; 420 for (int i = 0; i < num_nhops; i++) { 421 struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; 422 grp->id = nhop_get_uidx(wn[i].nh); 423 grp->weight = wn[i].weight; 424 grp->resvd1 = 0; 425 grp->resvd2 = 0; 426 } 427 428 if (nlmsg_end(nw)) 429 return (true); 430 enomem: 431 NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); 432 nlmsg_abort(nw); 433 return (false); 434 } 435 436 static bool 437 dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 438 struct nl_writer *nw) 439 { 440 struct nhop_object *nh = unhop->un_nhop_src; 441 442 if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) 443 goto enomem; 444 445 struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); 446 ENOMEM_IF_NULL(nhm); 447 nhm->nh_family = nhop_get_neigh_family(nh); 448 nhm->nh_scope = 0; // XXX: what's that? 449 nhm->nh_protocol = unhop->un_protocol; 450 nhm->nh_flags = 0; 451 452 nlattr_add_u32(nw, NHA_ID, unhop->un_idx); 453 if (nh->nh_flags & NHF_BLACKHOLE) { 454 nlattr_add_flag(nw, NHA_BLACKHOLE); 455 goto done; 456 } 457 nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); 458 459 switch (nh->gw_sa.sa_family) { 460 #ifdef INET 461 case AF_INET: 462 nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); 463 break; 464 #endif 465 #ifdef INET6 466 case AF_INET6: 467 { 468 struct in6_addr addr = nh->gw6_sa.sin6_addr; 469 in6_clearscope(&addr); 470 nlattr_add(nw, NHA_GATEWAY, 16, &addr); 471 break; 472 } 473 #endif 474 } 475 476 done: 477 if (nlmsg_end(nw)) 478 return (true); 479 enomem: 480 nlmsg_abort(nw); 481 return (false); 482 } 483 484 static void 485 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, 486 struct nl_writer *nw) 487 { 488 if (unhop->un_nhop_src != NULL) 489 dump_nhop(unhop, hdr, nw); 490 else 491 dump_nhgrp(unhop, hdr, nw); 492 } 493 494 static int 495 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) 496 { 497 struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; 498 499 struct user_nhop key = { .un_idx = uidx }; 500 501 UN_WLOCK(ctl); 502 503 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); 504 505 if (unhop_base != NULL) { 506 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); 507 IF_DEBUG_LEVEL(LOG_DEBUG2) { 508 char nhbuf[NHOP_PRINT_BUFSIZE]; 509 nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); 510 FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, 511 "removed base nhop %u: %s", uidx, nhbuf); 512 } 513 /* Unlink all child nexhops as well, keeping the chain intact */ 514 unhop_chain = unhop_base->un_nextchild; 515 while (unhop_chain != NULL) { 516 CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, 517 unhop_ret); 518 MPASS(unhop_chain == unhop_ret); 519 IF_DEBUG_LEVEL(LOG_DEBUG3) { 520 char nhbuf[NHOP_PRINT_BUFSIZE]; 521 nhop_print_buf_any(unhop_chain->un_nhop, 522 nhbuf, sizeof(nhbuf)); 523 FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, 524 "removed child nhop %u: %s", uidx, nhbuf); 525 } 526 unhop_chain = unhop_chain->un_nextchild; 527 } 528 } 529 530 UN_WUNLOCK(ctl); 531 532 if (unhop_base == NULL) { 533 NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); 534 return (ENOENT); 535 } 536 537 /* Report nexthop deletion */ 538 struct netlink_walkargs wa = { 539 .hdr.nlmsg_pid = hdr->nlmsg_pid, 540 .hdr.nlmsg_seq = hdr->nlmsg_seq, 541 .hdr.nlmsg_flags = hdr->nlmsg_flags, 542 .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, 543 }; 544 545 struct nl_writer nw = {}; 546 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 547 NL_LOG(LOG_DEBUG, "error allocating message writer"); 548 return (ENOMEM); 549 } 550 551 dump_unhop(unhop_base, &wa.hdr, &nw); 552 nlmsg_flush(&nw); 553 554 while (unhop_base != NULL) { 555 unhop_chain = unhop_base->un_nextchild; 556 NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); 557 unhop_base = unhop_chain; 558 } 559 560 return (0); 561 } 562 563 static void 564 consider_resize(struct unhop_ctl *ctl, uint32_t new_size) 565 { 566 void *new_ptr = NULL; 567 size_t alloc_size; 568 569 if (new_size == 0) 570 return; 571 572 if (new_size != 0) { 573 alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); 574 new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 575 if (new_ptr == NULL) 576 return; 577 } 578 579 NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); 580 UN_WLOCK(ctl); 581 if (new_ptr != NULL) { 582 CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); 583 } 584 UN_WUNLOCK(ctl); 585 586 587 if (new_ptr != NULL) 588 free(new_ptr, M_NETLINK); 589 } 590 591 static bool __noinline 592 vnet_init_unhops(void) 593 { 594 uint32_t num_buckets = 16; 595 size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); 596 597 struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, 598 M_NOWAIT | M_ZERO); 599 if (ctl == NULL) 600 return (false); 601 602 void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); 603 if (ptr == NULL) { 604 free(ctl, M_NETLINK); 605 return (false); 606 } 607 CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); 608 UN_LOCK_INIT(ctl); 609 610 if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { 611 free(ptr, M_NETLINK); 612 free(ctl, M_NETLINK); 613 } 614 615 if (atomic_load_ptr(&V_un_ctl) == NULL) 616 return (false); 617 618 NL_LOG(LOG_NOTICE, "UNHOPS init done"); 619 620 return (true); 621 } 622 623 static void 624 vnet_destroy_unhops(const void *unused __unused) 625 { 626 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 627 struct user_nhop *unhop, *tmp; 628 629 if (ctl == NULL) 630 return; 631 V_un_ctl = NULL; 632 633 /* Wait till all unhop users finish their reads */ 634 NET_EPOCH_WAIT(); 635 636 UN_WLOCK(ctl); 637 CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { 638 destroy_unhop(unhop); 639 } CHT_SLIST_FOREACH_SAFE_END; 640 UN_WUNLOCK(ctl); 641 642 free(ctl->un_head.ptr, M_NETLINK); 643 free(ctl, M_NETLINK); 644 } 645 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, 646 vnet_destroy_unhops, NULL); 647 648 static int 649 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) 650 { 651 int error = 0; 652 653 /* Verify attribute correctness */ 654 struct nexthop_grp *grp = NLA_DATA(nla); 655 int data_len = NLA_DATA_LEN(nla); 656 657 int count = data_len / sizeof(*grp); 658 if (count == 0 || (count * sizeof(*grp) != data_len)) { 659 NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); 660 return (EINVAL); 661 } 662 663 *((struct nlattr **)target) = nla; 664 return (error); 665 } 666 667 struct nl_parsed_nhop { 668 uint32_t nha_id; 669 uint8_t nha_blackhole; 670 uint8_t nha_groups; 671 struct ifnet *nha_oif; 672 struct sockaddr *nha_gw; 673 struct nlattr *nha_group; 674 uint8_t nh_family; 675 uint8_t nh_protocol; 676 }; 677 678 #define _IN(_field) offsetof(struct nhmsg, _field) 679 #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) 680 static const struct nlfield_parser nlf_p_nh[] = { 681 { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, 682 { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, 683 }; 684 685 static const struct nlattr_parser nla_p_nh[] = { 686 { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, 687 { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, 688 { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, 689 { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, 690 { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, 691 { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, 692 }; 693 #undef _IN 694 #undef _OUT 695 NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); 696 697 static bool 698 eligible_nhg(const struct nhop_object *nh) 699 { 700 return (nh->nh_flags & NHF_GATEWAY); 701 } 702 703 static int 704 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 705 { 706 struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); 707 int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); 708 struct weightened_nhop *wn; 709 710 wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); 711 if (wn == NULL) 712 return (ENOMEM); 713 714 for (int i = 0; i < count; i++) { 715 struct user_nhop *unhop; 716 unhop = nl_find_base_unhop(ctl, grp[i].id); 717 if (unhop == NULL) { 718 NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); 719 free(wn, M_NETLINK); 720 return (ESRCH); 721 } else if (unhop->un_nhop_src == NULL) { 722 NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", 723 grp[i].id); 724 free(wn, M_NETLINK); 725 return (ENOTSUP); 726 } else if (!eligible_nhg(unhop->un_nhop_src)) { 727 NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", 728 grp[i].id); 729 free(wn, M_NETLINK); 730 return (ENOTSUP); 731 } 732 /* 733 * TODO: consider more rigid eligibility checks: 734 * restrict nexthops with the same gateway 735 */ 736 wn[i].nh = unhop->un_nhop_src; 737 wn[i].weight = grp[i].weight; 738 } 739 unhop->un_nhgrp_src = wn; 740 unhop->un_nhgrp_count = count; 741 return (0); 742 } 743 744 /* 745 * Sets nexthop @nh gateway specified by @gw. 746 * If gateway is IPv6 link-local, alters @gw to include scopeid equal to 747 * @ifp ifindex. 748 * Returns 0 on success or errno. 749 */ 750 int 751 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp, 752 struct nl_pstate *npt) 753 { 754 #ifdef INET6 755 if (gw->sa_family == AF_INET6) { 756 struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; 757 if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { 758 if (ifp == NULL) { 759 NLMSG_REPORT_ERR_MSG(npt, "interface not set"); 760 return (EINVAL); 761 } 762 in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index); 763 } 764 } 765 #endif 766 nhop_set_gw(nh, gw, true); 767 return (0); 768 } 769 770 static int 771 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) 772 { 773 struct ifaddr *ifa = NULL; 774 struct nhop_object *nh; 775 int error; 776 777 if (!attrs->nha_blackhole) { 778 if (attrs->nha_gw == NULL) { 779 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); 780 return (EINVAL); 781 } 782 if (attrs->nha_oif == NULL) { 783 NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); 784 return (EINVAL); 785 } 786 if (ifa == NULL) 787 ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); 788 if (ifa == NULL) { 789 NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); 790 return (EINVAL); 791 } 792 } 793 794 int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; 795 796 nh = nhop_alloc(RT_DEFAULT_FIB, family); 797 if (nh == NULL) { 798 NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); 799 return (ENOMEM); 800 } 801 nhop_set_uidx(nh, attrs->nha_id); 802 803 if (attrs->nha_blackhole) 804 nhop_set_blackhole(nh, NHF_BLACKHOLE); 805 else { 806 error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); 807 if (error != 0) { 808 nhop_free(nh); 809 return (error); 810 } 811 nhop_set_transmit_ifp(nh, attrs->nha_oif); 812 nhop_set_src(nh, ifa); 813 } 814 815 error = nhop_get_unlinked(nh); 816 if (error != 0) { 817 NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); 818 return (error); 819 } 820 821 IF_DEBUG_LEVEL(LOG_DEBUG2) { 822 char nhbuf[NHOP_PRINT_BUFSIZE]; 823 nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); 824 NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); 825 } 826 827 unhop->un_nhop_src = nh; 828 return (0); 829 } 830 831 static int 832 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 833 struct nl_pstate *npt) 834 { 835 struct user_nhop *unhop; 836 int error; 837 838 if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) 839 return (ENOMEM); 840 struct unhop_ctl *ctl = V_un_ctl; 841 842 struct nl_parsed_nhop attrs = {}; 843 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 844 if (error != 0) 845 return (error); 846 847 /* 848 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class 849 * citizen. 850 */ 851 if (attrs.nha_id == 0) { 852 attrs.nha_id = find_spare_uidx(ctl); 853 if (attrs.nha_id == 0) { 854 NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); 855 return (ENOSPC); 856 } 857 } 858 859 NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); 860 861 unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); 862 if (unhop == NULL) { 863 NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); 864 return (ENOMEM); 865 } 866 unhop->un_idx = attrs.nha_id; 867 unhop->un_protocol = attrs.nh_protocol; 868 869 if (attrs.nha_group) 870 error = newnhg(ctl, &attrs, unhop); 871 else 872 error = newnhop(&attrs, unhop, npt); 873 874 if (error != 0) { 875 free(unhop, M_NETLINK); 876 return (error); 877 } 878 879 UN_WLOCK(ctl); 880 /* Check if uidx already exists */ 881 struct user_nhop *tmp = NULL; 882 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); 883 if (tmp != NULL) { 884 UN_WUNLOCK(ctl); 885 NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); 886 destroy_unhop(unhop); 887 return (EEXIST); 888 } 889 CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); 890 uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); 891 UN_WUNLOCK(ctl); 892 893 /* Report addition of the next nexhop */ 894 struct netlink_walkargs wa = { 895 .hdr.nlmsg_pid = hdr->nlmsg_pid, 896 .hdr.nlmsg_seq = hdr->nlmsg_seq, 897 .hdr.nlmsg_flags = hdr->nlmsg_flags, 898 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 899 }; 900 901 struct nl_writer nw = {}; 902 if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { 903 NL_LOG(LOG_DEBUG, "error allocating message writer"); 904 return (ENOMEM); 905 } 906 907 dump_unhop(unhop, &wa.hdr, &nw); 908 nlmsg_flush(&nw); 909 910 consider_resize(ctl, num_buckets_new); 911 912 return (0); 913 } 914 915 static int 916 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 917 struct nl_pstate *npt) 918 { 919 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 920 int error; 921 922 if (__predict_false(ctl == NULL)) 923 return (ESRCH); 924 925 struct nl_parsed_nhop attrs = {}; 926 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 927 if (error != 0) 928 return (error); 929 930 if (attrs.nha_id == 0) { 931 NL_LOG(LOG_DEBUG, "NHA_ID not set"); 932 return (EINVAL); 933 } 934 935 error = delete_unhop(ctl, hdr, attrs.nha_id); 936 937 return (error); 938 } 939 940 static bool 941 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) 942 { 943 if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) 944 return (false); 945 if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) 946 return (false); 947 if (attrs->nha_oif != NULL && 948 (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) 949 return (false); 950 951 return (true); 952 } 953 954 static int 955 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, 956 struct nl_pstate *npt) 957 { 958 struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); 959 struct user_nhop *unhop; 960 UN_TRACKER; 961 int error; 962 963 if (__predict_false(ctl == NULL)) 964 return (ESRCH); 965 966 struct nl_parsed_nhop attrs = {}; 967 error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); 968 if (error != 0) 969 return (error); 970 971 struct netlink_walkargs wa = { 972 .nw = npt->nw, 973 .hdr.nlmsg_pid = hdr->nlmsg_pid, 974 .hdr.nlmsg_seq = hdr->nlmsg_seq, 975 .hdr.nlmsg_flags = hdr->nlmsg_flags, 976 .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, 977 }; 978 979 if (attrs.nha_id != 0) { 980 NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); 981 struct user_nhop key= { .un_idx = attrs.nha_id }; 982 UN_RLOCK(ctl); 983 CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); 984 UN_RUNLOCK(ctl); 985 986 if (unhop == NULL) 987 return (ESRCH); 988 dump_unhop(unhop, &wa.hdr, wa.nw); 989 return (0); 990 } 991 992 UN_RLOCK(ctl); 993 wa.hdr.nlmsg_flags |= NLM_F_MULTI; 994 CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { 995 if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) 996 dump_unhop(unhop, &wa.hdr, wa.nw); 997 } CHT_SLIST_FOREACH_END; 998 UN_RUNLOCK(ctl); 999 1000 if (wa.error == 0) { 1001 if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) 1002 return (ENOMEM); 1003 } 1004 return (0); 1005 } 1006 1007 static const struct rtnl_cmd_handler cmd_handlers[] = { 1008 { 1009 .cmd = NL_RTM_NEWNEXTHOP, 1010 .name = "RTM_NEWNEXTHOP", 1011 .cb = &rtnl_handle_newnhop, 1012 .priv = PRIV_NET_ROUTE, 1013 }, 1014 { 1015 .cmd = NL_RTM_DELNEXTHOP, 1016 .name = "RTM_DELNEXTHOP", 1017 .cb = &rtnl_handle_delnhop, 1018 .priv = PRIV_NET_ROUTE, 1019 }, 1020 { 1021 .cmd = NL_RTM_GETNEXTHOP, 1022 .name = "RTM_GETNEXTHOP", 1023 .cb = &rtnl_handle_getnhop, 1024 } 1025 }; 1026 1027 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser }; 1028 1029 void 1030 rtnl_nexthops_init(void) 1031 { 1032 NL_VERIFY_PARSERS(all_parsers); 1033 rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); 1034 } 1035