1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 #ifdef INET6 109 bool algo_fixed_inet6 = false; 110 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 111 "IPv6 longest prefix match lookups"); 112 #endif 113 #ifdef INET 114 bool algo_fixed_inet = false; 115 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 116 "IPv4 longest prefix match lookups"); 117 #endif 118 119 struct nhop_ref_table { 120 uint32_t count; 121 int32_t refcnt[0]; 122 }; 123 124 /* 125 * Data structure for the fib lookup instance tied to the particular rib. 126 */ 127 struct fib_data { 128 uint32_t number_nhops; /* current # of nhops */ 129 uint8_t hit_nhops; /* true if out of nhop limit */ 130 uint8_t init_done; /* true if init is competed */ 131 uint32_t fd_dead:1; /* Scheduled for deletion */ 132 uint32_t fd_linked:1; /* true if linked */ 133 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 134 uint32_t fd_force_eval:1;/* true if rebuild scheduled */ 135 uint8_t fd_family; /* family */ 136 uint32_t fd_fibnum; /* fibnum */ 137 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 138 uint32_t fd_algo_mask; /* bitmask for algo data */ 139 struct callout fd_callout; /* rebuild callout */ 140 void *fd_algo_data; /* algorithm data */ 141 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 142 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 143 struct rib_head *fd_rh; /* RIB table we're attached to */ 144 struct rib_subscription *fd_rs; /* storing table subscription */ 145 struct fib_dp fd_dp; /* fib datapath data */ 146 struct vnet *fd_vnet; /* vnet fib belongs to */ 147 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 148 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 149 uint32_t fd_num_changes; /* number of changes since last callout */ 150 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 151 }; 152 153 static void rebuild_fd_callout(void *_data); 154 static void destroy_fd_instance_epoch(epoch_context_t ctx); 155 static enum flm_op_result attach_datapath(struct fib_data *fd); 156 static bool is_idx_free(struct fib_data *fd, uint32_t index); 157 static void set_algo_fixed(struct rib_head *rh); 158 159 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 160 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 161 162 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 163 struct fib_lookup_module *orig_flm); 164 static void fib_unref_algo(struct fib_lookup_module *flm); 165 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 166 167 struct mtx fib_mtx; 168 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 169 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 170 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 171 172 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 173 174 /* Algorithm has to be this percent better than the current to switch */ 175 #define BEST_DIFF_PERCENT (5 * 256 / 100) 176 /* Schedule algo re-evaluation X seconds after a change */ 177 #define ALGO_EVAL_DELAY_MS 30000 178 /* Force algo re-evaluation after X changes */ 179 #define ALGO_EVAL_NUM_ROUTES 100 180 /* Try to setup algorithm X times */ 181 #define FIB_MAX_TRIES 32 182 /* Max amount of supported nexthops */ 183 #define FIB_MAX_NHOPS 262144 184 #define FIB_CALLOUT_DELAY_MS 50 185 186 /* Debug */ 187 static int flm_debug_level = LOG_NOTICE; 188 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 189 &flm_debug_level, 0, "debuglevel"); 190 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 191 192 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 193 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 194 #define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ 195 printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\ 196 print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) 197 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 198 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 199 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 200 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 201 } 202 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 203 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 204 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 205 __func__, _fmt, ## __VA_ARGS__); \ 206 } 207 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 208 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 209 #else 210 #define FD_PRINTF_LOG_DEBUG() 211 #endif 212 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 213 #define FD_PRINTF_LOG_INFO _FD_PRINTF 214 #else 215 #define FD_PRINTF_LOG_INFO() 216 #endif 217 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 218 #define FD_PRINTF_LOG_ERR _FD_PRINTF 219 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 220 221 222 /* List of all registered lookup algorithms */ 223 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 224 225 /* List of all fib lookup instances in the vnet */ 226 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 227 #define V_fib_data_list VNET(fib_data_list) 228 229 /* Datastructure for storing non-transient fib lookup module failures */ 230 struct fib_error { 231 int fe_family; 232 uint32_t fe_fibnum; /* failed rtable */ 233 struct fib_lookup_module *fe_flm; /* failed module */ 234 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 235 }; 236 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 237 #define V_fib_error_list VNET(fib_error_list) 238 239 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 240 struct fib_dp_header { 241 struct epoch_context fdh_epoch_ctx; 242 uint32_t fdh_num_tables; 243 struct fib_dp fdh_idx[0]; 244 }; 245 246 /* 247 * Tries to add new non-transient algorithm error to the list of 248 * errors. 249 * Returns true on success. 250 */ 251 static bool 252 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 253 { 254 struct fib_error *fe; 255 256 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 257 if (fe == NULL) 258 return (false); 259 fe->fe_flm = flm; 260 fe->fe_family = flm->flm_family; 261 fe->fe_fibnum = fibnum; 262 263 FIB_MOD_LOCK(); 264 /* Avoid duplicates by checking if error already exists first */ 265 if (flm_error_check(flm, fibnum)) { 266 FIB_MOD_UNLOCK(); 267 free(fe, M_TEMP); 268 return (true); 269 } 270 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 271 FIB_MOD_UNLOCK(); 272 273 return (true); 274 } 275 276 /* 277 * True if non-transient error has been registered for @flm in @fibnum. 278 */ 279 static bool 280 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 281 { 282 const struct fib_error *fe; 283 284 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 285 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 286 return (true); 287 } 288 289 return (false); 290 } 291 292 /* 293 * Clear all errors of algo specified by @flm. 294 */ 295 static void 296 fib_error_clear_flm(struct fib_lookup_module *flm) 297 { 298 struct fib_error *fe, *fe_tmp; 299 300 FIB_MOD_LOCK_ASSERT(); 301 302 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 303 if (fe->fe_flm == flm) { 304 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 305 free(fe, M_TEMP); 306 } 307 } 308 } 309 310 /* 311 * Clears all errors in current VNET. 312 */ 313 static void 314 fib_error_clear() 315 { 316 struct fib_error *fe, *fe_tmp; 317 318 FIB_MOD_LOCK_ASSERT(); 319 320 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 321 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 322 free(fe, M_TEMP); 323 } 324 } 325 326 static const char * 327 print_op_result(enum flm_op_result result) 328 { 329 switch (result) { 330 case FLM_SUCCESS: 331 return "success"; 332 case FLM_REBUILD: 333 return "rebuild"; 334 case FLM_ERROR: 335 return "error"; 336 } 337 338 return "unknown"; 339 } 340 341 static const char * 342 print_family(int family) 343 { 344 345 if (family == AF_INET) 346 return ("inet"); 347 else if (family == AF_INET6) 348 return ("inet6"); 349 else 350 return ("unknown"); 351 } 352 353 /* 354 * Debug function used by lookup algorithms. 355 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 356 */ 357 void 358 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 359 { 360 char buf[128]; 361 va_list ap; 362 363 if (level > flm_debug_level) 364 return; 365 366 va_start(ap, fmt); 367 vsnprintf(buf, sizeof(buf), fmt, ap); 368 va_end(ap); 369 370 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 371 func, "%s", buf); 372 } 373 374 /* 375 * Outputs list of algorithms supported by the provided address family. 376 */ 377 static int 378 print_algos_sysctl(struct sysctl_req *req, int family) 379 { 380 struct fib_lookup_module *flm; 381 struct sbuf sbuf; 382 int error, count = 0; 383 384 error = sysctl_wire_old_buffer(req, 0); 385 if (error == 0) { 386 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 387 TAILQ_FOREACH(flm, &all_algo_list, entries) { 388 if (flm->flm_family == family) { 389 if (count++ > 0) 390 sbuf_cat(&sbuf, ", "); 391 sbuf_cat(&sbuf, flm->flm_name); 392 } 393 } 394 error = sbuf_finish(&sbuf); 395 sbuf_delete(&sbuf); 396 } 397 return (error); 398 } 399 400 #ifdef INET6 401 static int 402 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 403 { 404 405 return (print_algos_sysctl(req, AF_INET6)); 406 } 407 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 408 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 409 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 410 #endif 411 412 #ifdef INET 413 static int 414 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 415 { 416 417 return (print_algos_sysctl(req, AF_INET)); 418 } 419 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 420 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 421 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 422 #endif 423 424 /* 425 * Calculate delay between repeated failures. 426 * Returns current delay in milliseconds. 427 */ 428 static uint32_t 429 callout_calc_delay_ms(struct fib_data *fd) 430 { 431 uint32_t shift; 432 433 if (fd->fd_failed_rebuilds > 10) 434 shift = 10; 435 else 436 shift = fd->fd_failed_rebuilds; 437 438 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 439 } 440 441 static void 442 schedule_callout(struct fib_data *fd, int delay_ms) 443 { 444 445 callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, 446 rebuild_fd_callout, fd, 0); 447 } 448 449 static void 450 schedule_fd_rebuild(struct fib_data *fd) 451 { 452 453 FIB_MOD_LOCK(); 454 if (!fd->fd_need_rebuild) { 455 fd->fd_need_rebuild = true; 456 457 /* 458 * Potentially re-schedules pending callout 459 * initiated by schedule_algo_eval. 460 */ 461 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); 462 schedule_callout(fd, callout_calc_delay_ms(fd)); 463 } 464 FIB_MOD_UNLOCK(); 465 } 466 467 static void 468 schedule_algo_eval(struct fib_data *fd) 469 { 470 471 if (fd->fd_num_changes++ == 0) { 472 /* Start callout to consider switch */ 473 FIB_MOD_LOCK(); 474 if (!callout_pending(&fd->fd_callout)) 475 schedule_callout(fd, ALGO_EVAL_DELAY_MS); 476 FIB_MOD_UNLOCK(); 477 } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { 478 /* Reset callout to exec immediately */ 479 FIB_MOD_LOCK(); 480 if (!fd->fd_need_rebuild) { 481 fd->fd_force_eval = true; 482 schedule_callout(fd, 1); 483 } 484 FIB_MOD_UNLOCK(); 485 } 486 } 487 488 /* 489 * Rib subscription handler. Checks if the algorithm is ready to 490 * receive updates, handles nexthop refcounting and passes change 491 * data to the algorithm callback. 492 */ 493 static void 494 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 495 void *_data) 496 { 497 struct fib_data *fd = (struct fib_data *)_data; 498 enum flm_op_result result; 499 500 RIB_WLOCK_ASSERT(rnh); 501 502 /* 503 * There is a small gap between subscribing for route changes 504 * and initiating rtable dump. Avoid receiving route changes 505 * prior to finishing rtable dump by checking `init_done`. 506 */ 507 if (!fd->init_done) 508 return; 509 /* 510 * If algo requested rebuild, stop sending updates by default. 511 * This simplifies nexthop refcount handling logic. 512 */ 513 if (fd->fd_need_rebuild) 514 return; 515 516 /* Consider scheduling algorithm re-evaluation */ 517 schedule_algo_eval(fd); 518 519 /* 520 * Maintain guarantee that every nexthop returned by the dataplane 521 * lookup has > 0 refcount, so can be safely referenced within current 522 * epoch. 523 */ 524 if (rc->rc_nh_new != NULL) { 525 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 526 /* ran out of indexes */ 527 schedule_fd_rebuild(fd); 528 return; 529 } 530 } 531 532 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 533 534 switch (result) { 535 case FLM_SUCCESS: 536 /* Unref old nexthop on success */ 537 if (rc->rc_nh_old != NULL) 538 fib_unref_nhop(fd, rc->rc_nh_old); 539 break; 540 case FLM_REBUILD: 541 542 /* 543 * Algo is not able to apply the update. 544 * Schedule algo rebuild. 545 */ 546 schedule_fd_rebuild(fd); 547 break; 548 case FLM_ERROR: 549 550 /* 551 * Algo reported a non-recoverable error. 552 * Record the error and schedule rebuild, which will 553 * trigger best algo selection. 554 */ 555 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 556 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 557 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 558 schedule_fd_rebuild(fd); 559 } 560 } 561 562 static void 563 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 564 { 565 566 if (old_fd == NULL) { 567 // TODO: read from rtable 568 fd->number_nhops = 16; 569 return; 570 } 571 572 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 573 fd->number_nhops = 2 * old_fd->number_nhops; 574 else 575 fd->number_nhops = old_fd->number_nhops; 576 } 577 578 struct walk_cbdata { 579 struct fib_data *fd; 580 flm_dump_t *func; 581 enum flm_op_result result; 582 }; 583 584 /* 585 * Handler called after all rtenties have been dumped. 586 * Performs post-dump framework checks and calls 587 * algo:flm_dump_end_cb(). 588 * 589 * Updates walk_cbdata result. 590 */ 591 static void 592 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 593 { 594 struct walk_cbdata *w = (struct walk_cbdata *)_data; 595 struct fib_data *fd = w->fd; 596 597 RIB_WLOCK_ASSERT(w->fd->fd_rh); 598 599 if (rnh->rib_dying) { 600 w->result = FLM_ERROR; 601 return; 602 } 603 604 if (fd->hit_nhops) { 605 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 606 fd->nh_ref_table->count); 607 if (w->result == FLM_SUCCESS) 608 w->result = FLM_REBUILD; 609 return; 610 } 611 612 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 613 return; 614 615 /* Post-dump hook, dump successful */ 616 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 617 618 if (w->result == FLM_SUCCESS) { 619 /* Mark init as done to allow routing updates */ 620 fd->init_done = 1; 621 } 622 } 623 624 /* 625 * Callback for each entry in rib. 626 * Calls algo:flm_dump_rib_item_cb func as a part of initial 627 * route table synchronisation. 628 */ 629 static int 630 sync_algo_cb(struct rtentry *rt, void *_data) 631 { 632 struct walk_cbdata *w = (struct walk_cbdata *)_data; 633 634 RIB_WLOCK_ASSERT(w->fd->fd_rh); 635 636 if (w->result == FLM_SUCCESS && w->func) { 637 638 /* 639 * Reference nexthops to maintain guarantee that 640 * each nexthop returned by datapath has > 0 references 641 * and can be safely referenced within current epoch. 642 */ 643 struct nhop_object *nh = rt_get_raw_nhop(rt); 644 if (fib_ref_nhop(w->fd, nh) != 0) 645 w->result = w->func(rt, w->fd->fd_algo_data); 646 else 647 w->result = FLM_REBUILD; 648 } 649 650 return (0); 651 } 652 653 /* 654 * Dump all routing table state to the algo instance. 655 */ 656 static enum flm_op_result 657 sync_algo(struct fib_data *fd) 658 { 659 struct walk_cbdata w = { 660 .fd = fd, 661 .func = fd->fd_flm->flm_dump_rib_item_cb, 662 .result = FLM_SUCCESS, 663 }; 664 665 rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); 666 667 FD_PRINTF(LOG_INFO, fd, "initial dump completed, result: %s", 668 print_op_result(w.result)); 669 670 return (w.result); 671 } 672 673 /* 674 * Schedules epoch-backed @fd instance deletion. 675 * * Unlinks @fd from the list of active algo instances. 676 * * Removes rib subscription. 677 * * Stops callout. 678 * * Schedules actual deletion. 679 * 680 * Assume @fd is already unlinked from the datapath. 681 */ 682 static int 683 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 684 { 685 bool is_dead; 686 687 NET_EPOCH_ASSERT(); 688 689 FIB_MOD_LOCK(); 690 is_dead = fd->fd_dead; 691 if (!is_dead) 692 fd->fd_dead = true; 693 if (fd->fd_linked) { 694 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 695 fd->fd_linked = false; 696 } 697 FIB_MOD_UNLOCK(); 698 if (is_dead) 699 return (0); 700 701 FD_PRINTF(LOG_INFO, fd, "DETACH"); 702 703 if (fd->fd_rs != NULL) 704 rib_unsibscribe(fd->fd_rs); 705 706 /* 707 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 708 * will be executed, hence no _new_ callout schedules will happen. 709 * 710 * There can be 2 possible scenarious here: 711 * 1) we're running inside a callout when we're deleting ourselves 712 * due to migration to a newer fd 713 * 2) we're running from rt_table_destroy() and callout is scheduled 714 * for execution OR is executing 715 * 716 * For (2) we need to wait for the callout termination, as the routing table 717 * will be destroyed after this function returns. 718 * For (1) we cannot call drain, but can ensure that this is the last invocation. 719 */ 720 721 if (in_callout) 722 callout_stop(&fd->fd_callout); 723 else 724 callout_drain(&fd->fd_callout); 725 726 epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, 727 &fd->fd_epoch_ctx); 728 729 return (0); 730 } 731 732 /* 733 * Wipe all fd instances from the list matching rib specified by @rh. 734 * If @keep_first is set, remove all but the first record. 735 */ 736 static void 737 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 738 { 739 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 740 struct fib_data *fd, *fd_tmp; 741 struct epoch_tracker et; 742 743 FIB_MOD_LOCK(); 744 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 745 if (fd->fd_rh == rh) { 746 if (keep_first) { 747 keep_first = false; 748 continue; 749 } 750 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 751 fd->fd_linked = false; 752 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 753 } 754 } 755 FIB_MOD_UNLOCK(); 756 757 /* Pass 2: remove each entry */ 758 NET_EPOCH_ENTER(et); 759 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 760 schedule_destroy_fd_instance(fd, in_callout); 761 } 762 NET_EPOCH_EXIT(et); 763 } 764 765 void 766 fib_destroy_rib(struct rib_head *rh) 767 { 768 769 /* 770 * rnh has `is_dying` flag set, so setup of new fd's will fail at 771 * sync_algo() stage, preventing new entries to be added to the list 772 * of active algos. Remove all existing entries for the particular rib. 773 */ 774 fib_cleanup_algo(rh, false, false); 775 } 776 777 /* 778 * Finalises fd destruction by freeing all fd resources. 779 */ 780 static void 781 destroy_fd_instance(struct fib_data *fd) 782 { 783 784 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 785 786 /* Call destroy callback first */ 787 if (fd->fd_algo_data != NULL) 788 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 789 790 /* Nhop table */ 791 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 792 for (int i = 0; i < fd->number_nhops; i++) { 793 if (!is_idx_free(fd, i)) { 794 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", 795 i, fd->nh_idx[i]); 796 nhop_free_any(fd->nh_idx[i]); 797 } 798 } 799 free(fd->nh_idx, M_RTABLE); 800 } 801 if (fd->nh_ref_table != NULL) 802 free(fd->nh_ref_table, M_RTABLE); 803 804 fib_unref_algo(fd->fd_flm); 805 806 free(fd, M_RTABLE); 807 } 808 809 /* 810 * Epoch callback indicating fd is safe to destroy 811 */ 812 static void 813 destroy_fd_instance_epoch(epoch_context_t ctx) 814 { 815 struct fib_data *fd; 816 817 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 818 819 destroy_fd_instance(fd); 820 } 821 822 /* 823 * Tries to setup fd instance. 824 * - Allocates fd/nhop table 825 * - Runs algo:flm_init_cb algo init 826 * - Subscribes fd to the rib 827 * - Runs rtable dump 828 * - Adds instance to the list of active instances. 829 * 830 * Returns: operation result. Fills in @pfd with resulting fd on success. 831 * 832 */ 833 static enum flm_op_result 834 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 835 struct fib_data *old_fd, struct fib_data **pfd) 836 { 837 struct fib_data *fd; 838 size_t size; 839 enum flm_op_result result; 840 841 /* Allocate */ 842 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 843 if (fd == NULL) { 844 *pfd = NULL; 845 return (FLM_REBUILD); 846 } 847 *pfd = fd; 848 849 estimate_nhop_scale(old_fd, fd); 850 851 fd->fd_rh = rh; 852 fd->fd_family = rh->rib_family; 853 fd->fd_fibnum = rh->rib_fibnum; 854 callout_init(&fd->fd_callout, 1); 855 fd->fd_vnet = curvnet; 856 fd->fd_flm = flm; 857 858 FIB_MOD_LOCK(); 859 flm->flm_refcount++; 860 FIB_MOD_UNLOCK(); 861 862 /* Allocate nhidx -> nhop_ptr table */ 863 size = fd->number_nhops * sizeof(void *); 864 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 865 if (fd->nh_idx == NULL) { 866 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 867 return (FLM_REBUILD); 868 } 869 870 /* Allocate nhop index refcount table */ 871 size = sizeof(struct nhop_ref_table); 872 size += fd->number_nhops * sizeof(uint32_t); 873 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 874 if (fd->nh_ref_table == NULL) { 875 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 876 return (FLM_REBUILD); 877 } 878 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 879 880 /* Okay, we're ready for algo init */ 881 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 882 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 883 if (result != FLM_SUCCESS) 884 return (result); 885 886 /* Try to subscribe */ 887 if (flm->flm_change_rib_item_cb != NULL) { 888 fd->fd_rs = rib_subscribe_internal(fd->fd_rh, 889 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); 890 if (fd->fd_rs == NULL) 891 return (FLM_REBUILD); 892 } 893 894 /* Dump */ 895 result = sync_algo(fd); 896 if (result != FLM_SUCCESS) 897 return (result); 898 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 899 900 FIB_MOD_LOCK(); 901 /* 902 * Insert fd in the beginning of a list, to maintain invariant 903 * that first matching entry for the AF/fib is always the active 904 * one. 905 */ 906 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 907 fd->fd_linked = true; 908 FIB_MOD_UNLOCK(); 909 910 return (FLM_SUCCESS); 911 } 912 913 /* 914 * Sets up algo @flm for table @rh and links it to the datapath. 915 * 916 */ 917 static enum flm_op_result 918 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 919 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 920 { 921 struct fib_data *prev_fd, *new_fd; 922 struct epoch_tracker et; 923 enum flm_op_result result; 924 925 prev_fd = orig_fd; 926 new_fd = NULL; 927 for (int i = 0; i < FIB_MAX_TRIES; i++) { 928 NET_EPOCH_ENTER(et); 929 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 930 931 if ((result == FLM_SUCCESS) && attach) 932 result = attach_datapath(new_fd); 933 934 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 935 schedule_destroy_fd_instance(prev_fd, false); 936 prev_fd = NULL; 937 } 938 NET_EPOCH_EXIT(et); 939 940 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 941 print_op_result(result)); 942 943 if (result == FLM_REBUILD) { 944 prev_fd = new_fd; 945 new_fd = NULL; 946 continue; 947 } 948 949 break; 950 } 951 952 if (result != FLM_SUCCESS) { 953 /* update failure count */ 954 FIB_MOD_LOCK(); 955 if (orig_fd != NULL) 956 orig_fd->fd_failed_rebuilds++; 957 FIB_MOD_UNLOCK(); 958 959 /* Ban algo on non-recoverable error */ 960 if (result == FLM_ERROR) 961 flm_error_add(flm, rh->rib_fibnum); 962 963 NET_EPOCH_ENTER(et); 964 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 965 schedule_destroy_fd_instance(prev_fd, false); 966 if (new_fd != NULL) { 967 schedule_destroy_fd_instance(new_fd, false); 968 new_fd = NULL; 969 } 970 NET_EPOCH_EXIT(et); 971 } 972 973 *pfd = new_fd; 974 return (result); 975 } 976 977 /* 978 * Callout for all scheduled fd-related work. 979 * - Checks if the current algo is still the best algo 980 * - Creates a new instance of an algo for af/fib if desired. 981 */ 982 static void 983 rebuild_fd_callout(void *_data) 984 { 985 struct fib_data *fd, *fd_new, *fd_tmp; 986 struct fib_lookup_module *flm_new; 987 struct epoch_tracker et; 988 enum flm_op_result result; 989 bool need_rebuild = false; 990 991 fd = (struct fib_data *)_data; 992 993 FIB_MOD_LOCK(); 994 need_rebuild = fd->fd_need_rebuild; 995 fd->fd_need_rebuild = false; 996 fd->fd_force_eval = false; 997 fd->fd_num_changes = 0; 998 FIB_MOD_UNLOCK(); 999 1000 CURVNET_SET(fd->fd_vnet); 1001 1002 /* First, check if we're still OK to use this algo */ 1003 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1004 if ((flm_new == NULL) && (!need_rebuild)) { 1005 /* Keep existing algo, no need to rebuild. */ 1006 CURVNET_RESTORE(); 1007 return; 1008 } 1009 1010 if (flm_new == NULL) { 1011 flm_new = fd->fd_flm; 1012 fd_tmp = fd; 1013 } else { 1014 fd_tmp = NULL; 1015 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1016 } 1017 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1018 if (fd_tmp == NULL) { 1019 /* fd_new represents new algo */ 1020 fib_unref_algo(flm_new); 1021 } 1022 if (result != FLM_SUCCESS) { 1023 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1024 CURVNET_RESTORE(); 1025 return; 1026 } 1027 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1028 1029 /* Remove old instance removal */ 1030 if (fd != NULL) { 1031 NET_EPOCH_ENTER(et); 1032 schedule_destroy_fd_instance(fd, true); 1033 NET_EPOCH_EXIT(et); 1034 } 1035 1036 CURVNET_RESTORE(); 1037 } 1038 1039 /* 1040 * Finds algo by name/family. 1041 * Returns referenced algo or NULL. 1042 */ 1043 static struct fib_lookup_module * 1044 fib_find_algo(const char *algo_name, int family) 1045 { 1046 struct fib_lookup_module *flm; 1047 1048 FIB_MOD_LOCK(); 1049 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1050 if ((strcmp(flm->flm_name, algo_name) == 0) && 1051 (family == flm->flm_family)) { 1052 flm->flm_refcount++; 1053 FIB_MOD_UNLOCK(); 1054 return (flm); 1055 } 1056 } 1057 FIB_MOD_UNLOCK(); 1058 1059 return (NULL); 1060 } 1061 1062 static void 1063 fib_unref_algo(struct fib_lookup_module *flm) 1064 { 1065 1066 FIB_MOD_LOCK(); 1067 flm->flm_refcount--; 1068 FIB_MOD_UNLOCK(); 1069 } 1070 1071 static int 1072 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1073 { 1074 struct fib_lookup_module *flm = NULL; 1075 struct fib_data *fd = NULL; 1076 char old_algo_name[32], algo_name[32]; 1077 struct rib_head *rh = NULL; 1078 enum flm_op_result result; 1079 int error; 1080 1081 /* Fetch current algo/rib for af/family */ 1082 FIB_MOD_LOCK(); 1083 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1084 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1085 break; 1086 } 1087 if (fd == NULL) { 1088 FIB_MOD_UNLOCK(); 1089 return (ENOENT); 1090 } 1091 rh = fd->fd_rh; 1092 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1093 sizeof(old_algo_name)); 1094 FIB_MOD_UNLOCK(); 1095 1096 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1097 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1098 if (error != 0 || req->newptr == NULL) 1099 return (error); 1100 1101 if (strcmp(algo_name, old_algo_name) == 0) 1102 return (0); 1103 1104 /* New algorithm name is different */ 1105 flm = fib_find_algo(algo_name, family); 1106 if (flm == NULL) { 1107 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1108 return (ESRCH); 1109 } 1110 1111 fd = NULL; 1112 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1113 fib_unref_algo(flm); 1114 if (result != FLM_SUCCESS) 1115 return (EINVAL); 1116 1117 /* Disable automated jumping between algos */ 1118 FIB_MOD_LOCK(); 1119 set_algo_fixed(rh); 1120 FIB_MOD_UNLOCK(); 1121 /* Remove old instance(s) */ 1122 fib_cleanup_algo(rh, true, false); 1123 1124 /* Drain cb so user can unload the module after userret if so desired */ 1125 epoch_drain_callbacks(net_epoch_preempt); 1126 1127 return (0); 1128 } 1129 1130 #ifdef INET 1131 static int 1132 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1133 { 1134 1135 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req)); 1136 } 1137 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1138 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1139 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1140 #endif 1141 1142 #ifdef INET6 1143 static int 1144 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1145 { 1146 1147 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req)); 1148 } 1149 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1150 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1151 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1152 #endif 1153 1154 static void 1155 destroy_fdh_epoch(epoch_context_t ctx) 1156 { 1157 struct fib_dp_header *fdh; 1158 1159 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1160 free(fdh, M_RTABLE); 1161 } 1162 1163 static struct fib_dp_header * 1164 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1165 { 1166 size_t sz; 1167 struct fib_dp_header *fdh; 1168 1169 sz = sizeof(struct fib_dp_header); 1170 sz += sizeof(struct fib_dp) * num_tables; 1171 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1172 if (fdh != NULL) 1173 fdh->fdh_num_tables = num_tables; 1174 return (fdh); 1175 } 1176 1177 static struct fib_dp_header * 1178 get_fib_dp_header(struct fib_dp *dp) 1179 { 1180 1181 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1182 } 1183 1184 /* 1185 * Replace per-family index pool @pdp with a new one which 1186 * contains updated callback/algo data from @fd. 1187 * Returns 0 on success. 1188 */ 1189 static enum flm_op_result 1190 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) 1191 { 1192 struct fib_dp_header *new_fdh, *old_fdh; 1193 1194 NET_EPOCH_ASSERT(); 1195 1196 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1197 curvnet, fd->fd_dp.f, fd->fd_dp.arg); 1198 1199 FIB_MOD_LOCK(); 1200 old_fdh = get_fib_dp_header(*pdp); 1201 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1202 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1203 if (new_fdh == NULL) { 1204 FIB_MOD_UNLOCK(); 1205 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1206 return (FLM_REBUILD); 1207 } 1208 1209 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1210 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1211 /* Update relevant data structure for @fd */ 1212 new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1213 1214 /* Ensure memcpy() writes have completed */ 1215 atomic_thread_fence_rel(); 1216 /* Set new datapath pointer */ 1217 *pdp = &new_fdh->fdh_idx[0]; 1218 FIB_MOD_UNLOCK(); 1219 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1220 1221 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1222 &old_fdh->fdh_epoch_ctx); 1223 1224 return (FLM_SUCCESS); 1225 } 1226 1227 static struct fib_dp ** 1228 get_family_dp_ptr(int family) 1229 { 1230 switch (family) { 1231 case AF_INET: 1232 return (&V_inet_dp); 1233 case AF_INET6: 1234 return (&V_inet6_dp); 1235 } 1236 return (NULL); 1237 } 1238 1239 /* 1240 * Make datapath use fib instance @fd 1241 */ 1242 static enum flm_op_result 1243 attach_datapath(struct fib_data *fd) 1244 { 1245 struct fib_dp **pdp; 1246 1247 pdp = get_family_dp_ptr(fd->fd_family); 1248 return (replace_rtables_family(pdp, fd)); 1249 } 1250 1251 /* 1252 * Grow datapath pointers array. 1253 * Called from sysctl handler on growing number of routing tables. 1254 */ 1255 static void 1256 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1257 { 1258 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1259 1260 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1261 1262 FIB_MOD_LOCK(); 1263 if (*pdp != NULL) { 1264 old_fdh = get_fib_dp_header(*pdp); 1265 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1266 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1267 } 1268 1269 /* Wait till all writes completed */ 1270 atomic_thread_fence_rel(); 1271 1272 *pdp = &new_fdh->fdh_idx[0]; 1273 FIB_MOD_UNLOCK(); 1274 1275 if (old_fdh != NULL) 1276 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1277 &old_fdh->fdh_epoch_ctx); 1278 } 1279 1280 /* 1281 * Grows per-AF arrays of datapath pointers for each supported family. 1282 * Called from fibs resize sysctl handler. 1283 */ 1284 void 1285 fib_grow_rtables(uint32_t new_num_tables) 1286 { 1287 1288 #ifdef INET 1289 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1290 #endif 1291 #ifdef INET6 1292 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1293 #endif 1294 } 1295 1296 void 1297 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1298 { 1299 1300 bzero(rinfo, sizeof(struct rib_rtable_info)); 1301 rinfo->num_prefixes = rh->rnh_prefixes; 1302 rinfo->num_nhops = nhops_get_count(rh); 1303 #ifdef ROUTE_MPATH 1304 rinfo->num_nhgrp = nhgrp_get_count(rh); 1305 #endif 1306 } 1307 1308 /* 1309 * Accessor to get rib instance @fd is attached to. 1310 */ 1311 struct rib_head * 1312 fib_get_rh(struct fib_data *fd) 1313 { 1314 1315 return (fd->fd_rh); 1316 } 1317 1318 /* 1319 * Accessor to export idx->nhop array 1320 */ 1321 struct nhop_object ** 1322 fib_get_nhop_array(struct fib_data *fd) 1323 { 1324 1325 return (fd->nh_idx); 1326 } 1327 1328 static uint32_t 1329 get_nhop_idx(struct nhop_object *nh) 1330 { 1331 #ifdef ROUTE_MPATH 1332 if (NH_IS_NHGRP(nh)) 1333 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1334 else 1335 return (nhop_get_idx(nh) * 2); 1336 #else 1337 return (nhop_get_idx(nh)); 1338 #endif 1339 } 1340 1341 uint32_t 1342 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1343 { 1344 1345 return (get_nhop_idx(nh)); 1346 } 1347 1348 static bool 1349 is_idx_free(struct fib_data *fd, uint32_t index) 1350 { 1351 1352 return (fd->nh_ref_table->refcnt[index] == 0); 1353 } 1354 1355 static uint32_t 1356 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1357 { 1358 uint32_t idx = get_nhop_idx(nh); 1359 1360 if (idx >= fd->number_nhops) { 1361 fd->hit_nhops = 1; 1362 return (0); 1363 } 1364 1365 if (is_idx_free(fd, idx)) { 1366 nhop_ref_any(nh); 1367 fd->nh_idx[idx] = nh; 1368 fd->nh_ref_table->count++; 1369 FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1370 } 1371 fd->nh_ref_table->refcnt[idx]++; 1372 1373 return (idx); 1374 } 1375 1376 struct nhop_release_data { 1377 struct nhop_object *nh; 1378 struct epoch_context ctx; 1379 }; 1380 1381 static void 1382 release_nhop_epoch(epoch_context_t ctx) 1383 { 1384 struct nhop_release_data *nrd; 1385 1386 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1387 nhop_free_any(nrd->nh); 1388 free(nrd, M_TEMP); 1389 } 1390 1391 /* 1392 * Delays nexthop refcount release. 1393 * Datapath may have the datastructures not updated yet, so the old 1394 * nexthop may still be returned till the end of current epoch. Delay 1395 * refcount removal, as we may be removing the last instance, which will 1396 * trigger nexthop deletion, rendering returned nexthop invalid. 1397 */ 1398 static void 1399 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1400 { 1401 struct nhop_release_data *nrd; 1402 1403 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1404 if (nrd != NULL) { 1405 nrd->nh = nh; 1406 epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); 1407 } else { 1408 /* 1409 * Unable to allocate memory. Leak nexthop to maintain guarantee 1410 * that each nhop can be referenced. 1411 */ 1412 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1413 } 1414 } 1415 1416 static void 1417 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1418 { 1419 uint32_t idx = get_nhop_idx(nh); 1420 1421 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1422 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1423 1424 fd->nh_ref_table->refcnt[idx]--; 1425 if (fd->nh_ref_table->refcnt[idx] == 0) { 1426 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1427 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1428 } 1429 } 1430 1431 static void 1432 set_algo_fixed(struct rib_head *rh) 1433 { 1434 switch (rh->rib_family) { 1435 #ifdef INET 1436 case AF_INET: 1437 algo_fixed_inet = true; 1438 break; 1439 #endif 1440 #ifdef INET6 1441 case AF_INET6: 1442 algo_fixed_inet6 = true; 1443 break; 1444 #endif 1445 } 1446 } 1447 1448 static bool 1449 is_algo_fixed(struct rib_head *rh) 1450 { 1451 1452 switch (rh->rib_family) { 1453 #ifdef INET 1454 case AF_INET: 1455 return (algo_fixed_inet); 1456 #endif 1457 #ifdef INET6 1458 case AF_INET6: 1459 return (algo_fixed_inet6); 1460 #endif 1461 } 1462 return (false); 1463 } 1464 1465 /* 1466 * Runs the check on what would be the best algo for rib @rh, assuming 1467 * that the current algo is the one specified by @orig_flm. Note that 1468 * it can be NULL for initial selection. 1469 * 1470 * Returns referenced new algo or NULL if the current one is the best. 1471 */ 1472 static struct fib_lookup_module * 1473 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1474 { 1475 uint8_t preference, curr_preference = 0, best_preference = 0; 1476 struct fib_lookup_module *flm, *best_flm = NULL; 1477 struct rib_rtable_info rinfo; 1478 int candidate_algos = 0; 1479 1480 fib_get_rtable_info(rh, &rinfo); 1481 1482 FIB_MOD_LOCK(); 1483 if (is_algo_fixed(rh)) { 1484 FIB_MOD_UNLOCK(); 1485 return (NULL); 1486 } 1487 1488 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1489 if (flm->flm_family != rh->rib_family) 1490 continue; 1491 candidate_algos++; 1492 preference = flm->flm_get_pref(&rinfo); 1493 if (preference > best_preference) { 1494 if (!flm_error_check(flm, rh->rib_fibnum)) { 1495 best_preference = preference; 1496 best_flm = flm; 1497 } 1498 } 1499 if (flm == orig_flm) 1500 curr_preference = preference; 1501 } 1502 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1503 best_flm->flm_refcount++; 1504 else 1505 best_flm = NULL; 1506 FIB_MOD_UNLOCK(); 1507 1508 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1509 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1510 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1511 best_preference); 1512 1513 return (best_flm); 1514 } 1515 1516 /* 1517 * Called when new route table is created. 1518 * Selects, allocates and attaches fib algo for the table. 1519 */ 1520 int 1521 fib_select_algo_initial(struct rib_head *rh) 1522 { 1523 struct fib_lookup_module *flm; 1524 struct fib_data *fd = NULL; 1525 enum flm_op_result result; 1526 int error = 0; 1527 1528 flm = fib_check_best_algo(rh, NULL); 1529 if (flm == NULL) { 1530 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1531 return (ENOENT); 1532 } 1533 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1534 1535 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1536 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1537 if (result == FLM_SUCCESS) { 1538 1539 /* 1540 * Attach datapath directly to avoid multiple reallocations 1541 * during fib growth 1542 */ 1543 struct fib_dp_header *fdp; 1544 struct fib_dp **pdp; 1545 1546 pdp = get_family_dp_ptr(rh->rib_family); 1547 if (pdp != NULL) { 1548 fdp = get_fib_dp_header(*pdp); 1549 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1550 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1551 } 1552 } else { 1553 error = EINVAL; 1554 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1555 } 1556 1557 fib_unref_algo(flm); 1558 1559 return (error); 1560 } 1561 1562 /* 1563 * Registers fib lookup module within the subsystem. 1564 */ 1565 int 1566 fib_module_register(struct fib_lookup_module *flm) 1567 { 1568 1569 FIB_MOD_LOCK(); 1570 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1571 print_family(flm->flm_family)); 1572 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1573 FIB_MOD_UNLOCK(); 1574 1575 return (0); 1576 } 1577 1578 /* 1579 * Tries to unregister fib lookup module. 1580 * 1581 * Returns 0 on success, EBUSY if module is still used 1582 * by some of the tables. 1583 */ 1584 int 1585 fib_module_unregister(struct fib_lookup_module *flm) 1586 { 1587 1588 FIB_MOD_LOCK(); 1589 if (flm->flm_refcount > 0) { 1590 FIB_MOD_UNLOCK(); 1591 return (EBUSY); 1592 } 1593 fib_error_clear_flm(flm); 1594 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1595 print_family(flm->flm_family)); 1596 TAILQ_REMOVE(&all_algo_list, flm, entries); 1597 FIB_MOD_UNLOCK(); 1598 1599 return (0); 1600 } 1601 1602 void 1603 vnet_fib_init(void) 1604 { 1605 1606 TAILQ_INIT(&V_fib_data_list); 1607 } 1608 1609 void 1610 vnet_fib_destroy(void) 1611 { 1612 1613 FIB_MOD_LOCK(); 1614 fib_error_clear(); 1615 FIB_MOD_UNLOCK(); 1616 } 1617