1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 #ifdef INET6 109 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 110 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 111 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 112 "IPv6 longest prefix match lookups"); 113 #endif 114 #ifdef INET 115 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 116 #define V_algo_fixed_inet VNET(algo_fixed_inet) 117 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 118 "IPv4 longest prefix match lookups"); 119 #endif 120 121 struct nhop_ref_table { 122 uint32_t count; 123 int32_t refcnt[0]; 124 }; 125 126 /* 127 * Data structure for the fib lookup instance tied to the particular rib. 128 */ 129 struct fib_data { 130 uint32_t number_nhops; /* current # of nhops */ 131 uint8_t hit_nhops; /* true if out of nhop limit */ 132 uint8_t init_done; /* true if init is competed */ 133 uint32_t fd_dead:1; /* Scheduled for deletion */ 134 uint32_t fd_linked:1; /* true if linked */ 135 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 136 uint32_t fd_force_eval:1;/* true if rebuild scheduled */ 137 uint8_t fd_family; /* family */ 138 uint32_t fd_fibnum; /* fibnum */ 139 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 140 uint32_t fd_algo_mask; /* bitmask for algo data */ 141 struct callout fd_callout; /* rebuild callout */ 142 void *fd_algo_data; /* algorithm data */ 143 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 144 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 145 struct rib_head *fd_rh; /* RIB table we're attached to */ 146 struct rib_subscription *fd_rs; /* storing table subscription */ 147 struct fib_dp fd_dp; /* fib datapath data */ 148 struct vnet *fd_vnet; /* vnet fib belongs to */ 149 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 150 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 151 uint32_t fd_num_changes; /* number of changes since last callout */ 152 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 153 }; 154 155 static void rebuild_fd_callout(void *_data); 156 static void destroy_fd_instance_epoch(epoch_context_t ctx); 157 static enum flm_op_result attach_datapath(struct fib_data *fd); 158 static bool is_idx_free(struct fib_data *fd, uint32_t index); 159 static void set_algo_fixed(struct rib_head *rh); 160 static bool is_algo_fixed(struct rib_head *rh); 161 162 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 163 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 164 165 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 166 struct fib_lookup_module *orig_flm); 167 static void fib_unref_algo(struct fib_lookup_module *flm); 168 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 169 170 struct mtx fib_mtx; 171 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 172 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 173 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 174 175 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 176 177 /* Algorithm has to be this percent better than the current to switch */ 178 #define BEST_DIFF_PERCENT (5 * 256 / 100) 179 /* Schedule algo re-evaluation X seconds after a change */ 180 #define ALGO_EVAL_DELAY_MS 30000 181 /* Force algo re-evaluation after X changes */ 182 #define ALGO_EVAL_NUM_ROUTES 100 183 /* Try to setup algorithm X times */ 184 #define FIB_MAX_TRIES 32 185 /* Max amount of supported nexthops */ 186 #define FIB_MAX_NHOPS 262144 187 #define FIB_CALLOUT_DELAY_MS 50 188 189 /* Debug */ 190 static int flm_debug_level = LOG_NOTICE; 191 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 192 &flm_debug_level, 0, "debuglevel"); 193 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 194 195 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 196 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 197 #define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ 198 printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\ 199 print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) 200 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 201 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 202 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 203 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 204 } 205 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 206 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 207 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 208 __func__, _fmt, ## __VA_ARGS__); \ 209 } 210 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 211 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 212 #else 213 #define FD_PRINTF_LOG_DEBUG() 214 #endif 215 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 216 #define FD_PRINTF_LOG_INFO _FD_PRINTF 217 #else 218 #define FD_PRINTF_LOG_INFO() 219 #endif 220 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 221 #define FD_PRINTF_LOG_ERR _FD_PRINTF 222 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 223 224 225 /* List of all registered lookup algorithms */ 226 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 227 228 /* List of all fib lookup instances in the vnet */ 229 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 230 #define V_fib_data_list VNET(fib_data_list) 231 232 /* Datastructure for storing non-transient fib lookup module failures */ 233 struct fib_error { 234 int fe_family; 235 uint32_t fe_fibnum; /* failed rtable */ 236 struct fib_lookup_module *fe_flm; /* failed module */ 237 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 238 }; 239 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 240 #define V_fib_error_list VNET(fib_error_list) 241 242 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 243 struct fib_dp_header { 244 struct epoch_context fdh_epoch_ctx; 245 uint32_t fdh_num_tables; 246 struct fib_dp fdh_idx[0]; 247 }; 248 249 /* 250 * Tries to add new non-transient algorithm error to the list of 251 * errors. 252 * Returns true on success. 253 */ 254 static bool 255 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 256 { 257 struct fib_error *fe; 258 259 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 260 if (fe == NULL) 261 return (false); 262 fe->fe_flm = flm; 263 fe->fe_family = flm->flm_family; 264 fe->fe_fibnum = fibnum; 265 266 FIB_MOD_LOCK(); 267 /* Avoid duplicates by checking if error already exists first */ 268 if (flm_error_check(flm, fibnum)) { 269 FIB_MOD_UNLOCK(); 270 free(fe, M_TEMP); 271 return (true); 272 } 273 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 274 FIB_MOD_UNLOCK(); 275 276 return (true); 277 } 278 279 /* 280 * True if non-transient error has been registered for @flm in @fibnum. 281 */ 282 static bool 283 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 284 { 285 const struct fib_error *fe; 286 287 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 288 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 289 return (true); 290 } 291 292 return (false); 293 } 294 295 /* 296 * Clear all errors of algo specified by @flm. 297 */ 298 static void 299 fib_error_clear_flm(struct fib_lookup_module *flm) 300 { 301 struct fib_error *fe, *fe_tmp; 302 303 FIB_MOD_LOCK_ASSERT(); 304 305 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 306 if (fe->fe_flm == flm) { 307 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 308 free(fe, M_TEMP); 309 } 310 } 311 } 312 313 /* 314 * Clears all errors in current VNET. 315 */ 316 static void 317 fib_error_clear() 318 { 319 struct fib_error *fe, *fe_tmp; 320 321 FIB_MOD_LOCK_ASSERT(); 322 323 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 324 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 325 free(fe, M_TEMP); 326 } 327 } 328 329 static const char * 330 print_op_result(enum flm_op_result result) 331 { 332 switch (result) { 333 case FLM_SUCCESS: 334 return "success"; 335 case FLM_REBUILD: 336 return "rebuild"; 337 case FLM_ERROR: 338 return "error"; 339 } 340 341 return "unknown"; 342 } 343 344 static const char * 345 print_family(int family) 346 { 347 348 if (family == AF_INET) 349 return ("inet"); 350 else if (family == AF_INET6) 351 return ("inet6"); 352 else 353 return ("unknown"); 354 } 355 356 /* 357 * Debug function used by lookup algorithms. 358 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 359 */ 360 void 361 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 362 { 363 char buf[128]; 364 va_list ap; 365 366 if (level > flm_debug_level) 367 return; 368 369 va_start(ap, fmt); 370 vsnprintf(buf, sizeof(buf), fmt, ap); 371 va_end(ap); 372 373 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 374 func, "%s", buf); 375 } 376 377 /* 378 * Outputs list of algorithms supported by the provided address family. 379 */ 380 static int 381 print_algos_sysctl(struct sysctl_req *req, int family) 382 { 383 struct fib_lookup_module *flm; 384 struct sbuf sbuf; 385 int error, count = 0; 386 387 error = sysctl_wire_old_buffer(req, 0); 388 if (error == 0) { 389 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 390 TAILQ_FOREACH(flm, &all_algo_list, entries) { 391 if (flm->flm_family == family) { 392 if (count++ > 0) 393 sbuf_cat(&sbuf, ", "); 394 sbuf_cat(&sbuf, flm->flm_name); 395 } 396 } 397 error = sbuf_finish(&sbuf); 398 sbuf_delete(&sbuf); 399 } 400 return (error); 401 } 402 403 #ifdef INET6 404 static int 405 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 406 { 407 408 return (print_algos_sysctl(req, AF_INET6)); 409 } 410 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 411 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 412 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 413 #endif 414 415 #ifdef INET 416 static int 417 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 418 { 419 420 return (print_algos_sysctl(req, AF_INET)); 421 } 422 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 423 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 424 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 425 #endif 426 427 /* 428 * Calculate delay between repeated failures. 429 * Returns current delay in milliseconds. 430 */ 431 static uint32_t 432 callout_calc_delay_ms(struct fib_data *fd) 433 { 434 uint32_t shift; 435 436 if (fd->fd_failed_rebuilds > 10) 437 shift = 10; 438 else 439 shift = fd->fd_failed_rebuilds; 440 441 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 442 } 443 444 static void 445 schedule_callout(struct fib_data *fd, int delay_ms) 446 { 447 448 callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, 449 rebuild_fd_callout, fd, 0); 450 } 451 452 static void 453 schedule_fd_rebuild(struct fib_data *fd) 454 { 455 456 FIB_MOD_LOCK(); 457 if (!fd->fd_need_rebuild) { 458 fd->fd_need_rebuild = true; 459 460 /* 461 * Potentially re-schedules pending callout 462 * initiated by schedule_algo_eval. 463 */ 464 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); 465 schedule_callout(fd, callout_calc_delay_ms(fd)); 466 } 467 FIB_MOD_UNLOCK(); 468 } 469 470 static void 471 schedule_algo_eval(struct fib_data *fd) 472 { 473 474 if (fd->fd_num_changes++ == 0) { 475 /* Start callout to consider switch */ 476 FIB_MOD_LOCK(); 477 if (!callout_pending(&fd->fd_callout)) 478 schedule_callout(fd, ALGO_EVAL_DELAY_MS); 479 FIB_MOD_UNLOCK(); 480 } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { 481 /* Reset callout to exec immediately */ 482 FIB_MOD_LOCK(); 483 if (!fd->fd_need_rebuild) { 484 fd->fd_force_eval = true; 485 schedule_callout(fd, 1); 486 } 487 FIB_MOD_UNLOCK(); 488 } 489 } 490 491 /* 492 * Rib subscription handler. Checks if the algorithm is ready to 493 * receive updates, handles nexthop refcounting and passes change 494 * data to the algorithm callback. 495 */ 496 static void 497 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 498 void *_data) 499 { 500 struct fib_data *fd = (struct fib_data *)_data; 501 enum flm_op_result result; 502 503 RIB_WLOCK_ASSERT(rnh); 504 505 /* 506 * There is a small gap between subscribing for route changes 507 * and initiating rtable dump. Avoid receiving route changes 508 * prior to finishing rtable dump by checking `init_done`. 509 */ 510 if (!fd->init_done) 511 return; 512 /* 513 * If algo requested rebuild, stop sending updates by default. 514 * This simplifies nexthop refcount handling logic. 515 */ 516 if (fd->fd_need_rebuild) 517 return; 518 519 /* Consider scheduling algorithm re-evaluation */ 520 schedule_algo_eval(fd); 521 522 /* 523 * Maintain guarantee that every nexthop returned by the dataplane 524 * lookup has > 0 refcount, so can be safely referenced within current 525 * epoch. 526 */ 527 if (rc->rc_nh_new != NULL) { 528 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 529 /* ran out of indexes */ 530 schedule_fd_rebuild(fd); 531 return; 532 } 533 } 534 535 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 536 537 switch (result) { 538 case FLM_SUCCESS: 539 /* Unref old nexthop on success */ 540 if (rc->rc_nh_old != NULL) 541 fib_unref_nhop(fd, rc->rc_nh_old); 542 break; 543 case FLM_REBUILD: 544 545 /* 546 * Algo is not able to apply the update. 547 * Schedule algo rebuild. 548 */ 549 schedule_fd_rebuild(fd); 550 break; 551 case FLM_ERROR: 552 553 /* 554 * Algo reported a non-recoverable error. 555 * Record the error and schedule rebuild, which will 556 * trigger best algo selection. 557 */ 558 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 559 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 560 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 561 schedule_fd_rebuild(fd); 562 } 563 } 564 565 static void 566 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 567 { 568 569 if (old_fd == NULL) { 570 // TODO: read from rtable 571 fd->number_nhops = 16; 572 return; 573 } 574 575 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 576 fd->number_nhops = 2 * old_fd->number_nhops; 577 else 578 fd->number_nhops = old_fd->number_nhops; 579 } 580 581 struct walk_cbdata { 582 struct fib_data *fd; 583 flm_dump_t *func; 584 enum flm_op_result result; 585 }; 586 587 /* 588 * Handler called after all rtenties have been dumped. 589 * Performs post-dump framework checks and calls 590 * algo:flm_dump_end_cb(). 591 * 592 * Updates walk_cbdata result. 593 */ 594 static void 595 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 596 { 597 struct walk_cbdata *w = (struct walk_cbdata *)_data; 598 struct fib_data *fd = w->fd; 599 600 RIB_WLOCK_ASSERT(w->fd->fd_rh); 601 602 if (rnh->rib_dying) { 603 w->result = FLM_ERROR; 604 return; 605 } 606 607 if (fd->hit_nhops) { 608 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 609 fd->nh_ref_table->count); 610 if (w->result == FLM_SUCCESS) 611 w->result = FLM_REBUILD; 612 return; 613 } 614 615 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 616 return; 617 618 /* Post-dump hook, dump successful */ 619 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 620 621 if (w->result == FLM_SUCCESS) { 622 /* Mark init as done to allow routing updates */ 623 fd->init_done = 1; 624 } 625 } 626 627 /* 628 * Callback for each entry in rib. 629 * Calls algo:flm_dump_rib_item_cb func as a part of initial 630 * route table synchronisation. 631 */ 632 static int 633 sync_algo_cb(struct rtentry *rt, void *_data) 634 { 635 struct walk_cbdata *w = (struct walk_cbdata *)_data; 636 637 RIB_WLOCK_ASSERT(w->fd->fd_rh); 638 639 if (w->result == FLM_SUCCESS && w->func) { 640 641 /* 642 * Reference nexthops to maintain guarantee that 643 * each nexthop returned by datapath has > 0 references 644 * and can be safely referenced within current epoch. 645 */ 646 struct nhop_object *nh = rt_get_raw_nhop(rt); 647 if (fib_ref_nhop(w->fd, nh) != 0) 648 w->result = w->func(rt, w->fd->fd_algo_data); 649 else 650 w->result = FLM_REBUILD; 651 } 652 653 return (0); 654 } 655 656 /* 657 * Dump all routing table state to the algo instance. 658 */ 659 static enum flm_op_result 660 sync_algo(struct fib_data *fd) 661 { 662 struct walk_cbdata w = { 663 .fd = fd, 664 .func = fd->fd_flm->flm_dump_rib_item_cb, 665 .result = FLM_SUCCESS, 666 }; 667 668 rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); 669 670 FD_PRINTF(LOG_INFO, fd, "initial dump completed, result: %s", 671 print_op_result(w.result)); 672 673 return (w.result); 674 } 675 676 /* 677 * Schedules epoch-backed @fd instance deletion. 678 * * Unlinks @fd from the list of active algo instances. 679 * * Removes rib subscription. 680 * * Stops callout. 681 * * Schedules actual deletion. 682 * 683 * Assume @fd is already unlinked from the datapath. 684 */ 685 static int 686 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 687 { 688 bool is_dead; 689 690 NET_EPOCH_ASSERT(); 691 692 FIB_MOD_LOCK(); 693 is_dead = fd->fd_dead; 694 if (!is_dead) 695 fd->fd_dead = true; 696 if (fd->fd_linked) { 697 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 698 fd->fd_linked = false; 699 } 700 FIB_MOD_UNLOCK(); 701 if (is_dead) 702 return (0); 703 704 FD_PRINTF(LOG_INFO, fd, "DETACH"); 705 706 if (fd->fd_rs != NULL) 707 rib_unsibscribe(fd->fd_rs); 708 709 /* 710 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 711 * will be executed, hence no _new_ callout schedules will happen. 712 * 713 * There can be 2 possible scenarious here: 714 * 1) we're running inside a callout when we're deleting ourselves 715 * due to migration to a newer fd 716 * 2) we're running from rt_table_destroy() and callout is scheduled 717 * for execution OR is executing 718 * 719 * For (2) we need to wait for the callout termination, as the routing table 720 * will be destroyed after this function returns. 721 * For (1) we cannot call drain, but can ensure that this is the last invocation. 722 */ 723 724 if (in_callout) 725 callout_stop(&fd->fd_callout); 726 else 727 callout_drain(&fd->fd_callout); 728 729 epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, 730 &fd->fd_epoch_ctx); 731 732 return (0); 733 } 734 735 /* 736 * Wipe all fd instances from the list matching rib specified by @rh. 737 * If @keep_first is set, remove all but the first record. 738 */ 739 static void 740 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 741 { 742 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 743 struct fib_data *fd, *fd_tmp; 744 struct epoch_tracker et; 745 746 FIB_MOD_LOCK(); 747 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 748 if (fd->fd_rh == rh) { 749 if (keep_first) { 750 keep_first = false; 751 continue; 752 } 753 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 754 fd->fd_linked = false; 755 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 756 } 757 } 758 FIB_MOD_UNLOCK(); 759 760 /* Pass 2: remove each entry */ 761 NET_EPOCH_ENTER(et); 762 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 763 schedule_destroy_fd_instance(fd, in_callout); 764 } 765 NET_EPOCH_EXIT(et); 766 } 767 768 void 769 fib_destroy_rib(struct rib_head *rh) 770 { 771 772 /* 773 * rnh has `is_dying` flag set, so setup of new fd's will fail at 774 * sync_algo() stage, preventing new entries to be added to the list 775 * of active algos. Remove all existing entries for the particular rib. 776 */ 777 fib_cleanup_algo(rh, false, false); 778 } 779 780 /* 781 * Finalises fd destruction by freeing all fd resources. 782 */ 783 static void 784 destroy_fd_instance(struct fib_data *fd) 785 { 786 787 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 788 789 /* Call destroy callback first */ 790 if (fd->fd_algo_data != NULL) 791 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 792 793 /* Nhop table */ 794 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 795 for (int i = 0; i < fd->number_nhops; i++) { 796 if (!is_idx_free(fd, i)) { 797 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", 798 i, fd->nh_idx[i]); 799 nhop_free_any(fd->nh_idx[i]); 800 } 801 } 802 free(fd->nh_idx, M_RTABLE); 803 } 804 if (fd->nh_ref_table != NULL) 805 free(fd->nh_ref_table, M_RTABLE); 806 807 fib_unref_algo(fd->fd_flm); 808 809 free(fd, M_RTABLE); 810 } 811 812 /* 813 * Epoch callback indicating fd is safe to destroy 814 */ 815 static void 816 destroy_fd_instance_epoch(epoch_context_t ctx) 817 { 818 struct fib_data *fd; 819 820 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 821 822 destroy_fd_instance(fd); 823 } 824 825 /* 826 * Tries to setup fd instance. 827 * - Allocates fd/nhop table 828 * - Runs algo:flm_init_cb algo init 829 * - Subscribes fd to the rib 830 * - Runs rtable dump 831 * - Adds instance to the list of active instances. 832 * 833 * Returns: operation result. Fills in @pfd with resulting fd on success. 834 * 835 */ 836 static enum flm_op_result 837 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 838 struct fib_data *old_fd, struct fib_data **pfd) 839 { 840 struct fib_data *fd; 841 size_t size; 842 enum flm_op_result result; 843 844 /* Allocate */ 845 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 846 if (fd == NULL) { 847 *pfd = NULL; 848 return (FLM_REBUILD); 849 } 850 *pfd = fd; 851 852 estimate_nhop_scale(old_fd, fd); 853 854 fd->fd_rh = rh; 855 fd->fd_family = rh->rib_family; 856 fd->fd_fibnum = rh->rib_fibnum; 857 callout_init(&fd->fd_callout, 1); 858 fd->fd_vnet = curvnet; 859 fd->fd_flm = flm; 860 861 FIB_MOD_LOCK(); 862 flm->flm_refcount++; 863 FIB_MOD_UNLOCK(); 864 865 /* Allocate nhidx -> nhop_ptr table */ 866 size = fd->number_nhops * sizeof(void *); 867 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 868 if (fd->nh_idx == NULL) { 869 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 870 return (FLM_REBUILD); 871 } 872 873 /* Allocate nhop index refcount table */ 874 size = sizeof(struct nhop_ref_table); 875 size += fd->number_nhops * sizeof(uint32_t); 876 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 877 if (fd->nh_ref_table == NULL) { 878 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 879 return (FLM_REBUILD); 880 } 881 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 882 883 /* Okay, we're ready for algo init */ 884 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 885 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 886 if (result != FLM_SUCCESS) 887 return (result); 888 889 /* Try to subscribe */ 890 if (flm->flm_change_rib_item_cb != NULL) { 891 fd->fd_rs = rib_subscribe_internal(fd->fd_rh, 892 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); 893 if (fd->fd_rs == NULL) 894 return (FLM_REBUILD); 895 } 896 897 /* Dump */ 898 result = sync_algo(fd); 899 if (result != FLM_SUCCESS) 900 return (result); 901 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 902 903 FIB_MOD_LOCK(); 904 /* 905 * Insert fd in the beginning of a list, to maintain invariant 906 * that first matching entry for the AF/fib is always the active 907 * one. 908 */ 909 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 910 fd->fd_linked = true; 911 FIB_MOD_UNLOCK(); 912 913 return (FLM_SUCCESS); 914 } 915 916 /* 917 * Sets up algo @flm for table @rh and links it to the datapath. 918 * 919 */ 920 static enum flm_op_result 921 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 922 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 923 { 924 struct fib_data *prev_fd, *new_fd; 925 struct epoch_tracker et; 926 enum flm_op_result result; 927 928 prev_fd = orig_fd; 929 new_fd = NULL; 930 for (int i = 0; i < FIB_MAX_TRIES; i++) { 931 NET_EPOCH_ENTER(et); 932 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 933 934 if ((result == FLM_SUCCESS) && attach) 935 result = attach_datapath(new_fd); 936 937 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 938 schedule_destroy_fd_instance(prev_fd, false); 939 prev_fd = NULL; 940 } 941 NET_EPOCH_EXIT(et); 942 943 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 944 print_op_result(result)); 945 946 if (result == FLM_REBUILD) { 947 prev_fd = new_fd; 948 new_fd = NULL; 949 continue; 950 } 951 952 break; 953 } 954 955 if (result != FLM_SUCCESS) { 956 /* update failure count */ 957 FIB_MOD_LOCK(); 958 if (orig_fd != NULL) 959 orig_fd->fd_failed_rebuilds++; 960 FIB_MOD_UNLOCK(); 961 962 /* Ban algo on non-recoverable error */ 963 if (result == FLM_ERROR) 964 flm_error_add(flm, rh->rib_fibnum); 965 966 NET_EPOCH_ENTER(et); 967 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 968 schedule_destroy_fd_instance(prev_fd, false); 969 if (new_fd != NULL) { 970 schedule_destroy_fd_instance(new_fd, false); 971 new_fd = NULL; 972 } 973 NET_EPOCH_EXIT(et); 974 } 975 976 *pfd = new_fd; 977 return (result); 978 } 979 980 /* 981 * Callout for all scheduled fd-related work. 982 * - Checks if the current algo is still the best algo 983 * - Creates a new instance of an algo for af/fib if desired. 984 */ 985 static void 986 rebuild_fd_callout(void *_data) 987 { 988 struct fib_data *fd, *fd_new, *fd_tmp; 989 struct fib_lookup_module *flm_new = NULL; 990 struct epoch_tracker et; 991 enum flm_op_result result; 992 bool need_rebuild = false; 993 994 fd = (struct fib_data *)_data; 995 996 FIB_MOD_LOCK(); 997 need_rebuild = fd->fd_need_rebuild; 998 fd->fd_need_rebuild = false; 999 fd->fd_force_eval = false; 1000 fd->fd_num_changes = 0; 1001 FIB_MOD_UNLOCK(); 1002 1003 CURVNET_SET(fd->fd_vnet); 1004 1005 /* First, check if we're still OK to use this algo */ 1006 if (!is_algo_fixed(fd->fd_rh)) 1007 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1008 if ((flm_new == NULL) && (!need_rebuild)) { 1009 /* Keep existing algo, no need to rebuild. */ 1010 CURVNET_RESTORE(); 1011 return; 1012 } 1013 1014 if (flm_new == NULL) { 1015 flm_new = fd->fd_flm; 1016 fd_tmp = fd; 1017 } else { 1018 fd_tmp = NULL; 1019 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1020 } 1021 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1022 if (fd_tmp == NULL) { 1023 /* fd_new represents new algo */ 1024 fib_unref_algo(flm_new); 1025 } 1026 if (result != FLM_SUCCESS) { 1027 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1028 CURVNET_RESTORE(); 1029 return; 1030 } 1031 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1032 1033 /* Remove old instance removal */ 1034 if (fd != NULL) { 1035 NET_EPOCH_ENTER(et); 1036 schedule_destroy_fd_instance(fd, true); 1037 NET_EPOCH_EXIT(et); 1038 } 1039 1040 CURVNET_RESTORE(); 1041 } 1042 1043 /* 1044 * Finds algo by name/family. 1045 * Returns referenced algo or NULL. 1046 */ 1047 static struct fib_lookup_module * 1048 fib_find_algo(const char *algo_name, int family) 1049 { 1050 struct fib_lookup_module *flm; 1051 1052 FIB_MOD_LOCK(); 1053 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1054 if ((strcmp(flm->flm_name, algo_name) == 0) && 1055 (family == flm->flm_family)) { 1056 flm->flm_refcount++; 1057 FIB_MOD_UNLOCK(); 1058 return (flm); 1059 } 1060 } 1061 FIB_MOD_UNLOCK(); 1062 1063 return (NULL); 1064 } 1065 1066 static void 1067 fib_unref_algo(struct fib_lookup_module *flm) 1068 { 1069 1070 FIB_MOD_LOCK(); 1071 flm->flm_refcount--; 1072 FIB_MOD_UNLOCK(); 1073 } 1074 1075 static int 1076 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1077 { 1078 struct fib_lookup_module *flm = NULL; 1079 struct fib_data *fd = NULL; 1080 char old_algo_name[32], algo_name[32]; 1081 struct rib_head *rh = NULL; 1082 enum flm_op_result result; 1083 int error; 1084 1085 /* Fetch current algo/rib for af/family */ 1086 FIB_MOD_LOCK(); 1087 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1088 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1089 break; 1090 } 1091 if (fd == NULL) { 1092 FIB_MOD_UNLOCK(); 1093 return (ENOENT); 1094 } 1095 rh = fd->fd_rh; 1096 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1097 sizeof(old_algo_name)); 1098 FIB_MOD_UNLOCK(); 1099 1100 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1101 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1102 if (error != 0 || req->newptr == NULL) 1103 return (error); 1104 1105 if (strcmp(algo_name, old_algo_name) == 0) 1106 return (0); 1107 1108 /* New algorithm name is different */ 1109 flm = fib_find_algo(algo_name, family); 1110 if (flm == NULL) { 1111 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1112 return (ESRCH); 1113 } 1114 1115 fd = NULL; 1116 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1117 fib_unref_algo(flm); 1118 if (result != FLM_SUCCESS) 1119 return (EINVAL); 1120 1121 /* Disable automated jumping between algos */ 1122 FIB_MOD_LOCK(); 1123 set_algo_fixed(rh); 1124 FIB_MOD_UNLOCK(); 1125 /* Remove old instance(s) */ 1126 fib_cleanup_algo(rh, true, false); 1127 1128 /* Drain cb so user can unload the module after userret if so desired */ 1129 epoch_drain_callbacks(net_epoch_preempt); 1130 1131 return (0); 1132 } 1133 1134 #ifdef INET 1135 static int 1136 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1137 { 1138 1139 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req)); 1140 } 1141 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1142 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1143 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1144 #endif 1145 1146 #ifdef INET6 1147 static int 1148 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1149 { 1150 1151 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req)); 1152 } 1153 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1154 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1155 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1156 #endif 1157 1158 static void 1159 destroy_fdh_epoch(epoch_context_t ctx) 1160 { 1161 struct fib_dp_header *fdh; 1162 1163 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1164 free(fdh, M_RTABLE); 1165 } 1166 1167 static struct fib_dp_header * 1168 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1169 { 1170 size_t sz; 1171 struct fib_dp_header *fdh; 1172 1173 sz = sizeof(struct fib_dp_header); 1174 sz += sizeof(struct fib_dp) * num_tables; 1175 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1176 if (fdh != NULL) 1177 fdh->fdh_num_tables = num_tables; 1178 return (fdh); 1179 } 1180 1181 static struct fib_dp_header * 1182 get_fib_dp_header(struct fib_dp *dp) 1183 { 1184 1185 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1186 } 1187 1188 /* 1189 * Replace per-family index pool @pdp with a new one which 1190 * contains updated callback/algo data from @fd. 1191 * Returns 0 on success. 1192 */ 1193 static enum flm_op_result 1194 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) 1195 { 1196 struct fib_dp_header *new_fdh, *old_fdh; 1197 1198 NET_EPOCH_ASSERT(); 1199 1200 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1201 curvnet, fd->fd_dp.f, fd->fd_dp.arg); 1202 1203 FIB_MOD_LOCK(); 1204 old_fdh = get_fib_dp_header(*pdp); 1205 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1206 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1207 if (new_fdh == NULL) { 1208 FIB_MOD_UNLOCK(); 1209 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1210 return (FLM_REBUILD); 1211 } 1212 1213 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1214 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1215 /* Update relevant data structure for @fd */ 1216 new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1217 1218 /* Ensure memcpy() writes have completed */ 1219 atomic_thread_fence_rel(); 1220 /* Set new datapath pointer */ 1221 *pdp = &new_fdh->fdh_idx[0]; 1222 FIB_MOD_UNLOCK(); 1223 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1224 1225 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1226 &old_fdh->fdh_epoch_ctx); 1227 1228 return (FLM_SUCCESS); 1229 } 1230 1231 static struct fib_dp ** 1232 get_family_dp_ptr(int family) 1233 { 1234 switch (family) { 1235 case AF_INET: 1236 return (&V_inet_dp); 1237 case AF_INET6: 1238 return (&V_inet6_dp); 1239 } 1240 return (NULL); 1241 } 1242 1243 /* 1244 * Make datapath use fib instance @fd 1245 */ 1246 static enum flm_op_result 1247 attach_datapath(struct fib_data *fd) 1248 { 1249 struct fib_dp **pdp; 1250 1251 pdp = get_family_dp_ptr(fd->fd_family); 1252 return (replace_rtables_family(pdp, fd)); 1253 } 1254 1255 /* 1256 * Grow datapath pointers array. 1257 * Called from sysctl handler on growing number of routing tables. 1258 */ 1259 static void 1260 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1261 { 1262 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1263 1264 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1265 1266 FIB_MOD_LOCK(); 1267 if (*pdp != NULL) { 1268 old_fdh = get_fib_dp_header(*pdp); 1269 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1270 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1271 } 1272 1273 /* Wait till all writes completed */ 1274 atomic_thread_fence_rel(); 1275 1276 *pdp = &new_fdh->fdh_idx[0]; 1277 FIB_MOD_UNLOCK(); 1278 1279 if (old_fdh != NULL) 1280 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1281 &old_fdh->fdh_epoch_ctx); 1282 } 1283 1284 /* 1285 * Grows per-AF arrays of datapath pointers for each supported family. 1286 * Called from fibs resize sysctl handler. 1287 */ 1288 void 1289 fib_grow_rtables(uint32_t new_num_tables) 1290 { 1291 1292 #ifdef INET 1293 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1294 #endif 1295 #ifdef INET6 1296 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1297 #endif 1298 } 1299 1300 void 1301 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1302 { 1303 1304 bzero(rinfo, sizeof(struct rib_rtable_info)); 1305 rinfo->num_prefixes = rh->rnh_prefixes; 1306 rinfo->num_nhops = nhops_get_count(rh); 1307 #ifdef ROUTE_MPATH 1308 rinfo->num_nhgrp = nhgrp_get_count(rh); 1309 #endif 1310 } 1311 1312 /* 1313 * Accessor to get rib instance @fd is attached to. 1314 */ 1315 struct rib_head * 1316 fib_get_rh(struct fib_data *fd) 1317 { 1318 1319 return (fd->fd_rh); 1320 } 1321 1322 /* 1323 * Accessor to export idx->nhop array 1324 */ 1325 struct nhop_object ** 1326 fib_get_nhop_array(struct fib_data *fd) 1327 { 1328 1329 return (fd->nh_idx); 1330 } 1331 1332 static uint32_t 1333 get_nhop_idx(struct nhop_object *nh) 1334 { 1335 #ifdef ROUTE_MPATH 1336 if (NH_IS_NHGRP(nh)) 1337 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1338 else 1339 return (nhop_get_idx(nh) * 2); 1340 #else 1341 return (nhop_get_idx(nh)); 1342 #endif 1343 } 1344 1345 uint32_t 1346 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1347 { 1348 1349 return (get_nhop_idx(nh)); 1350 } 1351 1352 static bool 1353 is_idx_free(struct fib_data *fd, uint32_t index) 1354 { 1355 1356 return (fd->nh_ref_table->refcnt[index] == 0); 1357 } 1358 1359 static uint32_t 1360 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1361 { 1362 uint32_t idx = get_nhop_idx(nh); 1363 1364 if (idx >= fd->number_nhops) { 1365 fd->hit_nhops = 1; 1366 return (0); 1367 } 1368 1369 if (is_idx_free(fd, idx)) { 1370 nhop_ref_any(nh); 1371 fd->nh_idx[idx] = nh; 1372 fd->nh_ref_table->count++; 1373 FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1374 } 1375 fd->nh_ref_table->refcnt[idx]++; 1376 1377 return (idx); 1378 } 1379 1380 struct nhop_release_data { 1381 struct nhop_object *nh; 1382 struct epoch_context ctx; 1383 }; 1384 1385 static void 1386 release_nhop_epoch(epoch_context_t ctx) 1387 { 1388 struct nhop_release_data *nrd; 1389 1390 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1391 nhop_free_any(nrd->nh); 1392 free(nrd, M_TEMP); 1393 } 1394 1395 /* 1396 * Delays nexthop refcount release. 1397 * Datapath may have the datastructures not updated yet, so the old 1398 * nexthop may still be returned till the end of current epoch. Delay 1399 * refcount removal, as we may be removing the last instance, which will 1400 * trigger nexthop deletion, rendering returned nexthop invalid. 1401 */ 1402 static void 1403 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1404 { 1405 struct nhop_release_data *nrd; 1406 1407 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1408 if (nrd != NULL) { 1409 nrd->nh = nh; 1410 epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); 1411 } else { 1412 /* 1413 * Unable to allocate memory. Leak nexthop to maintain guarantee 1414 * that each nhop can be referenced. 1415 */ 1416 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1417 } 1418 } 1419 1420 static void 1421 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1422 { 1423 uint32_t idx = get_nhop_idx(nh); 1424 1425 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1426 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1427 1428 fd->nh_ref_table->refcnt[idx]--; 1429 if (fd->nh_ref_table->refcnt[idx] == 0) { 1430 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1431 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1432 } 1433 } 1434 1435 static void 1436 set_algo_fixed(struct rib_head *rh) 1437 { 1438 switch (rh->rib_family) { 1439 #ifdef INET 1440 case AF_INET: 1441 V_algo_fixed_inet = true; 1442 break; 1443 #endif 1444 #ifdef INET6 1445 case AF_INET6: 1446 V_algo_fixed_inet6 = true; 1447 break; 1448 #endif 1449 } 1450 } 1451 1452 static bool 1453 is_algo_fixed(struct rib_head *rh) 1454 { 1455 1456 switch (rh->rib_family) { 1457 #ifdef INET 1458 case AF_INET: 1459 return (V_algo_fixed_inet); 1460 #endif 1461 #ifdef INET6 1462 case AF_INET6: 1463 return (V_algo_fixed_inet6); 1464 #endif 1465 } 1466 return (false); 1467 } 1468 1469 /* 1470 * Runs the check on what would be the best algo for rib @rh, assuming 1471 * that the current algo is the one specified by @orig_flm. Note that 1472 * it can be NULL for initial selection. 1473 * 1474 * Returns referenced new algo or NULL if the current one is the best. 1475 */ 1476 static struct fib_lookup_module * 1477 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1478 { 1479 uint8_t preference, curr_preference = 0, best_preference = 0; 1480 struct fib_lookup_module *flm, *best_flm = NULL; 1481 struct rib_rtable_info rinfo; 1482 int candidate_algos = 0; 1483 1484 fib_get_rtable_info(rh, &rinfo); 1485 1486 FIB_MOD_LOCK(); 1487 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1488 if (flm->flm_family != rh->rib_family) 1489 continue; 1490 candidate_algos++; 1491 preference = flm->flm_get_pref(&rinfo); 1492 if (preference > best_preference) { 1493 if (!flm_error_check(flm, rh->rib_fibnum)) { 1494 best_preference = preference; 1495 best_flm = flm; 1496 } 1497 } 1498 if (flm == orig_flm) 1499 curr_preference = preference; 1500 } 1501 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1502 best_flm->flm_refcount++; 1503 else 1504 best_flm = NULL; 1505 FIB_MOD_UNLOCK(); 1506 1507 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1508 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1509 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1510 best_preference); 1511 1512 return (best_flm); 1513 } 1514 1515 /* 1516 * Called when new route table is created. 1517 * Selects, allocates and attaches fib algo for the table. 1518 */ 1519 int 1520 fib_select_algo_initial(struct rib_head *rh) 1521 { 1522 struct fib_lookup_module *flm; 1523 struct fib_data *fd = NULL; 1524 enum flm_op_result result; 1525 int error = 0; 1526 1527 flm = fib_check_best_algo(rh, NULL); 1528 if (flm == NULL) { 1529 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1530 return (ENOENT); 1531 } 1532 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1533 1534 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1535 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1536 if (result == FLM_SUCCESS) { 1537 1538 /* 1539 * Attach datapath directly to avoid multiple reallocations 1540 * during fib growth 1541 */ 1542 struct fib_dp_header *fdp; 1543 struct fib_dp **pdp; 1544 1545 pdp = get_family_dp_ptr(rh->rib_family); 1546 if (pdp != NULL) { 1547 fdp = get_fib_dp_header(*pdp); 1548 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1549 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1550 } 1551 } else { 1552 error = EINVAL; 1553 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1554 } 1555 1556 fib_unref_algo(flm); 1557 1558 return (error); 1559 } 1560 1561 /* 1562 * Registers fib lookup module within the subsystem. 1563 */ 1564 int 1565 fib_module_register(struct fib_lookup_module *flm) 1566 { 1567 1568 FIB_MOD_LOCK(); 1569 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1570 print_family(flm->flm_family)); 1571 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1572 FIB_MOD_UNLOCK(); 1573 1574 return (0); 1575 } 1576 1577 /* 1578 * Tries to unregister fib lookup module. 1579 * 1580 * Returns 0 on success, EBUSY if module is still used 1581 * by some of the tables. 1582 */ 1583 int 1584 fib_module_unregister(struct fib_lookup_module *flm) 1585 { 1586 1587 FIB_MOD_LOCK(); 1588 if (flm->flm_refcount > 0) { 1589 FIB_MOD_UNLOCK(); 1590 return (EBUSY); 1591 } 1592 fib_error_clear_flm(flm); 1593 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1594 print_family(flm->flm_family)); 1595 TAILQ_REMOVE(&all_algo_list, flm, entries); 1596 FIB_MOD_UNLOCK(); 1597 1598 return (0); 1599 } 1600 1601 void 1602 vnet_fib_init(void) 1603 { 1604 1605 TAILQ_INIT(&V_fib_data_list); 1606 } 1607 1608 void 1609 vnet_fib_destroy(void) 1610 { 1611 1612 FIB_MOD_LOCK(); 1613 fib_error_clear(); 1614 FIB_MOD_UNLOCK(); 1615 } 1616