1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 #ifdef INET6 109 bool algo_fixed_inet6 = false; 110 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 111 "IPv6 longest prefix match lookups"); 112 #endif 113 #ifdef INET 114 bool algo_fixed_inet = false; 115 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 116 "IPv4 longest prefix match lookups"); 117 #endif 118 119 struct nhop_ref_table { 120 uint32_t count; 121 int32_t refcnt[0]; 122 }; 123 124 /* 125 * Data structure for the fib lookup instance tied to the particular rib. 126 */ 127 struct fib_data { 128 uint32_t number_nhops; /* current # of nhops */ 129 uint8_t hit_nhops; /* true if out of nhop limit */ 130 uint8_t init_done; /* true if init is competed */ 131 uint32_t fd_dead:1; /* Scheduled for deletion */ 132 uint32_t fd_linked:1; /* true if linked */ 133 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 134 uint32_t fd_force_eval:1;/* true if rebuild scheduled */ 135 uint8_t fd_family; /* family */ 136 uint32_t fd_fibnum; /* fibnum */ 137 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 138 uint32_t fd_algo_mask; /* bitmask for algo data */ 139 struct callout fd_callout; /* rebuild callout */ 140 void *fd_algo_data; /* algorithm data */ 141 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 142 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 143 struct rib_head *fd_rh; /* RIB table we're attached to */ 144 struct rib_subscription *fd_rs; /* storing table subscription */ 145 struct fib_dp fd_dp; /* fib datapath data */ 146 struct vnet *fd_vnet; /* vnet fib belongs to */ 147 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 148 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 149 uint32_t fd_num_changes; /* number of changes since last callout */ 150 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 151 }; 152 153 static void rebuild_fd_callout(void *_data); 154 static void destroy_fd_instance_epoch(epoch_context_t ctx); 155 static enum flm_op_result attach_datapath(struct fib_data *fd); 156 static bool is_idx_free(struct fib_data *fd, uint32_t index); 157 static void set_algo_fixed(struct rib_head *rh); 158 159 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 160 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 161 162 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 163 struct fib_lookup_module *orig_flm); 164 static void fib_unref_algo(struct fib_lookup_module *flm); 165 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 166 167 struct mtx fib_mtx; 168 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 169 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 170 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 171 172 173 /* Algorithm has to be this percent better than the current to switch */ 174 #define BEST_DIFF_PERCENT (5 * 256 / 100) 175 /* Schedule algo re-evaluation X seconds after a change */ 176 #define ALGO_EVAL_DELAY_MS 30000 177 /* Force algo re-evaluation after X changes */ 178 #define ALGO_EVAL_NUM_ROUTES 100 179 /* Try to setup algorithm X times */ 180 #define FIB_MAX_TRIES 32 181 /* Max amount of supported nexthops */ 182 #define FIB_MAX_NHOPS 262144 183 #define FIB_CALLOUT_DELAY_MS 50 184 185 /* Debug */ 186 static int flm_debug_level = LOG_NOTICE; 187 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 188 &flm_debug_level, 0, "debuglevel"); 189 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 190 191 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 192 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 193 #define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ 194 printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\ 195 print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) 196 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 197 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 198 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 199 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 200 } 201 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 202 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 203 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 204 __func__, _fmt, ## __VA_ARGS__); \ 205 } 206 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 207 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 208 #else 209 #define FD_PRINTF_LOG_DEBUG() 210 #endif 211 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 212 #define FD_PRINTF_LOG_INFO _FD_PRINTF 213 #else 214 #define FD_PRINTF_LOG_INFO() 215 #endif 216 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 217 #define FD_PRINTF_LOG_ERR _FD_PRINTF 218 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 219 220 221 /* List of all registered lookup algorithms */ 222 static TAILQ_HEAD(, fib_lookup_module) all_algo_list; 223 224 /* List of all fib lookup instances in the vnet */ 225 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 226 #define V_fib_data_list VNET(fib_data_list) 227 228 /* Datastructure for storing non-transient fib lookup module failures */ 229 struct fib_error { 230 int fe_family; 231 uint32_t fe_fibnum; /* failed rtable */ 232 struct fib_lookup_module *fe_flm; /* failed module */ 233 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 234 }; 235 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 236 #define V_fib_error_list VNET(fib_error_list) 237 238 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 239 struct fib_dp_header { 240 struct epoch_context fdh_epoch_ctx; 241 uint32_t fdh_num_tables; 242 struct fib_dp fdh_idx[0]; 243 }; 244 245 /* 246 * Tries to add new non-transient algorithm error to the list of 247 * errors. 248 * Returns true on success. 249 */ 250 static bool 251 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 252 { 253 struct fib_error *fe; 254 255 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 256 if (fe == NULL) 257 return (false); 258 fe->fe_flm = flm; 259 fe->fe_family = flm->flm_family; 260 fe->fe_fibnum = fibnum; 261 262 FIB_MOD_LOCK(); 263 /* Avoid duplicates by checking if error already exists first */ 264 if (flm_error_check(flm, fibnum)) { 265 FIB_MOD_UNLOCK(); 266 free(fe, M_TEMP); 267 return (true); 268 } 269 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 270 FIB_MOD_UNLOCK(); 271 272 return (true); 273 } 274 275 /* 276 * True if non-transient error has been registered for @flm in @fibnum. 277 */ 278 static bool 279 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 280 { 281 const struct fib_error *fe; 282 283 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 284 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 285 return (true); 286 } 287 288 return (false); 289 } 290 291 /* 292 * Clear all errors of algo specified by @flm. 293 */ 294 static void 295 fib_error_clear_flm(struct fib_lookup_module *flm) 296 { 297 struct fib_error *fe, *fe_tmp; 298 299 FIB_MOD_LOCK_ASSERT(); 300 301 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 302 if (fe->fe_flm == flm) { 303 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 304 free(fe, M_TEMP); 305 } 306 } 307 } 308 309 /* 310 * Clears all errors in current VNET. 311 */ 312 static void 313 fib_error_clear() 314 { 315 struct fib_error *fe, *fe_tmp; 316 317 FIB_MOD_LOCK_ASSERT(); 318 319 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 320 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 321 free(fe, M_TEMP); 322 } 323 } 324 325 static const char * 326 print_op_result(enum flm_op_result result) 327 { 328 switch (result) { 329 case FLM_SUCCESS: 330 return "success"; 331 case FLM_REBUILD: 332 return "rebuild"; 333 case FLM_ERROR: 334 return "error"; 335 } 336 337 return "unknown"; 338 } 339 340 static const char * 341 print_family(int family) 342 { 343 344 if (family == AF_INET) 345 return ("inet"); 346 else if (family == AF_INET6) 347 return ("inet6"); 348 else 349 return ("unknown"); 350 } 351 352 /* 353 * Debug function used by lookup algorithms. 354 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 355 */ 356 void 357 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 358 { 359 char buf[128]; 360 va_list ap; 361 362 if (level > flm_debug_level) 363 return; 364 365 va_start(ap, fmt); 366 vsnprintf(buf, sizeof(buf), fmt, ap); 367 va_end(ap); 368 369 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 370 func, "%s", buf); 371 } 372 373 /* 374 * Outputs list of algorithms supported by the provided address family. 375 */ 376 static int 377 print_algos_sysctl(struct sysctl_req *req, int family) 378 { 379 struct fib_lookup_module *flm; 380 struct sbuf sbuf; 381 int error, count = 0; 382 383 error = sysctl_wire_old_buffer(req, 0); 384 if (error == 0) { 385 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 386 TAILQ_FOREACH(flm, &all_algo_list, entries) { 387 if (flm->flm_family == family) { 388 if (count++ > 0) 389 sbuf_cat(&sbuf, ", "); 390 sbuf_cat(&sbuf, flm->flm_name); 391 } 392 } 393 error = sbuf_finish(&sbuf); 394 sbuf_delete(&sbuf); 395 } 396 return (error); 397 } 398 399 #ifdef INET6 400 static int 401 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 402 { 403 404 return (print_algos_sysctl(req, AF_INET6)); 405 } 406 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 407 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 408 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 409 #endif 410 411 #ifdef INET 412 static int 413 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 414 { 415 416 return (print_algos_sysctl(req, AF_INET)); 417 } 418 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 419 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 420 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 421 #endif 422 423 /* 424 * Calculate delay between repeated failures. 425 * Returns current delay in milliseconds. 426 */ 427 static uint32_t 428 callout_calc_delay_ms(struct fib_data *fd) 429 { 430 uint32_t shift; 431 432 if (fd->fd_failed_rebuilds > 10) 433 shift = 10; 434 else 435 shift = fd->fd_failed_rebuilds; 436 437 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 438 } 439 440 static void 441 schedule_callout(struct fib_data *fd, int delay_ms) 442 { 443 444 callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, 445 rebuild_fd_callout, fd, 0); 446 } 447 448 static void 449 schedule_fd_rebuild(struct fib_data *fd) 450 { 451 452 FIB_MOD_LOCK(); 453 if (!fd->fd_need_rebuild) { 454 fd->fd_need_rebuild = true; 455 456 /* 457 * Potentially re-schedules pending callout 458 * initiated by schedule_algo_eval. 459 */ 460 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); 461 schedule_callout(fd, callout_calc_delay_ms(fd)); 462 } 463 FIB_MOD_UNLOCK(); 464 } 465 466 static void 467 schedule_algo_eval(struct fib_data *fd) 468 { 469 470 if (fd->fd_num_changes++ == 0) { 471 /* Start callout to consider switch */ 472 FIB_MOD_LOCK(); 473 if (!callout_pending(&fd->fd_callout)) 474 schedule_callout(fd, ALGO_EVAL_DELAY_MS); 475 FIB_MOD_UNLOCK(); 476 } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { 477 /* Reset callout to exec immediately */ 478 FIB_MOD_LOCK(); 479 if (!fd->fd_need_rebuild) { 480 fd->fd_force_eval = true; 481 schedule_callout(fd, 1); 482 } 483 FIB_MOD_UNLOCK(); 484 } 485 } 486 487 /* 488 * Rib subscription handler. Checks if the algorithm is ready to 489 * receive updates, handles nexthop refcounting and passes change 490 * data to the algorithm callback. 491 */ 492 static void 493 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 494 void *_data) 495 { 496 struct fib_data *fd = (struct fib_data *)_data; 497 enum flm_op_result result; 498 499 RIB_WLOCK_ASSERT(rnh); 500 501 /* 502 * There is a small gap between subscribing for route changes 503 * and initiating rtable dump. Avoid receiving route changes 504 * prior to finishing rtable dump by checking `init_done`. 505 */ 506 if (!fd->init_done) 507 return; 508 /* 509 * If algo requested rebuild, stop sending updates by default. 510 * This simplifies nexthop refcount handling logic. 511 */ 512 if (fd->fd_need_rebuild) 513 return; 514 515 /* Consider scheduling algorithm re-evaluation */ 516 schedule_algo_eval(fd); 517 518 /* 519 * Maintain guarantee that every nexthop returned by the dataplane 520 * lookup has > 0 refcount, so can be safely referenced within current 521 * epoch. 522 */ 523 if (rc->rc_nh_new != NULL) { 524 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 525 /* ran out of indexes */ 526 schedule_fd_rebuild(fd); 527 return; 528 } 529 } 530 531 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 532 533 switch (result) { 534 case FLM_SUCCESS: 535 /* Unref old nexthop on success */ 536 if (rc->rc_nh_old != NULL) 537 fib_unref_nhop(fd, rc->rc_nh_old); 538 break; 539 case FLM_REBUILD: 540 541 /* 542 * Algo is not able to apply the update. 543 * Schedule algo rebuild. 544 */ 545 schedule_fd_rebuild(fd); 546 break; 547 case FLM_ERROR: 548 549 /* 550 * Algo reported a non-recoverable error. 551 * Record the error and schedule rebuild, which will 552 * trigger best algo selection. 553 */ 554 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 555 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 556 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 557 schedule_fd_rebuild(fd); 558 } 559 } 560 561 static void 562 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 563 { 564 565 if (old_fd == NULL) { 566 // TODO: read from rtable 567 fd->number_nhops = 16; 568 return; 569 } 570 571 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 572 fd->number_nhops = 2 * old_fd->number_nhops; 573 else 574 fd->number_nhops = old_fd->number_nhops; 575 } 576 577 struct walk_cbdata { 578 struct fib_data *fd; 579 flm_dump_t *func; 580 enum flm_op_result result; 581 }; 582 583 /* 584 * Handler called after all rtenties have been dumped. 585 * Performs post-dump framework checks and calls 586 * algo:flm_dump_end_cb(). 587 * 588 * Updates walk_cbdata result. 589 */ 590 static void 591 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 592 { 593 struct walk_cbdata *w = (struct walk_cbdata *)_data; 594 struct fib_data *fd = w->fd; 595 596 RIB_WLOCK_ASSERT(w->fd->fd_rh); 597 598 if (rnh->rib_dying) { 599 w->result = FLM_ERROR; 600 return; 601 } 602 603 if (fd->hit_nhops) { 604 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 605 fd->nh_ref_table->count); 606 if (w->result == FLM_SUCCESS) 607 w->result = FLM_REBUILD; 608 return; 609 } 610 611 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 612 return; 613 614 /* Post-dump hook, dump successful */ 615 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 616 617 if (w->result == FLM_SUCCESS) { 618 /* Mark init as done to allow routing updates */ 619 fd->init_done = 1; 620 } 621 } 622 623 /* 624 * Callback for each entry in rib. 625 * Calls algo:flm_dump_rib_item_cb func as a part of initial 626 * route table synchronisation. 627 */ 628 static int 629 sync_algo_cb(struct rtentry *rt, void *_data) 630 { 631 struct walk_cbdata *w = (struct walk_cbdata *)_data; 632 633 RIB_WLOCK_ASSERT(w->fd->fd_rh); 634 635 if (w->result == FLM_SUCCESS && w->func) { 636 637 /* 638 * Reference nexthops to maintain guarantee that 639 * each nexthop returned by datapath has > 0 references 640 * and can be safely referenced within current epoch. 641 */ 642 struct nhop_object *nh = rt_get_raw_nhop(rt); 643 if (fib_ref_nhop(w->fd, nh) != 0) 644 w->result = w->func(rt, w->fd->fd_algo_data); 645 else 646 w->result = FLM_REBUILD; 647 } 648 649 return (0); 650 } 651 652 /* 653 * Dump all routing table state to the algo instance. 654 */ 655 static enum flm_op_result 656 sync_algo(struct fib_data *fd) 657 { 658 struct walk_cbdata w = { 659 .fd = fd, 660 .func = fd->fd_flm->flm_dump_rib_item_cb, 661 .result = FLM_SUCCESS, 662 }; 663 664 rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); 665 666 FD_PRINTF(LOG_INFO, fd, "initial dump completed, result: %s", 667 print_op_result(w.result)); 668 669 return (w.result); 670 } 671 672 /* 673 * Schedules epoch-backed @fd instance deletion. 674 * * Unlinks @fd from the list of active algo instances. 675 * * Removes rib subscription. 676 * * Stops callout. 677 * * Schedules actual deletion. 678 * 679 * Assume @fd is already unlinked from the datapath. 680 */ 681 static int 682 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 683 { 684 bool is_dead; 685 686 NET_EPOCH_ASSERT(); 687 688 FIB_MOD_LOCK(); 689 is_dead = fd->fd_dead; 690 if (!is_dead) 691 fd->fd_dead = true; 692 if (fd->fd_linked) { 693 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 694 fd->fd_linked = false; 695 } 696 FIB_MOD_UNLOCK(); 697 if (is_dead) 698 return (0); 699 700 FD_PRINTF(LOG_INFO, fd, "DETACH"); 701 702 if (fd->fd_rs != NULL) 703 rib_unsibscribe(fd->fd_rs); 704 705 /* 706 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 707 * will be executed, hence no _new_ callout schedules will happen. 708 * 709 * There can be 2 possible scenarious here: 710 * 1) we're running inside a callout when we're deleting ourselves 711 * due to migration to a newer fd 712 * 2) we're running from rt_table_destroy() and callout is scheduled 713 * for execution OR is executing 714 * 715 * For (2) we need to wait for the callout termination, as the routing table 716 * will be destroyed after this function returns. 717 * For (1) we cannot call drain, but can ensure that this is the last invocation. 718 */ 719 720 if (in_callout) 721 callout_stop(&fd->fd_callout); 722 else 723 callout_drain(&fd->fd_callout); 724 725 epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, 726 &fd->fd_epoch_ctx); 727 728 return (0); 729 } 730 731 /* 732 * Wipe all fd instances from the list matching rib specified by @rh. 733 * If @keep_first is set, remove all but the first record. 734 */ 735 static void 736 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 737 { 738 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 739 struct fib_data *fd, *fd_tmp; 740 struct epoch_tracker et; 741 742 FIB_MOD_LOCK(); 743 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 744 if (fd->fd_rh == rh) { 745 if (keep_first) { 746 keep_first = false; 747 continue; 748 } 749 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 750 fd->fd_linked = false; 751 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 752 } 753 } 754 FIB_MOD_UNLOCK(); 755 756 /* Pass 2: remove each entry */ 757 NET_EPOCH_ENTER(et); 758 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 759 schedule_destroy_fd_instance(fd, in_callout); 760 } 761 NET_EPOCH_EXIT(et); 762 } 763 764 void 765 fib_destroy_rib(struct rib_head *rh) 766 { 767 768 /* 769 * rnh has `is_dying` flag set, so setup of new fd's will fail at 770 * sync_algo() stage, preventing new entries to be added to the list 771 * of active algos. Remove all existing entries for the particular rib. 772 */ 773 fib_cleanup_algo(rh, false, false); 774 } 775 776 /* 777 * Finalises fd destruction by freeing all fd resources. 778 */ 779 static void 780 destroy_fd_instance(struct fib_data *fd) 781 { 782 783 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 784 785 /* Call destroy callback first */ 786 if (fd->fd_algo_data != NULL) 787 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 788 789 /* Nhop table */ 790 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 791 for (int i = 0; i < fd->number_nhops; i++) { 792 if (!is_idx_free(fd, i)) { 793 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", 794 i, fd->nh_idx[i]); 795 nhop_free_any(fd->nh_idx[i]); 796 } 797 } 798 free(fd->nh_idx, M_RTABLE); 799 } 800 if (fd->nh_ref_table != NULL) 801 free(fd->nh_ref_table, M_RTABLE); 802 803 fib_unref_algo(fd->fd_flm); 804 805 free(fd, M_RTABLE); 806 } 807 808 /* 809 * Epoch callback indicating fd is safe to destroy 810 */ 811 static void 812 destroy_fd_instance_epoch(epoch_context_t ctx) 813 { 814 struct fib_data *fd; 815 816 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 817 818 destroy_fd_instance(fd); 819 } 820 821 /* 822 * Tries to setup fd instance. 823 * - Allocates fd/nhop table 824 * - Runs algo:flm_init_cb algo init 825 * - Subscribes fd to the rib 826 * - Runs rtable dump 827 * - Adds instance to the list of active instances. 828 * 829 * Returns: operation result. Fills in @pfd with resulting fd on success. 830 * 831 */ 832 static enum flm_op_result 833 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 834 struct fib_data *old_fd, struct fib_data **pfd) 835 { 836 struct fib_data *fd; 837 size_t size; 838 enum flm_op_result result; 839 840 /* Allocate */ 841 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 842 if (fd == NULL) { 843 *pfd = NULL; 844 return (FLM_REBUILD); 845 } 846 *pfd = fd; 847 848 estimate_nhop_scale(old_fd, fd); 849 850 fd->fd_rh = rh; 851 fd->fd_family = rh->rib_family; 852 fd->fd_fibnum = rh->rib_fibnum; 853 callout_init(&fd->fd_callout, 1); 854 fd->fd_vnet = curvnet; 855 fd->fd_flm = flm; 856 857 FIB_MOD_LOCK(); 858 flm->flm_refcount++; 859 FIB_MOD_UNLOCK(); 860 861 /* Allocate nhidx -> nhop_ptr table */ 862 size = fd->number_nhops * sizeof(void *); 863 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 864 if (fd->nh_idx == NULL) { 865 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 866 return (FLM_REBUILD); 867 } 868 869 /* Allocate nhop index refcount table */ 870 size = sizeof(struct nhop_ref_table); 871 size += fd->number_nhops * sizeof(uint32_t); 872 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 873 if (fd->nh_ref_table == NULL) { 874 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 875 return (FLM_REBUILD); 876 } 877 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 878 879 /* Okay, we're ready for algo init */ 880 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 881 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 882 if (result != FLM_SUCCESS) 883 return (result); 884 885 /* Try to subscribe */ 886 if (flm->flm_change_rib_item_cb != NULL) { 887 fd->fd_rs = rib_subscribe_internal(fd->fd_rh, 888 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0); 889 if (fd->fd_rs == NULL) 890 return (FLM_REBUILD); 891 } 892 893 /* Dump */ 894 result = sync_algo(fd); 895 if (result != FLM_SUCCESS) 896 return (result); 897 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 898 899 FIB_MOD_LOCK(); 900 /* 901 * Insert fd in the beginning of a list, to maintain invariant 902 * that first matching entry for the AF/fib is always the active 903 * one. 904 */ 905 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 906 fd->fd_linked = true; 907 FIB_MOD_UNLOCK(); 908 909 return (FLM_SUCCESS); 910 } 911 912 /* 913 * Sets up algo @flm for table @rh and links it to the datapath. 914 * 915 */ 916 static enum flm_op_result 917 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 918 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 919 { 920 struct fib_data *prev_fd, *new_fd; 921 struct epoch_tracker et; 922 enum flm_op_result result; 923 924 prev_fd = orig_fd; 925 new_fd = NULL; 926 for (int i = 0; i < FIB_MAX_TRIES; i++) { 927 NET_EPOCH_ENTER(et); 928 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 929 930 if ((result == FLM_SUCCESS) && attach) 931 result = attach_datapath(new_fd); 932 933 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 934 schedule_destroy_fd_instance(prev_fd, false); 935 prev_fd = NULL; 936 } 937 NET_EPOCH_EXIT(et); 938 939 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 940 print_op_result(result)); 941 942 if (result == FLM_REBUILD) { 943 prev_fd = new_fd; 944 new_fd = NULL; 945 continue; 946 } 947 948 break; 949 } 950 951 if (result != FLM_SUCCESS) { 952 /* update failure count */ 953 FIB_MOD_LOCK(); 954 if (orig_fd != NULL) 955 orig_fd->fd_failed_rebuilds++; 956 FIB_MOD_UNLOCK(); 957 958 /* Ban algo on non-recoverable error */ 959 if (result == FLM_ERROR) 960 flm_error_add(flm, rh->rib_fibnum); 961 962 NET_EPOCH_ENTER(et); 963 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 964 schedule_destroy_fd_instance(prev_fd, false); 965 if (new_fd != NULL) { 966 schedule_destroy_fd_instance(new_fd, false); 967 new_fd = NULL; 968 } 969 NET_EPOCH_EXIT(et); 970 } 971 972 *pfd = new_fd; 973 return (result); 974 } 975 976 /* 977 * Callout for all scheduled fd-related work. 978 * - Checks if the current algo is still the best algo 979 * - Creates a new instance of an algo for af/fib if desired. 980 */ 981 static void 982 rebuild_fd_callout(void *_data) 983 { 984 struct fib_data *fd, *fd_new, *fd_tmp; 985 struct fib_lookup_module *flm_new; 986 struct epoch_tracker et; 987 enum flm_op_result result; 988 bool need_rebuild = false; 989 990 fd = (struct fib_data *)_data; 991 992 FIB_MOD_LOCK(); 993 need_rebuild = fd->fd_need_rebuild; 994 fd->fd_need_rebuild = false; 995 fd->fd_force_eval = false; 996 fd->fd_num_changes = 0; 997 FIB_MOD_UNLOCK(); 998 999 CURVNET_SET(fd->fd_vnet); 1000 1001 /* First, check if we're still OK to use this algo */ 1002 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1003 if ((flm_new == NULL) && (!need_rebuild)) { 1004 /* Keep existing algo, no need to rebuild. */ 1005 CURVNET_RESTORE(); 1006 return; 1007 } 1008 1009 if (flm_new == NULL) { 1010 flm_new = fd->fd_flm; 1011 fd_tmp = fd; 1012 } else { 1013 fd_tmp = NULL; 1014 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1015 } 1016 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1017 if (fd_tmp == NULL) { 1018 /* fd_new represents new algo */ 1019 fib_unref_algo(flm_new); 1020 } 1021 if (result != FLM_SUCCESS) { 1022 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1023 CURVNET_RESTORE(); 1024 return; 1025 } 1026 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1027 1028 /* Remove old instance removal */ 1029 if (fd != NULL) { 1030 NET_EPOCH_ENTER(et); 1031 schedule_destroy_fd_instance(fd, true); 1032 NET_EPOCH_EXIT(et); 1033 } 1034 1035 CURVNET_RESTORE(); 1036 } 1037 1038 /* 1039 * Finds algo by name/family. 1040 * Returns referenced algo or NULL. 1041 */ 1042 static struct fib_lookup_module * 1043 fib_find_algo(const char *algo_name, int family) 1044 { 1045 struct fib_lookup_module *flm; 1046 1047 FIB_MOD_LOCK(); 1048 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1049 if ((strcmp(flm->flm_name, algo_name) == 0) && 1050 (family == flm->flm_family)) { 1051 flm->flm_refcount++; 1052 FIB_MOD_UNLOCK(); 1053 return (flm); 1054 } 1055 } 1056 FIB_MOD_UNLOCK(); 1057 1058 return (NULL); 1059 } 1060 1061 static void 1062 fib_unref_algo(struct fib_lookup_module *flm) 1063 { 1064 1065 FIB_MOD_LOCK(); 1066 flm->flm_refcount--; 1067 FIB_MOD_UNLOCK(); 1068 } 1069 1070 static int 1071 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1072 { 1073 struct fib_lookup_module *flm = NULL; 1074 struct fib_data *fd = NULL; 1075 char old_algo_name[32], algo_name[32]; 1076 struct rib_head *rh = NULL; 1077 enum flm_op_result result; 1078 int error; 1079 1080 /* Fetch current algo/rib for af/family */ 1081 FIB_MOD_LOCK(); 1082 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1083 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1084 break; 1085 } 1086 if (fd == NULL) { 1087 FIB_MOD_UNLOCK(); 1088 return (ENOENT); 1089 } 1090 rh = fd->fd_rh; 1091 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1092 sizeof(old_algo_name)); 1093 FIB_MOD_UNLOCK(); 1094 1095 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1096 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1097 if (error != 0 || req->newptr == NULL) 1098 return (error); 1099 1100 if (strcmp(algo_name, old_algo_name) == 0) 1101 return (0); 1102 1103 /* New algorithm name is different */ 1104 flm = fib_find_algo(algo_name, family); 1105 if (flm == NULL) { 1106 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1107 return (ESRCH); 1108 } 1109 1110 fd = NULL; 1111 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1112 fib_unref_algo(flm); 1113 if (result != FLM_SUCCESS) 1114 return (EINVAL); 1115 1116 /* Disable automated jumping between algos */ 1117 FIB_MOD_LOCK(); 1118 set_algo_fixed(rh); 1119 FIB_MOD_UNLOCK(); 1120 /* Remove old instance(s) */ 1121 fib_cleanup_algo(rh, true, false); 1122 1123 /* Drain cb so user can unload the module after userret if so desired */ 1124 epoch_drain_callbacks(net_epoch_preempt); 1125 1126 return (0); 1127 } 1128 1129 #ifdef INET 1130 static int 1131 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1132 { 1133 1134 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET, oidp, req)); 1135 } 1136 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1137 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1138 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1139 #endif 1140 1141 #ifdef INET6 1142 static int 1143 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1144 { 1145 1146 return (set_fib_algo(RT_DEFAULT_FIB, AF_INET6, oidp, req)); 1147 } 1148 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1149 CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1150 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1151 #endif 1152 1153 static void 1154 destroy_fdh_epoch(epoch_context_t ctx) 1155 { 1156 struct fib_dp_header *fdh; 1157 1158 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1159 free(fdh, M_RTABLE); 1160 } 1161 1162 static struct fib_dp_header * 1163 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1164 { 1165 size_t sz; 1166 struct fib_dp_header *fdh; 1167 1168 sz = sizeof(struct fib_dp_header); 1169 sz += sizeof(struct fib_dp) * num_tables; 1170 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1171 if (fdh != NULL) 1172 fdh->fdh_num_tables = num_tables; 1173 return (fdh); 1174 } 1175 1176 static struct fib_dp_header * 1177 get_fib_dp_header(struct fib_dp *dp) 1178 { 1179 1180 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1181 } 1182 1183 /* 1184 * Replace per-family index pool @pdp with a new one which 1185 * contains updated callback/algo data from @fd. 1186 * Returns 0 on success. 1187 */ 1188 static enum flm_op_result 1189 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) 1190 { 1191 struct fib_dp_header *new_fdh, *old_fdh; 1192 1193 NET_EPOCH_ASSERT(); 1194 1195 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1196 curvnet, fd->fd_dp.f, fd->fd_dp.arg); 1197 1198 FIB_MOD_LOCK(); 1199 old_fdh = get_fib_dp_header(*pdp); 1200 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1201 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1202 if (new_fdh == NULL) { 1203 FIB_MOD_UNLOCK(); 1204 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1205 return (FLM_REBUILD); 1206 } 1207 1208 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1209 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1210 /* Update relevant data structure for @fd */ 1211 new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1212 1213 /* Ensure memcpy() writes have completed */ 1214 atomic_thread_fence_rel(); 1215 /* Set new datapath pointer */ 1216 *pdp = &new_fdh->fdh_idx[0]; 1217 FIB_MOD_UNLOCK(); 1218 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1219 1220 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1221 &old_fdh->fdh_epoch_ctx); 1222 1223 return (FLM_SUCCESS); 1224 } 1225 1226 static struct fib_dp ** 1227 get_family_dp_ptr(int family) 1228 { 1229 switch (family) { 1230 case AF_INET: 1231 return (&V_inet_dp); 1232 case AF_INET6: 1233 return (&V_inet6_dp); 1234 } 1235 return (NULL); 1236 } 1237 1238 /* 1239 * Make datapath use fib instance @fd 1240 */ 1241 static enum flm_op_result 1242 attach_datapath(struct fib_data *fd) 1243 { 1244 struct fib_dp **pdp; 1245 1246 pdp = get_family_dp_ptr(fd->fd_family); 1247 return (replace_rtables_family(pdp, fd)); 1248 } 1249 1250 /* 1251 * Grow datapath pointers array. 1252 * Called from sysctl handler on growing number of routing tables. 1253 */ 1254 static void 1255 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1256 { 1257 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1258 1259 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1260 1261 FIB_MOD_LOCK(); 1262 if (*pdp != NULL) { 1263 old_fdh = get_fib_dp_header(*pdp); 1264 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1265 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1266 } 1267 1268 /* Wait till all writes completed */ 1269 atomic_thread_fence_rel(); 1270 1271 *pdp = &new_fdh->fdh_idx[0]; 1272 FIB_MOD_UNLOCK(); 1273 1274 if (old_fdh != NULL) 1275 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1276 &old_fdh->fdh_epoch_ctx); 1277 } 1278 1279 /* 1280 * Grows per-AF arrays of datapath pointers for each supported family. 1281 * Called from fibs resize sysctl handler. 1282 */ 1283 void 1284 fib_grow_rtables(uint32_t new_num_tables) 1285 { 1286 1287 #ifdef INET 1288 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1289 #endif 1290 #ifdef INET6 1291 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1292 #endif 1293 } 1294 1295 void 1296 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1297 { 1298 1299 bzero(rinfo, sizeof(struct rib_rtable_info)); 1300 rinfo->num_prefixes = rh->rnh_prefixes; 1301 rinfo->num_nhops = nhops_get_count(rh); 1302 #ifdef ROUTE_MPATH 1303 rinfo->num_nhgrp = nhgrp_get_count(rh); 1304 #endif 1305 } 1306 1307 /* 1308 * Accessor to get rib instance @fd is attached to. 1309 */ 1310 struct rib_head * 1311 fib_get_rh(struct fib_data *fd) 1312 { 1313 1314 return (fd->fd_rh); 1315 } 1316 1317 /* 1318 * Accessor to export idx->nhop array 1319 */ 1320 struct nhop_object ** 1321 fib_get_nhop_array(struct fib_data *fd) 1322 { 1323 1324 return (fd->nh_idx); 1325 } 1326 1327 static uint32_t 1328 get_nhop_idx(struct nhop_object *nh) 1329 { 1330 #ifdef ROUTE_MPATH 1331 if (NH_IS_NHGRP(nh)) 1332 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1333 else 1334 return (nhop_get_idx(nh) * 2); 1335 #else 1336 return (nhop_get_idx(nh)); 1337 #endif 1338 } 1339 1340 uint32_t 1341 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1342 { 1343 1344 return (get_nhop_idx(nh)); 1345 } 1346 1347 static bool 1348 is_idx_free(struct fib_data *fd, uint32_t index) 1349 { 1350 1351 return (fd->nh_ref_table->refcnt[index] == 0); 1352 } 1353 1354 static uint32_t 1355 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1356 { 1357 uint32_t idx = get_nhop_idx(nh); 1358 1359 if (idx >= fd->number_nhops) { 1360 fd->hit_nhops = 1; 1361 return (0); 1362 } 1363 1364 if (is_idx_free(fd, idx)) { 1365 nhop_ref_any(nh); 1366 fd->nh_idx[idx] = nh; 1367 fd->nh_ref_table->count++; 1368 FD_PRINTF(LOG_DEBUG, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1369 } 1370 fd->nh_ref_table->refcnt[idx]++; 1371 1372 return (idx); 1373 } 1374 1375 struct nhop_release_data { 1376 struct nhop_object *nh; 1377 struct epoch_context ctx; 1378 }; 1379 1380 static void 1381 release_nhop_epoch(epoch_context_t ctx) 1382 { 1383 struct nhop_release_data *nrd; 1384 1385 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1386 nhop_free_any(nrd->nh); 1387 free(nrd, M_TEMP); 1388 } 1389 1390 /* 1391 * Delays nexthop refcount release. 1392 * Datapath may have the datastructures not updated yet, so the old 1393 * nexthop may still be returned till the end of current epoch. Delay 1394 * refcount removal, as we may be removing the last instance, which will 1395 * trigger nexthop deletion, rendering returned nexthop invalid. 1396 */ 1397 static void 1398 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1399 { 1400 struct nhop_release_data *nrd; 1401 1402 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1403 if (nrd != NULL) { 1404 nrd->nh = nh; 1405 epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); 1406 } else { 1407 /* 1408 * Unable to allocate memory. Leak nexthop to maintain guarantee 1409 * that each nhop can be referenced. 1410 */ 1411 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1412 } 1413 } 1414 1415 static void 1416 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1417 { 1418 uint32_t idx = get_nhop_idx(nh); 1419 1420 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1421 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1422 1423 fd->nh_ref_table->refcnt[idx]--; 1424 if (fd->nh_ref_table->refcnt[idx] == 0) { 1425 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1426 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1427 } 1428 } 1429 1430 static void 1431 set_algo_fixed(struct rib_head *rh) 1432 { 1433 switch (rh->rib_family) { 1434 #ifdef INET 1435 case AF_INET: 1436 algo_fixed_inet = true; 1437 break; 1438 #endif 1439 #ifdef INET6 1440 case AF_INET6: 1441 algo_fixed_inet6 = true; 1442 break; 1443 #endif 1444 } 1445 } 1446 1447 static bool 1448 is_algo_fixed(struct rib_head *rh) 1449 { 1450 1451 switch (rh->rib_family) { 1452 #ifdef INET 1453 case AF_INET: 1454 return (algo_fixed_inet); 1455 #endif 1456 #ifdef INET6 1457 case AF_INET6: 1458 return (algo_fixed_inet6); 1459 #endif 1460 } 1461 return (false); 1462 } 1463 1464 /* 1465 * Runs the check on what would be the best algo for rib @rh, assuming 1466 * that the current algo is the one specified by @orig_flm. Note that 1467 * it can be NULL for initial selection. 1468 * 1469 * Returns referenced new algo or NULL if the current one is the best. 1470 */ 1471 static struct fib_lookup_module * 1472 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1473 { 1474 uint8_t preference, curr_preference = 0, best_preference = 0; 1475 struct fib_lookup_module *flm, *best_flm = NULL; 1476 struct rib_rtable_info rinfo; 1477 int candidate_algos = 0; 1478 1479 fib_get_rtable_info(rh, &rinfo); 1480 1481 FIB_MOD_LOCK(); 1482 if (is_algo_fixed(rh)) { 1483 FIB_MOD_UNLOCK(); 1484 return (NULL); 1485 } 1486 1487 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1488 if (flm->flm_family != rh->rib_family) 1489 continue; 1490 candidate_algos++; 1491 preference = flm->flm_get_pref(&rinfo); 1492 if (preference > best_preference) { 1493 if (!flm_error_check(flm, rh->rib_fibnum)) { 1494 best_preference = preference; 1495 best_flm = flm; 1496 } 1497 } 1498 if (flm == orig_flm) 1499 curr_preference = preference; 1500 } 1501 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1502 best_flm->flm_refcount++; 1503 else 1504 best_flm = NULL; 1505 FIB_MOD_UNLOCK(); 1506 1507 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1508 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1509 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1510 best_preference); 1511 1512 return (best_flm); 1513 } 1514 1515 /* 1516 * Called when new route table is created. 1517 * Selects, allocates and attaches fib algo for the table. 1518 */ 1519 int 1520 fib_select_algo_initial(struct rib_head *rh) 1521 { 1522 struct fib_lookup_module *flm; 1523 struct fib_data *fd = NULL; 1524 enum flm_op_result result; 1525 int error = 0; 1526 1527 flm = fib_check_best_algo(rh, NULL); 1528 if (flm == NULL) { 1529 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1530 return (ENOENT); 1531 } 1532 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1533 1534 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1535 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1536 if (result == FLM_SUCCESS) { 1537 1538 /* 1539 * Attach datapath directly to avoid multiple reallocations 1540 * during fib growth 1541 */ 1542 struct fib_dp_header *fdp; 1543 struct fib_dp **pdp; 1544 1545 pdp = get_family_dp_ptr(rh->rib_family); 1546 if (pdp != NULL) { 1547 fdp = get_fib_dp_header(*pdp); 1548 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1549 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1550 } 1551 } else { 1552 error = EINVAL; 1553 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1554 } 1555 1556 fib_unref_algo(flm); 1557 1558 return (error); 1559 } 1560 1561 /* 1562 * Registers fib lookup module within the subsystem. 1563 */ 1564 int 1565 fib_module_register(struct fib_lookup_module *flm) 1566 { 1567 1568 FIB_MOD_LOCK(); 1569 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1570 print_family(flm->flm_family)); 1571 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1572 FIB_MOD_UNLOCK(); 1573 1574 return (0); 1575 } 1576 1577 /* 1578 * Tries to unregister fib lookup module. 1579 * 1580 * Returns 0 on success, EBUSY if module is still used 1581 * by some of the tables. 1582 */ 1583 int 1584 fib_module_unregister(struct fib_lookup_module *flm) 1585 { 1586 1587 FIB_MOD_LOCK(); 1588 if (flm->flm_refcount > 0) { 1589 FIB_MOD_UNLOCK(); 1590 return (EBUSY); 1591 } 1592 fib_error_clear_flm(flm); 1593 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1594 print_family(flm->flm_family)); 1595 TAILQ_REMOVE(&all_algo_list, flm, entries); 1596 FIB_MOD_UNLOCK(); 1597 1598 return (0); 1599 } 1600 1601 void 1602 vnet_fib_init(void) 1603 { 1604 1605 TAILQ_INIT(&V_fib_data_list); 1606 } 1607 1608 void 1609 vnet_fib_destroy(void) 1610 { 1611 1612 FIB_MOD_LOCK(); 1613 fib_error_clear(); 1614 FIB_MOD_UNLOCK(); 1615 } 1616 1617 static void 1618 fib_init(void) 1619 { 1620 1621 mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF); 1622 TAILQ_INIT(&all_algo_list); 1623 } 1624 SYSINIT(fib_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_init, NULL); 1625 1626