1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 VNET_DEFINE(int, fib_sync_limit) = 100; 109 #define V_fib_sync_limit VNET(fib_sync_limit) 110 SYSCTL_INT(_net_route_algo, OID_AUTO, fib_sync_limit, CTLFLAG_RW | CTLFLAG_VNET, 111 &VNET_NAME(fib_sync_limit), 0, "Guarantee synchronous fib till route limit"); 112 113 #ifdef INET6 114 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 115 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 116 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 117 "IPv6 longest prefix match lookups"); 118 #endif 119 #ifdef INET 120 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 121 #define V_algo_fixed_inet VNET(algo_fixed_inet) 122 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 123 "IPv4 longest prefix match lookups"); 124 #endif 125 126 /* Fib instance counter */ 127 static uint32_t fib_gen = 0; 128 129 struct nhop_ref_table { 130 uint32_t count; 131 int32_t refcnt[0]; 132 }; 133 134 /* 135 * Data structure for the fib lookup instance tied to the particular rib. 136 */ 137 struct fib_data { 138 uint32_t number_nhops; /* current # of nhops */ 139 uint8_t hit_nhops; /* true if out of nhop limit */ 140 uint8_t init_done; /* true if init is competed */ 141 uint32_t fd_dead:1; /* Scheduled for deletion */ 142 uint32_t fd_linked:1; /* true if linked */ 143 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 144 uint32_t fd_force_eval:1;/* true if rebuild scheduled */ 145 uint8_t fd_family; /* family */ 146 uint32_t fd_fibnum; /* fibnum */ 147 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 148 uint32_t fd_gen; /* instance gen# */ 149 struct callout fd_callout; /* rebuild callout */ 150 void *fd_algo_data; /* algorithm data */ 151 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 152 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 153 struct rib_head *fd_rh; /* RIB table we're attached to */ 154 struct rib_subscription *fd_rs; /* storing table subscription */ 155 struct fib_dp fd_dp; /* fib datapath data */ 156 struct vnet *fd_vnet; /* vnet fib belongs to */ 157 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 158 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 159 uint32_t fd_num_changes; /* number of changes since last callout */ 160 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 161 }; 162 163 static bool rebuild_fd(struct fib_data *fd); 164 static void rebuild_fd_callout(void *_data); 165 static void destroy_fd_instance_epoch(epoch_context_t ctx); 166 static enum flm_op_result attach_datapath(struct fib_data *fd); 167 static bool is_idx_free(struct fib_data *fd, uint32_t index); 168 static void set_algo_fixed(struct rib_head *rh); 169 static bool is_algo_fixed(struct rib_head *rh); 170 171 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 172 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 173 174 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 175 struct fib_lookup_module *orig_flm); 176 static void fib_unref_algo(struct fib_lookup_module *flm); 177 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 178 179 struct mtx fib_mtx; 180 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 181 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 182 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 183 184 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 185 186 /* Algorithm has to be this percent better than the current to switch */ 187 #define BEST_DIFF_PERCENT (5 * 256 / 100) 188 /* Schedule algo re-evaluation X seconds after a change */ 189 #define ALGO_EVAL_DELAY_MS 30000 190 /* Force algo re-evaluation after X changes */ 191 #define ALGO_EVAL_NUM_ROUTES 100 192 /* Try to setup algorithm X times */ 193 #define FIB_MAX_TRIES 32 194 /* Max amount of supported nexthops */ 195 #define FIB_MAX_NHOPS 262144 196 #define FIB_CALLOUT_DELAY_MS 50 197 198 /* Debug */ 199 static int flm_debug_level = LOG_NOTICE; 200 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 201 &flm_debug_level, 0, "debuglevel"); 202 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 203 #ifndef LOG_DEBUG2 204 #define LOG_DEBUG2 8 205 #endif 206 207 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 208 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) 209 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 210 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 211 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 212 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 213 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 214 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 215 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 216 } 217 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 218 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 219 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 220 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 221 } 222 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 223 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 224 #else 225 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 226 #endif 227 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 228 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 229 #else 230 #define FD_PRINTF_LOG_DEBUG() 231 #endif 232 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 233 #define FD_PRINTF_LOG_INFO _FD_PRINTF 234 #else 235 #define FD_PRINTF_LOG_INFO() 236 #endif 237 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 238 #define FD_PRINTF_LOG_ERR _FD_PRINTF 239 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 240 241 242 /* List of all registered lookup algorithms */ 243 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 244 245 /* List of all fib lookup instances in the vnet */ 246 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 247 #define V_fib_data_list VNET(fib_data_list) 248 249 /* Datastructure for storing non-transient fib lookup module failures */ 250 struct fib_error { 251 int fe_family; 252 uint32_t fe_fibnum; /* failed rtable */ 253 struct fib_lookup_module *fe_flm; /* failed module */ 254 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 255 }; 256 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 257 #define V_fib_error_list VNET(fib_error_list) 258 259 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 260 struct fib_dp_header { 261 struct epoch_context fdh_epoch_ctx; 262 uint32_t fdh_num_tables; 263 struct fib_dp fdh_idx[0]; 264 }; 265 266 /* 267 * Tries to add new non-transient algorithm error to the list of 268 * errors. 269 * Returns true on success. 270 */ 271 static bool 272 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 273 { 274 struct fib_error *fe; 275 276 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 277 if (fe == NULL) 278 return (false); 279 fe->fe_flm = flm; 280 fe->fe_family = flm->flm_family; 281 fe->fe_fibnum = fibnum; 282 283 FIB_MOD_LOCK(); 284 /* Avoid duplicates by checking if error already exists first */ 285 if (flm_error_check(flm, fibnum)) { 286 FIB_MOD_UNLOCK(); 287 free(fe, M_TEMP); 288 return (true); 289 } 290 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 291 FIB_MOD_UNLOCK(); 292 293 return (true); 294 } 295 296 /* 297 * True if non-transient error has been registered for @flm in @fibnum. 298 */ 299 static bool 300 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 301 { 302 const struct fib_error *fe; 303 304 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 305 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 306 return (true); 307 } 308 309 return (false); 310 } 311 312 /* 313 * Clear all errors of algo specified by @flm. 314 */ 315 static void 316 fib_error_clear_flm(struct fib_lookup_module *flm) 317 { 318 struct fib_error *fe, *fe_tmp; 319 320 FIB_MOD_LOCK_ASSERT(); 321 322 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 323 if (fe->fe_flm == flm) { 324 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 325 free(fe, M_TEMP); 326 } 327 } 328 } 329 330 /* 331 * Clears all errors in current VNET. 332 */ 333 static void 334 fib_error_clear() 335 { 336 struct fib_error *fe, *fe_tmp; 337 338 FIB_MOD_LOCK_ASSERT(); 339 340 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 341 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 342 free(fe, M_TEMP); 343 } 344 } 345 346 static const char * 347 print_op_result(enum flm_op_result result) 348 { 349 switch (result) { 350 case FLM_SUCCESS: 351 return "success"; 352 case FLM_REBUILD: 353 return "rebuild"; 354 case FLM_ERROR: 355 return "error"; 356 } 357 358 return "unknown"; 359 } 360 361 static const char * 362 print_family(int family) 363 { 364 365 if (family == AF_INET) 366 return ("inet"); 367 else if (family == AF_INET6) 368 return ("inet6"); 369 else 370 return ("unknown"); 371 } 372 373 /* 374 * Debug function used by lookup algorithms. 375 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 376 */ 377 void 378 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 379 { 380 char buf[128]; 381 va_list ap; 382 383 if (level > flm_debug_level) 384 return; 385 386 va_start(ap, fmt); 387 vsnprintf(buf, sizeof(buf), fmt, ap); 388 va_end(ap); 389 390 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 391 fd->fd_gen, func, "%s", buf); 392 } 393 394 /* 395 * Outputs list of algorithms supported by the provided address family. 396 */ 397 static int 398 print_algos_sysctl(struct sysctl_req *req, int family) 399 { 400 struct fib_lookup_module *flm; 401 struct sbuf sbuf; 402 int error, count = 0; 403 404 error = sysctl_wire_old_buffer(req, 0); 405 if (error == 0) { 406 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 407 TAILQ_FOREACH(flm, &all_algo_list, entries) { 408 if (flm->flm_family == family) { 409 if (count++ > 0) 410 sbuf_cat(&sbuf, ", "); 411 sbuf_cat(&sbuf, flm->flm_name); 412 } 413 } 414 error = sbuf_finish(&sbuf); 415 sbuf_delete(&sbuf); 416 } 417 return (error); 418 } 419 420 #ifdef INET6 421 static int 422 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 423 { 424 425 return (print_algos_sysctl(req, AF_INET6)); 426 } 427 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 428 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 429 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 430 #endif 431 432 #ifdef INET 433 static int 434 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 435 { 436 437 return (print_algos_sysctl(req, AF_INET)); 438 } 439 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 440 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 441 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 442 #endif 443 444 /* 445 * Calculate delay between repeated failures. 446 * Returns current delay in milliseconds. 447 */ 448 static uint32_t 449 callout_calc_delay_ms(struct fib_data *fd) 450 { 451 uint32_t shift; 452 453 if (fd->fd_failed_rebuilds > 10) 454 shift = 10; 455 else 456 shift = fd->fd_failed_rebuilds; 457 458 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 459 } 460 461 static void 462 schedule_callout(struct fib_data *fd, int delay_ms) 463 { 464 465 callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, 466 rebuild_fd_callout, fd, 0); 467 } 468 469 static void 470 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 471 { 472 473 RIB_WLOCK_ASSERT(fd->fd_rh); 474 475 if (!fd->fd_need_rebuild) { 476 fd->fd_need_rebuild = true; 477 478 /* 479 * Potentially re-schedules pending callout 480 * initiated by schedule_algo_eval. 481 */ 482 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 483 reason, fd->fd_failed_rebuilds); 484 schedule_callout(fd, callout_calc_delay_ms(fd)); 485 } 486 } 487 488 static void 489 schedule_algo_eval(struct fib_data *fd) 490 { 491 492 RIB_WLOCK_ASSERT(fd->fd_rh); 493 494 if (fd->fd_num_changes++ == 0) { 495 /* Start callout to consider switch */ 496 if (!callout_pending(&fd->fd_callout)) 497 schedule_callout(fd, ALGO_EVAL_DELAY_MS); 498 } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { 499 /* Reset callout to exec immediately */ 500 if (!fd->fd_need_rebuild) { 501 fd->fd_force_eval = true; 502 schedule_callout(fd, 1); 503 } 504 } 505 } 506 507 static bool 508 need_immediate_rebuild(struct fib_data *fd, struct rib_cmd_info *rc) 509 { 510 struct nhop_object *nh; 511 512 if ((V_fib_sync_limit == 0) || (fd->fd_rh->rnh_prefixes <= V_fib_sync_limit)) 513 return (true); 514 515 /* Sync addition/removal of interface routes */ 516 switch (rc->rc_cmd) { 517 case RTM_ADD: 518 nh = rc->rc_nh_new; 519 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 520 return (true); 521 break; 522 case RTM_DELETE: 523 nh = rc->rc_nh_old; 524 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY))) 525 return (true); 526 break; 527 } 528 529 return (false); 530 } 531 532 /* 533 * Rib subscription handler. Checks if the algorithm is ready to 534 * receive updates, handles nexthop refcounting and passes change 535 * data to the algorithm callback. 536 */ 537 static void 538 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 539 void *_data) 540 { 541 struct fib_data *fd = (struct fib_data *)_data; 542 enum flm_op_result result; 543 544 RIB_WLOCK_ASSERT(rnh); 545 546 /* 547 * There is a small gap between subscribing for route changes 548 * and initiating rtable dump. Avoid receiving route changes 549 * prior to finishing rtable dump by checking `init_done`. 550 */ 551 if (!fd->init_done) 552 return; 553 /* 554 * If algo requested rebuild, stop sending updates by default. 555 * This simplifies nexthop refcount handling logic. 556 */ 557 if (fd->fd_need_rebuild) 558 return; 559 560 /* Consider scheduling algorithm re-evaluation */ 561 schedule_algo_eval(fd); 562 563 /* 564 * Maintain guarantee that every nexthop returned by the dataplane 565 * lookup has > 0 refcount, so can be safely referenced within current 566 * epoch. 567 */ 568 if (rc->rc_nh_new != NULL) { 569 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 570 /* ran out of indexes */ 571 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 572 return; 573 } 574 } 575 576 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 577 578 switch (result) { 579 case FLM_SUCCESS: 580 /* Unref old nexthop on success */ 581 if (rc->rc_nh_old != NULL) 582 fib_unref_nhop(fd, rc->rc_nh_old); 583 break; 584 case FLM_REBUILD: 585 586 /* 587 * Algo is not able to apply the update. 588 * Schedule algo rebuild. 589 */ 590 if (!need_immediate_rebuild(fd, rc)) { 591 schedule_fd_rebuild(fd, "algo requested rebuild"); 592 break; 593 } 594 595 fd->fd_need_rebuild = true; 596 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 597 if (!rebuild_fd(fd)) 598 schedule_fd_rebuild(fd, "sync rebuild failed"); 599 break; 600 case FLM_ERROR: 601 602 /* 603 * Algo reported a non-recoverable error. 604 * Record the error and schedule rebuild, which will 605 * trigger best algo selection. 606 */ 607 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 608 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 609 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 610 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 611 } 612 } 613 614 static void 615 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 616 { 617 618 if (old_fd == NULL) { 619 // TODO: read from rtable 620 fd->number_nhops = 16; 621 return; 622 } 623 624 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 625 fd->number_nhops = 2 * old_fd->number_nhops; 626 else 627 fd->number_nhops = old_fd->number_nhops; 628 } 629 630 struct walk_cbdata { 631 struct fib_data *fd; 632 flm_dump_t *func; 633 enum flm_op_result result; 634 }; 635 636 /* 637 * Handler called after all rtenties have been dumped. 638 * Performs post-dump framework checks and calls 639 * algo:flm_dump_end_cb(). 640 * 641 * Updates walk_cbdata result. 642 */ 643 static void 644 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 645 { 646 struct walk_cbdata *w = (struct walk_cbdata *)_data; 647 struct fib_data *fd = w->fd; 648 649 RIB_WLOCK_ASSERT(w->fd->fd_rh); 650 651 if (rnh->rib_dying) { 652 w->result = FLM_ERROR; 653 return; 654 } 655 656 if (fd->hit_nhops) { 657 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 658 fd->nh_ref_table->count); 659 if (w->result == FLM_SUCCESS) 660 w->result = FLM_REBUILD; 661 return; 662 } 663 664 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 665 return; 666 667 /* Post-dump hook, dump successful */ 668 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 669 670 if (w->result == FLM_SUCCESS) { 671 /* Mark init as done to allow routing updates */ 672 fd->init_done = 1; 673 } 674 } 675 676 /* 677 * Callback for each entry in rib. 678 * Calls algo:flm_dump_rib_item_cb func as a part of initial 679 * route table synchronisation. 680 */ 681 static int 682 sync_algo_cb(struct rtentry *rt, void *_data) 683 { 684 struct walk_cbdata *w = (struct walk_cbdata *)_data; 685 686 RIB_WLOCK_ASSERT(w->fd->fd_rh); 687 688 if (w->result == FLM_SUCCESS && w->func) { 689 690 /* 691 * Reference nexthops to maintain guarantee that 692 * each nexthop returned by datapath has > 0 references 693 * and can be safely referenced within current epoch. 694 */ 695 struct nhop_object *nh = rt_get_raw_nhop(rt); 696 if (fib_ref_nhop(w->fd, nh) != 0) 697 w->result = w->func(rt, w->fd->fd_algo_data); 698 else 699 w->result = FLM_REBUILD; 700 } 701 702 return (0); 703 } 704 705 /* 706 * Dump all routing table state to the algo instance. 707 */ 708 static enum flm_op_result 709 sync_algo(struct fib_data *fd) 710 { 711 struct walk_cbdata w = { 712 .fd = fd, 713 .func = fd->fd_flm->flm_dump_rib_item_cb, 714 .result = FLM_SUCCESS, 715 }; 716 717 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 718 719 FD_PRINTF(LOG_INFO, fd, 720 "initial dump completed (rtable version: %d), result: %s", 721 fd->fd_rh->rnh_gen, print_op_result(w.result)); 722 723 return (w.result); 724 } 725 726 /* 727 * Schedules epoch-backed @fd instance deletion. 728 * * Unlinks @fd from the list of active algo instances. 729 * * Removes rib subscription. 730 * * Stops callout. 731 * * Schedules actual deletion. 732 * 733 * Assume @fd is already unlinked from the datapath. 734 */ 735 static int 736 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 737 { 738 bool is_dead; 739 740 NET_EPOCH_ASSERT(); 741 RIB_WLOCK_ASSERT(fd->fd_rh); 742 743 FIB_MOD_LOCK(); 744 is_dead = fd->fd_dead; 745 if (!is_dead) 746 fd->fd_dead = true; 747 if (fd->fd_linked) { 748 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 749 fd->fd_linked = false; 750 } 751 FIB_MOD_UNLOCK(); 752 if (is_dead) 753 return (0); 754 755 FD_PRINTF(LOG_INFO, fd, "DETACH"); 756 757 if (fd->fd_rs != NULL) 758 rib_unsibscribe_locked(fd->fd_rs); 759 760 /* 761 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 762 * will be executed, hence no _new_ callout schedules will happen. 763 */ 764 callout_stop(&fd->fd_callout); 765 766 epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, 767 &fd->fd_epoch_ctx); 768 769 return (0); 770 } 771 772 /* 773 * Wipe all fd instances from the list matching rib specified by @rh. 774 * If @keep_first is set, remove all but the first record. 775 */ 776 static void 777 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 778 { 779 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 780 struct fib_data *fd, *fd_tmp; 781 struct epoch_tracker et; 782 783 FIB_MOD_LOCK(); 784 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 785 if (fd->fd_rh == rh) { 786 if (keep_first) { 787 keep_first = false; 788 continue; 789 } 790 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 791 fd->fd_linked = false; 792 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 793 } 794 } 795 FIB_MOD_UNLOCK(); 796 797 /* Pass 2: remove each entry */ 798 NET_EPOCH_ENTER(et); 799 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 800 if (!in_callout) 801 RIB_WLOCK(fd->fd_rh); 802 schedule_destroy_fd_instance(fd, in_callout); 803 if (!in_callout) 804 RIB_WUNLOCK(fd->fd_rh); 805 } 806 NET_EPOCH_EXIT(et); 807 } 808 809 void 810 fib_destroy_rib(struct rib_head *rh) 811 { 812 813 /* 814 * rnh has `is_dying` flag set, so setup of new fd's will fail at 815 * sync_algo() stage, preventing new entries to be added to the list 816 * of active algos. Remove all existing entries for the particular rib. 817 */ 818 fib_cleanup_algo(rh, false, false); 819 } 820 821 /* 822 * Finalises fd destruction by freeing all fd resources. 823 */ 824 static void 825 destroy_fd_instance(struct fib_data *fd) 826 { 827 828 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 829 830 /* Call destroy callback first */ 831 if (fd->fd_algo_data != NULL) 832 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 833 834 /* Nhop table */ 835 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 836 for (int i = 0; i < fd->number_nhops; i++) { 837 if (!is_idx_free(fd, i)) { 838 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 839 i, fd->nh_idx[i]); 840 nhop_free_any(fd->nh_idx[i]); 841 } 842 } 843 free(fd->nh_idx, M_RTABLE); 844 } 845 if (fd->nh_ref_table != NULL) 846 free(fd->nh_ref_table, M_RTABLE); 847 848 fib_unref_algo(fd->fd_flm); 849 850 free(fd, M_RTABLE); 851 } 852 853 /* 854 * Epoch callback indicating fd is safe to destroy 855 */ 856 static void 857 destroy_fd_instance_epoch(epoch_context_t ctx) 858 { 859 struct fib_data *fd; 860 861 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 862 863 destroy_fd_instance(fd); 864 } 865 866 /* 867 * Tries to setup fd instance. 868 * - Allocates fd/nhop table 869 * - Runs algo:flm_init_cb algo init 870 * - Subscribes fd to the rib 871 * - Runs rtable dump 872 * - Adds instance to the list of active instances. 873 * 874 * Returns: operation result. Fills in @pfd with resulting fd on success. 875 * 876 */ 877 static enum flm_op_result 878 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 879 struct fib_data *old_fd, struct fib_data **pfd) 880 { 881 struct fib_data *fd; 882 size_t size; 883 enum flm_op_result result; 884 885 /* Allocate */ 886 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 887 if (fd == NULL) { 888 *pfd = NULL; 889 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 890 return (FLM_REBUILD); 891 } 892 *pfd = fd; 893 894 estimate_nhop_scale(old_fd, fd); 895 896 fd->fd_rh = rh; 897 fd->fd_gen = ++fib_gen; 898 fd->fd_family = rh->rib_family; 899 fd->fd_fibnum = rh->rib_fibnum; 900 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 901 fd->fd_vnet = curvnet; 902 fd->fd_flm = flm; 903 904 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 905 906 FIB_MOD_LOCK(); 907 flm->flm_refcount++; 908 FIB_MOD_UNLOCK(); 909 910 /* Allocate nhidx -> nhop_ptr table */ 911 size = fd->number_nhops * sizeof(void *); 912 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 913 if (fd->nh_idx == NULL) { 914 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 915 return (FLM_REBUILD); 916 } 917 918 /* Allocate nhop index refcount table */ 919 size = sizeof(struct nhop_ref_table); 920 size += fd->number_nhops * sizeof(uint32_t); 921 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 922 if (fd->nh_ref_table == NULL) { 923 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 924 return (FLM_REBUILD); 925 } 926 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 927 928 /* Okay, we're ready for algo init */ 929 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 930 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 931 if (result != FLM_SUCCESS) { 932 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 933 return (result); 934 } 935 936 /* Try to subscribe */ 937 if (flm->flm_change_rib_item_cb != NULL) { 938 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 939 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 940 if (fd->fd_rs == NULL) { 941 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 942 return (FLM_REBUILD); 943 } 944 } 945 946 /* Dump */ 947 result = sync_algo(fd); 948 if (result != FLM_SUCCESS) { 949 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 950 return (result); 951 } 952 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 953 954 FIB_MOD_LOCK(); 955 /* 956 * Insert fd in the beginning of a list, to maintain invariant 957 * that first matching entry for the AF/fib is always the active 958 * one. 959 */ 960 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 961 fd->fd_linked = true; 962 FIB_MOD_UNLOCK(); 963 964 return (FLM_SUCCESS); 965 } 966 967 /* 968 * Sets up algo @flm for table @rh and links it to the datapath. 969 * 970 */ 971 static enum flm_op_result 972 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 973 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 974 { 975 struct fib_data *prev_fd, *new_fd; 976 enum flm_op_result result; 977 978 NET_EPOCH_ASSERT(); 979 RIB_WLOCK_ASSERT(rh); 980 981 prev_fd = orig_fd; 982 new_fd = NULL; 983 for (int i = 0; i < FIB_MAX_TRIES; i++) { 984 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 985 986 if ((result == FLM_SUCCESS) && attach) 987 result = attach_datapath(new_fd); 988 989 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 990 schedule_destroy_fd_instance(prev_fd, false); 991 prev_fd = NULL; 992 } 993 994 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 995 print_op_result(result)); 996 997 if (result == FLM_REBUILD) { 998 prev_fd = new_fd; 999 new_fd = NULL; 1000 continue; 1001 } 1002 1003 break; 1004 } 1005 1006 if (result != FLM_SUCCESS) { 1007 RH_PRINTF(LOG_WARNING, rh, 1008 "%s algo instance setup failed, failures=%d", flm->flm_name, 1009 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1010 /* update failure count */ 1011 FIB_MOD_LOCK(); 1012 if (orig_fd != NULL) 1013 orig_fd->fd_failed_rebuilds++; 1014 FIB_MOD_UNLOCK(); 1015 1016 /* Ban algo on non-recoverable error */ 1017 if (result == FLM_ERROR) 1018 flm_error_add(flm, rh->rib_fibnum); 1019 1020 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1021 schedule_destroy_fd_instance(prev_fd, false); 1022 if (new_fd != NULL) { 1023 schedule_destroy_fd_instance(new_fd, false); 1024 new_fd = NULL; 1025 } 1026 } 1027 1028 *pfd = new_fd; 1029 return (result); 1030 } 1031 1032 /* 1033 * Callout for all scheduled fd-related work. 1034 * - Checks if the current algo is still the best algo 1035 * - Creates a new instance of an algo for af/fib if desired. 1036 */ 1037 static void 1038 rebuild_fd_callout(void *_data) 1039 { 1040 struct fib_data *fd = (struct fib_data *)_data; 1041 struct epoch_tracker et; 1042 1043 FD_PRINTF(LOG_INFO, fd, "running callout rebuild"); 1044 1045 NET_EPOCH_ENTER(et); 1046 CURVNET_SET(fd->fd_vnet); 1047 rebuild_fd(fd); 1048 CURVNET_RESTORE(); 1049 NET_EPOCH_EXIT(et); 1050 } 1051 1052 /* 1053 * Tries to create new algo instance based on @fd data. 1054 * Returns true on success. 1055 */ 1056 static bool 1057 rebuild_fd(struct fib_data *fd) 1058 { 1059 struct fib_data *fd_new, *fd_tmp; 1060 struct fib_lookup_module *flm_new = NULL; 1061 enum flm_op_result result; 1062 bool need_rebuild = false; 1063 1064 NET_EPOCH_ASSERT(); 1065 RIB_WLOCK_ASSERT(fd->fd_rh); 1066 1067 need_rebuild = fd->fd_need_rebuild; 1068 fd->fd_need_rebuild = false; 1069 fd->fd_force_eval = false; 1070 fd->fd_num_changes = 0; 1071 1072 /* First, check if we're still OK to use this algo */ 1073 if (!is_algo_fixed(fd->fd_rh)) 1074 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1075 if ((flm_new == NULL) && (!need_rebuild)) { 1076 /* Keep existing algo, no need to rebuild. */ 1077 return (true); 1078 } 1079 1080 if (flm_new == NULL) { 1081 flm_new = fd->fd_flm; 1082 fd_tmp = fd; 1083 } else { 1084 fd_tmp = NULL; 1085 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1086 } 1087 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1088 if (fd_tmp == NULL) { 1089 /* fd_new represents new algo */ 1090 fib_unref_algo(flm_new); 1091 } 1092 if (result != FLM_SUCCESS) { 1093 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1094 return (false); 1095 } 1096 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1097 1098 /* Remove old instance */ 1099 schedule_destroy_fd_instance(fd, true); 1100 1101 return (true); 1102 } 1103 1104 /* 1105 * Finds algo by name/family. 1106 * Returns referenced algo or NULL. 1107 */ 1108 static struct fib_lookup_module * 1109 fib_find_algo(const char *algo_name, int family) 1110 { 1111 struct fib_lookup_module *flm; 1112 1113 FIB_MOD_LOCK(); 1114 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1115 if ((strcmp(flm->flm_name, algo_name) == 0) && 1116 (family == flm->flm_family)) { 1117 flm->flm_refcount++; 1118 FIB_MOD_UNLOCK(); 1119 return (flm); 1120 } 1121 } 1122 FIB_MOD_UNLOCK(); 1123 1124 return (NULL); 1125 } 1126 1127 static void 1128 fib_unref_algo(struct fib_lookup_module *flm) 1129 { 1130 1131 FIB_MOD_LOCK(); 1132 flm->flm_refcount--; 1133 FIB_MOD_UNLOCK(); 1134 } 1135 1136 static int 1137 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1138 { 1139 struct fib_lookup_module *flm = NULL; 1140 struct fib_data *fd = NULL; 1141 char old_algo_name[32], algo_name[32]; 1142 struct rib_head *rh = NULL; 1143 enum flm_op_result result; 1144 struct epoch_tracker et; 1145 int error; 1146 1147 /* Fetch current algo/rib for af/family */ 1148 FIB_MOD_LOCK(); 1149 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1150 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1151 break; 1152 } 1153 if (fd == NULL) { 1154 FIB_MOD_UNLOCK(); 1155 return (ENOENT); 1156 } 1157 rh = fd->fd_rh; 1158 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1159 sizeof(old_algo_name)); 1160 FIB_MOD_UNLOCK(); 1161 1162 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1163 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1164 if (error != 0 || req->newptr == NULL) 1165 return (error); 1166 1167 if (strcmp(algo_name, old_algo_name) == 0) 1168 return (0); 1169 1170 /* New algorithm name is different */ 1171 flm = fib_find_algo(algo_name, family); 1172 if (flm == NULL) { 1173 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1174 return (ESRCH); 1175 } 1176 1177 fd = NULL; 1178 NET_EPOCH_ENTER(et); 1179 RIB_WLOCK(rh); 1180 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1181 RIB_WUNLOCK(rh); 1182 NET_EPOCH_EXIT(et); 1183 fib_unref_algo(flm); 1184 if (result != FLM_SUCCESS) 1185 return (EINVAL); 1186 1187 /* Disable automated jumping between algos */ 1188 FIB_MOD_LOCK(); 1189 set_algo_fixed(rh); 1190 FIB_MOD_UNLOCK(); 1191 /* Remove old instance(s) */ 1192 fib_cleanup_algo(rh, true, false); 1193 1194 /* Drain cb so user can unload the module after userret if so desired */ 1195 epoch_drain_callbacks(net_epoch_preempt); 1196 1197 return (0); 1198 } 1199 1200 #ifdef INET 1201 static int 1202 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1203 { 1204 1205 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1206 } 1207 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1208 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1209 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1210 #endif 1211 1212 #ifdef INET6 1213 static int 1214 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1215 { 1216 1217 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1218 } 1219 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1220 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1221 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1222 #endif 1223 1224 static void 1225 destroy_fdh_epoch(epoch_context_t ctx) 1226 { 1227 struct fib_dp_header *fdh; 1228 1229 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1230 free(fdh, M_RTABLE); 1231 } 1232 1233 static struct fib_dp_header * 1234 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1235 { 1236 size_t sz; 1237 struct fib_dp_header *fdh; 1238 1239 sz = sizeof(struct fib_dp_header); 1240 sz += sizeof(struct fib_dp) * num_tables; 1241 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1242 if (fdh != NULL) 1243 fdh->fdh_num_tables = num_tables; 1244 return (fdh); 1245 } 1246 1247 static struct fib_dp_header * 1248 get_fib_dp_header(struct fib_dp *dp) 1249 { 1250 1251 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1252 } 1253 1254 /* 1255 * Replace per-family index pool @pdp with a new one which 1256 * contains updated callback/algo data from @fd. 1257 * Returns 0 on success. 1258 */ 1259 static enum flm_op_result 1260 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd) 1261 { 1262 struct fib_dp_header *new_fdh, *old_fdh; 1263 1264 NET_EPOCH_ASSERT(); 1265 1266 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1267 curvnet, fd->fd_dp.f, fd->fd_dp.arg); 1268 1269 FIB_MOD_LOCK(); 1270 old_fdh = get_fib_dp_header(*pdp); 1271 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1272 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1273 if (new_fdh == NULL) { 1274 FIB_MOD_UNLOCK(); 1275 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1276 return (FLM_REBUILD); 1277 } 1278 1279 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1280 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1281 /* Update relevant data structure for @fd */ 1282 new_fdh->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1283 1284 /* Ensure memcpy() writes have completed */ 1285 atomic_thread_fence_rel(); 1286 /* Set new datapath pointer */ 1287 *pdp = &new_fdh->fdh_idx[0]; 1288 FIB_MOD_UNLOCK(); 1289 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1290 1291 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1292 &old_fdh->fdh_epoch_ctx); 1293 1294 return (FLM_SUCCESS); 1295 } 1296 1297 static struct fib_dp ** 1298 get_family_dp_ptr(int family) 1299 { 1300 switch (family) { 1301 case AF_INET: 1302 return (&V_inet_dp); 1303 case AF_INET6: 1304 return (&V_inet6_dp); 1305 } 1306 return (NULL); 1307 } 1308 1309 /* 1310 * Make datapath use fib instance @fd 1311 */ 1312 static enum flm_op_result 1313 attach_datapath(struct fib_data *fd) 1314 { 1315 struct fib_dp **pdp; 1316 1317 pdp = get_family_dp_ptr(fd->fd_family); 1318 return (replace_rtables_family(pdp, fd)); 1319 } 1320 1321 /* 1322 * Grow datapath pointers array. 1323 * Called from sysctl handler on growing number of routing tables. 1324 */ 1325 static void 1326 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1327 { 1328 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1329 1330 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1331 1332 FIB_MOD_LOCK(); 1333 if (*pdp != NULL) { 1334 old_fdh = get_fib_dp_header(*pdp); 1335 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1336 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1337 } 1338 1339 /* Wait till all writes completed */ 1340 atomic_thread_fence_rel(); 1341 1342 *pdp = &new_fdh->fdh_idx[0]; 1343 FIB_MOD_UNLOCK(); 1344 1345 if (old_fdh != NULL) 1346 epoch_call(net_epoch_preempt, destroy_fdh_epoch, 1347 &old_fdh->fdh_epoch_ctx); 1348 } 1349 1350 /* 1351 * Grows per-AF arrays of datapath pointers for each supported family. 1352 * Called from fibs resize sysctl handler. 1353 */ 1354 void 1355 fib_grow_rtables(uint32_t new_num_tables) 1356 { 1357 1358 #ifdef INET 1359 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1360 #endif 1361 #ifdef INET6 1362 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1363 #endif 1364 } 1365 1366 void 1367 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1368 { 1369 1370 bzero(rinfo, sizeof(struct rib_rtable_info)); 1371 rinfo->num_prefixes = rh->rnh_prefixes; 1372 rinfo->num_nhops = nhops_get_count(rh); 1373 #ifdef ROUTE_MPATH 1374 rinfo->num_nhgrp = nhgrp_get_count(rh); 1375 #endif 1376 } 1377 1378 /* 1379 * Accessor to get rib instance @fd is attached to. 1380 */ 1381 struct rib_head * 1382 fib_get_rh(struct fib_data *fd) 1383 { 1384 1385 return (fd->fd_rh); 1386 } 1387 1388 /* 1389 * Accessor to export idx->nhop array 1390 */ 1391 struct nhop_object ** 1392 fib_get_nhop_array(struct fib_data *fd) 1393 { 1394 1395 return (fd->nh_idx); 1396 } 1397 1398 static uint32_t 1399 get_nhop_idx(struct nhop_object *nh) 1400 { 1401 #ifdef ROUTE_MPATH 1402 if (NH_IS_NHGRP(nh)) 1403 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1404 else 1405 return (nhop_get_idx(nh) * 2); 1406 #else 1407 return (nhop_get_idx(nh)); 1408 #endif 1409 } 1410 1411 uint32_t 1412 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1413 { 1414 1415 return (get_nhop_idx(nh)); 1416 } 1417 1418 static bool 1419 is_idx_free(struct fib_data *fd, uint32_t index) 1420 { 1421 1422 return (fd->nh_ref_table->refcnt[index] == 0); 1423 } 1424 1425 static uint32_t 1426 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1427 { 1428 uint32_t idx = get_nhop_idx(nh); 1429 1430 if (idx >= fd->number_nhops) { 1431 fd->hit_nhops = 1; 1432 return (0); 1433 } 1434 1435 if (is_idx_free(fd, idx)) { 1436 nhop_ref_any(nh); 1437 fd->nh_idx[idx] = nh; 1438 fd->nh_ref_table->count++; 1439 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1440 } 1441 fd->nh_ref_table->refcnt[idx]++; 1442 1443 return (idx); 1444 } 1445 1446 struct nhop_release_data { 1447 struct nhop_object *nh; 1448 struct epoch_context ctx; 1449 }; 1450 1451 static void 1452 release_nhop_epoch(epoch_context_t ctx) 1453 { 1454 struct nhop_release_data *nrd; 1455 1456 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1457 nhop_free_any(nrd->nh); 1458 free(nrd, M_TEMP); 1459 } 1460 1461 /* 1462 * Delays nexthop refcount release. 1463 * Datapath may have the datastructures not updated yet, so the old 1464 * nexthop may still be returned till the end of current epoch. Delay 1465 * refcount removal, as we may be removing the last instance, which will 1466 * trigger nexthop deletion, rendering returned nexthop invalid. 1467 */ 1468 static void 1469 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1470 { 1471 struct nhop_release_data *nrd; 1472 1473 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1474 if (nrd != NULL) { 1475 nrd->nh = nh; 1476 epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx); 1477 } else { 1478 /* 1479 * Unable to allocate memory. Leak nexthop to maintain guarantee 1480 * that each nhop can be referenced. 1481 */ 1482 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1483 } 1484 } 1485 1486 static void 1487 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1488 { 1489 uint32_t idx = get_nhop_idx(nh); 1490 1491 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1492 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1493 1494 fd->nh_ref_table->refcnt[idx]--; 1495 if (fd->nh_ref_table->refcnt[idx] == 0) { 1496 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1497 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1498 } 1499 } 1500 1501 static void 1502 set_algo_fixed(struct rib_head *rh) 1503 { 1504 switch (rh->rib_family) { 1505 #ifdef INET 1506 case AF_INET: 1507 V_algo_fixed_inet = true; 1508 break; 1509 #endif 1510 #ifdef INET6 1511 case AF_INET6: 1512 V_algo_fixed_inet6 = true; 1513 break; 1514 #endif 1515 } 1516 } 1517 1518 static bool 1519 is_algo_fixed(struct rib_head *rh) 1520 { 1521 1522 switch (rh->rib_family) { 1523 #ifdef INET 1524 case AF_INET: 1525 return (V_algo_fixed_inet); 1526 #endif 1527 #ifdef INET6 1528 case AF_INET6: 1529 return (V_algo_fixed_inet6); 1530 #endif 1531 } 1532 return (false); 1533 } 1534 1535 /* 1536 * Runs the check on what would be the best algo for rib @rh, assuming 1537 * that the current algo is the one specified by @orig_flm. Note that 1538 * it can be NULL for initial selection. 1539 * 1540 * Returns referenced new algo or NULL if the current one is the best. 1541 */ 1542 static struct fib_lookup_module * 1543 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1544 { 1545 uint8_t preference, curr_preference = 0, best_preference = 0; 1546 struct fib_lookup_module *flm, *best_flm = NULL; 1547 struct rib_rtable_info rinfo; 1548 int candidate_algos = 0; 1549 1550 fib_get_rtable_info(rh, &rinfo); 1551 1552 FIB_MOD_LOCK(); 1553 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1554 if (flm->flm_family != rh->rib_family) 1555 continue; 1556 candidate_algos++; 1557 preference = flm->flm_get_pref(&rinfo); 1558 if (preference > best_preference) { 1559 if (!flm_error_check(flm, rh->rib_fibnum)) { 1560 best_preference = preference; 1561 best_flm = flm; 1562 } 1563 } 1564 if (flm == orig_flm) 1565 curr_preference = preference; 1566 } 1567 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1568 best_flm->flm_refcount++; 1569 else 1570 best_flm = NULL; 1571 FIB_MOD_UNLOCK(); 1572 1573 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1574 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1575 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1576 best_preference); 1577 1578 return (best_flm); 1579 } 1580 1581 /* 1582 * Called when new route table is created. 1583 * Selects, allocates and attaches fib algo for the table. 1584 */ 1585 int 1586 fib_select_algo_initial(struct rib_head *rh) 1587 { 1588 struct fib_lookup_module *flm; 1589 struct fib_data *fd = NULL; 1590 enum flm_op_result result; 1591 struct epoch_tracker et; 1592 int error = 0; 1593 1594 flm = fib_check_best_algo(rh, NULL); 1595 if (flm == NULL) { 1596 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1597 return (ENOENT); 1598 } 1599 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1600 1601 NET_EPOCH_ENTER(et); 1602 RIB_WLOCK(rh); 1603 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1604 RIB_WUNLOCK(rh); 1605 NET_EPOCH_EXIT(et); 1606 1607 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1608 if (result == FLM_SUCCESS) { 1609 1610 /* 1611 * Attach datapath directly to avoid multiple reallocations 1612 * during fib growth 1613 */ 1614 struct fib_dp_header *fdp; 1615 struct fib_dp **pdp; 1616 1617 pdp = get_family_dp_ptr(rh->rib_family); 1618 if (pdp != NULL) { 1619 fdp = get_fib_dp_header(*pdp); 1620 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp; 1621 FD_PRINTF(LOG_INFO, fd, "datapath attached"); 1622 } 1623 } else { 1624 error = EINVAL; 1625 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1626 } 1627 1628 fib_unref_algo(flm); 1629 1630 return (error); 1631 } 1632 1633 /* 1634 * Registers fib lookup module within the subsystem. 1635 */ 1636 int 1637 fib_module_register(struct fib_lookup_module *flm) 1638 { 1639 1640 FIB_MOD_LOCK(); 1641 ALGO_PRINTF("attaching %s to %s", flm->flm_name, 1642 print_family(flm->flm_family)); 1643 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 1644 FIB_MOD_UNLOCK(); 1645 1646 return (0); 1647 } 1648 1649 /* 1650 * Tries to unregister fib lookup module. 1651 * 1652 * Returns 0 on success, EBUSY if module is still used 1653 * by some of the tables. 1654 */ 1655 int 1656 fib_module_unregister(struct fib_lookup_module *flm) 1657 { 1658 1659 FIB_MOD_LOCK(); 1660 if (flm->flm_refcount > 0) { 1661 FIB_MOD_UNLOCK(); 1662 return (EBUSY); 1663 } 1664 fib_error_clear_flm(flm); 1665 ALGO_PRINTF("detaching %s from %s", flm->flm_name, 1666 print_family(flm->flm_family)); 1667 TAILQ_REMOVE(&all_algo_list, flm, entries); 1668 FIB_MOD_UNLOCK(); 1669 1670 return (0); 1671 } 1672 1673 void 1674 vnet_fib_init(void) 1675 { 1676 1677 TAILQ_INIT(&V_fib_data_list); 1678 } 1679 1680 void 1681 vnet_fib_destroy(void) 1682 { 1683 1684 FIB_MOD_LOCK(); 1685 fib_error_clear(); 1686 FIB_MOD_UNLOCK(); 1687 } 1688