1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_route.h" 33 34 #include <sys/param.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/sbuf.h> 38 #include <sys/lock.h> 39 #include <sys/rmlock.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/module.h> 43 #include <sys/kernel.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/syslog.h> 50 #include <sys/queue.h> 51 #include <net/vnet.h> 52 53 #include <net/if.h> 54 #include <net/if_var.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip_var.h> 60 #ifdef INET6 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #endif 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 #include <net/route/route_ctl.h> 68 #include <net/route/route_var.h> 69 #include <net/route/fib_algo.h> 70 71 #include <machine/stdarg.h> 72 73 /* 74 * Fib lookup framework. 75 * 76 * This framework enables accelerated longest-prefix-match lookups for the 77 * routing tables by adding the ability to dynamically attach/detach lookup 78 * algorithms implementation to/from the datapath. 79 * 80 * flm - fib lookup modules - implementation of particular lookup algorithm 81 * fd - fib data - instance of an flm bound to specific routing table 82 * 83 * This file provides main framework functionality. 84 * 85 * The following are the features provided by the framework 86 * 87 * 1) nexhops abstraction -> provides transparent referencing, indexing 88 * and efficient idx->ptr mappings for nexthop and nexthop groups. 89 * 2) Routing table synchronisation 90 * 3) dataplane attachment points 91 * 4) automatic algorithm selection based on the provided preference. 92 * 93 * 94 * DATAPATH 95 * For each supported address family, there is a an allocated array of fib_dp 96 * structures, indexed by fib number. Each array entry contains callback function 97 * and its argument. This function will be called with a family-specific lookup key, 98 * scope and provided argument. This array gets re-created every time when new algo 99 * instance gets created. Please take a look at the replace_rtables_family() function 100 * for more details. 101 * 102 */ 103 104 SYSCTL_DECL(_net_route); 105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "Fib algorithm lookups"); 107 108 /* Algorithm sync policy */ 109 110 /* Time interval to bucket updates */ 111 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50; 112 #define V_update_bucket_time_ms VNET(update_bucket_time_ms) 113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, 114 &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate"); 115 116 /* Minimum update rate to delay sync */ 117 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500; 118 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) 119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, 120 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); 121 122 /* Max allowed delay to sync */ 123 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000; 124 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) 125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, 126 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); 127 128 129 #ifdef INET6 130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 131 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 133 "IPv6 longest prefix match lookups"); 134 #endif 135 #ifdef INET 136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 137 #define V_algo_fixed_inet VNET(algo_fixed_inet) 138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 139 "IPv4 longest prefix match lookups"); 140 #endif 141 142 /* Fib instance counter */ 143 static uint32_t fib_gen = 0; 144 145 struct nhop_ref_table { 146 uint32_t count; 147 int32_t refcnt[0]; 148 }; 149 150 enum fib_callout_action { 151 FDA_NONE, /* No callout scheduled */ 152 FDA_REBUILD, /* Asks to rebuild algo instance */ 153 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ 154 FDA_BATCH, /* Asks to submit batch of updates to the algo */ 155 }; 156 157 struct fib_sync_status { 158 struct timeval diverge_time; /* ts when diverged */ 159 uint32_t num_changes; /* number of changes since sync */ 160 uint32_t bucket_changes; /* num changes within the current bucket */ 161 uint64_t bucket_id; /* 50ms bucket # */ 162 struct fib_change_queue fd_change_queue;/* list of scheduled entries */ 163 }; 164 165 /* 166 * Data structure for the fib lookup instance tied to the particular rib. 167 */ 168 struct fib_data { 169 uint32_t number_nhops; /* current # of nhops */ 170 uint8_t hit_nhops; /* true if out of nhop limit */ 171 uint8_t init_done; /* true if init is competed */ 172 uint32_t fd_dead:1; /* Scheduled for deletion */ 173 uint32_t fd_linked:1; /* true if linked */ 174 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 175 uint32_t fd_batch:1; /* true if batched notification scheduled */ 176 uint8_t fd_family; /* family */ 177 uint32_t fd_fibnum; /* fibnum */ 178 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 179 uint32_t fd_gen; /* instance gen# */ 180 struct callout fd_callout; /* rebuild callout */ 181 enum fib_callout_action fd_callout_action; /* Callout action to take */ 182 void *fd_algo_data; /* algorithm data */ 183 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 184 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 185 struct rib_head *fd_rh; /* RIB table we're attached to */ 186 struct rib_subscription *fd_rs; /* storing table subscription */ 187 struct fib_dp fd_dp; /* fib datapath data */ 188 struct vnet *fd_vnet; /* vnet fib belongs to */ 189 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 190 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 191 struct fib_sync_status fd_ss; /* State relevant to the rib sync */ 192 uint32_t fd_num_changes; /* number of changes since last callout */ 193 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 194 }; 195 196 static bool rebuild_fd(struct fib_data *fd, const char *reason); 197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); 198 static void handle_fd_callout(void *_data); 199 static void destroy_fd_instance_epoch(epoch_context_t ctx); 200 static bool is_idx_free(struct fib_data *fd, uint32_t index); 201 static void set_algo_fixed(struct rib_head *rh); 202 static bool is_algo_fixed(struct rib_head *rh); 203 204 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 205 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 206 207 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 208 struct fib_lookup_module *orig_flm); 209 static void fib_unref_algo(struct fib_lookup_module *flm); 210 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 211 212 struct mtx fib_mtx; 213 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 214 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 215 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 216 217 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 218 219 /* Algorithm has to be this percent better than the current to switch */ 220 #define BEST_DIFF_PERCENT (5 * 256 / 100) 221 /* Schedule algo re-evaluation X seconds after a change */ 222 #define ALGO_EVAL_DELAY_MS 30000 223 /* Force algo re-evaluation after X changes */ 224 #define ALGO_EVAL_NUM_ROUTES 100 225 /* Try to setup algorithm X times */ 226 #define FIB_MAX_TRIES 32 227 /* Max amount of supported nexthops */ 228 #define FIB_MAX_NHOPS 262144 229 #define FIB_CALLOUT_DELAY_MS 50 230 231 232 /* Debug */ 233 static int flm_debug_level = LOG_NOTICE; 234 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 235 &flm_debug_level, 0, "debuglevel"); 236 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 237 #ifndef LOG_DEBUG2 238 #define LOG_DEBUG2 8 239 #endif 240 241 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 242 #define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \ 243 printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ 244 } 245 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 246 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 247 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 248 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 249 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 250 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 251 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 252 } 253 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 254 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 255 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 256 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 257 } 258 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 259 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 260 #else 261 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 262 #endif 263 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 264 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 265 #else 266 #define FD_PRINTF_LOG_DEBUG() 267 #endif 268 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 269 #define FD_PRINTF_LOG_INFO _FD_PRINTF 270 #else 271 #define FD_PRINTF_LOG_INFO() 272 #endif 273 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 274 #define FD_PRINTF_LOG_ERR _FD_PRINTF 275 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 276 277 278 /* List of all registered lookup algorithms */ 279 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 280 281 /* List of all fib lookup instances in the vnet */ 282 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 283 #define V_fib_data_list VNET(fib_data_list) 284 285 /* Datastructure for storing non-transient fib lookup module failures */ 286 struct fib_error { 287 int fe_family; 288 uint32_t fe_fibnum; /* failed rtable */ 289 struct fib_lookup_module *fe_flm; /* failed module */ 290 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 291 }; 292 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 293 #define V_fib_error_list VNET(fib_error_list) 294 295 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 296 struct fib_dp_header { 297 struct epoch_context fdh_epoch_ctx; 298 uint32_t fdh_num_tables; 299 struct fib_dp fdh_idx[0]; 300 }; 301 302 /* 303 * Tries to add new non-transient algorithm error to the list of 304 * errors. 305 * Returns true on success. 306 */ 307 static bool 308 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 309 { 310 struct fib_error *fe; 311 312 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 313 if (fe == NULL) 314 return (false); 315 fe->fe_flm = flm; 316 fe->fe_family = flm->flm_family; 317 fe->fe_fibnum = fibnum; 318 319 FIB_MOD_LOCK(); 320 /* Avoid duplicates by checking if error already exists first */ 321 if (flm_error_check(flm, fibnum)) { 322 FIB_MOD_UNLOCK(); 323 free(fe, M_TEMP); 324 return (true); 325 } 326 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 327 FIB_MOD_UNLOCK(); 328 329 return (true); 330 } 331 332 /* 333 * True if non-transient error has been registered for @flm in @fibnum. 334 */ 335 static bool 336 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 337 { 338 const struct fib_error *fe; 339 340 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 341 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 342 return (true); 343 } 344 345 return (false); 346 } 347 348 /* 349 * Clear all errors of algo specified by @flm. 350 */ 351 static void 352 fib_error_clear_flm(struct fib_lookup_module *flm) 353 { 354 struct fib_error *fe, *fe_tmp; 355 356 FIB_MOD_LOCK_ASSERT(); 357 358 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 359 if (fe->fe_flm == flm) { 360 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 361 free(fe, M_TEMP); 362 } 363 } 364 } 365 366 /* 367 * Clears all errors in current VNET. 368 */ 369 static void 370 fib_error_clear(void) 371 { 372 struct fib_error *fe, *fe_tmp; 373 374 FIB_MOD_LOCK_ASSERT(); 375 376 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 377 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 378 free(fe, M_TEMP); 379 } 380 } 381 382 static const char * 383 print_op_result(enum flm_op_result result) 384 { 385 switch (result) { 386 case FLM_SUCCESS: 387 return "success"; 388 case FLM_REBUILD: 389 return "rebuild"; 390 case FLM_BATCH: 391 return "batch"; 392 case FLM_ERROR: 393 return "error"; 394 } 395 396 return "unknown"; 397 } 398 399 static const char * 400 print_family(int family) 401 { 402 403 if (family == AF_INET) 404 return ("inet"); 405 else if (family == AF_INET6) 406 return ("inet6"); 407 else 408 return ("unknown"); 409 } 410 411 /* 412 * Debug function used by lookup algorithms. 413 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 414 */ 415 void 416 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 417 { 418 char buf[128]; 419 va_list ap; 420 421 if (level > flm_debug_level) 422 return; 423 424 va_start(ap, fmt); 425 vsnprintf(buf, sizeof(buf), fmt, ap); 426 va_end(ap); 427 428 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 429 fd->fd_gen, func, "%s", buf); 430 } 431 432 /* 433 * Outputs list of algorithms supported by the provided address family. 434 */ 435 static int 436 print_algos_sysctl(struct sysctl_req *req, int family) 437 { 438 struct fib_lookup_module *flm; 439 struct sbuf sbuf; 440 int error, count = 0; 441 442 error = sysctl_wire_old_buffer(req, 0); 443 if (error == 0) { 444 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 445 TAILQ_FOREACH(flm, &all_algo_list, entries) { 446 if (flm->flm_family == family) { 447 if (count++ > 0) 448 sbuf_cat(&sbuf, ", "); 449 sbuf_cat(&sbuf, flm->flm_name); 450 } 451 } 452 error = sbuf_finish(&sbuf); 453 sbuf_delete(&sbuf); 454 } 455 return (error); 456 } 457 458 #ifdef INET6 459 static int 460 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 461 { 462 463 return (print_algos_sysctl(req, AF_INET6)); 464 } 465 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 466 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 467 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 468 #endif 469 470 #ifdef INET 471 static int 472 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 473 { 474 475 return (print_algos_sysctl(req, AF_INET)); 476 } 477 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 478 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 479 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 480 #endif 481 482 /* 483 * Calculate delay between repeated failures. 484 * Returns current delay in milliseconds. 485 */ 486 static uint32_t 487 callout_calc_delay_ms(struct fib_data *fd) 488 { 489 uint32_t shift; 490 491 if (fd->fd_failed_rebuilds > 10) 492 shift = 10; 493 else 494 shift = fd->fd_failed_rebuilds; 495 496 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 497 } 498 499 static void 500 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) 501 { 502 503 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); 504 fd->fd_callout_action = action; 505 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, 506 handle_fd_callout, fd, 0); 507 } 508 509 static void 510 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 511 { 512 513 RIB_WLOCK_ASSERT(fd->fd_rh); 514 515 if (!fd->fd_need_rebuild) { 516 fd->fd_need_rebuild = true; 517 /* Stop batch updates */ 518 fd->fd_batch = false; 519 520 /* 521 * Potentially re-schedules pending callout 522 * initiated by schedule_algo_eval. 523 */ 524 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 525 reason, fd->fd_failed_rebuilds); 526 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); 527 } 528 } 529 530 static void 531 sync_rib_gen(struct fib_data *fd) 532 { 533 FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); 534 fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; 535 } 536 537 static int64_t 538 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) 539 { 540 int64_t diff = 0; 541 542 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; 543 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; 544 545 return (diff); 546 } 547 548 static void 549 add_tv_diff_ms(struct timeval *tv, int ms) 550 { 551 tv->tv_sec += ms / 1000; 552 ms = ms % 1000; 553 if (ms * 1000 + tv->tv_usec < 1000000) 554 tv->tv_usec += ms * 1000; 555 else { 556 tv->tv_sec += 1; 557 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; 558 } 559 } 560 561 /* 562 * Marks the time when algo state diverges from the rib state. 563 */ 564 static void 565 mark_diverge_time(struct fib_data *fd) 566 { 567 struct fib_sync_status *fd_ss = &fd->fd_ss; 568 569 getmicrouptime(&fd_ss->diverge_time); 570 fd_ss->bucket_id = 0; 571 fd_ss->bucket_changes = 0; 572 } 573 574 /* 575 * Calculates and updates the next algorithm sync time, based on the current activity. 576 * 577 * The intent is to provide reasonable balance between the update 578 * latency and efficient batching when changing large amount of routes. 579 * 580 * High-level algorithm looks the following: 581 * 1) all changes are bucketed in 50ms intervals 582 * 2) If amount of changes within the bucket is greater than the threshold, 583 * the update gets delayed, up to maximum delay threshold. 584 */ 585 static void 586 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) 587 { 588 uint32_t bucket_id, new_delay = 0; 589 struct timeval tv; 590 591 /* Fetch all variables at once to ensure consistent reads */ 592 uint32_t bucket_time_ms = V_update_bucket_time_ms; 593 uint32_t threshold_rate = V_bucket_change_threshold_rate; 594 uint32_t max_delay_ms = V_fib_max_sync_delay_ms; 595 596 if (bucket_time_ms == 0) 597 bucket_time_ms = 50; 598 /* calculate per-bucket threshold rate */ 599 threshold_rate = threshold_rate * bucket_time_ms / 1000; 600 601 getmicrouptime(&tv); 602 603 struct fib_sync_status *fd_ss = &fd->fd_ss; 604 605 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; 606 607 if (fd_ss->bucket_id == bucket_id) { 608 fd_ss->bucket_changes++; 609 if (fd_ss->bucket_changes == threshold_rate) { 610 new_delay = (bucket_id + 2) * bucket_time_ms; 611 if (new_delay <= max_delay_ms) { 612 FD_PRINTF(LOG_DEBUG, fd, 613 "hit threshold of %u routes, delay update," 614 "bucket: %u, total delay: %u", 615 threshold_rate, bucket_id + 1, new_delay); 616 } else { 617 new_delay = 0; 618 FD_PRINTF(LOG_DEBUG, fd, 619 "maximum sync delay (%u ms) reached", max_delay_ms); 620 } 621 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) 622 new_delay = bucket_time_ms; 623 } else { 624 fd_ss->bucket_id = bucket_id; 625 fd_ss->bucket_changes = 1; 626 } 627 628 if (new_delay > 0) { 629 /* Calculated time has been updated */ 630 struct timeval new_tv = fd_ss->diverge_time; 631 add_tv_diff_ms(&new_tv, new_delay); 632 633 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); 634 schedule_callout(fd, action, delay_ms); 635 } 636 } 637 638 static void 639 update_algo_state(struct fib_data *fd) 640 { 641 642 RIB_WLOCK_ASSERT(fd->fd_rh); 643 644 if (fd->fd_batch || fd->fd_need_rebuild) { 645 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; 646 update_rebuild_delay(fd, action); 647 return; 648 } 649 650 if (fd->fd_num_changes++ == 0) { 651 /* Start callout to consider switch */ 652 if (!callout_pending(&fd->fd_callout)) 653 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); 654 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { 655 /* Reset callout to exec immediately */ 656 if (fd->fd_callout_action == FDA_EVAL) 657 schedule_callout(fd, FDA_EVAL, 1); 658 } 659 } 660 661 static bool 662 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) 663 { 664 struct nhop_object *nh; 665 666 /* Sync addition/removal of interface routes */ 667 switch (rc->rc_cmd) { 668 case RTM_ADD: 669 nh = rc->rc_nh_new; 670 if (!NH_IS_NHGRP(nh)) { 671 if (!(nh->nh_flags & NHF_GATEWAY)) 672 return (true); 673 if (nhop_get_rtflags(nh) & RTF_STATIC) 674 return (true); 675 } 676 break; 677 case RTM_DELETE: 678 nh = rc->rc_nh_old; 679 if (!NH_IS_NHGRP(nh)) { 680 if (!(nh->nh_flags & NHF_GATEWAY)) 681 return (true); 682 if (nhop_get_rtflags(nh) & RTF_STATIC) 683 return (true); 684 } 685 break; 686 } 687 688 return (false); 689 } 690 691 static bool 692 apply_rtable_changes(struct fib_data *fd) 693 { 694 enum flm_op_result result; 695 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 696 697 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); 698 699 if (result == FLM_SUCCESS) { 700 sync_rib_gen(fd); 701 for (int i = 0; i < q->count; i++) 702 if (q->entries[i].nh_old) 703 fib_unref_nhop(fd, q->entries[i].nh_old); 704 q->count = 0; 705 } 706 fd->fd_batch = false; 707 708 return (result == FLM_SUCCESS); 709 } 710 711 static bool 712 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) 713 { 714 int plen = 0; 715 716 switch (fd->fd_family) { 717 #ifdef INET 718 case AF_INET: 719 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); 720 break; 721 #endif 722 #ifdef INET6 723 case AF_INET6: 724 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); 725 break; 726 #endif 727 } 728 729 ce->plen = plen; 730 ce->nh_old = rc->rc_nh_old; 731 ce->nh_new = rc->rc_nh_new; 732 if (ce->nh_new != NULL) { 733 if (fib_ref_nhop(fd, ce->nh_new) == 0) 734 return (false); 735 } 736 737 return (true); 738 } 739 740 static bool 741 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) 742 { 743 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 744 745 if (q->count >= q->size) { 746 uint32_t q_size; 747 748 if (q->size == 0) 749 q_size = 256; /* ~18k memory */ 750 else 751 q_size = q->size * 2; 752 753 size_t size = q_size * sizeof(struct fib_change_entry); 754 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); 755 if (a == NULL) { 756 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", 757 q_size); 758 return (false); 759 } 760 q->entries = a; 761 q->size = q_size; 762 } 763 764 return (fill_change_entry(fd, &q->entries[q->count++], rc)); 765 } 766 767 /* 768 * Rib subscription handler. Checks if the algorithm is ready to 769 * receive updates, handles nexthop refcounting and passes change 770 * data to the algorithm callback. 771 */ 772 static void 773 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 774 void *_data) 775 { 776 struct fib_data *fd = (struct fib_data *)_data; 777 enum flm_op_result result; 778 779 RIB_WLOCK_ASSERT(rnh); 780 781 /* 782 * There is a small gap between subscribing for route changes 783 * and initiating rtable dump. Avoid receiving route changes 784 * prior to finishing rtable dump by checking `init_done`. 785 */ 786 if (!fd->init_done) 787 return; 788 789 bool immediate_sync = need_immediate_sync(fd, rc); 790 791 /* Consider scheduling algorithm re-evaluation */ 792 update_algo_state(fd); 793 794 /* 795 * If algo requested rebuild, stop sending updates by default. 796 * This simplifies nexthop refcount handling logic. 797 */ 798 if (fd->fd_need_rebuild) { 799 if (immediate_sync) 800 rebuild_fd(fd, "rtable change type enforced sync"); 801 return; 802 } 803 804 /* 805 * Algo requested updates to be delivered in batches. 806 * Add the current change to the queue and return. 807 */ 808 if (fd->fd_batch) { 809 if (immediate_sync) { 810 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) 811 rebuild_fd(fd, "batch sync failed"); 812 } else { 813 if (!queue_rtable_change(fd, rc)) 814 schedule_fd_rebuild(fd, "batch queue failed"); 815 } 816 return; 817 } 818 819 /* 820 * Maintain guarantee that every nexthop returned by the dataplane 821 * lookup has > 0 refcount, so can be safely referenced within current 822 * epoch. 823 */ 824 if (rc->rc_nh_new != NULL) { 825 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 826 /* ran out of indexes */ 827 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 828 return; 829 } 830 } 831 832 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 833 834 switch (result) { 835 case FLM_SUCCESS: 836 sync_rib_gen(fd); 837 /* Unref old nexthop on success */ 838 if (rc->rc_nh_old != NULL) 839 fib_unref_nhop(fd, rc->rc_nh_old); 840 break; 841 case FLM_BATCH: 842 843 /* 844 * Algo asks to batch the changes. 845 */ 846 if (queue_rtable_change(fd, rc)) { 847 if (!immediate_sync) { 848 fd->fd_batch = true; 849 mark_diverge_time(fd); 850 update_rebuild_delay(fd, FDA_BATCH); 851 break; 852 } 853 if (apply_rtable_changes(fd)) 854 break; 855 } 856 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); 857 858 case FLM_REBUILD: 859 860 /* 861 * Algo is not able to apply the update. 862 * Schedule algo rebuild. 863 */ 864 if (!immediate_sync) { 865 mark_diverge_time(fd); 866 schedule_fd_rebuild(fd, "algo requested rebuild"); 867 break; 868 } 869 870 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 871 rebuild_fd(fd, "rtable change type enforced sync"); 872 break; 873 case FLM_ERROR: 874 875 /* 876 * Algo reported a non-recoverable error. 877 * Record the error and schedule rebuild, which will 878 * trigger best algo selection. 879 */ 880 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 881 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 882 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 883 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 884 } 885 } 886 887 static void 888 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 889 { 890 891 if (old_fd == NULL) { 892 // TODO: read from rtable 893 fd->number_nhops = 16; 894 return; 895 } 896 897 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 898 fd->number_nhops = 2 * old_fd->number_nhops; 899 else 900 fd->number_nhops = old_fd->number_nhops; 901 } 902 903 struct walk_cbdata { 904 struct fib_data *fd; 905 flm_dump_t *func; 906 enum flm_op_result result; 907 }; 908 909 /* 910 * Handler called after all rtenties have been dumped. 911 * Performs post-dump framework checks and calls 912 * algo:flm_dump_end_cb(). 913 * 914 * Updates walk_cbdata result. 915 */ 916 static void 917 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 918 { 919 struct walk_cbdata *w = (struct walk_cbdata *)_data; 920 struct fib_data *fd = w->fd; 921 922 RIB_WLOCK_ASSERT(w->fd->fd_rh); 923 924 if (rnh->rib_dying) { 925 w->result = FLM_ERROR; 926 return; 927 } 928 929 if (fd->hit_nhops) { 930 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 931 fd->nh_ref_table->count); 932 if (w->result == FLM_SUCCESS) 933 w->result = FLM_REBUILD; 934 return; 935 } 936 937 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 938 return; 939 940 /* Post-dump hook, dump successful */ 941 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 942 943 if (w->result == FLM_SUCCESS) { 944 /* Mark init as done to allow routing updates */ 945 fd->init_done = 1; 946 } 947 } 948 949 /* 950 * Callback for each entry in rib. 951 * Calls algo:flm_dump_rib_item_cb func as a part of initial 952 * route table synchronisation. 953 */ 954 static int 955 sync_algo_cb(struct rtentry *rt, void *_data) 956 { 957 struct walk_cbdata *w = (struct walk_cbdata *)_data; 958 959 RIB_WLOCK_ASSERT(w->fd->fd_rh); 960 961 if (w->result == FLM_SUCCESS && w->func) { 962 963 /* 964 * Reference nexthops to maintain guarantee that 965 * each nexthop returned by datapath has > 0 references 966 * and can be safely referenced within current epoch. 967 */ 968 struct nhop_object *nh = rt_get_raw_nhop(rt); 969 if (fib_ref_nhop(w->fd, nh) != 0) 970 w->result = w->func(rt, w->fd->fd_algo_data); 971 else 972 w->result = FLM_REBUILD; 973 } 974 975 return (0); 976 } 977 978 /* 979 * Dump all routing table state to the algo instance. 980 */ 981 static enum flm_op_result 982 sync_algo(struct fib_data *fd) 983 { 984 struct walk_cbdata w = { 985 .fd = fd, 986 .func = fd->fd_flm->flm_dump_rib_item_cb, 987 .result = FLM_SUCCESS, 988 }; 989 990 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 991 992 FD_PRINTF(LOG_INFO, fd, 993 "initial dump completed (rtable version: %d), result: %s", 994 fd->fd_rh->rnh_gen, print_op_result(w.result)); 995 996 return (w.result); 997 } 998 999 /* 1000 * Schedules epoch-backed @fd instance deletion. 1001 * * Unlinks @fd from the list of active algo instances. 1002 * * Removes rib subscription. 1003 * * Stops callout. 1004 * * Schedules actual deletion. 1005 * 1006 * Assume @fd is already unlinked from the datapath. 1007 */ 1008 static int 1009 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 1010 { 1011 bool is_dead; 1012 1013 NET_EPOCH_ASSERT(); 1014 RIB_WLOCK_ASSERT(fd->fd_rh); 1015 1016 FIB_MOD_LOCK(); 1017 is_dead = fd->fd_dead; 1018 if (!is_dead) 1019 fd->fd_dead = true; 1020 if (fd->fd_linked) { 1021 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1022 fd->fd_linked = false; 1023 } 1024 FIB_MOD_UNLOCK(); 1025 if (is_dead) 1026 return (0); 1027 1028 FD_PRINTF(LOG_INFO, fd, "DETACH"); 1029 1030 if (fd->fd_rs != NULL) 1031 rib_unsubscribe_locked(fd->fd_rs); 1032 1033 /* 1034 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 1035 * will be executed, hence no _new_ callout schedules will happen. 1036 */ 1037 callout_stop(&fd->fd_callout); 1038 1039 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); 1040 1041 return (0); 1042 } 1043 1044 /* 1045 * Wipe all fd instances from the list matching rib specified by @rh. 1046 * If @keep_first is set, remove all but the first record. 1047 */ 1048 static void 1049 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 1050 { 1051 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 1052 struct fib_data *fd, *fd_tmp; 1053 struct epoch_tracker et; 1054 1055 FIB_MOD_LOCK(); 1056 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 1057 if (fd->fd_rh == rh) { 1058 if (keep_first) { 1059 keep_first = false; 1060 continue; 1061 } 1062 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1063 fd->fd_linked = false; 1064 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 1065 } 1066 } 1067 FIB_MOD_UNLOCK(); 1068 1069 /* Pass 2: remove each entry */ 1070 NET_EPOCH_ENTER(et); 1071 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 1072 if (!in_callout) 1073 RIB_WLOCK(fd->fd_rh); 1074 schedule_destroy_fd_instance(fd, in_callout); 1075 if (!in_callout) 1076 RIB_WUNLOCK(fd->fd_rh); 1077 } 1078 NET_EPOCH_EXIT(et); 1079 } 1080 1081 void 1082 fib_destroy_rib(struct rib_head *rh) 1083 { 1084 1085 /* 1086 * rnh has `is_dying` flag set, so setup of new fd's will fail at 1087 * sync_algo() stage, preventing new entries to be added to the list 1088 * of active algos. Remove all existing entries for the particular rib. 1089 */ 1090 fib_cleanup_algo(rh, false, false); 1091 } 1092 1093 /* 1094 * Finalises fd destruction by freeing all fd resources. 1095 */ 1096 static void 1097 destroy_fd_instance(struct fib_data *fd) 1098 { 1099 1100 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 1101 1102 /* Call destroy callback first */ 1103 if (fd->fd_algo_data != NULL) 1104 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 1105 1106 /* Nhop table */ 1107 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 1108 for (int i = 0; i < fd->number_nhops; i++) { 1109 if (!is_idx_free(fd, i)) { 1110 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 1111 i, fd->nh_idx[i]); 1112 nhop_free_any(fd->nh_idx[i]); 1113 } 1114 } 1115 free(fd->nh_idx, M_RTABLE); 1116 } 1117 if (fd->nh_ref_table != NULL) 1118 free(fd->nh_ref_table, M_RTABLE); 1119 1120 if (fd->fd_ss.fd_change_queue.entries != NULL) 1121 free(fd->fd_ss.fd_change_queue.entries, M_TEMP); 1122 1123 fib_unref_algo(fd->fd_flm); 1124 1125 free(fd, M_RTABLE); 1126 } 1127 1128 /* 1129 * Epoch callback indicating fd is safe to destroy 1130 */ 1131 static void 1132 destroy_fd_instance_epoch(epoch_context_t ctx) 1133 { 1134 struct fib_data *fd; 1135 1136 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 1137 1138 destroy_fd_instance(fd); 1139 } 1140 1141 /* 1142 * Tries to setup fd instance. 1143 * - Allocates fd/nhop table 1144 * - Runs algo:flm_init_cb algo init 1145 * - Subscribes fd to the rib 1146 * - Runs rtable dump 1147 * - Adds instance to the list of active instances. 1148 * 1149 * Returns: operation result. Fills in @pfd with resulting fd on success. 1150 * 1151 */ 1152 static enum flm_op_result 1153 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1154 struct fib_data *old_fd, struct fib_data **pfd) 1155 { 1156 struct fib_data *fd; 1157 size_t size; 1158 enum flm_op_result result; 1159 1160 /* Allocate */ 1161 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 1162 if (fd == NULL) { 1163 *pfd = NULL; 1164 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 1165 return (FLM_REBUILD); 1166 } 1167 *pfd = fd; 1168 1169 estimate_nhop_scale(old_fd, fd); 1170 1171 fd->fd_rh = rh; 1172 fd->fd_family = rh->rib_family; 1173 fd->fd_fibnum = rh->rib_fibnum; 1174 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 1175 fd->fd_vnet = curvnet; 1176 fd->fd_flm = flm; 1177 1178 FIB_MOD_LOCK(); 1179 flm->flm_refcount++; 1180 fd->fd_gen = ++fib_gen; 1181 FIB_MOD_UNLOCK(); 1182 1183 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 1184 1185 /* Allocate nhidx -> nhop_ptr table */ 1186 size = fd->number_nhops * sizeof(void *); 1187 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1188 if (fd->nh_idx == NULL) { 1189 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 1190 return (FLM_REBUILD); 1191 } 1192 1193 /* Allocate nhop index refcount table */ 1194 size = sizeof(struct nhop_ref_table); 1195 size += fd->number_nhops * sizeof(uint32_t); 1196 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1197 if (fd->nh_ref_table == NULL) { 1198 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 1199 return (FLM_REBUILD); 1200 } 1201 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 1202 1203 /* Okay, we're ready for algo init */ 1204 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 1205 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 1206 if (result != FLM_SUCCESS) { 1207 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 1208 return (result); 1209 } 1210 1211 /* Try to subscribe */ 1212 if (flm->flm_change_rib_item_cb != NULL) { 1213 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 1214 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 1215 if (fd->fd_rs == NULL) { 1216 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 1217 return (FLM_REBUILD); 1218 } 1219 } 1220 1221 /* Dump */ 1222 result = sync_algo(fd); 1223 if (result != FLM_SUCCESS) { 1224 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 1225 return (result); 1226 } 1227 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 1228 1229 FIB_MOD_LOCK(); 1230 /* 1231 * Insert fd in the beginning of a list, to maintain invariant 1232 * that first matching entry for the AF/fib is always the active 1233 * one. 1234 */ 1235 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 1236 fd->fd_linked = true; 1237 FIB_MOD_UNLOCK(); 1238 1239 return (FLM_SUCCESS); 1240 } 1241 1242 /* 1243 * Sets up algo @flm for table @rh and links it to the datapath. 1244 * 1245 */ 1246 static enum flm_op_result 1247 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1248 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 1249 { 1250 struct fib_data *prev_fd, *new_fd; 1251 enum flm_op_result result; 1252 1253 NET_EPOCH_ASSERT(); 1254 RIB_WLOCK_ASSERT(rh); 1255 1256 prev_fd = orig_fd; 1257 new_fd = NULL; 1258 for (int i = 0; i < FIB_MAX_TRIES; i++) { 1259 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 1260 1261 if ((result == FLM_SUCCESS) && attach) { 1262 if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) 1263 sync_rib_gen(new_fd); 1264 else 1265 result = FLM_REBUILD; 1266 } 1267 1268 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 1269 schedule_destroy_fd_instance(prev_fd, false); 1270 prev_fd = NULL; 1271 } 1272 1273 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 1274 print_op_result(result)); 1275 1276 if (result == FLM_REBUILD) { 1277 prev_fd = new_fd; 1278 new_fd = NULL; 1279 continue; 1280 } 1281 1282 break; 1283 } 1284 1285 if (result != FLM_SUCCESS) { 1286 RH_PRINTF(LOG_WARNING, rh, 1287 "%s algo instance setup failed, failures=%d", flm->flm_name, 1288 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1289 /* update failure count */ 1290 FIB_MOD_LOCK(); 1291 if (orig_fd != NULL) 1292 orig_fd->fd_failed_rebuilds++; 1293 FIB_MOD_UNLOCK(); 1294 1295 /* Ban algo on non-recoverable error */ 1296 if (result == FLM_ERROR) 1297 flm_error_add(flm, rh->rib_fibnum); 1298 1299 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1300 schedule_destroy_fd_instance(prev_fd, false); 1301 if (new_fd != NULL) { 1302 schedule_destroy_fd_instance(new_fd, false); 1303 new_fd = NULL; 1304 } 1305 } 1306 1307 *pfd = new_fd; 1308 return (result); 1309 } 1310 1311 /* 1312 * Tries to sync algo with the current rtable state, either 1313 * by executing batch update or rebuilding. 1314 * Returns true on success. 1315 */ 1316 static bool 1317 execute_callout_action(struct fib_data *fd) 1318 { 1319 enum fib_callout_action action = fd->fd_callout_action; 1320 struct fib_lookup_module *flm_new = NULL; 1321 bool result = true; 1322 1323 NET_EPOCH_ASSERT(); 1324 RIB_WLOCK_ASSERT(fd->fd_rh); 1325 1326 fd->fd_need_rebuild = false; 1327 fd->fd_batch = false; 1328 fd->fd_num_changes = 0; 1329 1330 /* First, check if we're still OK to use this algo */ 1331 if (!is_algo_fixed(fd->fd_rh)) 1332 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1333 if (flm_new != NULL) 1334 action = FDA_REBUILD; 1335 1336 if (action == FDA_BATCH) { 1337 /* Try to sync */ 1338 if (!apply_rtable_changes(fd)) 1339 action = FDA_REBUILD; 1340 } 1341 1342 if (action == FDA_REBUILD) 1343 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1344 if (flm_new != NULL) 1345 fib_unref_algo(flm_new); 1346 1347 return (result); 1348 } 1349 1350 /* 1351 * Callout for all scheduled fd-related work. 1352 * - Checks if the current algo is still the best algo 1353 * - Synchronises algo instance to the rtable (batch usecase) 1354 * - Creates a new instance of an algo for af/fib if desired. 1355 */ 1356 static void 1357 handle_fd_callout(void *_data) 1358 { 1359 struct fib_data *fd = (struct fib_data *)_data; 1360 struct epoch_tracker et; 1361 1362 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); 1363 1364 NET_EPOCH_ENTER(et); 1365 CURVNET_SET(fd->fd_vnet); 1366 execute_callout_action(fd); 1367 CURVNET_RESTORE(); 1368 NET_EPOCH_EXIT(et); 1369 } 1370 1371 /* 1372 * Tries to create new algo instance based on @fd data. 1373 * Returns true on success. 1374 */ 1375 static bool 1376 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) 1377 { 1378 struct fib_data *fd_new, *fd_tmp = NULL; 1379 bool result; 1380 1381 if (flm_new == fd->fd_flm) 1382 fd_tmp = fd; 1383 else 1384 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name); 1385 1386 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1387 if (result != FLM_SUCCESS) { 1388 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1389 return (false); 1390 } 1391 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1392 1393 /* Remove old instance */ 1394 schedule_destroy_fd_instance(fd, true); 1395 1396 return (true); 1397 } 1398 1399 static bool 1400 rebuild_fd(struct fib_data *fd, const char *reason) 1401 { 1402 struct fib_lookup_module *flm_new = NULL; 1403 bool result; 1404 1405 if (!is_algo_fixed(fd->fd_rh)) 1406 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1407 1408 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); 1409 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1410 if (flm_new != NULL) 1411 fib_unref_algo(flm_new); 1412 1413 if (!result) { 1414 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); 1415 schedule_fd_rebuild(fd, "sync rebuild failed"); 1416 } 1417 1418 return (result); 1419 } 1420 1421 /* 1422 * Finds algo by name/family. 1423 * Returns referenced algo or NULL. 1424 */ 1425 static struct fib_lookup_module * 1426 fib_find_algo(const char *algo_name, int family) 1427 { 1428 struct fib_lookup_module *flm; 1429 1430 FIB_MOD_LOCK(); 1431 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1432 if ((strcmp(flm->flm_name, algo_name) == 0) && 1433 (family == flm->flm_family)) { 1434 flm->flm_refcount++; 1435 FIB_MOD_UNLOCK(); 1436 return (flm); 1437 } 1438 } 1439 FIB_MOD_UNLOCK(); 1440 1441 return (NULL); 1442 } 1443 1444 static void 1445 fib_unref_algo(struct fib_lookup_module *flm) 1446 { 1447 1448 FIB_MOD_LOCK(); 1449 flm->flm_refcount--; 1450 FIB_MOD_UNLOCK(); 1451 } 1452 1453 static int 1454 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1455 { 1456 struct fib_lookup_module *flm = NULL; 1457 struct fib_data *fd = NULL; 1458 char old_algo_name[32], algo_name[32]; 1459 struct rib_head *rh = NULL; 1460 enum flm_op_result result; 1461 struct epoch_tracker et; 1462 int error; 1463 1464 /* Fetch current algo/rib for af/family */ 1465 FIB_MOD_LOCK(); 1466 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1467 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1468 break; 1469 } 1470 if (fd == NULL) { 1471 FIB_MOD_UNLOCK(); 1472 return (ENOENT); 1473 } 1474 rh = fd->fd_rh; 1475 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1476 sizeof(old_algo_name)); 1477 FIB_MOD_UNLOCK(); 1478 1479 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1480 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1481 if (error != 0 || req->newptr == NULL) 1482 return (error); 1483 1484 if (strcmp(algo_name, old_algo_name) == 0) 1485 return (0); 1486 1487 /* New algorithm name is different */ 1488 flm = fib_find_algo(algo_name, family); 1489 if (flm == NULL) { 1490 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1491 return (ESRCH); 1492 } 1493 1494 fd = NULL; 1495 NET_EPOCH_ENTER(et); 1496 RIB_WLOCK(rh); 1497 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1498 RIB_WUNLOCK(rh); 1499 NET_EPOCH_EXIT(et); 1500 fib_unref_algo(flm); 1501 if (result != FLM_SUCCESS) 1502 return (EINVAL); 1503 1504 /* Disable automated jumping between algos */ 1505 FIB_MOD_LOCK(); 1506 set_algo_fixed(rh); 1507 FIB_MOD_UNLOCK(); 1508 /* Remove old instance(s) */ 1509 fib_cleanup_algo(rh, true, false); 1510 1511 /* Drain cb so user can unload the module after userret if so desired */ 1512 epoch_drain_callbacks(net_epoch_preempt); 1513 1514 return (0); 1515 } 1516 1517 #ifdef INET 1518 static int 1519 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1520 { 1521 1522 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1523 } 1524 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1525 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1526 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1527 #endif 1528 1529 #ifdef INET6 1530 static int 1531 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1532 { 1533 1534 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1535 } 1536 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1537 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1538 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1539 #endif 1540 1541 static struct nhop_object * 1542 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) 1543 { 1544 return (NULL); 1545 } 1546 1547 static void 1548 destroy_fdh_epoch(epoch_context_t ctx) 1549 { 1550 struct fib_dp_header *fdh; 1551 1552 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1553 free(fdh, M_RTABLE); 1554 } 1555 1556 static struct fib_dp_header * 1557 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1558 { 1559 size_t sz; 1560 struct fib_dp_header *fdh; 1561 1562 sz = sizeof(struct fib_dp_header); 1563 sz += sizeof(struct fib_dp) * num_tables; 1564 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1565 if (fdh != NULL) { 1566 fdh->fdh_num_tables = num_tables; 1567 /* 1568 * Set dummy lookup function ptr always returning NULL, so 1569 * we can delay algo init. 1570 */ 1571 for (uint32_t i = 0; i < num_tables; i++) 1572 fdh->fdh_idx[i].f = dummy_lookup; 1573 } 1574 return (fdh); 1575 } 1576 1577 static struct fib_dp_header * 1578 get_fib_dp_header(struct fib_dp *dp) 1579 { 1580 1581 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1582 } 1583 1584 /* 1585 * Replace per-family index pool @pdp with a new one which 1586 * contains updated callback/algo data from @fd. 1587 * Returns true on success. 1588 */ 1589 static bool 1590 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) 1591 { 1592 struct fib_dp_header *new_fdh, *old_fdh; 1593 1594 NET_EPOCH_ASSERT(); 1595 1596 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1597 curvnet, dp->f, dp->arg); 1598 1599 FIB_MOD_LOCK(); 1600 old_fdh = get_fib_dp_header(*pdp); 1601 1602 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { 1603 /* 1604 * Function is the same, data pointer needs update. 1605 * Perform in-line replace without reallocation. 1606 */ 1607 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; 1608 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); 1609 FIB_MOD_UNLOCK(); 1610 return (true); 1611 } 1612 1613 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1614 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1615 if (new_fdh == NULL) { 1616 FIB_MOD_UNLOCK(); 1617 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1618 return (false); 1619 } 1620 1621 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1622 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1623 /* Update relevant data structure for @fd */ 1624 new_fdh->fdh_idx[fd->fd_fibnum] = *dp; 1625 1626 /* Ensure memcpy() writes have completed */ 1627 atomic_thread_fence_rel(); 1628 /* Set new datapath pointer */ 1629 *pdp = &new_fdh->fdh_idx[0]; 1630 FIB_MOD_UNLOCK(); 1631 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1632 1633 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1634 1635 return (true); 1636 } 1637 1638 static struct fib_dp ** 1639 get_family_dp_ptr(int family) 1640 { 1641 switch (family) { 1642 #ifdef INET 1643 case AF_INET: 1644 return (&V_inet_dp); 1645 #endif 1646 #ifdef INET6 1647 case AF_INET6: 1648 return (&V_inet6_dp); 1649 #endif 1650 } 1651 return (NULL); 1652 } 1653 1654 /* 1655 * Make datapath use fib instance @fd 1656 */ 1657 bool 1658 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) 1659 { 1660 struct fib_dp **pdp; 1661 1662 pdp = get_family_dp_ptr(fd->fd_family); 1663 return (replace_rtables_family(pdp, fd, dp)); 1664 } 1665 1666 /* 1667 * Grow datapath pointers array. 1668 * Called from sysctl handler on growing number of routing tables. 1669 */ 1670 static void 1671 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1672 { 1673 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1674 1675 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1676 1677 FIB_MOD_LOCK(); 1678 if (*pdp != NULL) { 1679 old_fdh = get_fib_dp_header(*pdp); 1680 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1681 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1682 } 1683 1684 /* Wait till all writes completed */ 1685 atomic_thread_fence_rel(); 1686 1687 *pdp = &new_fdh->fdh_idx[0]; 1688 FIB_MOD_UNLOCK(); 1689 1690 if (old_fdh != NULL) 1691 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1692 } 1693 1694 /* 1695 * Grows per-AF arrays of datapath pointers for each supported family. 1696 * Called from fibs resize sysctl handler. 1697 */ 1698 void 1699 fib_grow_rtables(uint32_t new_num_tables) 1700 { 1701 1702 #ifdef INET 1703 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1704 #endif 1705 #ifdef INET6 1706 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1707 #endif 1708 } 1709 1710 void 1711 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1712 { 1713 1714 bzero(rinfo, sizeof(struct rib_rtable_info)); 1715 rinfo->num_prefixes = rh->rnh_prefixes; 1716 rinfo->num_nhops = nhops_get_count(rh); 1717 #ifdef ROUTE_MPATH 1718 rinfo->num_nhgrp = nhgrp_get_count(rh); 1719 #endif 1720 } 1721 1722 /* 1723 * Updates pointer to the algo data for the @fd. 1724 */ 1725 void 1726 fib_set_algo_ptr(struct fib_data *fd, void *algo_data) 1727 { 1728 RIB_WLOCK_ASSERT(fd->fd_rh); 1729 1730 fd->fd_algo_data = algo_data; 1731 } 1732 1733 /* 1734 * Calls @callback with @ctx after the end of a current epoch. 1735 */ 1736 void 1737 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) 1738 { 1739 epoch_call(net_epoch_preempt, callback, ctx); 1740 } 1741 1742 /* 1743 * Accessor to get rib instance @fd is attached to. 1744 */ 1745 struct rib_head * 1746 fib_get_rh(struct fib_data *fd) 1747 { 1748 1749 return (fd->fd_rh); 1750 } 1751 1752 /* 1753 * Accessor to export idx->nhop array 1754 */ 1755 struct nhop_object ** 1756 fib_get_nhop_array(struct fib_data *fd) 1757 { 1758 1759 return (fd->nh_idx); 1760 } 1761 1762 static uint32_t 1763 get_nhop_idx(struct nhop_object *nh) 1764 { 1765 #ifdef ROUTE_MPATH 1766 if (NH_IS_NHGRP(nh)) 1767 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1); 1768 else 1769 return (nhop_get_idx(nh) * 2); 1770 #else 1771 return (nhop_get_idx(nh)); 1772 #endif 1773 } 1774 1775 uint32_t 1776 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1777 { 1778 1779 return (get_nhop_idx(nh)); 1780 } 1781 1782 static bool 1783 is_idx_free(struct fib_data *fd, uint32_t index) 1784 { 1785 1786 return (fd->nh_ref_table->refcnt[index] == 0); 1787 } 1788 1789 static uint32_t 1790 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1791 { 1792 uint32_t idx = get_nhop_idx(nh); 1793 1794 if (idx >= fd->number_nhops) { 1795 fd->hit_nhops = 1; 1796 return (0); 1797 } 1798 1799 if (is_idx_free(fd, idx)) { 1800 nhop_ref_any(nh); 1801 fd->nh_idx[idx] = nh; 1802 fd->nh_ref_table->count++; 1803 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1804 } 1805 fd->nh_ref_table->refcnt[idx]++; 1806 1807 return (idx); 1808 } 1809 1810 struct nhop_release_data { 1811 struct nhop_object *nh; 1812 struct epoch_context ctx; 1813 }; 1814 1815 static void 1816 release_nhop_epoch(epoch_context_t ctx) 1817 { 1818 struct nhop_release_data *nrd; 1819 1820 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1821 nhop_free_any(nrd->nh); 1822 free(nrd, M_TEMP); 1823 } 1824 1825 /* 1826 * Delays nexthop refcount release. 1827 * Datapath may have the datastructures not updated yet, so the old 1828 * nexthop may still be returned till the end of current epoch. Delay 1829 * refcount removal, as we may be removing the last instance, which will 1830 * trigger nexthop deletion, rendering returned nexthop invalid. 1831 */ 1832 static void 1833 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1834 { 1835 struct nhop_release_data *nrd; 1836 1837 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1838 if (nrd != NULL) { 1839 nrd->nh = nh; 1840 fib_epoch_call(release_nhop_epoch, &nrd->ctx); 1841 } else { 1842 /* 1843 * Unable to allocate memory. Leak nexthop to maintain guarantee 1844 * that each nhop can be referenced. 1845 */ 1846 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1847 } 1848 } 1849 1850 static void 1851 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1852 { 1853 uint32_t idx = get_nhop_idx(nh); 1854 1855 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1856 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1857 1858 fd->nh_ref_table->refcnt[idx]--; 1859 if (fd->nh_ref_table->refcnt[idx] == 0) { 1860 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1861 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1862 } 1863 } 1864 1865 static void 1866 set_algo_fixed(struct rib_head *rh) 1867 { 1868 switch (rh->rib_family) { 1869 #ifdef INET 1870 case AF_INET: 1871 V_algo_fixed_inet = true; 1872 break; 1873 #endif 1874 #ifdef INET6 1875 case AF_INET6: 1876 V_algo_fixed_inet6 = true; 1877 break; 1878 #endif 1879 } 1880 } 1881 1882 static bool 1883 is_algo_fixed(struct rib_head *rh) 1884 { 1885 1886 switch (rh->rib_family) { 1887 #ifdef INET 1888 case AF_INET: 1889 return (V_algo_fixed_inet); 1890 #endif 1891 #ifdef INET6 1892 case AF_INET6: 1893 return (V_algo_fixed_inet6); 1894 #endif 1895 } 1896 return (false); 1897 } 1898 1899 /* 1900 * Runs the check on what would be the best algo for rib @rh, assuming 1901 * that the current algo is the one specified by @orig_flm. Note that 1902 * it can be NULL for initial selection. 1903 * 1904 * Returns referenced new algo or NULL if the current one is the best. 1905 */ 1906 static struct fib_lookup_module * 1907 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1908 { 1909 uint8_t preference, curr_preference = 0, best_preference = 0; 1910 struct fib_lookup_module *flm, *best_flm = NULL; 1911 struct rib_rtable_info rinfo; 1912 int candidate_algos = 0; 1913 1914 fib_get_rtable_info(rh, &rinfo); 1915 1916 FIB_MOD_LOCK(); 1917 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1918 if (flm->flm_family != rh->rib_family) 1919 continue; 1920 candidate_algos++; 1921 preference = flm->flm_get_pref(&rinfo); 1922 if (preference > best_preference) { 1923 if (!flm_error_check(flm, rh->rib_fibnum)) { 1924 best_preference = preference; 1925 best_flm = flm; 1926 } 1927 } 1928 if (flm == orig_flm) 1929 curr_preference = preference; 1930 } 1931 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1932 best_flm->flm_refcount++; 1933 else 1934 best_flm = NULL; 1935 FIB_MOD_UNLOCK(); 1936 1937 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1938 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1939 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1940 best_preference); 1941 1942 return (best_flm); 1943 } 1944 1945 /* 1946 * Called when new route table is created. 1947 * Selects, allocates and attaches fib algo for the table. 1948 */ 1949 static bool 1950 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp) 1951 { 1952 struct fib_lookup_module *flm; 1953 struct fib_data *fd = NULL; 1954 enum flm_op_result result; 1955 struct epoch_tracker et; 1956 1957 flm = fib_check_best_algo(rh, NULL); 1958 if (flm == NULL) { 1959 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1960 return (false); 1961 } 1962 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1963 1964 NET_EPOCH_ENTER(et); 1965 RIB_WLOCK(rh); 1966 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1967 RIB_WUNLOCK(rh); 1968 NET_EPOCH_EXIT(et); 1969 1970 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1971 if (result == FLM_SUCCESS) 1972 *dp = fd->fd_dp; 1973 else 1974 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1975 1976 fib_unref_algo(flm); 1977 1978 return (result == FLM_SUCCESS); 1979 } 1980 1981 /* 1982 * Sets up fib algo instances for the non-initialized RIBs in the @family. 1983 * Allocates temporary datapath index to amortize datapaint index updates 1984 * with large @num_tables. 1985 */ 1986 void 1987 fib_setup_family(int family, uint32_t num_tables) 1988 { 1989 struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false); 1990 if (new_fdh == NULL) { 1991 ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family)); 1992 return; 1993 } 1994 1995 for (int i = 0; i < num_tables; i++) { 1996 struct rib_head *rh = rt_tables_get_rnh(i, family); 1997 if (rh->rib_algo_init) 1998 continue; 1999 if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i])) 2000 continue; 2001 2002 rh->rib_algo_init = true; 2003 } 2004 2005 FIB_MOD_LOCK(); 2006 struct fib_dp **pdp = get_family_dp_ptr(family); 2007 struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp); 2008 2009 /* Update the items not touched by the new init, from the old data pointer */ 2010 for (int i = 0; i < num_tables; i++) { 2011 if (new_fdh->fdh_idx[i].f == dummy_lookup) 2012 new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i]; 2013 } 2014 2015 /* Ensure all index writes have completed */ 2016 atomic_thread_fence_rel(); 2017 /* Set new datapath pointer */ 2018 *pdp = &new_fdh->fdh_idx[0]; 2019 2020 FIB_MOD_UNLOCK(); 2021 2022 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 2023 } 2024 2025 /* 2026 * Registers fib lookup module within the subsystem. 2027 */ 2028 int 2029 fib_module_register(struct fib_lookup_module *flm) 2030 { 2031 2032 FIB_MOD_LOCK(); 2033 ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name, 2034 print_family(flm->flm_family)); 2035 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 2036 FIB_MOD_UNLOCK(); 2037 2038 return (0); 2039 } 2040 2041 /* 2042 * Tries to unregister fib lookup module. 2043 * 2044 * Returns 0 on success, EBUSY if module is still used 2045 * by some of the tables. 2046 */ 2047 int 2048 fib_module_unregister(struct fib_lookup_module *flm) 2049 { 2050 2051 FIB_MOD_LOCK(); 2052 if (flm->flm_refcount > 0) { 2053 FIB_MOD_UNLOCK(); 2054 return (EBUSY); 2055 } 2056 fib_error_clear_flm(flm); 2057 ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name, 2058 print_family(flm->flm_family)); 2059 TAILQ_REMOVE(&all_algo_list, flm, entries); 2060 FIB_MOD_UNLOCK(); 2061 2062 return (0); 2063 } 2064 2065 void 2066 vnet_fib_init(void) 2067 { 2068 2069 TAILQ_INIT(&V_fib_data_list); 2070 } 2071 2072 void 2073 vnet_fib_destroy(void) 2074 { 2075 2076 FIB_MOD_LOCK(); 2077 fib_error_clear(); 2078 FIB_MOD_UNLOCK(); 2079 } 2080