1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2020 Alexander V. Chernikov 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_route.h" 32 33 #include <sys/param.h> 34 #include <sys/eventhandler.h> 35 #include <sys/kernel.h> 36 #include <sys/sbuf.h> 37 #include <sys/lock.h> 38 #include <sys/rmlock.h> 39 #include <sys/malloc.h> 40 #include <sys/mbuf.h> 41 #include <sys/module.h> 42 #include <sys/kernel.h> 43 #include <sys/priv.h> 44 #include <sys/proc.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/syslog.h> 49 #include <sys/queue.h> 50 #include <net/vnet.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_var.h> 57 #include <netinet/ip.h> 58 #include <netinet/ip_var.h> 59 #ifdef INET6 60 #include <netinet/ip6.h> 61 #include <netinet6/ip6_var.h> 62 #endif 63 64 #include <net/route.h> 65 #include <net/route/nhop.h> 66 #include <net/route/route_ctl.h> 67 #include <net/route/route_var.h> 68 #include <net/route/fib_algo.h> 69 70 #include <machine/stdarg.h> 71 72 /* 73 * Fib lookup framework. 74 * 75 * This framework enables accelerated longest-prefix-match lookups for the 76 * routing tables by adding the ability to dynamically attach/detach lookup 77 * algorithms implementation to/from the datapath. 78 * 79 * flm - fib lookup modules - implementation of particular lookup algorithm 80 * fd - fib data - instance of an flm bound to specific routing table 81 * 82 * This file provides main framework functionality. 83 * 84 * The following are the features provided by the framework 85 * 86 * 1) nexhops abstraction -> provides transparent referencing, indexing 87 * and efficient idx->ptr mappings for nexthop and nexthop groups. 88 * 2) Routing table synchronisation 89 * 3) dataplane attachment points 90 * 4) automatic algorithm selection based on the provided preference. 91 * 92 * 93 * DATAPATH 94 * For each supported address family, there is a an allocated array of fib_dp 95 * structures, indexed by fib number. Each array entry contains callback function 96 * and its argument. This function will be called with a family-specific lookup key, 97 * scope and provided argument. This array gets re-created every time when new algo 98 * instance gets created. Please take a look at the replace_rtables_family() function 99 * for more details. 100 * 101 */ 102 103 SYSCTL_DECL(_net_route); 104 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 105 "Fib algorithm lookups"); 106 107 /* Algorithm sync policy */ 108 109 /* Time interval to bucket updates */ 110 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50; 111 #define V_update_bucket_time_ms VNET(update_bucket_time_ms) 112 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET, 113 &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate"); 114 115 /* Minimum update rate to delay sync */ 116 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500; 117 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate) 118 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET, 119 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync"); 120 121 /* Max allowed delay to sync */ 122 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000; 123 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms) 124 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET, 125 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)"); 126 127 128 #ifdef INET6 129 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false; 130 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6) 131 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 132 "IPv6 longest prefix match lookups"); 133 #endif 134 #ifdef INET 135 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false; 136 #define V_algo_fixed_inet VNET(algo_fixed_inet) 137 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 138 "IPv4 longest prefix match lookups"); 139 #endif 140 141 /* Fib instance counter */ 142 static uint32_t fib_gen = 0; 143 144 struct nhop_ref_table { 145 uint32_t count; 146 int32_t refcnt[0]; 147 }; 148 149 enum fib_callout_action { 150 FDA_NONE, /* No callout scheduled */ 151 FDA_REBUILD, /* Asks to rebuild algo instance */ 152 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */ 153 FDA_BATCH, /* Asks to submit batch of updates to the algo */ 154 }; 155 156 struct fib_sync_status { 157 struct timeval diverge_time; /* ts when diverged */ 158 uint32_t num_changes; /* number of changes since sync */ 159 uint32_t bucket_changes; /* num changes within the current bucket */ 160 uint64_t bucket_id; /* 50ms bucket # */ 161 struct fib_change_queue fd_change_queue;/* list of scheduled entries */ 162 }; 163 164 /* 165 * Data structure for the fib lookup instance tied to the particular rib. 166 */ 167 struct fib_data { 168 uint32_t number_nhops; /* current # of nhops */ 169 uint8_t hit_nhops; /* true if out of nhop limit */ 170 uint8_t init_done; /* true if init is competed */ 171 uint32_t fd_dead:1; /* Scheduled for deletion */ 172 uint32_t fd_linked:1; /* true if linked */ 173 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ 174 uint32_t fd_batch:1; /* true if batched notification scheduled */ 175 uint8_t fd_family; /* family */ 176 uint32_t fd_fibnum; /* fibnum */ 177 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ 178 uint32_t fd_gen; /* instance gen# */ 179 struct callout fd_callout; /* rebuild callout */ 180 enum fib_callout_action fd_callout_action; /* Callout action to take */ 181 void *fd_algo_data; /* algorithm data */ 182 struct nhop_object **nh_idx; /* nhop idx->ptr array */ 183 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ 184 struct rib_head *fd_rh; /* RIB table we're attached to */ 185 struct rib_subscription *fd_rs; /* storing table subscription */ 186 struct fib_dp fd_dp; /* fib datapath data */ 187 struct vnet *fd_vnet; /* vnet fib belongs to */ 188 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ 189 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ 190 struct fib_sync_status fd_ss; /* State relevant to the rib sync */ 191 uint32_t fd_num_changes; /* number of changes since last callout */ 192 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ 193 }; 194 195 static bool rebuild_fd(struct fib_data *fd, const char *reason); 196 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new); 197 static void handle_fd_callout(void *_data); 198 static void destroy_fd_instance_epoch(epoch_context_t ctx); 199 static bool is_idx_free(struct fib_data *fd, uint32_t index); 200 static void set_algo_fixed(struct rib_head *rh); 201 static bool is_algo_fixed(struct rib_head *rh); 202 203 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); 204 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); 205 206 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, 207 struct fib_lookup_module *orig_flm); 208 static void fib_unref_algo(struct fib_lookup_module *flm); 209 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); 210 211 struct mtx fib_mtx; 212 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx) 213 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) 214 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) 215 216 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF); 217 218 /* Algorithm has to be this percent better than the current to switch */ 219 #define BEST_DIFF_PERCENT (5 * 256 / 100) 220 /* Schedule algo re-evaluation X seconds after a change */ 221 #define ALGO_EVAL_DELAY_MS 30000 222 /* Force algo re-evaluation after X changes */ 223 #define ALGO_EVAL_NUM_ROUTES 100 224 /* Try to setup algorithm X times */ 225 #define FIB_MAX_TRIES 32 226 /* Max amount of supported nexthops */ 227 #define FIB_MAX_NHOPS 262144 228 #define FIB_CALLOUT_DELAY_MS 50 229 230 231 /* Debug */ 232 static int flm_debug_level = LOG_NOTICE; 233 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, 234 &flm_debug_level, 0, "debuglevel"); 235 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG 236 #ifndef LOG_DEBUG2 237 #define LOG_DEBUG2 8 238 #endif 239 240 #define _PASS_MSG(_l) (flm_debug_level >= (_l)) 241 #define ALGO_PRINTF(_l, _fmt, ...) if (_PASS_MSG(_l)) { \ 242 printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__); \ 243 } 244 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \ 245 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\ 246 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__) 247 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ 248 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) 249 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ 250 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ 251 } 252 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) 253 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ 254 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ 255 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \ 256 } 257 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2 258 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF 259 #else 260 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...) 261 #endif 262 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG 263 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF 264 #else 265 #define FD_PRINTF_LOG_DEBUG() 266 #endif 267 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO 268 #define FD_PRINTF_LOG_INFO _FD_PRINTF 269 #else 270 #define FD_PRINTF_LOG_INFO() 271 #endif 272 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF 273 #define FD_PRINTF_LOG_ERR _FD_PRINTF 274 #define FD_PRINTF_LOG_WARNING _FD_PRINTF 275 276 277 /* List of all registered lookup algorithms */ 278 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list); 279 280 /* List of all fib lookup instances in the vnet */ 281 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); 282 #define V_fib_data_list VNET(fib_data_list) 283 284 /* Datastructure for storing non-transient fib lookup module failures */ 285 struct fib_error { 286 int fe_family; 287 uint32_t fe_fibnum; /* failed rtable */ 288 struct fib_lookup_module *fe_flm; /* failed module */ 289 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ 290 }; 291 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); 292 #define V_fib_error_list VNET(fib_error_list) 293 294 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */ 295 struct fib_dp_header { 296 struct epoch_context fdh_epoch_ctx; 297 uint32_t fdh_num_tables; 298 struct fib_dp fdh_idx[0]; 299 }; 300 301 /* 302 * Tries to add new non-transient algorithm error to the list of 303 * errors. 304 * Returns true on success. 305 */ 306 static bool 307 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) 308 { 309 struct fib_error *fe; 310 311 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); 312 if (fe == NULL) 313 return (false); 314 fe->fe_flm = flm; 315 fe->fe_family = flm->flm_family; 316 fe->fe_fibnum = fibnum; 317 318 FIB_MOD_LOCK(); 319 /* Avoid duplicates by checking if error already exists first */ 320 if (flm_error_check(flm, fibnum)) { 321 FIB_MOD_UNLOCK(); 322 free(fe, M_TEMP); 323 return (true); 324 } 325 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); 326 FIB_MOD_UNLOCK(); 327 328 return (true); 329 } 330 331 /* 332 * True if non-transient error has been registered for @flm in @fibnum. 333 */ 334 static bool 335 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) 336 { 337 const struct fib_error *fe; 338 339 TAILQ_FOREACH(fe, &V_fib_error_list, entries) { 340 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) 341 return (true); 342 } 343 344 return (false); 345 } 346 347 /* 348 * Clear all errors of algo specified by @flm. 349 */ 350 static void 351 fib_error_clear_flm(struct fib_lookup_module *flm) 352 { 353 struct fib_error *fe, *fe_tmp; 354 355 FIB_MOD_LOCK_ASSERT(); 356 357 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 358 if (fe->fe_flm == flm) { 359 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 360 free(fe, M_TEMP); 361 } 362 } 363 } 364 365 /* 366 * Clears all errors in current VNET. 367 */ 368 static void 369 fib_error_clear(void) 370 { 371 struct fib_error *fe, *fe_tmp; 372 373 FIB_MOD_LOCK_ASSERT(); 374 375 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { 376 TAILQ_REMOVE(&V_fib_error_list, fe, entries); 377 free(fe, M_TEMP); 378 } 379 } 380 381 static const char * 382 print_op_result(enum flm_op_result result) 383 { 384 switch (result) { 385 case FLM_SUCCESS: 386 return "success"; 387 case FLM_REBUILD: 388 return "rebuild"; 389 case FLM_BATCH: 390 return "batch"; 391 case FLM_ERROR: 392 return "error"; 393 } 394 395 return "unknown"; 396 } 397 398 static const char * 399 print_family(int family) 400 { 401 402 if (family == AF_INET) 403 return ("inet"); 404 else if (family == AF_INET6) 405 return ("inet6"); 406 else 407 return ("unknown"); 408 } 409 410 /* 411 * Debug function used by lookup algorithms. 412 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " 413 */ 414 void 415 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) 416 { 417 char buf[128]; 418 va_list ap; 419 420 if (level > flm_debug_level) 421 return; 422 423 va_start(ap, fmt); 424 vsnprintf(buf, sizeof(buf), fmt, ap); 425 va_end(ap); 426 427 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, 428 fd->fd_gen, func, "%s", buf); 429 } 430 431 /* 432 * Outputs list of algorithms supported by the provided address family. 433 */ 434 static int 435 print_algos_sysctl(struct sysctl_req *req, int family) 436 { 437 struct fib_lookup_module *flm; 438 struct sbuf sbuf; 439 int error, count = 0; 440 441 error = sysctl_wire_old_buffer(req, 0); 442 if (error == 0) { 443 sbuf_new_for_sysctl(&sbuf, NULL, 512, req); 444 TAILQ_FOREACH(flm, &all_algo_list, entries) { 445 if (flm->flm_family == family) { 446 if (count++ > 0) 447 sbuf_cat(&sbuf, ", "); 448 sbuf_cat(&sbuf, flm->flm_name); 449 } 450 } 451 error = sbuf_finish(&sbuf); 452 sbuf_delete(&sbuf); 453 } 454 return (error); 455 } 456 457 #ifdef INET6 458 static int 459 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) 460 { 461 462 return (print_algos_sysctl(req, AF_INET6)); 463 } 464 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, 465 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 466 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); 467 #endif 468 469 #ifdef INET 470 static int 471 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) 472 { 473 474 return (print_algos_sysctl(req, AF_INET)); 475 } 476 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, 477 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 478 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); 479 #endif 480 481 /* 482 * Calculate delay between repeated failures. 483 * Returns current delay in milliseconds. 484 */ 485 static uint32_t 486 callout_calc_delay_ms(struct fib_data *fd) 487 { 488 uint32_t shift; 489 490 if (fd->fd_failed_rebuilds > 10) 491 shift = 10; 492 else 493 shift = fd->fd_failed_rebuilds; 494 495 return ((1 << shift) * FIB_CALLOUT_DELAY_MS); 496 } 497 498 static void 499 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms) 500 { 501 502 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action); 503 fd->fd_callout_action = action; 504 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0, 505 handle_fd_callout, fd, 0); 506 } 507 508 static void 509 schedule_fd_rebuild(struct fib_data *fd, const char *reason) 510 { 511 512 RIB_WLOCK_ASSERT(fd->fd_rh); 513 514 if (!fd->fd_need_rebuild) { 515 fd->fd_need_rebuild = true; 516 /* Stop batch updates */ 517 fd->fd_batch = false; 518 519 /* 520 * Potentially re-schedules pending callout 521 * initiated by schedule_algo_eval. 522 */ 523 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)", 524 reason, fd->fd_failed_rebuilds); 525 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd)); 526 } 527 } 528 529 static void 530 sync_rib_gen(struct fib_data *fd) 531 { 532 FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib); 533 fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib; 534 } 535 536 static int64_t 537 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv) 538 { 539 int64_t diff = 0; 540 541 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000; 542 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000; 543 544 return (diff); 545 } 546 547 static void 548 add_tv_diff_ms(struct timeval *tv, int ms) 549 { 550 tv->tv_sec += ms / 1000; 551 ms = ms % 1000; 552 if (ms * 1000 + tv->tv_usec < 1000000) 553 tv->tv_usec += ms * 1000; 554 else { 555 tv->tv_sec += 1; 556 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000; 557 } 558 } 559 560 /* 561 * Marks the time when algo state diverges from the rib state. 562 */ 563 static void 564 mark_diverge_time(struct fib_data *fd) 565 { 566 struct fib_sync_status *fd_ss = &fd->fd_ss; 567 568 getmicrouptime(&fd_ss->diverge_time); 569 fd_ss->bucket_id = 0; 570 fd_ss->bucket_changes = 0; 571 } 572 573 /* 574 * Calculates and updates the next algorithm sync time, based on the current activity. 575 * 576 * The intent is to provide reasonable balance between the update 577 * latency and efficient batching when changing large amount of routes. 578 * 579 * High-level algorithm looks the following: 580 * 1) all changes are bucketed in 50ms intervals 581 * 2) If amount of changes within the bucket is greater than the threshold, 582 * the update gets delayed, up to maximum delay threshold. 583 */ 584 static void 585 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action) 586 { 587 uint32_t bucket_id, new_delay = 0; 588 struct timeval tv; 589 590 /* Fetch all variables at once to ensure consistent reads */ 591 uint32_t bucket_time_ms = V_update_bucket_time_ms; 592 uint32_t threshold_rate = V_bucket_change_threshold_rate; 593 uint32_t max_delay_ms = V_fib_max_sync_delay_ms; 594 595 if (bucket_time_ms == 0) 596 bucket_time_ms = 50; 597 /* calculate per-bucket threshold rate */ 598 threshold_rate = threshold_rate * bucket_time_ms / 1000; 599 600 getmicrouptime(&tv); 601 602 struct fib_sync_status *fd_ss = &fd->fd_ss; 603 604 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms; 605 606 if (fd_ss->bucket_id == bucket_id) { 607 fd_ss->bucket_changes++; 608 if (fd_ss->bucket_changes == threshold_rate) { 609 new_delay = (bucket_id + 2) * bucket_time_ms; 610 if (new_delay <= max_delay_ms) { 611 FD_PRINTF(LOG_DEBUG, fd, 612 "hit threshold of %u routes, delay update," 613 "bucket: %u, total delay: %u", 614 threshold_rate, bucket_id + 1, new_delay); 615 } else { 616 new_delay = 0; 617 FD_PRINTF(LOG_DEBUG, fd, 618 "maximum sync delay (%u ms) reached", max_delay_ms); 619 } 620 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1)) 621 new_delay = bucket_time_ms; 622 } else { 623 fd_ss->bucket_id = bucket_id; 624 fd_ss->bucket_changes = 1; 625 } 626 627 if (new_delay > 0) { 628 /* Calculated time has been updated */ 629 struct timeval new_tv = fd_ss->diverge_time; 630 add_tv_diff_ms(&new_tv, new_delay); 631 632 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv); 633 schedule_callout(fd, action, delay_ms); 634 } 635 } 636 637 static void 638 update_algo_state(struct fib_data *fd) 639 { 640 641 RIB_WLOCK_ASSERT(fd->fd_rh); 642 643 if (fd->fd_batch || fd->fd_need_rebuild) { 644 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH; 645 update_rebuild_delay(fd, action); 646 return; 647 } 648 649 if (fd->fd_num_changes++ == 0) { 650 /* Start callout to consider switch */ 651 if (!callout_pending(&fd->fd_callout)) 652 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS); 653 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) { 654 /* Reset callout to exec immediately */ 655 if (fd->fd_callout_action == FDA_EVAL) 656 schedule_callout(fd, FDA_EVAL, 1); 657 } 658 } 659 660 static bool 661 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc) 662 { 663 struct nhop_object *nh; 664 665 /* Sync addition/removal of interface routes */ 666 switch (rc->rc_cmd) { 667 case RTM_ADD: 668 nh = rc->rc_nh_new; 669 if (!NH_IS_NHGRP(nh)) { 670 if (!(nh->nh_flags & NHF_GATEWAY)) 671 return (true); 672 if (nhop_get_rtflags(nh) & RTF_STATIC) 673 return (true); 674 } 675 break; 676 case RTM_DELETE: 677 nh = rc->rc_nh_old; 678 if (!NH_IS_NHGRP(nh)) { 679 if (!(nh->nh_flags & NHF_GATEWAY)) 680 return (true); 681 if (nhop_get_rtflags(nh) & RTF_STATIC) 682 return (true); 683 } 684 break; 685 } 686 687 return (false); 688 } 689 690 static bool 691 apply_rtable_changes(struct fib_data *fd) 692 { 693 enum flm_op_result result; 694 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 695 696 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data); 697 698 if (result == FLM_SUCCESS) { 699 sync_rib_gen(fd); 700 for (int i = 0; i < q->count; i++) 701 if (q->entries[i].nh_old) 702 fib_unref_nhop(fd, q->entries[i].nh_old); 703 q->count = 0; 704 } 705 fd->fd_batch = false; 706 707 return (result == FLM_SUCCESS); 708 } 709 710 static bool 711 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc) 712 { 713 int plen = 0; 714 715 switch (fd->fd_family) { 716 #ifdef INET 717 case AF_INET: 718 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid); 719 break; 720 #endif 721 #ifdef INET6 722 case AF_INET6: 723 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid); 724 break; 725 #endif 726 } 727 728 ce->plen = plen; 729 ce->nh_old = rc->rc_nh_old; 730 ce->nh_new = rc->rc_nh_new; 731 if (ce->nh_new != NULL) { 732 if (fib_ref_nhop(fd, ce->nh_new) == 0) 733 return (false); 734 } 735 736 return (true); 737 } 738 739 static bool 740 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc) 741 { 742 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue; 743 744 if (q->count >= q->size) { 745 uint32_t q_size; 746 747 if (q->size == 0) 748 q_size = 256; /* ~18k memory */ 749 else 750 q_size = q->size * 2; 751 752 size_t size = q_size * sizeof(struct fib_change_entry); 753 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO); 754 if (a == NULL) { 755 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements", 756 q_size); 757 return (false); 758 } 759 q->entries = a; 760 q->size = q_size; 761 } 762 763 return (fill_change_entry(fd, &q->entries[q->count++], rc)); 764 } 765 766 /* 767 * Rib subscription handler. Checks if the algorithm is ready to 768 * receive updates, handles nexthop refcounting and passes change 769 * data to the algorithm callback. 770 */ 771 static void 772 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, 773 void *_data) 774 { 775 struct fib_data *fd = (struct fib_data *)_data; 776 enum flm_op_result result; 777 778 RIB_WLOCK_ASSERT(rnh); 779 780 /* 781 * There is a small gap between subscribing for route changes 782 * and initiating rtable dump. Avoid receiving route changes 783 * prior to finishing rtable dump by checking `init_done`. 784 */ 785 if (!fd->init_done) 786 return; 787 788 bool immediate_sync = need_immediate_sync(fd, rc); 789 790 /* Consider scheduling algorithm re-evaluation */ 791 update_algo_state(fd); 792 793 /* 794 * If algo requested rebuild, stop sending updates by default. 795 * This simplifies nexthop refcount handling logic. 796 */ 797 if (fd->fd_need_rebuild) { 798 if (immediate_sync) 799 rebuild_fd(fd, "rtable change type enforced sync"); 800 return; 801 } 802 803 /* 804 * Algo requested updates to be delivered in batches. 805 * Add the current change to the queue and return. 806 */ 807 if (fd->fd_batch) { 808 if (immediate_sync) { 809 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd)) 810 rebuild_fd(fd, "batch sync failed"); 811 } else { 812 if (!queue_rtable_change(fd, rc)) 813 schedule_fd_rebuild(fd, "batch queue failed"); 814 } 815 return; 816 } 817 818 /* 819 * Maintain guarantee that every nexthop returned by the dataplane 820 * lookup has > 0 refcount, so can be safely referenced within current 821 * epoch. 822 */ 823 if (rc->rc_nh_new != NULL) { 824 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { 825 /* ran out of indexes */ 826 schedule_fd_rebuild(fd, "ran out of nhop indexes"); 827 return; 828 } 829 } 830 831 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); 832 833 switch (result) { 834 case FLM_SUCCESS: 835 sync_rib_gen(fd); 836 /* Unref old nexthop on success */ 837 if (rc->rc_nh_old != NULL) 838 fib_unref_nhop(fd, rc->rc_nh_old); 839 break; 840 case FLM_BATCH: 841 842 /* 843 * Algo asks to batch the changes. 844 */ 845 if (queue_rtable_change(fd, rc)) { 846 if (!immediate_sync) { 847 fd->fd_batch = true; 848 mark_diverge_time(fd); 849 update_rebuild_delay(fd, FDA_BATCH); 850 break; 851 } 852 if (apply_rtable_changes(fd)) 853 break; 854 } 855 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild"); 856 857 case FLM_REBUILD: 858 859 /* 860 * Algo is not able to apply the update. 861 * Schedule algo rebuild. 862 */ 863 if (!immediate_sync) { 864 mark_diverge_time(fd); 865 schedule_fd_rebuild(fd, "algo requested rebuild"); 866 break; 867 } 868 869 FD_PRINTF(LOG_INFO, fd, "running sync rebuild"); 870 rebuild_fd(fd, "rtable change type enforced sync"); 871 break; 872 case FLM_ERROR: 873 874 /* 875 * Algo reported a non-recoverable error. 876 * Record the error and schedule rebuild, which will 877 * trigger best algo selection. 878 */ 879 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); 880 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) 881 FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); 882 schedule_fd_rebuild(fd, "algo reported non-recoverable error"); 883 } 884 } 885 886 static void 887 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) 888 { 889 890 if (old_fd == NULL) { 891 // TODO: read from rtable 892 fd->number_nhops = 16; 893 return; 894 } 895 896 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) 897 fd->number_nhops = 2 * old_fd->number_nhops; 898 else 899 fd->number_nhops = old_fd->number_nhops; 900 } 901 902 struct walk_cbdata { 903 struct fib_data *fd; 904 flm_dump_t *func; 905 enum flm_op_result result; 906 }; 907 908 /* 909 * Handler called after all rtenties have been dumped. 910 * Performs post-dump framework checks and calls 911 * algo:flm_dump_end_cb(). 912 * 913 * Updates walk_cbdata result. 914 */ 915 static void 916 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) 917 { 918 struct walk_cbdata *w = (struct walk_cbdata *)_data; 919 struct fib_data *fd = w->fd; 920 921 RIB_WLOCK_ASSERT(w->fd->fd_rh); 922 923 if (rnh->rib_dying) { 924 w->result = FLM_ERROR; 925 return; 926 } 927 928 if (fd->hit_nhops) { 929 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", 930 fd->nh_ref_table->count); 931 if (w->result == FLM_SUCCESS) 932 w->result = FLM_REBUILD; 933 return; 934 } 935 936 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) 937 return; 938 939 /* Post-dump hook, dump successful */ 940 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); 941 942 if (w->result == FLM_SUCCESS) { 943 /* Mark init as done to allow routing updates */ 944 fd->init_done = 1; 945 } 946 } 947 948 /* 949 * Callback for each entry in rib. 950 * Calls algo:flm_dump_rib_item_cb func as a part of initial 951 * route table synchronisation. 952 */ 953 static int 954 sync_algo_cb(struct rtentry *rt, void *_data) 955 { 956 struct walk_cbdata *w = (struct walk_cbdata *)_data; 957 958 RIB_WLOCK_ASSERT(w->fd->fd_rh); 959 960 if (w->result == FLM_SUCCESS && w->func) { 961 962 /* 963 * Reference nexthops to maintain guarantee that 964 * each nexthop returned by datapath has > 0 references 965 * and can be safely referenced within current epoch. 966 */ 967 struct nhop_object *nh = rt_get_raw_nhop(rt); 968 if (fib_ref_nhop(w->fd, nh) != 0) 969 w->result = w->func(rt, w->fd->fd_algo_data); 970 else 971 w->result = FLM_REBUILD; 972 } 973 974 return (0); 975 } 976 977 /* 978 * Dump all routing table state to the algo instance. 979 */ 980 static enum flm_op_result 981 sync_algo(struct fib_data *fd) 982 { 983 struct walk_cbdata w = { 984 .fd = fd, 985 .func = fd->fd_flm->flm_dump_rib_item_cb, 986 .result = FLM_SUCCESS, 987 }; 988 989 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w); 990 991 FD_PRINTF(LOG_INFO, fd, 992 "initial dump completed (rtable version: %d), result: %s", 993 fd->fd_rh->rnh_gen, print_op_result(w.result)); 994 995 return (w.result); 996 } 997 998 /* 999 * Schedules epoch-backed @fd instance deletion. 1000 * * Unlinks @fd from the list of active algo instances. 1001 * * Removes rib subscription. 1002 * * Stops callout. 1003 * * Schedules actual deletion. 1004 * 1005 * Assume @fd is already unlinked from the datapath. 1006 */ 1007 static int 1008 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) 1009 { 1010 bool is_dead; 1011 1012 NET_EPOCH_ASSERT(); 1013 RIB_WLOCK_ASSERT(fd->fd_rh); 1014 1015 FIB_MOD_LOCK(); 1016 is_dead = fd->fd_dead; 1017 if (!is_dead) 1018 fd->fd_dead = true; 1019 if (fd->fd_linked) { 1020 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1021 fd->fd_linked = false; 1022 } 1023 FIB_MOD_UNLOCK(); 1024 if (is_dead) 1025 return (0); 1026 1027 FD_PRINTF(LOG_INFO, fd, "DETACH"); 1028 1029 if (fd->fd_rs != NULL) 1030 rib_unsubscribe_locked(fd->fd_rs); 1031 1032 /* 1033 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls 1034 * will be executed, hence no _new_ callout schedules will happen. 1035 */ 1036 callout_stop(&fd->fd_callout); 1037 1038 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx); 1039 1040 return (0); 1041 } 1042 1043 /* 1044 * Wipe all fd instances from the list matching rib specified by @rh. 1045 * If @keep_first is set, remove all but the first record. 1046 */ 1047 static void 1048 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) 1049 { 1050 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); 1051 struct fib_data *fd, *fd_tmp; 1052 struct epoch_tracker et; 1053 1054 FIB_MOD_LOCK(); 1055 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { 1056 if (fd->fd_rh == rh) { 1057 if (keep_first) { 1058 keep_first = false; 1059 continue; 1060 } 1061 TAILQ_REMOVE(&V_fib_data_list, fd, entries); 1062 fd->fd_linked = false; 1063 TAILQ_INSERT_TAIL(&tmp_head, fd, entries); 1064 } 1065 } 1066 FIB_MOD_UNLOCK(); 1067 1068 /* Pass 2: remove each entry */ 1069 NET_EPOCH_ENTER(et); 1070 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { 1071 if (!in_callout) 1072 RIB_WLOCK(fd->fd_rh); 1073 schedule_destroy_fd_instance(fd, in_callout); 1074 if (!in_callout) 1075 RIB_WUNLOCK(fd->fd_rh); 1076 } 1077 NET_EPOCH_EXIT(et); 1078 } 1079 1080 void 1081 fib_destroy_rib(struct rib_head *rh) 1082 { 1083 1084 /* 1085 * rnh has `is_dying` flag set, so setup of new fd's will fail at 1086 * sync_algo() stage, preventing new entries to be added to the list 1087 * of active algos. Remove all existing entries for the particular rib. 1088 */ 1089 fib_cleanup_algo(rh, false, false); 1090 } 1091 1092 /* 1093 * Finalises fd destruction by freeing all fd resources. 1094 */ 1095 static void 1096 destroy_fd_instance(struct fib_data *fd) 1097 { 1098 1099 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); 1100 1101 /* Call destroy callback first */ 1102 if (fd->fd_algo_data != NULL) 1103 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); 1104 1105 /* Nhop table */ 1106 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { 1107 for (int i = 0; i < fd->number_nhops; i++) { 1108 if (!is_idx_free(fd, i)) { 1109 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p", 1110 i, fd->nh_idx[i]); 1111 nhop_free_any(fd->nh_idx[i]); 1112 } 1113 } 1114 free(fd->nh_idx, M_RTABLE); 1115 } 1116 if (fd->nh_ref_table != NULL) 1117 free(fd->nh_ref_table, M_RTABLE); 1118 1119 if (fd->fd_ss.fd_change_queue.entries != NULL) 1120 free(fd->fd_ss.fd_change_queue.entries, M_TEMP); 1121 1122 fib_unref_algo(fd->fd_flm); 1123 1124 free(fd, M_RTABLE); 1125 } 1126 1127 /* 1128 * Epoch callback indicating fd is safe to destroy 1129 */ 1130 static void 1131 destroy_fd_instance_epoch(epoch_context_t ctx) 1132 { 1133 struct fib_data *fd; 1134 1135 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); 1136 1137 CURVNET_SET(fd->fd_vnet); 1138 destroy_fd_instance(fd); 1139 CURVNET_RESTORE(); 1140 } 1141 1142 /* 1143 * Tries to setup fd instance. 1144 * - Allocates fd/nhop table 1145 * - Runs algo:flm_init_cb algo init 1146 * - Subscribes fd to the rib 1147 * - Runs rtable dump 1148 * - Adds instance to the list of active instances. 1149 * 1150 * Returns: operation result. Fills in @pfd with resulting fd on success. 1151 * 1152 */ 1153 static enum flm_op_result 1154 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1155 struct fib_data *old_fd, struct fib_data **pfd) 1156 { 1157 struct fib_data *fd; 1158 size_t size; 1159 enum flm_op_result result; 1160 1161 /* Allocate */ 1162 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO); 1163 if (fd == NULL) { 1164 *pfd = NULL; 1165 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure"); 1166 return (FLM_REBUILD); 1167 } 1168 *pfd = fd; 1169 1170 estimate_nhop_scale(old_fd, fd); 1171 1172 fd->fd_rh = rh; 1173 fd->fd_family = rh->rib_family; 1174 fd->fd_fibnum = rh->rib_fibnum; 1175 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0); 1176 fd->fd_vnet = curvnet; 1177 fd->fd_flm = flm; 1178 1179 FIB_MOD_LOCK(); 1180 flm->flm_refcount++; 1181 fd->fd_gen = ++fib_gen; 1182 FIB_MOD_UNLOCK(); 1183 1184 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd); 1185 1186 /* Allocate nhidx -> nhop_ptr table */ 1187 size = fd->number_nhops * sizeof(void *); 1188 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1189 if (fd->nh_idx == NULL) { 1190 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size); 1191 return (FLM_REBUILD); 1192 } 1193 1194 /* Allocate nhop index refcount table */ 1195 size = sizeof(struct nhop_ref_table); 1196 size += fd->number_nhops * sizeof(uint32_t); 1197 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO); 1198 if (fd->nh_ref_table == NULL) { 1199 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size); 1200 return (FLM_REBUILD); 1201 } 1202 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops); 1203 1204 /* Okay, we're ready for algo init */ 1205 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL; 1206 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data); 1207 if (result != FLM_SUCCESS) { 1208 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name); 1209 return (result); 1210 } 1211 1212 /* Try to subscribe */ 1213 if (flm->flm_change_rib_item_cb != NULL) { 1214 fd->fd_rs = rib_subscribe_locked(fd->fd_rh, 1215 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE); 1216 if (fd->fd_rs == NULL) { 1217 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes"); 1218 return (FLM_REBUILD); 1219 } 1220 } 1221 1222 /* Dump */ 1223 result = sync_algo(fd); 1224 if (result != FLM_SUCCESS) { 1225 FD_PRINTF(LOG_INFO, fd, "rib sync failed"); 1226 return (result); 1227 } 1228 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully."); 1229 1230 FIB_MOD_LOCK(); 1231 /* 1232 * Insert fd in the beginning of a list, to maintain invariant 1233 * that first matching entry for the AF/fib is always the active 1234 * one. 1235 */ 1236 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries); 1237 fd->fd_linked = true; 1238 FIB_MOD_UNLOCK(); 1239 1240 return (FLM_SUCCESS); 1241 } 1242 1243 /* 1244 * Sets up algo @flm for table @rh and links it to the datapath. 1245 * 1246 */ 1247 static enum flm_op_result 1248 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, 1249 struct fib_data *orig_fd, struct fib_data **pfd, bool attach) 1250 { 1251 struct fib_data *prev_fd, *new_fd; 1252 enum flm_op_result result; 1253 1254 NET_EPOCH_ASSERT(); 1255 RIB_WLOCK_ASSERT(rh); 1256 1257 prev_fd = orig_fd; 1258 new_fd = NULL; 1259 for (int i = 0; i < FIB_MAX_TRIES; i++) { 1260 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd); 1261 1262 if ((result == FLM_SUCCESS) && attach) { 1263 if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp)) 1264 sync_rib_gen(new_fd); 1265 else 1266 result = FLM_REBUILD; 1267 } 1268 1269 if ((prev_fd != NULL) && (prev_fd != orig_fd)) { 1270 schedule_destroy_fd_instance(prev_fd, false); 1271 prev_fd = NULL; 1272 } 1273 1274 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i, 1275 print_op_result(result)); 1276 1277 if (result == FLM_REBUILD) { 1278 prev_fd = new_fd; 1279 new_fd = NULL; 1280 continue; 1281 } 1282 1283 break; 1284 } 1285 1286 if (result != FLM_SUCCESS) { 1287 RH_PRINTF(LOG_WARNING, rh, 1288 "%s algo instance setup failed, failures=%d", flm->flm_name, 1289 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0); 1290 /* update failure count */ 1291 FIB_MOD_LOCK(); 1292 if (orig_fd != NULL) 1293 orig_fd->fd_failed_rebuilds++; 1294 FIB_MOD_UNLOCK(); 1295 1296 /* Ban algo on non-recoverable error */ 1297 if (result == FLM_ERROR) 1298 flm_error_add(flm, rh->rib_fibnum); 1299 1300 if ((prev_fd != NULL) && (prev_fd != orig_fd)) 1301 schedule_destroy_fd_instance(prev_fd, false); 1302 if (new_fd != NULL) { 1303 schedule_destroy_fd_instance(new_fd, false); 1304 new_fd = NULL; 1305 } 1306 } 1307 1308 *pfd = new_fd; 1309 return (result); 1310 } 1311 1312 /* 1313 * Tries to sync algo with the current rtable state, either 1314 * by executing batch update or rebuilding. 1315 * Returns true on success. 1316 */ 1317 static bool 1318 execute_callout_action(struct fib_data *fd) 1319 { 1320 enum fib_callout_action action = fd->fd_callout_action; 1321 struct fib_lookup_module *flm_new = NULL; 1322 bool result = true; 1323 1324 NET_EPOCH_ASSERT(); 1325 RIB_WLOCK_ASSERT(fd->fd_rh); 1326 1327 fd->fd_need_rebuild = false; 1328 fd->fd_batch = false; 1329 fd->fd_num_changes = 0; 1330 1331 /* First, check if we're still OK to use this algo */ 1332 if (!is_algo_fixed(fd->fd_rh)) 1333 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1334 if (flm_new != NULL) 1335 action = FDA_REBUILD; 1336 1337 if (action == FDA_BATCH) { 1338 /* Try to sync */ 1339 if (!apply_rtable_changes(fd)) 1340 action = FDA_REBUILD; 1341 } 1342 1343 if (action == FDA_REBUILD) 1344 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1345 if (flm_new != NULL) 1346 fib_unref_algo(flm_new); 1347 1348 return (result); 1349 } 1350 1351 /* 1352 * Callout for all scheduled fd-related work. 1353 * - Checks if the current algo is still the best algo 1354 * - Synchronises algo instance to the rtable (batch usecase) 1355 * - Creates a new instance of an algo for af/fib if desired. 1356 */ 1357 static void 1358 handle_fd_callout(void *_data) 1359 { 1360 struct fib_data *fd = (struct fib_data *)_data; 1361 struct epoch_tracker et; 1362 1363 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action); 1364 1365 NET_EPOCH_ENTER(et); 1366 CURVNET_SET(fd->fd_vnet); 1367 execute_callout_action(fd); 1368 CURVNET_RESTORE(); 1369 NET_EPOCH_EXIT(et); 1370 } 1371 1372 /* 1373 * Tries to create new algo instance based on @fd data. 1374 * Returns true on success. 1375 */ 1376 static bool 1377 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new) 1378 { 1379 struct fib_data *fd_new, *fd_tmp = NULL; 1380 bool result; 1381 1382 if (flm_new == fd->fd_flm) 1383 fd_tmp = fd; 1384 else 1385 FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name); 1386 1387 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true); 1388 if (result != FLM_SUCCESS) { 1389 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed"); 1390 return (false); 1391 } 1392 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance"); 1393 1394 /* Remove old instance */ 1395 schedule_destroy_fd_instance(fd, true); 1396 1397 return (true); 1398 } 1399 1400 static bool 1401 rebuild_fd(struct fib_data *fd, const char *reason) 1402 { 1403 struct fib_lookup_module *flm_new = NULL; 1404 bool result; 1405 1406 if (!is_algo_fixed(fd->fd_rh)) 1407 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm); 1408 1409 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason); 1410 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm); 1411 if (flm_new != NULL) 1412 fib_unref_algo(flm_new); 1413 1414 if (!result) { 1415 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed"); 1416 schedule_fd_rebuild(fd, "sync rebuild failed"); 1417 } 1418 1419 return (result); 1420 } 1421 1422 /* 1423 * Finds algo by name/family. 1424 * Returns referenced algo or NULL. 1425 */ 1426 static struct fib_lookup_module * 1427 fib_find_algo(const char *algo_name, int family) 1428 { 1429 struct fib_lookup_module *flm; 1430 1431 FIB_MOD_LOCK(); 1432 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1433 if ((strcmp(flm->flm_name, algo_name) == 0) && 1434 (family == flm->flm_family)) { 1435 flm->flm_refcount++; 1436 FIB_MOD_UNLOCK(); 1437 return (flm); 1438 } 1439 } 1440 FIB_MOD_UNLOCK(); 1441 1442 return (NULL); 1443 } 1444 1445 static void 1446 fib_unref_algo(struct fib_lookup_module *flm) 1447 { 1448 1449 FIB_MOD_LOCK(); 1450 flm->flm_refcount--; 1451 FIB_MOD_UNLOCK(); 1452 } 1453 1454 static int 1455 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req) 1456 { 1457 struct fib_lookup_module *flm = NULL; 1458 struct fib_data *fd = NULL; 1459 char old_algo_name[32], algo_name[32]; 1460 struct rib_head *rh = NULL; 1461 enum flm_op_result result; 1462 struct epoch_tracker et; 1463 int error; 1464 1465 /* Fetch current algo/rib for af/family */ 1466 FIB_MOD_LOCK(); 1467 TAILQ_FOREACH(fd, &V_fib_data_list, entries) { 1468 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum)) 1469 break; 1470 } 1471 if (fd == NULL) { 1472 FIB_MOD_UNLOCK(); 1473 return (ENOENT); 1474 } 1475 rh = fd->fd_rh; 1476 strlcpy(old_algo_name, fd->fd_flm->flm_name, 1477 sizeof(old_algo_name)); 1478 FIB_MOD_UNLOCK(); 1479 1480 strlcpy(algo_name, old_algo_name, sizeof(algo_name)); 1481 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req); 1482 if (error != 0 || req->newptr == NULL) 1483 return (error); 1484 1485 if (strcmp(algo_name, old_algo_name) == 0) 1486 return (0); 1487 1488 /* New algorithm name is different */ 1489 flm = fib_find_algo(algo_name, family); 1490 if (flm == NULL) { 1491 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name); 1492 return (ESRCH); 1493 } 1494 1495 fd = NULL; 1496 NET_EPOCH_ENTER(et); 1497 RIB_WLOCK(rh); 1498 result = setup_fd_instance(flm, rh, NULL, &fd, true); 1499 RIB_WUNLOCK(rh); 1500 NET_EPOCH_EXIT(et); 1501 fib_unref_algo(flm); 1502 if (result != FLM_SUCCESS) 1503 return (EINVAL); 1504 1505 /* Disable automated jumping between algos */ 1506 FIB_MOD_LOCK(); 1507 set_algo_fixed(rh); 1508 FIB_MOD_UNLOCK(); 1509 /* Remove old instance(s) */ 1510 fib_cleanup_algo(rh, true, false); 1511 1512 /* Drain cb so user can unload the module after userret if so desired */ 1513 NET_EPOCH_DRAIN_CALLBACKS(); 1514 1515 return (0); 1516 } 1517 1518 #ifdef INET 1519 static int 1520 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS) 1521 { 1522 1523 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req)); 1524 } 1525 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo, 1526 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1527 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo"); 1528 #endif 1529 1530 #ifdef INET6 1531 static int 1532 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS) 1533 { 1534 1535 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req)); 1536 } 1537 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo, 1538 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 1539 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo"); 1540 #endif 1541 1542 static struct nhop_object * 1543 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid) 1544 { 1545 return (NULL); 1546 } 1547 1548 static void 1549 destroy_fdh_epoch(epoch_context_t ctx) 1550 { 1551 struct fib_dp_header *fdh; 1552 1553 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx); 1554 free(fdh, M_RTABLE); 1555 } 1556 1557 static struct fib_dp_header * 1558 alloc_fib_dp_array(uint32_t num_tables, bool waitok) 1559 { 1560 size_t sz; 1561 struct fib_dp_header *fdh; 1562 1563 sz = sizeof(struct fib_dp_header); 1564 sz += sizeof(struct fib_dp) * num_tables; 1565 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO); 1566 if (fdh != NULL) { 1567 fdh->fdh_num_tables = num_tables; 1568 /* 1569 * Set dummy lookup function ptr always returning NULL, so 1570 * we can delay algo init. 1571 */ 1572 for (uint32_t i = 0; i < num_tables; i++) 1573 fdh->fdh_idx[i].f = dummy_lookup; 1574 } 1575 return (fdh); 1576 } 1577 1578 static struct fib_dp_header * 1579 get_fib_dp_header(struct fib_dp *dp) 1580 { 1581 1582 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx)); 1583 } 1584 1585 /* 1586 * Replace per-family index pool @pdp with a new one which 1587 * contains updated callback/algo data from @fd. 1588 * Returns true on success. 1589 */ 1590 static bool 1591 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp) 1592 { 1593 struct fib_dp_header *new_fdh, *old_fdh; 1594 1595 NET_EPOCH_ASSERT(); 1596 1597 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p", 1598 curvnet, dp->f, dp->arg); 1599 1600 FIB_MOD_LOCK(); 1601 old_fdh = get_fib_dp_header(*pdp); 1602 1603 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) { 1604 /* 1605 * Function is the same, data pointer needs update. 1606 * Perform in-line replace without reallocation. 1607 */ 1608 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg; 1609 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh); 1610 FIB_MOD_UNLOCK(); 1611 return (true); 1612 } 1613 1614 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false); 1615 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh); 1616 if (new_fdh == NULL) { 1617 FIB_MOD_UNLOCK(); 1618 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath"); 1619 return (false); 1620 } 1621 1622 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1623 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1624 /* Update relevant data structure for @fd */ 1625 new_fdh->fdh_idx[fd->fd_fibnum] = *dp; 1626 1627 /* Ensure memcpy() writes have completed */ 1628 atomic_thread_fence_rel(); 1629 /* Set new datapath pointer */ 1630 *pdp = &new_fdh->fdh_idx[0]; 1631 FIB_MOD_UNLOCK(); 1632 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh); 1633 1634 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1635 1636 return (true); 1637 } 1638 1639 static struct fib_dp ** 1640 get_family_dp_ptr(int family) 1641 { 1642 switch (family) { 1643 #ifdef INET 1644 case AF_INET: 1645 return (&V_inet_dp); 1646 #endif 1647 #ifdef INET6 1648 case AF_INET6: 1649 return (&V_inet6_dp); 1650 #endif 1651 } 1652 return (NULL); 1653 } 1654 1655 /* 1656 * Make datapath use fib instance @fd 1657 */ 1658 bool 1659 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp) 1660 { 1661 struct fib_dp **pdp; 1662 1663 pdp = get_family_dp_ptr(fd->fd_family); 1664 return (replace_rtables_family(pdp, fd, dp)); 1665 } 1666 1667 /* 1668 * Grow datapath pointers array. 1669 * Called from sysctl handler on growing number of routing tables. 1670 */ 1671 static void 1672 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables) 1673 { 1674 struct fib_dp_header *new_fdh, *old_fdh = NULL; 1675 1676 new_fdh = alloc_fib_dp_array(new_num_tables, true); 1677 1678 FIB_MOD_LOCK(); 1679 if (*pdp != NULL) { 1680 old_fdh = get_fib_dp_header(*pdp); 1681 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0], 1682 old_fdh->fdh_num_tables * sizeof(struct fib_dp)); 1683 } 1684 1685 /* Wait till all writes completed */ 1686 atomic_thread_fence_rel(); 1687 1688 *pdp = &new_fdh->fdh_idx[0]; 1689 FIB_MOD_UNLOCK(); 1690 1691 if (old_fdh != NULL) 1692 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 1693 } 1694 1695 /* 1696 * Grows per-AF arrays of datapath pointers for each supported family. 1697 * Called from fibs resize sysctl handler. 1698 */ 1699 void 1700 fib_grow_rtables(uint32_t new_num_tables) 1701 { 1702 1703 #ifdef INET 1704 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables); 1705 #endif 1706 #ifdef INET6 1707 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables); 1708 #endif 1709 } 1710 1711 void 1712 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo) 1713 { 1714 1715 bzero(rinfo, sizeof(struct rib_rtable_info)); 1716 rinfo->num_prefixes = rh->rnh_prefixes; 1717 rinfo->num_nhops = nhops_get_count(rh); 1718 #ifdef ROUTE_MPATH 1719 rinfo->num_nhgrp = nhgrp_get_count(rh); 1720 #endif 1721 } 1722 1723 /* 1724 * Updates pointer to the algo data for the @fd. 1725 */ 1726 void 1727 fib_set_algo_ptr(struct fib_data *fd, void *algo_data) 1728 { 1729 RIB_WLOCK_ASSERT(fd->fd_rh); 1730 1731 fd->fd_algo_data = algo_data; 1732 } 1733 1734 /* 1735 * Calls @callback with @ctx after the end of a current epoch. 1736 */ 1737 void 1738 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx) 1739 { 1740 NET_EPOCH_CALL(callback, ctx); 1741 } 1742 1743 /* 1744 * Accessor to get rib instance @fd is attached to. 1745 */ 1746 struct rib_head * 1747 fib_get_rh(struct fib_data *fd) 1748 { 1749 1750 return (fd->fd_rh); 1751 } 1752 1753 /* 1754 * Accessor to export idx->nhop array 1755 */ 1756 struct nhop_object ** 1757 fib_get_nhop_array(struct fib_data *fd) 1758 { 1759 1760 return (fd->nh_idx); 1761 } 1762 1763 static uint32_t 1764 get_nhop_idx(struct nhop_object *nh) 1765 { 1766 #ifdef ROUTE_MPATH 1767 if (NH_IS_NHGRP(nh)) 1768 return (nhgrp_get_idx((struct nhgrp_object *)nh)); 1769 else 1770 #endif 1771 return (nhop_get_idx(nh)); 1772 } 1773 1774 uint32_t 1775 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh) 1776 { 1777 1778 return (get_nhop_idx(nh)); 1779 } 1780 1781 static bool 1782 is_idx_free(struct fib_data *fd, uint32_t index) 1783 { 1784 1785 return (fd->nh_ref_table->refcnt[index] == 0); 1786 } 1787 1788 static uint32_t 1789 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh) 1790 { 1791 uint32_t idx = get_nhop_idx(nh); 1792 1793 if (idx >= fd->number_nhops) { 1794 fd->hit_nhops = 1; 1795 return (0); 1796 } 1797 1798 if (is_idx_free(fd, idx)) { 1799 nhop_ref_any(nh); 1800 fd->nh_idx[idx] = nh; 1801 fd->nh_ref_table->count++; 1802 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]); 1803 } 1804 fd->nh_ref_table->refcnt[idx]++; 1805 1806 return (idx); 1807 } 1808 1809 struct nhop_release_data { 1810 struct nhop_object *nh; 1811 struct epoch_context ctx; 1812 }; 1813 1814 static void 1815 release_nhop_epoch(epoch_context_t ctx) 1816 { 1817 struct nhop_release_data *nrd; 1818 1819 nrd = __containerof(ctx, struct nhop_release_data, ctx); 1820 nhop_free_any(nrd->nh); 1821 free(nrd, M_TEMP); 1822 } 1823 1824 /* 1825 * Delays nexthop refcount release. 1826 * Datapath may have the datastructures not updated yet, so the old 1827 * nexthop may still be returned till the end of current epoch. Delay 1828 * refcount removal, as we may be removing the last instance, which will 1829 * trigger nexthop deletion, rendering returned nexthop invalid. 1830 */ 1831 static void 1832 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh) 1833 { 1834 struct nhop_release_data *nrd; 1835 1836 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO); 1837 if (nrd != NULL) { 1838 nrd->nh = nh; 1839 fib_epoch_call(release_nhop_epoch, &nrd->ctx); 1840 } else { 1841 /* 1842 * Unable to allocate memory. Leak nexthop to maintain guarantee 1843 * that each nhop can be referenced. 1844 */ 1845 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh); 1846 } 1847 } 1848 1849 static void 1850 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh) 1851 { 1852 uint32_t idx = get_nhop_idx(nh); 1853 1854 KASSERT((idx < fd->number_nhops), ("invalid nhop index")); 1855 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh")); 1856 1857 fd->nh_ref_table->refcnt[idx]--; 1858 if (fd->nh_ref_table->refcnt[idx] == 0) { 1859 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]); 1860 fib_schedule_release_nhop(fd, fd->nh_idx[idx]); 1861 } 1862 } 1863 1864 static void 1865 set_algo_fixed(struct rib_head *rh) 1866 { 1867 switch (rh->rib_family) { 1868 #ifdef INET 1869 case AF_INET: 1870 V_algo_fixed_inet = true; 1871 break; 1872 #endif 1873 #ifdef INET6 1874 case AF_INET6: 1875 V_algo_fixed_inet6 = true; 1876 break; 1877 #endif 1878 } 1879 } 1880 1881 static bool 1882 is_algo_fixed(struct rib_head *rh) 1883 { 1884 1885 switch (rh->rib_family) { 1886 #ifdef INET 1887 case AF_INET: 1888 return (V_algo_fixed_inet); 1889 #endif 1890 #ifdef INET6 1891 case AF_INET6: 1892 return (V_algo_fixed_inet6); 1893 #endif 1894 } 1895 return (false); 1896 } 1897 1898 /* 1899 * Runs the check on what would be the best algo for rib @rh, assuming 1900 * that the current algo is the one specified by @orig_flm. Note that 1901 * it can be NULL for initial selection. 1902 * 1903 * Returns referenced new algo or NULL if the current one is the best. 1904 */ 1905 static struct fib_lookup_module * 1906 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm) 1907 { 1908 uint8_t preference, curr_preference = 0, best_preference = 0; 1909 struct fib_lookup_module *flm, *best_flm = NULL; 1910 struct rib_rtable_info rinfo; 1911 int candidate_algos = 0; 1912 1913 fib_get_rtable_info(rh, &rinfo); 1914 1915 FIB_MOD_LOCK(); 1916 TAILQ_FOREACH(flm, &all_algo_list, entries) { 1917 if (flm->flm_family != rh->rib_family) 1918 continue; 1919 candidate_algos++; 1920 preference = flm->flm_get_pref(&rinfo); 1921 if (preference > best_preference) { 1922 if (!flm_error_check(flm, rh->rib_fibnum)) { 1923 best_preference = preference; 1924 best_flm = flm; 1925 } 1926 } 1927 if (flm == orig_flm) 1928 curr_preference = preference; 1929 } 1930 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference)) 1931 best_flm->flm_refcount++; 1932 else 1933 best_flm = NULL; 1934 FIB_MOD_UNLOCK(); 1935 1936 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)", 1937 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference, 1938 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"), 1939 best_preference); 1940 1941 return (best_flm); 1942 } 1943 1944 /* 1945 * Called when new route table is created. 1946 * Selects, allocates and attaches fib algo for the table. 1947 */ 1948 static bool 1949 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp) 1950 { 1951 struct fib_lookup_module *flm; 1952 struct fib_data *fd = NULL; 1953 enum flm_op_result result; 1954 struct epoch_tracker et; 1955 1956 flm = fib_check_best_algo(rh, NULL); 1957 if (flm == NULL) { 1958 RH_PRINTF(LOG_CRIT, rh, "no algo selected"); 1959 return (false); 1960 } 1961 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name); 1962 1963 NET_EPOCH_ENTER(et); 1964 RIB_WLOCK(rh); 1965 result = setup_fd_instance(flm, rh, NULL, &fd, false); 1966 RIB_WUNLOCK(rh); 1967 NET_EPOCH_EXIT(et); 1968 1969 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd); 1970 if (result == FLM_SUCCESS) 1971 *dp = fd->fd_dp; 1972 else 1973 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name); 1974 1975 fib_unref_algo(flm); 1976 1977 return (result == FLM_SUCCESS); 1978 } 1979 1980 /* 1981 * Sets up fib algo instances for the non-initialized RIBs in the @family. 1982 * Allocates temporary datapath index to amortize datapaint index updates 1983 * with large @num_tables. 1984 */ 1985 void 1986 fib_setup_family(int family, uint32_t num_tables) 1987 { 1988 struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false); 1989 if (new_fdh == NULL) { 1990 ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family)); 1991 return; 1992 } 1993 1994 for (int i = 0; i < num_tables; i++) { 1995 struct rib_head *rh = rt_tables_get_rnh(i, family); 1996 if (rh->rib_algo_init) 1997 continue; 1998 if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i])) 1999 continue; 2000 2001 rh->rib_algo_init = true; 2002 } 2003 2004 FIB_MOD_LOCK(); 2005 struct fib_dp **pdp = get_family_dp_ptr(family); 2006 struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp); 2007 2008 /* Update the items not touched by the new init, from the old data pointer */ 2009 for (int i = 0; i < num_tables; i++) { 2010 if (new_fdh->fdh_idx[i].f == dummy_lookup) 2011 new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i]; 2012 } 2013 2014 /* Ensure all index writes have completed */ 2015 atomic_thread_fence_rel(); 2016 /* Set new datapath pointer */ 2017 *pdp = &new_fdh->fdh_idx[0]; 2018 2019 FIB_MOD_UNLOCK(); 2020 2021 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx); 2022 } 2023 2024 /* 2025 * Registers fib lookup module within the subsystem. 2026 */ 2027 int 2028 fib_module_register(struct fib_lookup_module *flm) 2029 { 2030 2031 FIB_MOD_LOCK(); 2032 ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name, 2033 print_family(flm->flm_family)); 2034 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries); 2035 FIB_MOD_UNLOCK(); 2036 2037 return (0); 2038 } 2039 2040 /* 2041 * Tries to unregister fib lookup module. 2042 * 2043 * Returns 0 on success, EBUSY if module is still used 2044 * by some of the tables. 2045 */ 2046 int 2047 fib_module_unregister(struct fib_lookup_module *flm) 2048 { 2049 2050 FIB_MOD_LOCK(); 2051 if (flm->flm_refcount > 0) { 2052 FIB_MOD_UNLOCK(); 2053 return (EBUSY); 2054 } 2055 fib_error_clear_flm(flm); 2056 ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name, 2057 print_family(flm->flm_family)); 2058 TAILQ_REMOVE(&all_algo_list, flm, entries); 2059 FIB_MOD_UNLOCK(); 2060 2061 return (0); 2062 } 2063 2064 void 2065 vnet_fib_init(void) 2066 { 2067 2068 TAILQ_INIT(&V_fib_data_list); 2069 } 2070 2071 void 2072 vnet_fib_destroy(void) 2073 { 2074 2075 FIB_MOD_LOCK(); 2076 fib_error_clear(); 2077 FIB_MOD_UNLOCK(); 2078 } 2079